diff --git "a/slurm.out" "b/slurm.out"
new file mode 100644--- /dev/null
+++ "b/slurm.out"
@@ -0,0 +1,2061 @@
+1: W0902 19:36:38.892000 3051572 torch/distributed/run.py:792] 
+1: W0902 19:36:38.892000 3051572 torch/distributed/run.py:792] *****************************************
+1: W0902 19:36:38.892000 3051572 torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+1: W0902 19:36:38.892000 3051572 torch/distributed/run.py:792] *****************************************
+0: W0902 19:36:39.195000 3622552 torch/distributed/run.py:792] 
+0: W0902 19:36:39.195000 3622552 torch/distributed/run.py:792] *****************************************
+0: W0902 19:36:39.195000 3622552 torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+0: W0902 19:36:39.195000 3622552 torch/distributed/run.py:792] *****************************************
+3: W0902 19:36:39.240000 2169916 torch/distributed/run.py:792] 
+3: W0902 19:36:39.240000 2169916 torch/distributed/run.py:792] *****************************************
+3: W0902 19:36:39.240000 2169916 torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+3: W0902 19:36:39.240000 2169916 torch/distributed/run.py:792] *****************************************
+2: W0902 19:36:39.704000 3738913 torch/distributed/run.py:792] 
+2: W0902 19:36:39.704000 3738913 torch/distributed/run.py:792] *****************************************
+2: W0902 19:36:39.704000 3738913 torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+2: W0902 19:36:39.704000 3738913 torch/distributed/run.py:792] *****************************************
+0: [2025-09-02 19:37:04,095] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:119] [PID:3622631] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing`[39m
+0: [2025-09-02 19:37:04,095] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:218] [PID:3622631] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing[39m
+3: [2025-09-02 19:37:05,674] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:119] [PID:2169992] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing`[39m
+3: [2025-09-02 19:37:05,674] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:218] [PID:2169992] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing[39m
+1: [2025-09-02 19:37:05,843] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:119] [PID:3051647] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing`[39m
+1: [2025-09-02 19:37:05,843] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:218] [PID:3051647] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing[39m
+2: [2025-09-02 19:37:05,901] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:119] [PID:3738989] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing`[39m
+2: [2025-09-02 19:37:05,902] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:218] [PID:3738989] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing[39m
+0: [2025-09-02 19:37:07,766] [INFO] [axolotl.cli.config.load_cfg:245] [PID:3622631] [RANK:0] config:
+0: {
+0:   "activation_offloading": false,
+0:   "auto_resume_from_checkpoints": true,
+0:   "axolotl_config_path": "/lustre/fswork/projects/rech/dgo/udv55np/train/tmp/1756826506457874101.yaml",
+0:   "base_model": "/lustre/fswork/projects/rech/qwv/udv55np/Qwen/Qwen2.5-0.5B_reasoning",
+0:   "base_model_config": "/lustre/fswork/projects/rech/qwv/udv55np/Qwen/Qwen2.5-0.5B_reasoning",
+0:   "batch_size": 16,
+0:   "bf16": true,
+0:   "capabilities": {
+0:     "bf16": true,
+0:     "compute_capability": "sm_90",
+0:     "fp8": false,
+0:     "n_gpu": 16,
+0:     "n_node": 1
+0:   },
+0:   "chat_template": "qwen_25",
+0:   "context_parallel_size": 1,
+0:   "dataloader_num_workers": 16,
+0:   "dataloader_pin_memory": true,
+0:   "dataloader_prefetch_factor": 256,
+0:   "dataset_prepared_path": "/lustre/fsn1/projects/rech/dgo/udv55np/dataset_math/Qwen3-235B-A22B/1",
+0:   "dataset_processes": 192,
+0:   "datasets": [
+0:     {
+0:       "chat_template": "tokenizer_default",
+0:       "field_messages": "conversations",
+0:       "message_property_mappings": {
+0:         "content": "content",
+0:         "role": "role"
+0:       },
+0:       "path": "/lustre/fswork/projects/rech/qwv/udv55np/dataset/math/hf/thinking_text/generator/default-68225543a18d39ac/0.0.0",
+0:       "trust_remote_code": false,
+0:       "type": "chat_template"
+0:     }
+0:   ],
+0:   "ddp": true,
+0:   "deepspeed": {
+0:     "bf16": {
+0:       "enabled": true
+0:     },
+0:     "gradient_accumulation_steps": "auto",
+0:     "gradient_clipping": "auto",
+0:     "train_batch_size": "auto",
+0:     "train_micro_batch_size_per_gpu": "auto",
+0:     "wall_clock_breakdown": false,
+0:     "zero_optimization": {
+0:       "contiguous_gradients": true,
+0:       "overlap_comm": true,
+0:       "reduce_bucket_size": "auto",
+0:       "stage": 3,
+0:       "stage3_gather_16bit_weights_on_model_save": true,
+0:       "stage3_param_persistence_threshold": "auto",
+0:       "stage3_prefetch_bucket_size": "auto",
+0:       "sub_group_size": 0
+0:     }
+0:   },
+0:   "device": "cuda:0",
+0:   "device_map": {
+0:     "": 0
+0:   },
+0:   "dion_rank_fraction": 1.0,
+0:   "dion_rank_multiple_of": 1,
+0:   "env_capabilities": {
+0:     "torch_version": "2.6.0"
+0:   },
+0:   "eval_batch_size": 1,
+0:   "eval_causal_lm_metrics": [
+0:     "sacrebleu",
+0:     "comet",
+0:     "ter",
+0:     "chrf"
+0:   ],
+0:   "eval_max_new_tokens": 128,
+0:   "eval_sample_packing": true,
+0:   "eval_table_size": 0,
+0:   "evals_per_epoch": 0,
+0:   "flash_attention": true,
+0:   "fp16": false,
+0:   "gradient_accumulation_steps": 1,
+0:   "gradient_checkpointing": true,
+0:   "gradient_checkpointing_kwargs": {
+0:     "use_reentrant": true
+0:   },
+0:   "learning_rate": 2e-05,
+0:   "lisa_layers_attribute": "model.layers",
+0:   "load_best_model_at_end": false,
+0:   "load_in_4bit": false,
+0:   "load_in_8bit": false,
+0:   "local_rank": 0,
+0:   "logging_steps": 10,
+0:   "lora_dropout": 0.0,
+0:   "loraplus_lr_embedding": 1e-06,
+0:   "lr_scheduler": "warmup_stable_decay",
+0:   "lr_scheduler_kwargs": {
+0:     "min_lr_ratio": 0.1,
+0:     "num_decay_steps": 300
+0:   },
+0:   "max_prompt_len": 512,
+0:   "mean_resizing_embeddings": false,
+0:   "micro_batch_size": 1,
+0:   "model_config_type": "qwen2",
+0:   "num_epochs": 1.0,
+0:   "optimizer": "adamw_torch_fused",
+0:   "output_dir": "/lustre/fswork/projects/rech/dgo/udv55np/math/Qwen3-235B-A22B/Qwen2.5-0.5B_reasoning/1",
+0:   "pad_to_sequence_len": true,
+0:   "pretrain_multipack_attn": true,
+0:   "pretrain_multipack_buffer_size": 10000,
+0:   "profiler_steps_start": 0,
+0:   "qlora_sharded_model_loading": false,
+0:   "ray_num_workers": 1,
+0:   "resources_per_worker": {
+0:     "GPU": 1
+0:   },
+0:   "sample_packing": true,
+0:   "sample_packing_bin_size": 200,
+0:   "sample_packing_group_size": 100000,
+0:   "save_only_model": false,
+0:   "save_safetensors": true,
+0:   "save_steps": 0.2,
+0:   "save_total_limit": 20,
+0:   "sequence_len": 16384,
+0:   "shuffle_before_merging_datasets": false,
+0:   "shuffle_merged_datasets": true,
+0:   "skip_prepare_dataset": false,
+0:   "special_tokens": {
+0:     "bos_token": "<|im_start|>",
+0:     "eos_token": "<|im_end|>",
+0:     "pad_token": "<|endoftext|>"
+0:   },
+0:   "strict": false,
+0:   "tensor_parallel_size": 1,
+0:   "tf32": false,
+0:   "tiled_mlp_use_original_mlp": true,
+0:   "tokenizer_config": "/lustre/fswork/projects/rech/qwv/udv55np/Qwen/Qwen2.5-0.5B_reasoning",
+0:   "torch_dtype": "torch.bfloat16",
+0:   "train_on_inputs": false,
+0:   "trl": {
+0:     "log_completions": false,
+0:     "mask_truncated_completions": false,
+0:     "ref_model_mixup_alpha": 0.9,
+0:     "ref_model_sync_steps": 64,
+0:     "scale_rewards": true,
+0:     "sync_ref_model": false,
+0:     "use_vllm": false,
+0:     "vllm_server_host": "0.0.0.0",
+0:     "vllm_server_port": 8000
+0:   },
+0:   "use_ray": false,
+0:   "use_tensorboard": true,
+0:   "val_set_size": 0.0,
+0:   "vllm": {
+0:     "device": "auto",
+0:     "dtype": "auto",
+0:     "gpu_memory_utilization": 0.9,
+0:     "host": "0.0.0.0",
+0:     "port": 8000
+0:   },
+0:   "warmup_steps": 150,
+0:   "weight_decay": 0.0,
+0:   "world_size": 16
+0: }[39m
+0: [2025-09-02 19:37:07,768] [INFO] [axolotl.cli.checks.check_user_token:35] [PID:3622631] [RANK:0] Skipping HuggingFace token verification because HF_HUB_OFFLINE is set to True. Only local files will be used.[39m
+0: [2025-09-02 19:37:08,143] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:478] [PID:3622631] [RANK:0] Unable to find prepared dataset in /lustre/fsn1/projects/rech/dgo/udv55np/dataset_math/Qwen3-235B-A22B/1/21006e43c19b80ce1023552634abc92d[39m
+0: [2025-09-02 19:37:08,143] [INFO] [axolotl.utils.data.sft._load_raw_datasets:314] [PID:3622631] [RANK:0] Loading raw datasets...[39m
+0: [33m[2025-09-02 19:37:08,143] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:316] [PID:3622631] [RANK:0] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`.[39m
+0: [2025-09-02 19:37:08,415] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:88] [PID:3622631] [RANK:0] Loading dataset: /lustre/fswork/projects/rech/qwv/udv55np/dataset/math/hf/thinking_text/generator/default-68225543a18d39ac/0.0.0 with base_type: chat_template and prompt_style: None[39m
+0: [2025-09-02 19:37:08,421] [INFO] [axolotl.prompt_strategies.chat_template.__call__:957] [PID:3622631] [RANK:0] Using chat template:
+0: ---
+0: {%- if tools %}
+0:     {{- '<|im_start|>system\n' }}
+0:     {%- if messages[0]['role'] == 'system' %}
+0:         {{- messages[0]['content'] }}
+0:     {%- else %}
+0:         {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+0:     {%- endif %}
+0:     {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+0:     {%- for tool in tools %}
+0:         {{- "\n" }}
+0:         {{- tool | tojson }}
+0:     {%- endfor %}
+0:     {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+0: {%- else %}
+0:     {%- if messages[0]['role'] == 'system' %}
+0:         {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+0:     {%- else %}
+0:         {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+0:     {%- endif %}
+0: {%- endif %}
+0: {%- for message in messages %}
+0:     {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+0:         {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+0:     {%- elif message.role == "assistant" %}
+0:         {{- '<|im_start|>' + message.role }}
+0:         {%- if message.content %}
+0:             {{- '\n' + message.content }}
+0:         {%- endif %}
+0:         {%- for tool_call in message.tool_calls %}
+0:             {%- if tool_call.function is defined %}
+0:                 {%- set tool_call = tool_call.function %}
+0:             {%- endif %}
+0:             {{- '\n<tool_call>\n{"name": "' }}
+0:             {{- tool_call.name }}
+0:             {{- '", "arguments": ' }}
+0:             {{- tool_call.arguments | tojson }}
+0:             {{- '}\n</tool_call>' }}
+0:         {%- endfor %}
+0:         {{- '<|im_end|>\n' }}
+0:     {%- elif message.role == "tool" %}
+0:         {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+0:             {{- '<|im_start|>user' }}
+0:         {%- endif %}
+0:         {{- '\n<tool_response>\n' }}
+0:         {{- message.content }}
+0:         {{- '\n</tool_response>' }}
+0:         {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+0:             {{- '<|im_end|>\n' }}
+0:         {%- endif %}
+0:     {%- endif %}
+0: {%- endfor %}
+0: {%- if add_generation_prompt %}
+0:     {{- '<|im_start|>assistant\n' }}
+0: {%- endif %}
+0: 
+0: ---[39m
+0: Tokenizing Prompts (num_proc=192):   0%|          | 0/322369 [00:00<?, ? examples/s]Tokenizing Prompts (num_proc=192):   0%|          | 1000/322369 [01:20<7:09:06, 12.48 examples/s]Tokenizing Prompts (num_proc=192):   1%|          | 2000/322369 [01:23<3:05:28, 28.79 examples/s]Tokenizing Prompts (num_proc=192):   1%|          | 3000/322369 [01:23<1:42:35, 51.89 examples/s]Tokenizing Prompts (num_proc=192):   1%|          | 4000/322369 [01:24<1:02:58, 84.27 examples/s]Tokenizing Prompts (num_proc=192):   2%|▏         | 5000/322369 [01:25<41:51, 126.37 examples/s] Tokenizing Prompts (num_proc=192):   2%|▏         | 6000/322369 [01:25<27:58, 188.50 examples/s]Tokenizing Prompts (num_proc=192):   2%|▏         | 7000/322369 [01:25<19:04, 275.47 examples/s]Tokenizing Prompts (num_proc=192):   2%|▏         | 8000/322369 [01:27<15:23, 340.26 examples/s]Tokenizing Prompts (num_proc=192):   3%|▎         | 9000/322369 [01:28<12:07, 430.48 examples/s]Tokenizing Prompts (num_proc=192):   3%|▎       
+0:   | 11000/322369 [01:28<06:33, 791.66 examples/s]Tokenizing Prompts (num_proc=192):   4%|▍         | 14000/322369 [01:28<03:27, 1488.24 examples/s]Tokenizing Prompts (num_proc=192):   5%|▍         | 16000/322369 [01:28<02:30, 2038.45 examples/s]Tokenizing Prompts (num_proc=192):   5%|▌         | 17000/322369 [01:29<02:22, 2145.67 examples/s]Tokenizing Prompts (num_proc=192):   6%|▌         | 18000/322369 [01:29<02:19, 2179.32 examples/s]Tokenizing Prompts (num_proc=192):   6%|▌         | 19000/322369 [01:29<01:56, 2595.18 examples/s]Tokenizing Prompts (num_proc=192):   6%|▌         | 20000/322369 [01:30<01:43, 2920.29 examples/s]Tokenizing Prompts (num_proc=192):   7%|▋         | 22000/322369 [01:30<01:11, 4214.34 examples/s]Tokenizing Prompts (num_proc=192):   7%|▋         | 24000/322369 [01:30<00:50, 5916.12 examples/s]Tokenizing Prompts (num_proc=192):   8%|▊         | 26000/322369 [01:31<01:24, 3516.48 examples/s]Tokenizing Prompts (num_proc=192):   8%|▊         | 27000/32236
+0: 9 [01:31<01:42, 2886.42 examples/s]Tokenizing Prompts (num_proc=192):   9%|▊         | 28000/322369 [01:32<01:31, 3224.42 examples/s]Tokenizing Prompts (num_proc=192):   9%|▉         | 29000/322369 [01:32<01:25, 3433.63 examples/s]Tokenizing Prompts (num_proc=192):  10%|▉         | 31000/322369 [01:32<00:59, 4938.24 examples/s]Tokenizing Prompts (num_proc=192):  10%|▉         | 32000/322369 [01:32<00:53, 5440.46 examples/s]Tokenizing Prompts (num_proc=192):  11%|█         | 34000/322369 [01:33<00:56, 5081.97 examples/s]Tokenizing Prompts (num_proc=192):  11%|█         | 35000/322369 [01:33<00:50, 5693.53 examples/s]Tokenizing Prompts (num_proc=192):  11%|█         | 36000/322369 [01:33<00:49, 5817.66 examples/s]Tokenizing Prompts (num_proc=192):  12%|█▏        | 38000/322369 [01:33<00:57, 4907.51 examples/s]Tokenizing Prompts (num_proc=192):  12%|█▏        | 39000/322369 [01:34<01:04, 4368.61 examples/s]Tokenizing Prompts (num_proc=192):  12%|█▏        | 40000/322369 [01:34
+0: <01:34, 3002.35 examples/s]Tokenizing Prompts (num_proc=192):  13%|█▎        | 42000/322369 [01:35<01:36, 2914.97 examples/s]Tokenizing Prompts (num_proc=192):  13%|█▎        | 43000/322369 [01:35<01:21, 3407.60 examples/s]Tokenizing Prompts (num_proc=192):  14%|█▍        | 46000/322369 [01:35<00:51, 5351.20 examples/s]Tokenizing Prompts (num_proc=192):  15%|█▍        | 48000/322369 [01:35<00:42, 6457.06 examples/s]Tokenizing Prompts (num_proc=192):  16%|█▌        | 50000/322369 [01:36<00:34, 7925.12 examples/s]Tokenizing Prompts (num_proc=192):  16%|█▌        | 52000/322369 [01:36<00:29, 9214.74 examples/s]Tokenizing Prompts (num_proc=192):  17%|█▋        | 54000/322369 [01:36<00:25, 10360.06 examples/s]Tokenizing Prompts (num_proc=192):  17%|█▋        | 56000/322369 [01:36<00:21, 12128.63 examples/s]Tokenizing Prompts (num_proc=192):  18%|█▊        | 58000/322369 [01:36<00:21, 12271.54 examples/s]Tokenizing Prompts (num_proc=192):  19%|█▊        | 60000/3223
+0: 69 [01:37<00:31, 8400.65 examples/s] Tokenizing Prompts (num_proc=192):  19%|█▉        | 62000/322369 [01:37<00:39, 6517.05 examples/s]Tokenizing Prompts (num_proc=192):  20%|█▉        | 63000/322369 [01:37<00:44, 5781.79 examples/s]Tokenizing Prompts (num_proc=192):  20%|█▉        | 64000/322369 [01:38<01:06, 3902.11 examples/s]Tokenizing Prompts (num_proc=192):  20%|██        | 65000/322369 [01:38<01:03, 4038.62 examples/s]Tokenizing Prompts (num_proc=192):  21%|██        | 68000/322369 [01:38<00:48, 5211.08 examples/s]Tokenizing Prompts (num_proc=192):  21%|██▏       | 69000/322369 [01:39<01:26, 2914.51 examples/s]Tokenizing Prompts (num_proc=192):  22%|██▏       | 70000/322369 [01:41<02:04, 2024.42 examples/s]Tokenizing Prompts (num_proc=192):  22%|██▏       | 71000/322369 [01:41<02:01, 2064.83 examples/s]Tokenizing Prompts (num_proc=192):  23%|██▎       | 75000/322369 [01:41<01:00, 4121.57 examples/s]Tokenizing Prompts (num_proc=192):  24%|██▎  
+0:      | 76000/322369 [01:42<01:04, 3831.39 examples/s]Tokenizing Prompts (num_proc=192):  24%|██▍       | 77000/322369 [01:42<01:01, 3974.69 examples/s]Tokenizing Prompts (num_proc=192):  25%|██▍       | 80000/322369 [01:42<00:43, 5573.63 examples/s]Tokenizing Prompts (num_proc=192):  25%|██▌       | 81000/322369 [01:43<01:22, 2922.27 examples/s]Tokenizing Prompts (num_proc=192):  26%|██▌       | 83000/322369 [01:44<01:22, 2892.49 examples/s]Tokenizing Prompts (num_proc=192):  27%|██▋       | 86000/322369 [01:44<00:51, 4578.26 examples/s]Tokenizing Prompts (num_proc=192):  27%|██▋       | 88000/322369 [01:45<01:04, 3639.73 examples/s]Tokenizing Prompts (num_proc=192):  28%|██▊       | 89000/322369 [01:46<01:42, 2266.96 examples/s]Tokenizing Prompts (num_proc=192):  28%|██▊       | 91000/322369 [01:46<01:16, 3006.51 examples/s]Tokenizing Prompts (num_proc=192):  29%|██▊       | 92000/322369 [01:47<01:42, 2236.97 examples/s]Tokenizing Prompts (num_p
+0: roc=192):  29%|██▉       | 93000/322369 [01:47<01:28, 2594.98 examples/s]Tokenizing Prompts (num_proc=192):  29%|██▉       | 94000/322369 [01:48<01:29, 2538.26 examples/s]Tokenizing Prompts (num_proc=192):  29%|██▉       | 95000/322369 [01:48<01:42, 2226.68 examples/s]Tokenizing Prompts (num_proc=192):  30%|███       | 97000/322369 [01:49<01:05, 3434.52 examples/s]Tokenizing Prompts (num_proc=192):  30%|███       | 98000/322369 [01:49<01:21, 2736.22 examples/s]Tokenizing Prompts (num_proc=192):  31%|███       | 99000/322369 [01:50<01:20, 2789.64 examples/s]Tokenizing Prompts (num_proc=192):  31%|███▏      | 101000/322369 [01:50<01:08, 3248.62 examples/s]Tokenizing Prompts (num_proc=192):  32%|███▏      | 102000/322369 [01:50<00:59, 3711.17 examples/s]Tokenizing Prompts (num_proc=192):  32%|███▏      | 104000/322369 [01:51<00:52, 4172.21 examples/s]Tokenizing Prompts (num_proc=192):  33%|███▎      | 106000/322369 [01:51<00:38, 5665.57
+0:  examples/s]Tokenizing Prompts (num_proc=192):  33%|███▎      | 107000/322369 [01:51<00:38, 5591.44 examples/s]Tokenizing Prompts (num_proc=192):  34%|███▍      | 109000/322369 [01:51<00:34, 6115.34 examples/s]Tokenizing Prompts (num_proc=192):  34%|███▍      | 110000/322369 [01:51<00:35, 5938.83 examples/s]Tokenizing Prompts (num_proc=192):  35%|███▍      | 112000/322369 [01:52<00:31, 6604.75 examples/s]Tokenizing Prompts (num_proc=192):  35%|███▌      | 114000/322369 [01:52<00:24, 8503.75 examples/s]Tokenizing Prompts (num_proc=192):  36%|███▋      | 117000/322369 [01:52<00:20, 10034.27 examples/s]Tokenizing Prompts (num_proc=192):  37%|███▋      | 119000/322369 [01:52<00:22, 9228.05 examples/s] Tokenizing Prompts (num_proc=192):  38%|███▊      | 121000/322369 [01:52<00:24, 8064.63 examples/s]Tokenizing Prompts (num_proc=192):  38%|███▊      | 122000/322369 [01:53<00:28, 6911.45 examples/s]Tokenizing Prompts (num_proc=192):  3
+0: 8%|███▊      | 123000/322369 [01:53<00:32, 6098.10 examples/s]Tokenizing Prompts (num_proc=192):  38%|███▊      | 124000/322369 [01:53<00:39, 4987.41 examples/s]Tokenizing Prompts (num_proc=192):  39%|███▉      | 127000/322369 [01:54<00:31, 6154.36 examples/s]Tokenizing Prompts (num_proc=192):  40%|████      | 130000/322369 [01:55<00:43, 4430.66 examples/s]Tokenizing Prompts (num_proc=192):  41%|████      | 131000/322369 [01:55<00:47, 3992.37 examples/s]Tokenizing Prompts (num_proc=192):  41%|████      | 132000/322369 [01:55<00:48, 3915.99 examples/s]Tokenizing Prompts (num_proc=192):  41%|████▏     | 133000/322369 [01:55<00:42, 4481.16 examples/s]Tokenizing Prompts (num_proc=192):  42%|████▏     | 134000/322369 [01:56<00:55, 3375.06 examples/s]Tokenizing Prompts (num_proc=192):  42%|████▏     | 135000/322369 [01:56<00:50, 3679.29 examples/s]Tokenizing Prompts (num_proc=192):  42%|████▏     | 136000/322369 [01:56<
+0: 00:43, 4327.05 examples/s]Tokenizing Prompts (num_proc=192):  42%|████▏     | 137000/322369 [01:57<01:01, 2993.11 examples/s]Tokenizing Prompts (num_proc=192):  43%|████▎     | 138000/322369 [01:57<00:49, 3711.41 examples/s]Tokenizing Prompts (num_proc=192):  44%|████▎     | 141000/322369 [01:57<00:30, 5920.88 examples/s]Tokenizing Prompts (num_proc=192):  44%|████▍     | 143000/322369 [01:58<00:32, 5596.41 examples/s]Tokenizing Prompts (num_proc=192):  45%|████▍     | 144000/322369 [01:58<00:31, 5670.96 examples/s]Tokenizing Prompts (num_proc=192):  45%|████▍     | 145000/322369 [01:58<00:32, 5534.38 examples/s]Tokenizing Prompts (num_proc=192):  46%|████▌     | 147000/322369 [01:58<00:27, 6405.72 examples/s]Tokenizing Prompts (num_proc=192):  47%|████▋     | 150000/322369 [01:58<00:19, 8814.45 examples/s]Tokenizing Prompts (num_proc=192):  47%|████▋     | 152000/322369 [01:59<00:21, 7769.71 examples/s]Tokeniz
+0: ing Prompts (num_proc=192):  47%|████▋     | 153000/322369 [01:59<00:23, 7174.25 examples/s]Tokenizing Prompts (num_proc=192):  48%|████▊     | 154000/322369 [01:59<00:22, 7417.28 examples/s]Tokenizing Prompts (num_proc=192):  48%|████▊     | 155000/322369 [01:59<00:32, 5222.98 examples/s]Tokenizing Prompts (num_proc=192):  49%|████▉     | 159000/322369 [02:00<00:21, 7571.02 examples/s]Tokenizing Prompts (num_proc=192):  50%|████▉     | 160000/322369 [02:00<00:21, 7542.33 examples/s]Tokenizing Prompts (num_proc=192):  50%|████▉     | 161000/322369 [02:00<00:31, 5120.95 examples/s]Tokenizing Prompts (num_proc=192):  50%|█████     | 162000/322369 [02:01<00:33, 4839.42 examples/s]Tokenizing Prompts (num_proc=192):  51%|█████     | 163000/322369 [02:01<00:57, 2781.70 examples/s]Tokenizing Prompts (num_proc=192):  51%|█████     | 164000/322369 [02:02<00:54, 2901.14 examples/s]Tokenizing Prompts (num_proc=192):  51%|�
+0: ��████▏    | 166000/322369 [02:02<00:49, 3158.81 examples/s]Tokenizing Prompts (num_proc=192):  52%|█████▏    | 167000/322369 [02:03<00:56, 2772.72 examples/s]Tokenizing Prompts (num_proc=192):  52%|█████▏    | 168000/322369 [02:03<00:46, 3296.43 examples/s]Tokenizing Prompts (num_proc=192):  53%|█████▎    | 170000/322369 [02:03<00:43, 3489.36 examples/s]Tokenizing Prompts (num_proc=192):  53%|█████▎    | 171000/322369 [02:04<00:47, 3190.27 examples/s]Tokenizing Prompts (num_proc=192):  54%|█████▎    | 173000/322369 [02:04<00:32, 4646.90 examples/s]Tokenizing Prompts (num_proc=192):  54%|█████▍    | 174000/322369 [02:04<00:28, 5170.71 examples/s]Tokenizing Prompts (num_proc=192):  55%|█████▍    | 176000/322369 [02:05<00:54, 2669.22 examples/s]Tokenizing Prompts (num_proc=192):  56%|█████▌    | 179000/322369 [02:06<00:35, 4016.72 examples/s]Tokenizing Prompts (num_proc=192):  56%|█████�
+0: �    | 180000/322369 [02:06<00:46, 3068.50 examples/s]Tokenizing Prompts (num_proc=192):  56%|█████▌    | 181000/322369 [02:07<00:44, 3159.62 examples/s]Tokenizing Prompts (num_proc=192):  56%|█████▋    | 182000/322369 [02:07<00:42, 3331.19 examples/s]Tokenizing Prompts (num_proc=192):  57%|█████▋    | 183000/322369 [02:08<00:51, 2701.06 examples/s]Tokenizing Prompts (num_proc=192):  57%|█████▋    | 184000/322369 [02:08<00:42, 3274.82 examples/s]Tokenizing Prompts (num_proc=192):  57%|█████▋    | 185000/322369 [02:09<01:12, 1907.38 examples/s]Tokenizing Prompts (num_proc=192):  58%|█████▊    | 186000/322369 [02:09<01:02, 2189.64 examples/s]Tokenizing Prompts (num_proc=192):  58%|█████▊    | 187000/322369 [02:11<01:59, 1129.81 examples/s]Tokenizing Prompts (num_proc=192):  58%|█████▊    | 188000/322369 [02:11<01:38, 1360.10 examples/s]Tokenizing Prompts (num_proc=192):  59%|█████▊    | 189000/32
+0: 2369 [02:12<01:18, 1692.07 examples/s]Tokenizing Prompts (num_proc=192):  59%|█████▉    | 190000/322369 [02:21<07:07, 309.68 examples/s] Tokenizing Prompts (num_proc=192):  59%|█████▉    | 190679/322369 [02:28<10:13, 214.82 examples/s]Tokenizing Prompts (num_proc=192):  59%|█████▉    | 191679/322369 [02:29<07:34, 287.34 examples/s]Tokenizing Prompts (num_proc=192):  60%|█████▉    | 192358/322369 [02:30<07:06, 305.10 examples/s]Tokenizing Prompts (num_proc=192):  60%|█████▉    | 193037/322369 [02:31<05:27, 395.50 examples/s]Tokenizing Prompts (num_proc=192):  60%|██████    | 194395/322369 [02:33<04:48, 443.35 examples/s]Tokenizing Prompts (num_proc=192):  61%|██████    | 195753/322369 [02:34<03:16, 643.36 examples/s]Tokenizing Prompts (num_proc=192):  61%|██████    | 196753/322369 [02:34<02:29, 837.99 examples/s]Tokenizing Prompts (num_proc=192):  61%|██████    | 197432/322369 [02:34<02:08, 974.
+0: 10 examples/s]Tokenizing Prompts (num_proc=192):  61%|██████▏   | 198111/322369 [02:37<03:23, 610.37 examples/s]Tokenizing Prompts (num_proc=192):  62%|██████▏   | 199469/322369 [02:37<02:19, 883.86 examples/s]Tokenizing Prompts (num_proc=192):  62%|██████▏   | 200827/322369 [02:38<01:43, 1174.94 examples/s]Tokenizing Prompts (num_proc=192):  63%|██████▎   | 201506/322369 [02:38<01:30, 1331.26 examples/s]Tokenizing Prompts (num_proc=192):  63%|██████▎   | 202185/322369 [02:38<01:20, 1484.66 examples/s]Tokenizing Prompts (num_proc=192):  63%|██████▎   | 202864/322369 [02:39<01:12, 1650.05 examples/s]Tokenizing Prompts (num_proc=192):  63%|██████▎   | 203543/322369 [02:39<01:03, 1865.72 examples/s]Tokenizing Prompts (num_proc=192):  63%|██████▎   | 204222/322369 [02:39<00:54, 2173.02 examples/s]Tokenizing Prompts (num_proc=192):  64%|██████▎   | 204901/322369 [02:40<01:29, 1310.
+0: 63 examples/s]Tokenizing Prompts (num_proc=192):  64%|██████▍   | 205580/322369 [02:40<01:20, 1453.66 examples/s]Tokenizing Prompts (num_proc=192):  64%|██████▍   | 206938/322369 [02:41<00:54, 2104.70 examples/s]Tokenizing Prompts (num_proc=192):  64%|██████▍   | 207617/322369 [02:41<00:47, 2441.03 examples/s]Tokenizing Prompts (num_proc=192):  65%|██████▍   | 208296/322369 [02:41<00:54, 2079.42 examples/s]Tokenizing Prompts (num_proc=192):  65%|██████▍   | 208975/322369 [02:41<00:48, 2317.15 examples/s]Tokenizing Prompts (num_proc=192):  65%|██████▌   | 210333/322369 [02:42<00:37, 3022.59 examples/s]Tokenizing Prompts (num_proc=192):  66%|██████▌   | 212370/322369 [02:42<00:28, 3869.34 examples/s]Tokenizing Prompts (num_proc=192):  67%|██████▋   | 214407/322369 [02:42<00:19, 5464.99 examples/s]Tokenizing Prompts (num_proc=192):  67%|██████▋   | 215765/322369 [02:42<00:18, 571
+0: 7.52 examples/s]Tokenizing Prompts (num_proc=192):  67%|██████▋   | 217123/322369 [02:43<00:21, 4927.01 examples/s]Tokenizing Prompts (num_proc=192):  68%|██████▊   | 217802/322369 [02:43<00:34, 3066.15 examples/s]Tokenizing Prompts (num_proc=192):  68%|██████▊   | 219160/322369 [02:44<00:27, 3762.83 examples/s]Tokenizing Prompts (num_proc=192):  68%|██████▊   | 219839/322369 [02:44<00:33, 3030.82 examples/s]Tokenizing Prompts (num_proc=192):  68%|██████▊   | 220518/322369 [02:45<00:55, 1830.84 examples/s]Tokenizing Prompts (num_proc=192):  69%|██████▊   | 221197/322369 [02:45<00:59, 1689.49 examples/s]Tokenizing Prompts (num_proc=192):  69%|██████▉   | 221876/322369 [02:46<01:18, 1285.50 examples/s]Tokenizing Prompts (num_proc=192):  69%|██████▉   | 222555/322369 [02:47<01:14, 1341.28 examples/s]Tokenizing Prompts (num_proc=192):  70%|██████▉   | 224592/322369 [02:47<00:40, 2
+0: 422.22 examples/s]Tokenizing Prompts (num_proc=192):  70%|██████▉   | 225271/322369 [02:47<00:42, 2307.77 examples/s]Tokenizing Prompts (num_proc=192):  70%|███████   | 225950/322369 [02:48<00:45, 2098.76 examples/s]Tokenizing Prompts (num_proc=192):  70%|███████   | 226629/322369 [02:48<00:40, 2339.96 examples/s]Tokenizing Prompts (num_proc=192):  71%|███████   | 227987/322369 [02:48<00:29, 3147.62 examples/s]Tokenizing Prompts (num_proc=192):  71%|███████   | 228666/322369 [02:48<00:27, 3426.70 examples/s]Tokenizing Prompts (num_proc=192):  71%|███████   | 229345/322369 [02:49<00:30, 3095.42 examples/s]Tokenizing Prompts (num_proc=192):  71%|███████▏  | 230024/322369 [02:49<00:37, 2432.54 examples/s]Tokenizing Prompts (num_proc=192):  72%|███████▏  | 230703/322369 [02:49<00:32, 2807.09 examples/s]Tokenizing Prompts (num_proc=192):  72%|███████▏  | 232061/322369 [02:49<
+0: 00:23, 3809.93 examples/s]Tokenizing Prompts (num_proc=192):  72%|███████▏  | 232740/322369 [02:51<01:01, 1461.67 examples/s]Tokenizing Prompts (num_proc=192):  72%|███████▏  | 233419/322369 [02:51<00:53, 1652.12 examples/s]Tokenizing Prompts (num_proc=192):  73%|███████▎  | 234098/322369 [02:51<00:49, 1791.81 examples/s]Tokenizing Prompts (num_proc=192):  73%|███████▎  | 235456/322369 [02:52<00:36, 2380.17 examples/s]Tokenizing Prompts (num_proc=192):  73%|███████▎  | 236135/322369 [02:52<00:43, 1986.08 examples/s]Tokenizing Prompts (num_proc=192):  74%|███████▎  | 237493/322369 [02:53<00:34, 2444.88 examples/s]Tokenizing Prompts (num_proc=192):  74%|███████▍  | 238172/322369 [02:53<00:31, 2696.56 examples/s]Tokenizing Prompts (num_proc=192):  74%|███████▍  | 238851/322369 [02:53<00:29, 2845.25 examples/s]Tokenizing Prompts (num_proc=192):  74%|███████▍  | 2
+0: 39530/322369 [02:53<00:30, 2696.42 examples/s]Tokenizing Prompts (num_proc=192):  75%|███████▍  | 240209/322369 [02:53<00:26, 3044.02 examples/s]Tokenizing Prompts (num_proc=192):  75%|███████▍  | 241567/322369 [02:54<00:18, 4286.57 examples/s]Tokenizing Prompts (num_proc=192):  75%|███████▌  | 242247/322369 [02:54<00:21, 3809.80 examples/s]Tokenizing Prompts (num_proc=192):  75%|███████▌  | 242926/322369 [02:54<00:18, 4233.84 examples/s]Tokenizing Prompts (num_proc=192):  76%|███████▌  | 244284/322369 [02:54<00:14, 5406.25 examples/s]Tokenizing Prompts (num_proc=192):  76%|███████▌  | 244963/322369 [02:54<00:14, 5350.62 examples/s]Tokenizing Prompts (num_proc=192):  76%|███████▋  | 246321/322369 [02:54<00:13, 5760.84 examples/s]Tokenizing Prompts (num_proc=192):  77%|███████▋  | 247000/322369 [02:55<00:15, 4732.75 examples/s]Tokenizing Prompts (num_proc=192):  77%|███
+0: ████▋  | 249037/322369 [02:55<00:18, 4004.32 examples/s]Tokenizing Prompts (num_proc=192):  77%|███████▋  | 249716/322369 [02:55<00:18, 3943.41 examples/s]Tokenizing Prompts (num_proc=192):  78%|███████▊  | 251074/322369 [02:56<00:17, 4173.74 examples/s]Tokenizing Prompts (num_proc=192):  78%|███████▊  | 251753/322369 [02:56<00:18, 3918.07 examples/s]Tokenizing Prompts (num_proc=192):  78%|███████▊  | 252432/322369 [02:56<00:18, 3784.54 examples/s]Tokenizing Prompts (num_proc=192):  79%|███████▊  | 253111/322369 [02:56<00:16, 4163.78 examples/s]Tokenizing Prompts (num_proc=192):  79%|███████▉  | 254469/322369 [02:56<00:14, 4601.64 examples/s]Tokenizing Prompts (num_proc=192):  79%|███████▉  | 255148/322369 [02:57<00:17, 3875.70 examples/s]Tokenizing Prompts (num_proc=192):  79%|███████▉  | 255827/322369 [02:57<00:16, 4062.51 examples/s]Tokenizing Prompts (num_proc=
+0: 192):  80%|███████▉  | 257185/322369 [02:57<00:14, 4412.07 examples/s]Tokenizing Prompts (num_proc=192):  80%|████████  | 258543/322369 [02:57<00:12, 5313.58 examples/s]Tokenizing Prompts (num_proc=192):  80%|████████  | 259222/322369 [02:57<00:11, 5414.51 examples/s]Tokenizing Prompts (num_proc=192):  81%|████████  | 259901/322369 [02:58<00:11, 5259.63 examples/s]Tokenizing Prompts (num_proc=192):  81%|████████▏ | 261938/322369 [02:58<00:10, 5849.39 examples/s]Tokenizing Prompts (num_proc=192):  81%|████████▏ | 262617/322369 [02:58<00:10, 5501.34 examples/s]Tokenizing Prompts (num_proc=192):  82%|████████▏ | 264654/322369 [02:58<00:08, 6537.00 examples/s]Tokenizing Prompts (num_proc=192):  82%|████████▏ | 265333/322369 [02:59<00:13, 4257.74 examples/s]Tokenizing Prompts (num_proc=192):  83%|████████▎ | 266691/322369 [02:59<00:10, 5317.27 examples/s]
+0: Tokenizing Prompts (num_proc=192):  83%|████████▎ | 268728/322369 [02:59<00:11, 4688.72 examples/s]Tokenizing Prompts (num_proc=192):  84%|████████▍ | 270086/322369 [02:59<00:09, 5369.20 examples/s]Tokenizing Prompts (num_proc=192):  84%|████████▍ | 270765/322369 [03:00<00:09, 5205.89 examples/s]Tokenizing Prompts (num_proc=192):  84%|████████▍ | 271444/322369 [03:00<00:13, 3685.47 examples/s]Tokenizing Prompts (num_proc=192):  85%|████████▍ | 272802/322369 [03:00<00:11, 4178.81 examples/s]Tokenizing Prompts (num_proc=192):  85%|████████▌ | 274160/322369 [03:00<00:09, 5210.99 examples/s]Tokenizing Prompts (num_proc=192):  85%|████████▌ | 274839/322369 [03:01<00:08, 5357.08 examples/s]Tokenizing Prompts (num_proc=192):  85%|████████▌ | 275518/322369 [03:01<00:08, 5603.95 examples/s]Tokenizing Prompts (num_proc=192):  86%|████████▋ | 278234/32
+0: 2369 [03:01<00:06, 7255.63 examples/s]Tokenizing Prompts (num_proc=192):  87%|████████▋ | 279592/322369 [03:01<00:08, 5327.40 examples/s]Tokenizing Prompts (num_proc=192):  87%|████████▋ | 280271/322369 [03:01<00:07, 5366.46 examples/s]Tokenizing Prompts (num_proc=192):  87%|████████▋ | 280950/322369 [03:02<00:07, 5295.13 examples/s]Tokenizing Prompts (num_proc=192):  88%|████████▊ | 282987/322369 [03:02<00:08, 4833.28 examples/s]Tokenizing Prompts (num_proc=192):  88%|████████▊ | 283666/322369 [03:02<00:07, 4933.09 examples/s]Tokenizing Prompts (num_proc=192):  88%|████████▊ | 284345/322369 [03:03<00:11, 3359.59 examples/s]Tokenizing Prompts (num_proc=192):  88%|████████▊ | 285024/322369 [03:03<00:11, 3263.62 examples/s]Tokenizing Prompts (num_proc=192):  89%|████████▊ | 285703/322369 [03:03<00:16, 2279.12 examples/s]Tokenizing Prompts (num_proc=192):  89%|�
+0: ��███████▉ | 286382/322369 [03:04<00:21, 1665.02 examples/s]Tokenizing Prompts (num_proc=192):  89%|████████▉ | 287740/322369 [03:04<00:14, 2442.36 examples/s]Tokenizing Prompts (num_proc=192):  89%|████████▉ | 288419/322369 [03:04<00:12, 2809.62 examples/s]Tokenizing Prompts (num_proc=192):  90%|████████▉ | 289098/322369 [03:05<00:11, 2960.04 examples/s]Tokenizing Prompts (num_proc=192):  90%|████████▉ | 289777/322369 [03:05<00:17, 1827.52 examples/s]Tokenizing Prompts (num_proc=192):  90%|█████████ | 290456/322369 [03:06<00:16, 1972.81 examples/s]Tokenizing Prompts (num_proc=192):  91%|█████████ | 291814/322369 [03:06<00:09, 3092.81 examples/s]Tokenizing Prompts (num_proc=192):  91%|█████████▏| 294530/322369 [03:06<00:05, 4727.89 examples/s]Tokenizing Prompts (num_proc=192):  92%|█████████▏| 295209/322369 [03:06<00:06, 3931.43 examples/s]
+0: Tokenizing Prompts (num_proc=192):  92%|█████████▏| 295888/322369 [03:07<00:07, 3542.11 examples/s]Tokenizing Prompts (num_proc=192):  92%|█████████▏| 296567/322369 [03:07<00:07, 3525.33 examples/s]Tokenizing Prompts (num_proc=192):  92%|█████████▏| 297925/322369 [03:07<00:06, 3800.61 examples/s]Tokenizing Prompts (num_proc=192):  93%|█████████▎| 299283/322369 [03:08<00:05, 4307.85 examples/s]Tokenizing Prompts (num_proc=192):  93%|█████████▎| 301320/322369 [03:08<00:04, 4855.89 examples/s]Tokenizing Prompts (num_proc=192):  94%|█████████▎| 301999/322369 [03:08<00:04, 4105.00 examples/s]Tokenizing Prompts (num_proc=192):  94%|█████████▍| 302678/322369 [03:08<00:05, 3478.05 examples/s]Tokenizing Prompts (num_proc=192):  94%|█████████▍| 304036/322369 [03:09<00:06, 2793.45 examples/s]Tokenizing Prompts (num_proc=192):  95%|███████�
+0: �█▍| 304715/322369 [03:10<00:07, 2492.44 examples/s]Tokenizing Prompts (num_proc=192):  95%|█████████▌| 306752/322369 [03:10<00:03, 4052.01 examples/s]Tokenizing Prompts (num_proc=192):  96%|█████████▌| 308110/322369 [03:10<00:02, 4832.91 examples/s]Tokenizing Prompts (num_proc=192):  96%|█████████▌| 309468/322369 [03:10<00:03, 4206.48 examples/s]Tokenizing Prompts (num_proc=192):  96%|█████████▌| 310147/322369 [03:10<00:02, 4310.54 examples/s]Tokenizing Prompts (num_proc=192):  97%|█████████▋| 311505/322369 [03:11<00:02, 5298.84 examples/s]Tokenizing Prompts (num_proc=192):  97%|█████████▋| 312184/322369 [03:11<00:02, 4237.44 examples/s]Tokenizing Prompts (num_proc=192):  97%|█████████▋| 312863/322369 [03:11<00:02, 4246.83 examples/s]Tokenizing Prompts (num_proc=192):  97%|█████��███▋| 313542/322369 [03:11<00:02, 3132.99 examples/s]Tokeniz
+0: ing Prompts (num_proc=192):  97%|█████████▋| 314221/322369 [03:12<00:02, 2904.33 examples/s]Tokenizing Prompts (num_proc=192):  98%|█████████▊| 315579/322369 [03:12<00:02, 3285.91 examples/s]Tokenizing Prompts (num_proc=192):  98%|█████████▊| 316258/322369 [03:12<00:01, 3254.45 examples/s]Tokenizing Prompts (num_proc=192):  99%|█████████▊| 317616/322369 [03:13<00:01, 2837.23 examples/s]Tokenizing Prompts (num_proc=192):  99%|█████████▊| 318295/322369 [03:13<00:01, 2193.08 examples/s]Tokenizing Prompts (num_proc=192):  99%|█████████▉| 318974/322369 [03:14<00:02, 1592.02 examples/s]Tokenizing Prompts (num_proc=192):  99%|█████████▉| 319653/322369 [03:15<00:01, 1552.20 examples/s]Tokenizing Prompts (num_proc=192):  99%|█████████▉| 320332/322369 [03:16<00:01, 1185.50 examples/s]Tokenizing Prompts (num_proc=192): 100%|█████████▉|
+0:  321011/322369 [03:16<00:01, 1028.53 examples/s]Tokenizing Prompts (num_proc=192): 100%|█████████▉| 321690/322369 [03:19<00:01, 617.73 examples/s] Tokenizing Prompts (num_proc=192): 100%|██████████| 322369/322369 [03:20<00:00, 572.60 examples/s]Tokenizing Prompts (num_proc=192): 100%|██████████| 322369/322369 [03:21<00:00, 1601.32 examples/s]
+0: [2025-09-02 19:40:36,936] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:209] [PID:3622631] [RANK:0] min_input_len: 385[39m
+0: [2025-09-02 19:40:36,937] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:211] [PID:3622631] [RANK:0] max_input_len: 16413[39m
+0: Dropping Long Sequences (>16384) (num_proc=192):   0%|          | 0/322369 [00:00<?, ? examples/s]Dropping Long Sequences (>16384) (num_proc=192):   0%|          | 1000/322369 [00:06<36:09, 148.14 examples/s]Dropping Long Sequences (>16384) (num_proc=192):   1%|          | 4000/322369 [00:06<07:00, 757.58 examples/s]Dropping Long Sequences (>16384) (num_proc=192):   2%|▏         | 6000/322369 [00:07<04:10, 1261.67 examples/s]Dropping Long Sequences (>16384) (num_proc=192):   3%|▎         | 10000/322369 [00:07<01:56, 2692.18 examples/s]Dropping Long Sequences (>16384) (num_proc=192):   5%|▍         | 15000/322369 [00:07<01:01, 5034.59 examples/s]Dropping Long Sequences (>16384) (num_proc=192):   6%|▌         | 19000/322369 [00:07<00:42, 7217.43 examples/s]Dropping Long Sequences (>16384) (num_proc=192):   8%|▊         | 25000/322369 [00:07<00:25, 11484.18 examples/s]Dropping Long Sequences (>16384) (num_proc=192):  10%|█         | 33000/322369 [00:07<00:15, 18799.94 examples/s]Dropping Lo
+0: ng Sequences (>16384) (num_proc=192):  13%|█▎        | 43000/322369 [00:07<00:09, 28769.90 examples/s]Dropping Long Sequences (>16384) (num_proc=192):  15%|█▌        | 49000/322369 [00:07<00:08, 33450.33 examples/s]Dropping Long Sequences (>16384) (num_proc=192):  18%|█▊        | 59000/322369 [00:08<00:05, 44377.24 examples/s]Dropping Long Sequences (>16384) (num_proc=192):  20%|██        | 66000/322369 [00:08<00:05, 47104.39 examples/s]Dropping Long Sequences (>16384) (num_proc=192):  23%|██▎       | 73000/322369 [00:08<00:05, 44052.55 examples/s]Dropping Long Sequences (>16384) (num_proc=192):  25%|██▍       | 79000/322369 [00:08<00:05, 46681.30 examples/s]Dropping Long Sequences (>16384) (num_proc=192):  26%|██▋       | 85000/322369 [00:08<00:06, 36246.85 examples/s]Dropping Long Sequences (>16384) (num_proc=192):  28%|██▊       | 90000/322369 [00:09<00:07, 30121.31 examples/s]Dropping Long Sequences (>16384) (num_proc=192):  29%|██▉       | 94000/32236
+0: 9 [00:09<00:08, 25538.92 examples/s]Dropping Long Sequences (>16384) (num_proc=192):  30%|███       | 98000/322369 [00:09<00:08, 25422.39 examples/s]Dropping Long Sequences (>16384) (num_proc=192):  32%|███▏      | 103000/322369 [00:09<00:07, 28478.49 examples/s]Dropping Long Sequences (>16384) (num_proc=192):  33%|███▎      | 107000/322369 [00:09<00:07, 30388.17 examples/s]Dropping Long Sequences (>16384) (num_proc=192):  35%|███▍      | 112000/322369 [00:09<00:06, 32943.40 examples/s]Dropping Long Sequences (>16384) (num_proc=192):  36%|███▌      | 116000/322369 [00:09<00:06, 31420.77 examples/s]Dropping Long Sequences (>16384) (num_proc=192):  40%|████      | 129000/322369 [00:10<00:03, 52910.28 examples/s]Dropping Long Sequences (>16384) (num_proc=192):  42%|████▏     | 137000/322369 [00:10<00:03, 58067.13 examples/s]Dropping Long Sequences (>16384) (num_proc=192):  45%|████▍     | 144000/322369 [00:10<00:02, 60217.10 examples/s]Dr
+0: opping Long Sequences (>16384) (num_proc=192):  47%|████▋     | 151000/322369 [00:10<00:02, 61892.45 examples/s]Dropping Long Sequences (>16384) (num_proc=192):  49%|████▉     | 158000/322369 [00:10<00:02, 63424.25 examples/s]Dropping Long Sequences (>16384) (num_proc=192):  51%|█████     | 165000/322369 [00:10<00:02, 59524.72 examples/s]Dropping Long Sequences (>16384) (num_proc=192):  53%|█████▎    | 172000/322369 [00:10<00:02, 61209.42 examples/s]Dropping Long Sequences (>16384) (num_proc=192):  56%|█████▌    | 180000/322369 [00:10<00:02, 65579.19 examples/s]Dropping Long Sequences (>16384) (num_proc=192):  58%|█████▊    | 187000/322369 [00:10<00:02, 62434.71 examples/s]Dropping Long Sequences (>16384) (num_proc=192):  60%|██████    | 194037/322369 [00:12<00:07, 16980.70 examples/s]Dropping Long Sequences (>16384) (num_proc=192):  62%|██████▏   | 198790/322369 [00:12<00:07, 16811.26 examples/s]Dropping Long 
+0: Sequences (>16384) (num_proc=192):  63%|██████▎   | 202864/322369 [00:12<00:06, 17545.05 examples/s]Dropping Long Sequences (>16384) (num_proc=192):  64%|██████▍   | 206259/322369 [00:12<00:06, 19099.19 examples/s]Dropping Long Sequences (>16384) (num_proc=192):  66%|██████▌   | 211691/322369 [00:12<00:04, 22159.56 examples/s]Dropping Long Sequences (>16384) (num_proc=192):  67%|██████▋   | 217123/322369 [00:12<00:03, 27099.77 examples/s]Dropping Long Sequences (>16384) (num_proc=192):  69%|██████▉   | 221876/322369 [00:13<00:03, 29819.74 examples/s]Dropping Long Sequences (>16384) (num_proc=192):  72%|███████▏  | 230703/322369 [00:13<00:02, 40026.85 examples/s]Dropping Long Sequences (>16384) (num_proc=192):  73%|███████▎  | 236136/322369 [00:13<00:02, 38594.30 examples/s]Dropping Long Sequences (>16384) (num_proc=192):  75%|███████▍  | 240889/322369 [00:13<00:02, 40423.40 examples/s]
+0: Dropping Long Sequences (>16384) (num_proc=192):  76%|███████▌  | 245642/322369 [00:13<00:01, 40269.97 examples/s]Dropping Long Sequences (>16384) (num_proc=192):  78%|███████▊  | 251753/322369 [00:13<00:01, 44576.83 examples/s]Dropping Long Sequences (>16384) (num_proc=192):  80%|███████▉  | 257185/322369 [00:13<00:01, 43078.37 examples/s]Dropping Long Sequences (>16384) (num_proc=192):  81%|████████▏ | 261938/322369 [00:13<00:01, 40329.01 examples/s]Dropping Long Sequences (>16384) (num_proc=192):  84%|████████▍ | 272123/322369 [00:14<00:00, 54881.05 examples/s]Dropping Long Sequences (>16384) (num_proc=192):  88%|████████▊ | 282308/322369 [00:14<00:00, 65008.56 examples/s]Dropping Long Sequences (>16384) (num_proc=192):  91%|█████████ | 291814/322369 [00:14<00:00, 72724.62 examples/s]Dropping Long Sequences (>16384) (num_proc=192):  93%|█████████▎| 300641/322369
+0:  [00:14<00:00, 75828.80 examples/s]Dropping Long Sequences (>16384) (num_proc=192):  96%|█████████▌| 308789/322369 [00:14<00:00, 68320.98 examples/s]Dropping Long Sequences (>16384) (num_proc=192):  98%|█████████▊| 316258/322369 [00:14<00:00, 67013.11 examples/s]Dropping Long Sequences (>16384) (num_proc=192): 100%|██████████| 322369/322369 [00:15<00:00, 21177.10 examples/s]
+0: [33m[2025-09-02 19:40:53,734] [WARNING] [axolotl.utils.data.utils.handle_long_seq_in_dataset:251] [PID:3622631] [RANK:0] Dropped 31920 samples from dataset[39m
+0: Drop Samples with Zero Trainable Tokens (num_proc=192):   0%|          | 0/290449 [00:00<?, ? examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):   0%|          | 1000/290449 [00:04<23:22, 206.35 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):   1%|          | 3000/290449 [00:05<06:25, 744.87 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):   2%|▏         | 5000/290449 [00:05<03:15, 1460.49 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):   2%|▏         | 7000/290449 [00:05<02:22, 1992.67 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):   3%|▎         | 9000/290449 [00:05<01:39, 2824.31 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):   4%|▍         | 11000/290449 [00:06<01:16, 3651.34 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):   5%|▌         | 15000/290449 [00:06<00:41, 6601.62 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):   6%|▌         |
+0:  17000/290449 [00:06<00:36, 7471.80 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):   7%|▋         | 21000/290449 [00:06<00:27, 9638.40 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):   9%|▊         | 25000/290449 [00:06<00:23, 11302.53 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  10%|▉         | 28000/290449 [00:07<00:36, 7138.81 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192):  18%|█▊        | 52000/290449 [00:07<00:08, 27187.59 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  20%|██        | 59000/290449 [00:08<00:08, 28468.98 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  22%|██▏       | 64513/290449 [00:08<00:08, 27877.83 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  24%|██▍       | 69539/290449 [00:08<00:07, 29651.97 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  25%|██▌       | 74052/290449 [00:08<00:07
+0: , 30046.55 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  27%|██▋       | 78565/290449 [00:08<00:06, 31590.66 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  28%|██▊       | 82565/290449 [00:09<00:08, 23362.12 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  30%|██▉       | 86591/290449 [00:09<00:07, 25486.99 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  31%|███       | 90130/290449 [00:09<00:09, 20457.83 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  32%|███▏      | 93156/290449 [00:09<00:10, 18160.10 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  33%|███▎      | 96182/290449 [00:09<00:10, 17945.14 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  34%|███▍      | 99721/290449 [00:10<00:10, 18799.21 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  36%|███▌      | 104260/290449 [00:10<00:0
+0: 8, 23127.42 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  38%|███▊      | 110260/290449 [00:10<00:06, 29836.06 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  41%|████      | 118312/290449 [00:10<00:04, 38736.04 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  43%|████▎     | 124825/290449 [00:10<00:03, 44326.33 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  45%|████▍     | 129877/290449 [00:10<00:03, 45214.17 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  46%|████▋     | 134903/290449 [00:10<00:03, 40051.01 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  48%|████▊     | 139442/290449 [00:10<00:03, 39949.29 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  50%|█████     | 145442/290449 [00:10<00:03, 44883.21 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  53%|█████▎ 
+0:    | 153494/290449 [00:11<00:02, 53804.06 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  56%|█████▌    | 162033/290449 [00:11<00:02, 55773.00 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  59%|█████▉    | 171572/290449 [00:11<00:01, 64690.97 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  63%|██████▎   | 181702/290449 [00:11<00:01, 69984.89 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  65%|██████▌   | 189780/290449 [00:11<00:01, 68142.13 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  68%|██████▊   | 198858/290449 [00:11<00:01, 68007.39 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  71%|███████   | 205910/290449 [00:11<00:01, 64318.70 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  73%|███████▎  | 212962/290449 [00:11<00:01, 64574.79 examples/s]Drop Samples with Ze
+0: ro Trainable Tokens (num_proc=192):  76%|███████▌  | 219527/290449 [00:12<00:01, 63669.07 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  78%|███████▊  | 227605/290449 [00:12<00:00, 64273.58 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  81%|████████  | 234657/290449 [00:12<00:00, 59830.77 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  83%|████████▎ | 240786/290449 [00:12<00:00, 52711.05 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  85%|████████▍ | 246403/290449 [00:12<00:00, 52530.09 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  87%|████████▋ | 252019/290449 [00:12<00:00, 44269.30 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  89%|████████▊ | 257144/290449 [00:12<00:00, 42447.89 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  90%|███
+0: ██████ | 262271/290449 [00:12<00:00, 44477.86 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  92%|█████████▏| 267910/290449 [00:13<00:00, 46665.26 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  94%|█████████▍| 273037/290449 [00:13<00:00, 37991.27 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  96%|█████████▌| 277647/290449 [00:13<00:00, 34992.04 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  97%|█████████▋| 281745/290449 [00:13<00:00, 35592.32 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192):  98%|█████████▊| 285841/290449 [00:13<00:00, 24333.80 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192): 100%|██████████| 290449/290449 [00:14<00:00, 26388.32 examples/s]Drop Samples with Zero Trainable Tokens (num_proc=192): 100%|██████████| 290449/29
+0: 0449 [00:14<00:00, 19726.00 examples/s]
+0: Add position_id column (Sample Packing) (num_proc=192):   0%|          | 0/290449 [00:00<?, ? examples/s]Add position_id column (Sample Packing) (num_proc=192):   0%|          | 1000/290449 [00:09<46:32, 103.64 examples/s]Add position_id column (Sample Packing) (num_proc=192):   1%|          | 2000/290449 [00:09<19:25, 247.59 examples/s]Add position_id column (Sample Packing) (num_proc=192):   1%|▏         | 4000/290449 [00:10<07:33, 631.40 examples/s]Add position_id column (Sample Packing) (num_proc=192):   3%|▎         | 8000/290449 [00:10<02:52, 1641.84 examples/s]Add position_id column (Sample Packing) (num_proc=192):   4%|▍         | 11000/290449 [00:10<01:44, 2661.99 examples/s]Add position_id column (Sample Packing) (num_proc=192):   4%|▍         | 13000/290449 [00:10<01:21, 3404.08 examples/s]Add position_id column (Sample Packing) (num_proc=192):   5%|▌         | 15000/290449 [00:10<01:07, 4101.46 examples/s]Add position_id column (Sample Packing) (num_proc=192):   8%|▊         |
+0:  23000/290449 [00:10<00:27, 9634.71 examples/s]Add position_id column (Sample Packing) (num_proc=192):   9%|▉         | 27000/290449 [00:11<00:21, 12258.64 examples/s]Add position_id column (Sample Packing) (num_proc=192):  12%|█▏        | 34000/290449 [00:11<00:13, 18957.47 examples/s]Add position_id column (Sample Packing) (num_proc=192):  14%|█▍        | 41000/290449 [00:11<00:09, 25116.32 examples/s]Add position_id column (Sample Packing) (num_proc=192):  17%|█▋        | 49000/290449 [00:11<00:07, 33467.07 examples/s]Add position_id column (Sample Packing) (num_proc=192):  19%|█▉        | 55000/290449 [00:11<00:06, 36362.44 examples/s]Add position_id column (Sample Packing) (num_proc=192):  21%|██        | 61000/290449 [00:11<00:06, 37929.95 examples/s]Add position_id column (Sample Packing) (num_proc=192):  23%|██▎       | 66000/290449 [00:11<00:05, 40072.06 examples/s]Add position_id column (Sample Packing) (num_proc=192):  24%|██▍       | 71000/290449 [00:11<00
+0: :06, 33671.80 examples/s]Add position_id column (Sample Packing) (num_proc=192):  26%|██▌       | 76000/290449 [00:12<00:06, 33393.67 examples/s]Add position_id column (Sample Packing) (num_proc=192):  28%|██▊       | 80000/290449 [00:12<00:06, 30105.93 examples/s]Add position_id column (Sample Packing) (num_proc=192):  29%|██▉       | 85000/290449 [00:12<00:06, 31678.33 examples/s]Add position_id column (Sample Packing) (num_proc=192):  31%|███▏      | 91000/290449 [00:12<00:05, 37398.19 examples/s]Add position_id column (Sample Packing) (num_proc=192):  33%|███▎      | 96000/290449 [00:12<00:05, 36817.33 examples/s]Add position_id column (Sample Packing) (num_proc=192):  35%|███▍      | 101000/290449 [00:12<00:05, 37372.23 examples/s]Add position_id column (Sample Packing) (num_proc=192):  36%|███▌      | 105000/290449 [00:12<00:05, 35776.81 examples/s]Add position_id column (Sample Packing) (num_proc=192):  38%|███▊      | 110000/290449 [00:
+0: 13<00:04, 39152.57 examples/s]Add position_id column (Sample Packing) (num_proc=192):  41%|████      | 119000/290449 [00:13<00:03, 50660.90 examples/s]Add position_id column (Sample Packing) (num_proc=192):  43%|████▎     | 125000/290449 [00:13<00:03, 50728.93 examples/s]Add position_id column (Sample Packing) (num_proc=192):  46%|████▌     | 133000/290449 [00:13<00:02, 55477.66 examples/s]Add position_id column (Sample Packing) (num_proc=192):  50%|████▉     | 144000/290449 [00:13<00:02, 65647.22 examples/s]Add position_id column (Sample Packing) (num_proc=192):  53%|█████▎    | 155000/290449 [00:13<00:01, 73832.19 examples/s]Add position_id column (Sample Packing) (num_proc=192):  56%|█████▌    | 163000/290449 [00:13<00:01, 74120.94 examples/s]Add position_id column (Sample Packing) (num_proc=192):  59%|█████▉    | 172000/290449 [00:13<00:01, 71201.32 examples/s]Add position_id column (Sample Packing) (num_proc=192):  62%|█�
+0: ��████▏   | 180000/290449 [00:14<00:01, 66816.76 examples/s]Add position_id column (Sample Packing) (num_proc=192):  64%|██████▍   | 187000/290449 [00:14<00:01, 58713.85 examples/s]Add position_id column (Sample Packing) (num_proc=192):  67%|██████▋   | 193539/290449 [00:15<00:07, 13034.93 examples/s]Add position_id column (Sample Packing) (num_proc=192):  68%|██████▊   | 198156/290449 [00:16<00:08, 11172.80 examples/s]Add position_id column (Sample Packing) (num_proc=192):  69%|██████▉   | 201747/290449 [00:16<00:07, 11333.11 examples/s]Add position_id column (Sample Packing) (num_proc=192):  71%|███████   | 204825/290449 [00:16<00:06, 12691.77 examples/s]Add position_id column (Sample Packing) (num_proc=192):  72%|███████▏  | 207903/290449 [00:17<00:06, 13665.36 examples/s]Add position_id column (Sample Packing) (num_proc=192):  72%|███████▏  | 210468/290449 [00:17<00:05, 14883.74 examples/
+0: s]Add position_id column (Sample Packing) (num_proc=192):  74%|███████▎  | 213546/290449 [00:17<00:04, 17013.55 examples/s]Add position_id column (Sample Packing) (num_proc=192):  75%|███████▌  | 218676/290449 [00:17<00:03, 21439.94 examples/s]Add position_id column (Sample Packing) (num_proc=192):  76%|███████▋  | 221754/290449 [00:17<00:03, 21484.45 examples/s]Add position_id column (Sample Packing) (num_proc=192):  77%|███████▋  | 224832/290449 [00:17<00:03, 19974.78 examples/s]Add position_id column (Sample Packing) (num_proc=192):  78%|███████▊  | 227397/290449 [00:17<00:03, 20065.88 examples/s]Add position_id column (Sample Packing) (num_proc=192):  79%|███████▉  | 230474/290449 [00:18<00:03, 18805.57 examples/s]Add position_id column (Sample Packing) (num_proc=192):  80%|████████  | 233039/290449 [00:18<00:02, 19263.13 examples/s]Add position_id column (Sample Packing) (num_proc=192):
+0:   82%|████████▏ | 237143/290449 [00:18<00:02, 22462.76 examples/s]Add position_id column (Sample Packing) (num_proc=192):  83%|████████▎ | 239708/290449 [00:18<00:02, 21513.18 examples/s]Add position_id column (Sample Packing) (num_proc=192):  84%|████████▍ | 245351/290449 [00:18<00:01, 29425.82 examples/s]Add position_id column (Sample Packing) (num_proc=192):  86%|████████▌ | 248942/290449 [00:18<00:01, 30212.40 examples/s]Add position_id column (Sample Packing) (num_proc=192):  87%|████████▋ | 254069/290449 [00:18<00:01, 34929.20 examples/s]Add position_id column (Sample Packing) (num_proc=192):  90%|████████▉ | 260731/290449 [00:18<00:00, 41536.13 examples/s]Add position_id column (Sample Packing) (num_proc=192):  91%|█████████▏| 265345/290449 [00:18<00:00, 42154.18 examples/s]Add position_id column (Sample Packing) (num_proc=192):  94%|█████████▍| 2740
+0: 59/290449 [00:19<00:00, 52481.73 examples/s]Add position_id column (Sample Packing) (num_proc=192):  96%|█████████▋| 280207/290449 [00:19<00:00, 47475.38 examples/s]Add position_id column (Sample Packing) (num_proc=192):  98%|█████████▊| 285329/290449 [00:19<00:00, 46971.34 examples/s]Add position_id column (Sample Packing) (num_proc=192): 100%|██████████| 290449/290449 [00:19<00:00, 35087.67 examples/s]Add position_id column (Sample Packing) (num_proc=192): 100%|██████████| 290449/290449 [00:20<00:00, 14390.52 examples/s]
+0: Saving the dataset (0/192 shards):   0%|          | 0/290449 [00:00<?, ? examples/s]Saving the dataset (0/192 shards):   0%|          | 1000/290449 [00:02<10:37, 453.92 examples/s]Saving the dataset (1/192 shards):  11%|█         | 30513/290449 [00:02<09:32, 453.92 examples/s]Saving the dataset (2/192 shards):  11%|█         | 32026/290449 [00:02<09:29, 453.92 examples/s]Saving the dataset (3/192 shards):  16%|█▌        | 45539/290449 [00:02<08:59, 453.92 examples/s]Saving the dataset (4/192 shards):  16%|█▌        | 47052/290449 [00:02<08:56, 453.92 examples/s]Saving the dataset (5/192 shards):  16%|█▋        | 47565/290449 [00:02<08:55, 453.92 examples/s]Saving the dataset (6/192 shards):  17%|█▋        | 50078/290449 [00:02<08:49, 453.92 examples/s]Saving the dataset (7/192 shards):  18%|█▊        | 52591/290449 [00:02<08:44, 453.92 examples/s]Saving the dataset (8/192 shards):  19%|█▊        | 54104/290449 [00:02<08:40, 453.92 examples/s]Saving the dataset (9/192 sha
+0: rds):  19%|█▉        | 54617/290449 [00:02<08:39, 453.92 examples/s]Saving the dataset (10/192 shards):  21%|██        | 61130/290449 [00:02<08:25, 453.92 examples/s]Saving the dataset (11/192 shards):  21%|██        | 61643/290449 [00:02<08:24, 453.92 examples/s]Saving the dataset (12/192 shards):  22%|██▏       | 65156/290449 [00:02<08:16, 453.92 examples/s]Saving the dataset (13/192 shards):  23%|██▎       | 67669/290449 [00:02<08:10, 453.92 examples/s]Saving the dataset (13/192 shards):  23%|██▎       | 68182/290449 [00:02<00:05, 41326.47 examples/s]Saving the dataset (14/192 shards):  23%|██▎       | 68182/290449 [00:02<00:05, 41326.47 examples/s]Saving the dataset (15/192 shards):  25%|██▍       | 71695/290449 [00:02<00:05, 41326.47 examples/s]Saving the dataset (16/192 shards):  25%|██▍       | 72208/290449 [00:02<00:05, 41326.47 examples/s]Saving the dataset (17/192 shards):  25%|██▌       | 73721/290449 [00:02<00:05, 41326.47 examples/s]
+0: Saving the dataset (18/192 shards):  26%|██▌       | 76234/290449 [00:02<00:05, 41326.47 examples/s]Saving the dataset (19/192 shards):  27%|██▋       | 78747/290449 [00:02<00:05, 41326.47 examples/s]Saving the dataset (20/192 shards):  27%|██▋       | 79260/290449 [00:02<00:05, 41326.47 examples/s]Saving the dataset (21/192 shards):  28%|██▊       | 81773/290449 [00:02<00:05, 41326.47 examples/s]Saving the dataset (22/192 shards):  29%|██▉       | 85286/290449 [00:02<00:04, 41326.47 examples/s]Saving the dataset (23/192 shards):  30%|███       | 87799/290449 [00:02<00:04, 41326.47 examples/s]Saving the dataset (24/192 shards):  32%|███▏      | 94312/290449 [00:02<00:04, 41326.47 examples/s]Saving the dataset (25/192 shards):  33%|███▎      | 95338/290449 [00:02<00:04, 41326.47 examples/s]Saving the dataset (26/192 shards):  33%|███▎      | 95338/290449 [00:02<00:04, 41326.47 examples/s]Saving the dataset (27/192 shards):  33%|███▎  
+0:     | 96851/290449 [00:02<00:04, 41326.47 examples/s]Saving the dataset (28/192 shards):  34%|███▎      | 97364/290449 [00:02<00:04, 41326.47 examples/s]Saving the dataset (29/192 shards):  34%|███▍      | 98390/290449 [00:02<00:04, 41326.47 examples/s]Saving the dataset (30/192 shards):  34%|███▍      | 98390/290449 [00:02<00:04, 41326.47 examples/s]Saving the dataset (31/192 shards):  34%|███▍      | 98903/290449 [00:02<00:04, 41326.47 examples/s]Saving the dataset (32/192 shards):  35%|███▍      | 100416/290449 [00:02<00:04, 41326.47 examples/s]Saving the dataset (33/192 shards):  36%|███▋      | 105929/290449 [00:02<00:04, 41326.47 examples/s]Saving the dataset (34/192 shards):  37%|███▋      | 107442/290449 [00:02<00:04, 41326.47 examples/s]Saving the dataset (34/192 shards):  37%|███▋      | 108442/290449 [00:02<00:02, 66634.79 examples/s]Saving the dataset (35/192 shards):  38%|███▊      | 110468/290449 [00:02<00:02, 66634
+0: .79 examples/s]Saving the dataset (36/192 shards):  38%|███▊      | 111468/290449 [00:02<00:02, 66634.79 examples/s]Saving the dataset (37/192 shards):  40%|███▉      | 115981/290449 [00:02<00:02, 66634.79 examples/s]Saving the dataset (38/192 shards):  40%|████      | 117494/290449 [00:02<00:02, 66634.79 examples/s]Saving the dataset (39/192 shards):  41%|████      | 119520/290449 [00:02<00:02, 66634.79 examples/s]Saving the dataset (40/192 shards):  41%|████      | 119520/290449 [00:02<00:02, 66634.79 examples/s]Saving the dataset (41/192 shards):  41%|████▏     | 120033/290449 [00:02<00:02, 66634.79 examples/s]Saving the dataset (42/192 shards):  43%|████▎     | 124546/290449 [00:02<00:02, 66634.79 examples/s]Saving the dataset (43/192 shards):  43%|████▎     | 125059/290449 [00:02<00:02, 66634.79 examples/s]Saving the dataset (44/192 shards):  44%|████▎     | 126572/290449 [00:02<00:02, 66634.79 examples/s]Saving the
+0:  dataset (45/192 shards):  44%|████▍     | 128085/290449 [00:02<00:02, 66634.79 examples/s]Saving the dataset (46/192 shards):  45%|████▍     | 130111/290449 [00:02<00:02, 66634.79 examples/s]Saving the dataset (47/192 shards):  45%|████▍     | 130624/290449 [00:02<00:02, 66634.79 examples/s]Saving the dataset (48/192 shards):  45%|████▍     | 130624/290449 [00:02<00:02, 66634.79 examples/s]Saving the dataset (49/192 shards):  45%|████▌     | 131137/290449 [00:02<00:02, 66634.79 examples/s]Saving the dataset (49/192 shards):  49%|████▊     | 141137/290449 [00:02<00:01, 86510.55 examples/s]Saving the dataset (50/192 shards):  50%|████▉     | 144650/290449 [00:02<00:01, 86510.55 examples/s]Saving the dataset (51/192 shards):  51%|█████     | 148163/290449 [00:02<00:01, 86510.55 examples/s]Saving the dataset (52/192 shards):  51%|█████     | 148676/290449 [00:02<00:01, 86510.55 examples/s]Saving the dataset (53/192
+0:  shards):  52%|█████▏    | 151189/290449 [00:02<00:01, 86510.55 examples/s]Saving the dataset (54/192 shards):  52%|█████▏    | 151702/290449 [00:02<00:01, 86510.55 examples/s]Saving the dataset (55/192 shards):  53%|█████▎    | 155215/290449 [00:02<00:01, 86510.55 examples/s]Saving the dataset (56/192 shards):  54%|█████▎    | 155728/290449 [00:02<00:01, 86510.55 examples/s]Saving the dataset (57/192 shards):  54%|█████▍    | 157241/290449 [00:02<00:01, 86510.55 examples/s]Saving the dataset (58/192 shards):  55%|█████▍    | 159267/290449 [00:02<00:01, 86510.55 examples/s]Saving the dataset (59/192 shards):  55%|█████▍    | 159267/290449 [00:02<00:01, 86510.55 examples/s]Saving the dataset (60/192 shards):  56%|█████▌    | 162780/290449 [00:02<00:01, 86510.55 examples/s]Saving the dataset (61/192 shards):  56%|█████▌    | 163293/290449 [00:02<00:01, 86510.55 examples/s]Saving the dataset (62/1
+0: 92 shards):  57%|█████▋    | 165806/290449 [00:02<00:01, 86510.55 examples/s]Saving the dataset (63/192 shards):  58%|█████▊    | 168319/290449 [00:02<00:01, 86510.55 examples/s]Saving the dataset (63/192 shards):  58%|█████▊    | 168832/290449 [00:02<00:01, 106577.04 examples/s]Saving the dataset (64/192 shards):  58%|█████▊    | 168832/290449 [00:02<00:01, 106577.04 examples/s]Saving the dataset (65/192 shards):  58%|█████▊    | 169345/290449 [00:02<00:01, 106577.04 examples/s]Saving the dataset (66/192 shards):  59%|█████▉    | 171371/290449 [00:02<00:01, 106577.04 examples/s]Saving the dataset (67/192 shards):  59%|█████▉    | 171371/290449 [00:02<00:01, 106577.04 examples/s]Saving the dataset (68/192 shards):  59%|█████▉    | 171884/290449 [00:02<00:01, 106577.04 examples/s]Saving the dataset (69/192 shards):  60%|█████▉    | 173910/290449 [00:02<00:01, 106577.04 examples/s]Saving the data
+0: set (70/192 shards):  60%|█████▉    | 173910/290449 [00:02<00:01, 106577.04 examples/s]Saving the dataset (71/192 shards):  60%|██████    | 174423/290449 [00:02<00:01, 106577.04 examples/s]Saving the dataset (72/192 shards):  61%|██████    | 176936/290449 [00:02<00:01, 106577.04 examples/s]Saving the dataset (73/192 shards):  61%|██████    | 177449/290449 [00:02<00:01, 106577.04 examples/s]Saving the dataset (74/192 shards):  61%|██████▏   | 177962/290449 [00:02<00:01, 106577.04 examples/s]Saving the dataset (75/192 shards):  62%|██████▏   | 178988/290449 [00:02<00:01, 106577.04 examples/s]Saving the dataset (76/192 shards):  62%|██████▏   | 179988/290449 [00:02<00:01, 106577.04 examples/s]Saving the dataset (77/192 shards):  62%|██████▏   | 180501/290449 [00:02<00:01, 106577.04 examples/s]Saving the dataset (78/192 shards):  62%|██████▏   | 181014/290449 [00:02<00:01, 106577.04 exampl
+0: es/s]Saving the dataset (79/192 shards):  64%|██████▎   | 184527/290449 [00:02<00:00, 106577.04 examples/s]Saving the dataset (80/192 shards):  64%|██████▍   | 187040/290449 [00:02<00:00, 106577.04 examples/s]Saving the dataset (81/192 shards):  65%|██████▍   | 188066/290449 [00:02<00:00, 106577.04 examples/s]Saving the dataset (82/192 shards):  65%|██████▍   | 188579/290449 [00:02<00:00, 106577.04 examples/s]Saving the dataset (83/192 shards):  65%|██████▍   | 188579/290449 [00:02<00:00, 106577.04 examples/s]Saving the dataset (84/192 shards):  65%|██████▌   | 190092/290449 [00:02<00:00, 106577.04 examples/s]Saving the dataset (85/192 shards):  66%|██████▋   | 192605/290449 [00:02<00:00, 106577.04 examples/s]Saving the dataset (85/192 shards):  67%|██████▋   | 195118/290449 [00:02<00:00, 128094.20 examples/s]Saving the dataset (86/192 shards):  67%|██████▋   | 195631/290449 [
+0: 00:02<00:00, 128094.20 examples/s]Saving the dataset (87/192 shards):  67%|██████▋   | 195631/290449 [00:02<00:00, 128094.20 examples/s]Saving the dataset (88/192 shards):  68%|██████▊   | 197144/290449 [00:02<00:00, 128094.20 examples/s]Saving the dataset (89/192 shards):  69%|██████▉   | 201657/290449 [00:02<00:00, 128094.20 examples/s]Saving the dataset (90/192 shards):  70%|██████▉   | 202170/290449 [00:02<00:00, 128094.20 examples/s]Saving the dataset (91/192 shards):  70%|███████   | 203683/290449 [00:02<00:00, 128094.20 examples/s]Saving the dataset (92/192 shards):  71%|███████▏  | 207196/290449 [00:02<00:00, 128094.20 examples/s]Saving the dataset (93/192 shards):  72%|███████▏  | 208709/290449 [00:02<00:00, 128094.20 examples/s]Saving the dataset (94/192 shards):  73%|███████▎  | 211222/290449 [00:02<00:00, 128094.20 examples/s]Saving the dataset (95/192 shards):  75%|██
+0: █████▍  | 217248/290449 [00:02<00:00, 128094.20 examples/s]Saving the dataset (96/192 shards):  75%|███████▍  | 217248/290449 [00:02<00:00, 128094.20 examples/s]Saving the dataset (97/192 shards):  75%|███████▌  | 218761/290449 [00:02<00:00, 128094.20 examples/s]Saving the dataset (98/192 shards):  77%|███████▋  | 224274/290449 [00:02<00:00, 128094.20 examples/s]Saving the dataset (98/192 shards):  78%|███████▊  | 225274/290449 [00:02<00:00, 156727.22 examples/s]Saving the dataset (99/192 shards):  78%|███████▊  | 227300/290449 [00:02<00:00, 156727.22 examples/s]Saving the dataset (100/192 shards):  78%|███████▊  | 227300/290449 [00:02<00:00, 156727.22 examples/s]Saving the dataset (101/192 shards):  79%|███████▉  | 228813/290449 [00:02<00:00, 156727.22 examples/s]Saving the dataset (102/192 shards):  79%|███████▉  | 230326/290449 [00:02<00:00, 156727.22 examples/
+0: s]Saving the dataset (103/192 shards):  81%|████████  | 233865/290449 [00:02<00:00, 156727.22 examples/s]Saving the dataset (104/192 shards):  81%|████████  | 233865/290449 [00:02<00:00, 156727.22 examples/s]Saving the dataset (105/192 shards):  81%|████████  | 233865/290449 [00:02<00:00, 156727.22 examples/s]Saving the dataset (106/192 shards):  81%|████████  | 235404/290449 [00:02<00:00, 156727.22 examples/s]Saving the dataset (107/192 shards):  81%|████████  | 235404/290449 [00:02<00:00, 156727.22 examples/s]Saving the dataset (108/192 shards):  81%|████████  | 235404/290449 [00:02<00:00, 156727.22 examples/s]Saving the dataset (109/192 shards):  81%|████████  | 235917/290449 [00:02<00:00, 156727.22 examples/s]Saving the dataset (110/192 shards):  82%|████████▏ | 238943/290449 [00:02<00:00, 156727.22 examples/s]Saving the dataset (111/192 shards):  82%|█████�
+0: �██▏ | 238943/290449 [00:02<00:00, 156727.22 examples/s]Saving the dataset (112/192 shards):  83%|████████▎ | 241969/290449 [00:02<00:00, 156727.22 examples/s]Saving the dataset (113/192 shards):  83%|████████▎ | 241969/290449 [00:02<00:00, 156727.22 examples/s]Saving the dataset (114/192 shards):  83%|████████▎ | 242482/290449 [00:02<00:00, 156727.22 examples/s]Saving the dataset (115/192 shards):  84%|████████▍ | 243508/290449 [00:03<00:00, 156727.22 examples/s]Saving the dataset (116/192 shards):  84%|████████▍ | 243508/290449 [00:03<00:00, 156727.22 examples/s]Saving the dataset (117/192 shards):  84%|████████▍ | 244021/290449 [00:03<00:00, 156727.22 examples/s]Saving the dataset (118/192 shards):  84%|████████▍ | 244534/290449 [00:03<00:00, 156727.22 examples/s]Saving the dataset (119/192 shards):  85%|████████▍ | 246047/290449 [00:03<00:00, 156727.
+0: 22 examples/s]Saving the dataset (120/192 shards):  85%|████████▍ | 246560/290449 [00:03<00:00, 156727.22 examples/s]Saving the dataset (121/192 shards):  85%|████████▌ | 247073/290449 [00:03<00:00, 156727.22 examples/s]Saving the dataset (122/192 shards):  86%|████████▌ | 248586/290449 [00:03<00:00, 156727.22 examples/s]Saving the dataset (123/192 shards):  86%|████████▌ | 249099/290449 [00:03<00:00, 156727.22 examples/s]Saving the dataset (124/192 shards):  86%|████████▌ | 249612/290449 [00:03<00:00, 156727.22 examples/s]Saving the dataset (125/192 shards):  86%|████████▌ | 250124/290449 [00:03<00:00, 156727.22 examples/s]Saving the dataset (126/192 shards):  86%|████████▋ | 251150/290449 [00:03<00:00, 156727.22 examples/s]Saving the dataset (127/192 shards):  86%|████████▋ | 251150/290449 [00:03<00:00, 156727.22 examples/s]Saving the dataset (128/192 shard
+0: s):  87%|████████▋ | 252663/290449 [00:03<00:00, 156727.22 examples/s]Saving the dataset (129/192 shards):  87%|████████▋ | 253175/290449 [00:03<00:00, 156727.22 examples/s]Saving the dataset (130/192 shards):  87%|████████▋ | 253688/290449 [00:03<00:00, 156727.22 examples/s]Saving the dataset (131/192 shards):  88%|████████▊ | 254201/290449 [00:03<00:00, 156727.22 examples/s]Saving the dataset (131/192 shards):  88%|████████▊ | 254714/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (132/192 shards):  88%|████████▊ | 254714/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (133/192 shards):  88%|████████▊ | 255226/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (134/192 shards):  88%|████████▊ | 255739/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (135/192 shards):  88%|████████▊ | 256252/29
+0: 0449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (136/192 shards):  89%|████████▊ | 257278/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (137/192 shards):  89%|████████▊ | 257278/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (138/192 shards):  89%|████████▉ | 258791/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (139/192 shards):  90%|████████▉ | 260329/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (140/192 shards):  90%|████████▉ | 261329/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (141/192 shards):  90%|████████▉ | 261329/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (142/192 shards):  90%|█████████ | 261842/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (143/192 shards):  90%|█████████ | 262354/290449 [00:03<00:00, 182681.32 examples/s]Saving 
+0: the dataset (144/192 shards):  91%|█████████▏| 265378/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (145/192 shards):  91%|█████████▏| 265378/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (146/192 shards):  92%|█████████▏| 265890/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (147/192 shards):  92%|█████████▏| 266403/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (148/192 shards):  92%|█████████▏| 266915/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (149/192 shards):  92%|█████████▏| 267427/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (150/192 shards):  92%|█████████▏| 267940/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (151/192 shards):  92%|█████████▏| 268452/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (152/192 shards):  9
+0: 3%|█████████▎| 269476/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (153/192 shards):  93%|█████████▎| 270500/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (154/192 shards):  93%|█████████▎| 270500/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (155/192 shards):  93%|█████████▎| 270500/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (156/192 shards):  93%|█████████▎| 271525/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (157/192 shards):  93%|█████████▎| 271525/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (158/192 shards):  94%|█████████▎| 272038/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (159/192 shards):  94%|█████████▍| 273063/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (160/192 shards):  94%|█████████�
+0: �| 273575/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (161/192 shards):  94%|█████████▍| 274088/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (162/192 shards):  94%|█████████▍| 274088/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (163/192 shards):  95%|█████████▍| 275113/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (164/192 shards):  95%|█████████▍| 275113/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (165/192 shards):  95%|█████████▍| 275625/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (166/192 shards):  95%|█████████▌| 276649/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (167/192 shards):  96%|█████████▌| 277673/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (168/192 shards):  96%|█████████▌| 279185/290449 [00:03<00:00, 1
+0: 82681.32 examples/s]Saving the dataset (169/192 shards):  96%|█████████▌| 279185/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (170/192 shards):  96%|█████████▌| 279185/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (171/192 shards):  96%|█████████▋| 279697/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (172/192 shards):  96%|█████████▋| 280209/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (173/192 shards):  97%|█████████▋| 280721/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (174/192 shards):  97%|█████████▋| 281233/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (175/192 shards):  97%|█████████▋| 282769/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (176/192 shards):  97%|█████████▋| 282769/290449 [00:03<00:00, 182681.32 examples/s]Saving the 
+0: dataset (177/192 shards):  97%|█████████▋| 282769/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (178/192 shards):  98%|█████████▊| 283793/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (179/192 shards):  98%|█████████▊| 283793/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (180/192 shards):  98%|█████████▊| 284305/290449 [00:03<00:00, 182681.32 examples/s]Saving the dataset (180/192 shards):  98%|█████████▊| 284817/290449 [00:03<00:00, 207417.75 examples/s]Saving the dataset (181/192 shards):  98%|█████████▊| 284817/290449 [00:03<00:00, 207417.75 examples/s]Saving the dataset (182/192 shards):  98%|█████████▊| 285841/290449 [00:03<00:00, 207417.75 examples/s]Saving the dataset (183/192 shards):  98%|█████████▊| 285841/290449 [00:03<00:00, 207417.75 examples/s]Saving the dataset (184/192 shards):  99%|�
+0: ��████████▉| 286865/290449 [00:03<00:00, 207417.75 examples/s]Saving the dataset (185/192 shards):  99%|█████████▉| 286865/290449 [00:03<00:00, 207417.75 examples/s]Saving the dataset (186/192 shards):  99%|█████████▉| 287377/290449 [00:03<00:00, 207417.75 examples/s]Saving the dataset (187/192 shards):  99%|█████████▉| 287889/290449 [00:03<00:00, 207417.75 examples/s]Saving the dataset (188/192 shards):  99%|█████████▉| 288401/290449 [00:03<00:00, 207417.75 examples/s]Saving the dataset (189/192 shards):  99%|█████████▉| 288913/290449 [00:03<00:00, 207417.75 examples/s]Saving the dataset (190/192 shards): 100%|█████████▉| 289425/290449 [00:03<00:00, 207417.75 examples/s]Saving the dataset (191/192 shards): 100%|█████████▉| 289937/290449 [00:03<00:00, 207417.75 examples/s]Saving the dataset (192/192 shards): 100%|██████████| 2
+0: 90449/290449 [00:03<00:00, 207417.75 examples/s]Saving the dataset (192/192 shards): 100%|██████████| 290449/290449 [00:03<00:00, 88496.31 examples/s] 
+0: [2025-09-02 19:43:59,224] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:436] [PID:3622631] [RANK:0] gather_len_batches: [156401, 156323, 156360, 156342, 156327, 156381, 156371, 156340, 156377, 156377, 156382, 156350, 156392, 156337, 156379, 156365][39m
+0: [2025-09-02 19:43:59,634] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:495] [PID:3622631] [RANK:0] sample_packing_eff_est across ranks: [0.977264940738678, 0.977090060710907, 0.9777962565422058, 0.9773836731910706, 0.9768965244293213, 0.9772837162017822, 0.9774523973464966, 0.9769526720046997, 0.9774523973464966, 0.9775961637496948, 0.9775086641311646, 0.9766343832015991, 0.9773149490356445, 0.9774774312973022, 0.9774898886680603, 0.9774399399757385][39m
+0: [2025-09-02 19:43:59,650] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:127] [PID:3622631] [RANK:0] Maximum number of steps set at 9770[39m
+0: [2025-09-02 19:44:00,072] [INFO] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:110] [PID:3622631] [RANK:0] Patched Trainer.evaluation_loop with nanmean loss calculation[39m
+0: [2025-09-02 19:44:00,073] [INFO] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:164] [PID:3622631] [RANK:0] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation[39m
+0: [2025-09-02 19:44:05,320] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:345] [PID:3622631] [RANK:0] Converting modules to torch.bfloat16[39m
+0: [2025-09-02 19:44:42,778] [INFO] [axolotl.train.save_initial_configs:416] [PID:3622631] [RANK:0] Pre-saving tokenizer to /lustre/fswork/projects/rech/dgo/udv55np/math/Qwen3-235B-A22B/Qwen2.5-0.5B_reasoning/1...[39m
+0: [2025-09-02 19:44:43,248] [INFO] [axolotl.train.save_initial_configs:419] [PID:3622631] [RANK:0] Pre-saving model config to /lustre/fswork/projects/rech/dgo/udv55np/math/Qwen3-235B-A22B/Qwen2.5-0.5B_reasoning/1...[39m
+0: [2025-09-02 19:44:43,258] [INFO] [axolotl.train.execute_training:203] [PID:3622631] [RANK:0] Starting trainer...[39m
+0: [2025-09-02 19:55:57,577] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:436] [PID:3622631] [RANK:0] gather_len_batches: [156401, 156401, 156401, 156401, 156401, 156401, 156401, 156401, 156401, 156401, 156401, 156401, 156401, 156401, 156401, 156401][39m
+0: Parameter Offload - Persistent parameters statistics: param_count = 121, numel = 71552
+0: {'loss': 0.7713, 'grad_norm': 0.6373713397101597, 'learning_rate': 3.08e-06, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.0}
+0:   0%|          | 0/9770 [00:00<?, ?it/s]  0%|          | 1/9770 [03:13<523:53:08, 193.06s/it]  0%|          | 2/9770 [03:17<222:37:50, 82.05s/it]   0%|          | 3/9770 [03:18<122:21:38, 45.10s/it]  0%|          | 4/9770 [03:19<75:14:46, 27.74s/it]   0%|          | 5/9770 [03:20<49:08:31, 18.12s/it]  0%|          | 6/9770 [03:21<33:02:00, 12.18s/it]  0%|          | 7/9770 [03:22<22:48:10,  8.41s/it]  0%|          | 8/9770 [03:22<16:06:36,  5.94s/it]  0%|          | 9/9770 [03:23<11:37:08,  4.29s/it]  0%|          | 10/9770 [03:23<8:35:23,  3.17s/it]                                                     0%|          | 10/9770 [03:23<8:35:23,  3.17s/it]  0%|          | 11/9770 [03:24<6:30:35,  2.40s/it]  0%|          | 12/9770 [03:25<5:03:26,  1.87s/it]  0%|          | 13/9770 [03:25<4:03:13,  1.50s/it]  0%|          | 14/9770 [03:26<3:21:51,  1.24s/it]  0%|          | 15/9770 [03:27<2:53:37,  1.07s/it]  0%|          | 16/9770 [03:27<2:33:58,  1.06it/s]  0%|          | 17/9770 [03:28<2:20:02
+0: {'loss': 0.7882, 'grad_norm': 0.6298029794696478, 'learning_rate': 4.2800000000000005e-06, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.0}
+0: {'loss': 0.777, 'grad_norm': 0.6642386775767615, 'learning_rate': 5.480000000000001e-06, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.0}
+0: ,  1.16it/s]  0%|          | 18/9770 [03:29<2:09:06,  1.26it/s]  0%|          | 19/9770 [03:29<2:04:28,  1.31it/s]  0%|          | 20/9770 [03:30<1:59:48,  1.36it/s]                                                     0%|          | 20/9770 [03:30<1:59:48,  1.36it/s]  0%|          | 21/9770 [03:31<1:55:50,  1.40it/s]  0%|          | 22/9770 [03:31<1:52:59,  1.44it/s]  0%|          | 23/9770 [03:32<1:50:05,  1.48it/s]  0%|          | 24/9770 [03:33<1:49:35,  1.48it/s]  0%|          | 25/9770 [03:33<1:48:23,  1.50it/s]  0%|          | 26/9770 [03:34<1:48:06,  1.50it/s]  0%|          | 27/9770 [03:35<1:47:15,  1.51it/s]  0%|          | 28/9770 [03:35<1:47:53,  1.50it/s]  0%|          | 29/9770 [03:36<1:49:09,  1.49it/s]  0%|          | 30/9770 [03:37<1:50:50,  1.46it/s]                                                     0%|          | 30/9770 [03:37<1:50:50,  1.46it/s]  0%|          | 31/9770 [03:37<1:49:52,  1.48it/s]  0%|          | 32/9770 [03:38<1:48:38,  1.49it/s]  0%|          | 33/9
+0: {'loss': 0.7812, 'grad_norm': 0.6098133550634027, 'learning_rate': 6.680000000000001e-06, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.0}
+0: 770 [03:39<1:48:19,  1.50it/s]  0%|          | 34/9770 [03:39<1:46:37,  1.52it/s]  0%|          | 35/9770 [03:40<1:45:57,  1.53it/s]  0%|          | 36/9770 [03:41<1:47:10,  1.51it/s]  0%|          | 37/9770 [03:41<1:47:26,  1.51it/s]  0%|          | 38/9770 [03:42<1:48:45,  1.49it/s]  0%|          | 39/9770 [03:43<1:49:42,  1.48it/s]  0%|          | 40/9770 [03:43<1:48:37,  1.49it/s]                                                     0%|          | 40/9770 [03:43<1:48:37,  1.49it/s]  0%|          | 41/9770 [03:44<1:46:05,  1.53it/s]  0%|          | 42/9770 [03:45<1:46:21,  1.52it/s]  0%|          | 43/9770 [03:45<1:46:00,  1.53it/s]  0%|          | 44/9770 [03:46<1:45:19,  1.54it/s]  0%|          | 45/9770 [03:47<1:45:19,  1.54it/s]  0%|          | 46/9770 [03:47<1:46:05,  1.53it/s]  0%|          | 47/9770 [03:48<1:46:53,  1.52it/s]  0%|          | 48/9770 [03:49<1:45:56,  1.53it/s]  1%|          | 49/9770 [03:49<1:44:58,  1.54it/s]  1%|          | 50/9770 [03:50<1:44:52,  1.54it/s]    
+0: {'loss': 0.7572, 'grad_norm': 0.698355356840049, 'learning_rate': 7.88e-06, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.01}
+0: {'loss': 0.7607, 'grad_norm': 0.6345132113651546, 'learning_rate': 9.08e-06, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.01}
+0:                                                  1%|          | 50/9770 [03:50<1:44:52,  1.54it/s]  1%|          | 51/9770 [03:50<1:43:56,  1.56it/s]  1%|          | 52/9770 [03:51<1:44:15,  1.55it/s]  1%|          | 53/9770 [03:52<1:45:22,  1.54it/s]  1%|          | 54/9770 [03:52<1:44:45,  1.55it/s]  1%|          | 55/9770 [03:53<1:44:59,  1.54it/s]  1%|          | 56/9770 [03:54<1:45:54,  1.53it/s]  1%|          | 57/9770 [03:54<1:45:31,  1.53it/s]  1%|          | 58/9770 [03:55<1:47:10,  1.51it/s]  1%|          | 59/9770 [03:56<1:46:09,  1.52it/s]  1%|          | 60/9770 [03:56<1:45:38,  1.53it/s]                                                     1%|          | 60/9770 [03:56<1:45:38,  1.53it/s]  1%|          | 61/9770 [03:57<1:45:32,  1.53it/s]  1%|          | 62/9770 [03:58<1:45:41,  1.53it/s]  1%|          | 63/9770 [03:58<1:45:57,  1.53it/s]  1%|          | 64/9770 [03:59<1:46:32,  1.52it/s]  1%|          | 65/9770 [04:00<1:45:44,  1.53it/s]  1%|          | 66/9770 [04:00<1:46:3
+0: {'loss': 0.7718, 'grad_norm': 0.7738322924474639, 'learning_rate': 1.0280000000000002e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.01}
+0: {'loss': 0.7649, 'grad_norm': 0.6784661053627246, 'learning_rate': 1.148e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.01}
+0: 6,  1.52it/s]  1%|          | 67/9770 [04:01<1:45:18,  1.54it/s]  1%|          | 68/9770 [04:02<1:45:24,  1.53it/s]  1%|          | 69/9770 [04:02<1:44:32,  1.55it/s]  1%|          | 70/9770 [04:03<1:44:56,  1.54it/s]                                                     1%|          | 70/9770 [04:03<1:44:56,  1.54it/s]  1%|          | 71/9770 [04:04<1:47:32,  1.50it/s]  1%|          | 72/9770 [04:04<1:47:37,  1.50it/s]  1%|          | 73/9770 [04:05<1:47:36,  1.50it/s]  1%|          | 74/9770 [04:06<1:46:59,  1.51it/s]  1%|          | 75/9770 [04:06<1:46:14,  1.52it/s]  1%|          | 76/9770 [04:07<1:45:41,  1.53it/s]  1%|          | 77/9770 [04:08<1:44:49,  1.54it/s]  1%|          | 78/9770 [04:08<1:47:30,  1.50it/s]  1%|          | 79/9770 [04:09<1:47:08,  1.51it/s]  1%|          | 80/9770 [04:10<1:47:15,  1.51it/s]                                                     1%|          | 80/9770 [04:10<1:47:15,  1.51it/s]  1%|          | 81/9770 [04:10<1:46:25,  1.52it/s]  1%|          | 82/
+0: {'loss': 0.7384, 'grad_norm': 0.6481618613494908, 'learning_rate': 1.268e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.01}
+0: 9770 [04:11<1:46:26,  1.52it/s]  1%|          | 83/9770 [04:12<1:47:32,  1.50it/s]  1%|          | 84/9770 [04:12<1:46:36,  1.51it/s]  1%|          | 85/9770 [04:13<1:47:51,  1.50it/s]  1%|          | 86/9770 [04:14<1:48:17,  1.49it/s]  1%|          | 87/9770 [04:14<1:48:14,  1.49it/s]  1%|          | 88/9770 [04:15<1:48:52,  1.48it/s]  1%|          | 89/9770 [04:16<1:47:43,  1.50it/s]  1%|          | 90/9770 [04:16<1:46:32,  1.51it/s]                                                     1%|          | 90/9770 [04:16<1:46:32,  1.51it/s]  1%|          | 91/9770 [04:17<1:46:26,  1.52it/s]  1%|          | 92/9770 [04:17<1:46:20,  1.52it/s]  1%|          | 93/9770 [04:18<1:48:33,  1.49it/s]  1%|          | 94/9770 [04:19<1:47:53,  1.49it/s]  1%|          | 95/9770 [04:20<1:46:50,  1.51it/s]  1%|          | 96/9770 [04:20<1:45:49,  1.52it/s]  1%|          | 97/9770 [04:21<1:48:30,  1.49it/s]  1%|          | 98/9770 [04:22<1:48:17,  1.49it/s]  1%|          | 99/9770 [04:22<1:47:20,  1.50it/s]  1
+0: {'loss': 0.7464, 'grad_norm': 0.7340683217536735, 'learning_rate': 1.3880000000000003e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.01}
+0: {'loss': 0.7763, 'grad_norm': 0.8330098034721529, 'learning_rate': 1.5080000000000001e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.01}
+0: %|          | 100/9770 [04:23<1:47:08,  1.50it/s]                                                      1%|          | 100/9770 [04:23<1:47:08,  1.50it/s]  1%|          | 101/9770 [04:24<1:47:13,  1.50it/s]  1%|          | 102/9770 [04:24<1:46:01,  1.52it/s]  1%|          | 103/9770 [04:25<1:45:52,  1.52it/s]  1%|          | 104/9770 [04:25<1:45:07,  1.53it/s]  1%|          | 105/9770 [04:26<1:44:25,  1.54it/s]  1%|          | 106/9770 [04:27<1:45:38,  1.52it/s]  1%|          | 107/9770 [04:27<1:47:24,  1.50it/s]  1%|          | 108/9770 [04:28<1:47:13,  1.50it/s]  1%|          | 109/9770 [04:29<1:45:34,  1.53it/s]  1%|          | 110/9770 [04:29<1:44:52,  1.54it/s]                                                      1%|          | 110/9770 [04:29<1:44:52,  1.54it/s]  1%|          | 111/9770 [04:30<1:45:36,  1.52it/s]  1%|          | 112/9770 [04:31<1:46:35,  1.51it/s]  1%|          | 113/9770 [04:31<1:45:15,  1.53it/s]  1%|          | 114/9770 [04:32<1:45:39,  1.52it/s]  1%|          | 1
+0: {'loss': 0.7715, 'grad_norm': 0.7542528426325983, 'learning_rate': 1.628e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.01}
+0: {'loss': 0.7613, 'grad_norm': 0.6839745730828016, 'learning_rate': 1.7480000000000002e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.01}
+0: 15/9770 [04:33<1:45:48,  1.52it/s]  1%|          | 116/9770 [04:33<1:45:03,  1.53it/s]  1%|          | 117/9770 [04:34<1:44:45,  1.54it/s]  1%|          | 118/9770 [04:35<1:44:54,  1.53it/s]  1%|          | 119/9770 [04:35<1:47:23,  1.50it/s]  1%|          | 120/9770 [04:36<1:46:56,  1.50it/s]                                                      1%|          | 120/9770 [04:36<1:46:56,  1.50it/s]  1%|          | 121/9770 [04:37<1:47:02,  1.50it/s]  1%|          | 122/9770 [04:37<1:46:52,  1.50it/s]  1%|▏         | 123/9770 [04:38<1:45:42,  1.52it/s]  1%|▏         | 124/9770 [04:39<1:45:38,  1.52it/s]  1%|▏         | 125/9770 [04:39<1:45:58,  1.52it/s]  1%|▏         | 126/9770 [04:40<1:45:25,  1.52it/s]  1%|▏         | 127/9770 [04:41<1:47:00,  1.50it/s]  1%|▏         | 128/9770 [04:41<1:46:53,  1.50it/s]  1%|▏         | 129/9770 [04:42<1:46:26,  1.51it/s]  1%|▏         | 130/9770 [04:43<1:45:53,  1.52it/s]                                                      1%|▏         
+0: {'loss': 0.7713, 'grad_norm': 0.7564292058760052, 'learning_rate': 1.868e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.01}
+0: | 130/9770 [04:43<1:45:53,  1.52it/s]  1%|▏         | 131/9770 [04:43<1:46:42,  1.51it/s]  1%|▏         | 132/9770 [04:44<1:45:11,  1.53it/s]  1%|▏         | 133/9770 [04:45<1:45:09,  1.53it/s]  1%|▏         | 134/9770 [04:45<1:44:53,  1.53it/s]  1%|▏         | 135/9770 [04:46<1:46:01,  1.51it/s]  1%|▏         | 136/9770 [04:47<1:46:07,  1.51it/s]  1%|▏         | 137/9770 [04:47<1:46:17,  1.51it/s]  1%|▏         | 138/9770 [04:48<1:46:06,  1.51it/s]  1%|▏         | 139/9770 [04:49<1:46:02,  1.51it/s]  1%|▏         | 140/9770 [04:49<1:46:28,  1.51it/s]                                                      1%|▏         | 140/9770 [04:49<1:46:28,  1.51it/s]  1%|▏         | 141/9770 [04:50<1:46:15,  1.51it/s]  1%|▏         | 142/9770 [04:50<1:44:42,  1.53it/s]  1%|▏         | 143/9770 [04:51<1:44:43,  1.53it/s]  1%|▏         | 144/9770 [04:52<1:45:48,  1.52it/s]  1%|▏         | 145/9770 [04:52<1:44:41,  1.53it/s]  1%|▏         | 146/9770 [04:53<1:43:52,  1.54it/
+0: {'loss': 0.7643, 'grad_norm': 0.7226619797998896, 'learning_rate': 1.9880000000000003e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.02}
+0: {'loss': 0.7492, 'grad_norm': 0.7613324127555616, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.02}
+0: s]  2%|▏         | 147/9770 [04:54<1:44:20,  1.54it/s]  2%|▏         | 148/9770 [04:54<1:43:59,  1.54it/s]  2%|▏         | 149/9770 [04:55<1:44:02,  1.54it/s]  2%|▏         | 150/9770 [04:56<1:44:07,  1.54it/s]                                                      2%|▏         | 150/9770 [04:56<1:44:07,  1.54it/s]  2%|▏         | 151/9770 [04:56<1:44:38,  1.53it/s]  2%|▏         | 152/9770 [04:57<1:46:01,  1.51it/s]  2%|▏         | 153/9770 [04:58<1:44:37,  1.53it/s]  2%|▏         | 154/9770 [04:58<1:47:00,  1.50it/s]  2%|▏         | 155/9770 [04:59<1:47:45,  1.49it/s]  2%|▏         | 156/9770 [05:00<1:45:51,  1.51it/s]  2%|▏         | 157/9770 [05:00<1:45:55,  1.51it/s]  2%|▏         | 158/9770 [05:01<1:46:08,  1.51it/s]  2%|▏         | 159/9770 [05:02<1:45:53,  1.51it/s]  2%|▏         | 160/9770 [05:02<1:46:13,  1.51it/s]                                                      2%|▏         | 160/9770 [05:02<1:46:13,  1.51it/s]  2%|▏         | 161/9770 [05:0
+0: {'loss': 0.7834, 'grad_norm': 0.7665650060528073, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.02}
+0: 3<1:45:40,  1.52it/s]  2%|▏         | 162/9770 [05:04<1:45:51,  1.51it/s]  2%|▏         | 163/9770 [05:04<1:43:51,  1.54it/s]  2%|▏         | 164/9770 [05:05<1:44:18,  1.53it/s]  2%|▏         | 165/9770 [05:06<1:46:11,  1.51it/s]  2%|▏         | 166/9770 [05:06<1:45:37,  1.52it/s]  2%|▏         | 167/9770 [05:07<1:45:49,  1.51it/s]  2%|▏         | 168/9770 [05:08<1:45:30,  1.52it/s]  2%|▏         | 169/9770 [05:08<1:46:01,  1.51it/s]  2%|▏         | 170/9770 [05:09<1:44:34,  1.53it/s]                                                      2%|▏         | 170/9770 [05:09<1:44:34,  1.53it/s]  2%|▏         | 171/9770 [05:10<1:44:35,  1.53it/s]  2%|▏         | 172/9770 [05:10<1:44:52,  1.53it/s]  2%|▏         | 173/9770 [05:11<1:45:12,  1.52it/s]  2%|▏         | 174/9770 [05:12<1:45:12,  1.52it/s]  2%|▏         | 175/9770 [05:12<1:45:20,  1.52it/s]  2%|▏         | 176/9770 [05:13<1:45:43,  1.51it/s]  2%|▏         | 177/9770 [05:14<1:45:29,  1.52it/s]  2%|▏     
+0: {'loss': 0.7583, 'grad_norm': 0.7789047641431397, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.02}
+0: {'loss': 0.7455, 'grad_norm': 0.7569259235834734, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.02}
+0:     | 178/9770 [05:14<1:45:48,  1.51it/s]  2%|▏         | 179/9770 [05:15<1:43:56,  1.54it/s]  2%|▏         | 180/9770 [05:15<1:45:00,  1.52it/s]                                                      2%|▏         | 180/9770 [05:15<1:45:00,  1.52it/s]  2%|▏         | 181/9770 [05:16<1:45:35,  1.51it/s]  2%|▏         | 182/9770 [05:17<1:47:30,  1.49it/s]  2%|▏         | 183/9770 [05:18<1:47:09,  1.49it/s]  2%|▏         | 184/9770 [05:18<1:46:29,  1.50it/s]  2%|▏         | 185/9770 [05:19<1:45:22,  1.52it/s]  2%|▏         | 186/9770 [05:19<1:44:54,  1.52it/s]  2%|▏         | 187/9770 [05:20<1:45:28,  1.51it/s]  2%|▏         | 188/9770 [05:21<1:45:36,  1.51it/s]  2%|▏         | 189/9770 [05:21<1:44:47,  1.52it/s]  2%|▏         | 190/9770 [05:22<1:44:15,  1.53it/s]                                                      2%|▏         | 190/9770 [05:22<1:44:15,  1.53it/s]  2%|▏         | 191/9770 [05:23<1:43:28,  1.54it/s]  2%|▏         | 192/9770 [05:23<1:43:37,  1.54
+0: {'loss': 0.7728, 'grad_norm': 0.7471740052827762, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.02}
+0: it/s]  2%|▏         | 193/9770 [05:24<1:44:20,  1.53it/s]  2%|▏         | 194/9770 [05:25<1:45:16,  1.52it/s]  2%|▏         | 195/9770 [05:25<1:44:34,  1.53it/s]  2%|▏         | 196/9770 [05:26<1:44:41,  1.52it/s]  2%|▏         | 197/9770 [05:27<1:45:15,  1.52it/s]  2%|▏         | 198/9770 [05:27<1:46:12,  1.50it/s]  2%|▏         | 199/9770 [05:28<1:46:14,  1.50it/s]  2%|▏         | 200/9770 [05:29<1:47:07,  1.49it/s]                                                      2%|▏         | 200/9770 [05:29<1:47:07,  1.49it/s]  2%|▏         | 201/9770 [05:29<1:45:58,  1.51it/s]  2%|▏         | 202/9770 [05:30<1:45:16,  1.51it/s]  2%|▏         | 203/9770 [05:31<1:44:31,  1.53it/s]  2%|▏         | 204/9770 [05:31<1:44:06,  1.53it/s]  2%|▏         | 205/9770 [05:32<1:44:54,  1.52it/s]  2%|▏         | 206/9770 [05:33<1:44:03,  1.53it/s]  2%|▏         | 207/9770 [05:33<1:44:38,  1.52it/s]  2%|▏         | 208/9770 [05:34<1:44:41,  1.52it/s]  2%|▏         | 209/9770 [
+0: {'loss': 0.7572, 'grad_norm': 0.7374676617758451, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.02}
+0: {'loss': 0.7718, 'grad_norm': 0.7629597525187471, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.02}
+0: 05:35<1:44:24,  1.53it/s]  2%|▏         | 210/9770 [05:35<1:46:35,  1.49it/s]                                                      2%|▏         | 210/9770 [05:35<1:46:35,  1.49it/s]  2%|▏         | 211/9770 [05:36<1:47:22,  1.48it/s]  2%|▏         | 212/9770 [05:37<1:46:53,  1.49it/s]  2%|▏         | 213/9770 [05:37<1:46:07,  1.50it/s]  2%|▏         | 214/9770 [05:38<1:46:15,  1.50it/s]  2%|▏         | 215/9770 [05:39<1:45:28,  1.51it/s]  2%|▏         | 216/9770 [05:39<1:45:13,  1.51it/s]  2%|▏         | 217/9770 [05:40<1:45:21,  1.51it/s]  2%|▏         | 218/9770 [05:41<1:45:20,  1.51it/s]  2%|▏         | 219/9770 [05:41<1:45:47,  1.50it/s]  2%|▏         | 220/9770 [05:42<1:45:29,  1.51it/s]                                                      2%|▏         | 220/9770 [05:42<1:45:29,  1.51it/s]  2%|▏         | 221/9770 [05:43<1:45:35,  1.51it/s]  2%|▏         | 222/9770 [05:43<1:44:53,  1.52it/s]  2%|▏         | 223/9770 [05:44<1:44:01,  1.53it/s]  2%|▏  
+0: {'loss': 0.7598, 'grad_norm': 0.7531879365781883, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.02}
+0:        | 224/9770 [05:45<1:44:14,  1.53it/s]  2%|▏         | 225/9770 [05:45<1:44:57,  1.52it/s]  2%|▏         | 226/9770 [05:46<1:44:04,  1.53it/s]  2%|▏         | 227/9770 [05:47<1:44:43,  1.52it/s]  2%|▏         | 228/9770 [05:47<1:45:09,  1.51it/s]  2%|▏         | 229/9770 [05:48<1:45:40,  1.50it/s]  2%|▏         | 230/9770 [05:49<1:44:52,  1.52it/s]                                                      2%|▏         | 230/9770 [05:49<1:44:52,  1.52it/s]  2%|▏         | 231/9770 [05:49<1:45:24,  1.51it/s]  2%|▏         | 232/9770 [05:50<1:45:46,  1.50it/s]  2%|▏         | 233/9770 [05:51<1:44:50,  1.52it/s]  2%|▏         | 234/9770 [05:51<1:46:55,  1.49it/s]  2%|▏         | 235/9770 [05:52<1:45:40,  1.50it/s]  2%|▏         | 236/9770 [05:52<1:44:14,  1.52it/s]  2%|▏         | 237/9770 [05:53<1:43:38,  1.53it/s]  2%|▏         | 238/9770 [05:54<1:44:38,  1.52it/s]  2%|▏         | 239/9770 [05:55<1:46:53,  1.49it/s]  2%|▏         | 240/9770 [05:55<1:46:19,  
+0: {'loss': 0.7319, 'grad_norm': 0.6819478363087886, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.02}
+0: {'loss': 0.7609, 'grad_norm': 0.7426229652245786, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.03}
+0: 1.49it/s]                                                      2%|▏         | 240/9770 [05:55<1:46:19,  1.49it/s]  2%|▏         | 241/9770 [05:56<1:45:44,  1.50it/s]  2%|▏         | 242/9770 [05:56<1:45:18,  1.51it/s]  2%|▏         | 243/9770 [05:57<1:44:44,  1.52it/s]  2%|▏         | 244/9770 [05:58<1:44:06,  1.53it/s]  3%|▎         | 245/9770 [05:58<1:45:15,  1.51it/s]  3%|▎         | 246/9770 [05:59<1:45:14,  1.51it/s]  3%|▎         | 247/9770 [06:00<1:44:48,  1.51it/s]  3%|▎         | 248/9770 [06:00<1:45:46,  1.50it/s]  3%|▎         | 249/9770 [06:01<1:46:07,  1.50it/s]  3%|▎         | 250/9770 [06:02<1:46:26,  1.49it/s]                                                      3%|▎         | 250/9770 [06:02<1:46:26,  1.49it/s]  3%|▎         | 251/9770 [06:02<1:46:32,  1.49it/s]  3%|▎         | 252/9770 [06:03<1:45:56,  1.50it/s]  3%|▎         | 253/9770 [06:04<1:45:52,  1.50it/s]  3%|▎         | 254/9770 [06:04<1:44:43,  1.51it/s]  3%|▎         | 255/977
+0: {'loss': 0.7616, 'grad_norm': 0.751664873395375, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.03}
+0: {'loss': 0.7657, 'grad_norm': 0.7442777883826834, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.03}
+0: 0 [06:05<1:43:53,  1.53it/s]  3%|▎         | 256/9770 [06:06<1:44:15,  1.52it/s]  3%|▎         | 257/9770 [06:06<1:45:15,  1.51it/s]  3%|▎         | 258/9770 [06:07<1:45:19,  1.51it/s]  3%|▎         | 259/9770 [06:08<1:45:08,  1.51it/s]  3%|▎         | 260/9770 [06:08<1:45:02,  1.51it/s]                                                      3%|▎         | 260/9770 [06:08<1:45:02,  1.51it/s]  3%|▎         | 261/9770 [06:09<1:44:13,  1.52it/s]  3%|▎         | 262/9770 [06:10<1:43:43,  1.53it/s]  3%|▎         | 263/9770 [06:10<1:46:36,  1.49it/s]  3%|▎         | 264/9770 [06:11<1:45:26,  1.50it/s]  3%|▎         | 265/9770 [06:12<1:45:30,  1.50it/s]  3%|▎         | 266/9770 [06:12<1:44:44,  1.51it/s]  3%|▎         | 267/9770 [06:13<1:46:21,  1.49it/s]  3%|▎         | 268/9770 [06:14<1:46:29,  1.49it/s]  3%|▎         | 269/9770 [06:14<1:45:59,  1.49it/s]  3%|▎         | 270/9770 [06:15<1:46:00,  1.49it/s]                                                      3%|�
+0: {'loss': 0.7551, 'grad_norm': 0.7730411300291219, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.03}
+0: �         | 270/9770 [06:15<1:46:00,  1.49it/s]  3%|▎         | 271/9770 [06:16<1:44:45,  1.51it/s]  3%|▎         | 272/9770 [06:16<1:45:06,  1.51it/s]  3%|▎         | 273/9770 [06:17<1:44:32,  1.51it/s]  3%|▎         | 274/9770 [06:18<1:44:28,  1.51it/s]  3%|▎         | 275/9770 [06:18<1:44:30,  1.51it/s]  3%|▎         | 276/9770 [06:19<1:44:25,  1.52it/s]  3%|▎         | 277/9770 [06:20<1:44:31,  1.51it/s]  3%|▎         | 278/9770 [06:20<1:44:56,  1.51it/s]  3%|▎         | 279/9770 [06:21<1:44:18,  1.52it/s]  3%|▎         | 280/9770 [06:22<1:46:12,  1.49it/s]                                                      3%|▎         | 280/9770 [06:22<1:46:12,  1.49it/s]  3%|▎         | 281/9770 [06:22<1:46:05,  1.49it/s]  3%|▎         | 282/9770 [06:23<1:46:00,  1.49it/s]  3%|▎         | 283/9770 [06:24<1:45:47,  1.49it/s]  3%|▎         | 284/9770 [06:24<1:45:24,  1.50it/s]  3%|▎         | 285/9770 [06:25<1:45:29,  1.50it/s]  3%|▎         | 286/9770 [06:26<1:45:01
+0: {'loss': 0.7581, 'grad_norm': 0.6847058107679785, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.03}
+0: {'loss': 0.745, 'grad_norm': 0.6755495813728286, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.03}
+0: ,  1.50it/s]  3%|▎         | 287/9770 [06:26<1:45:07,  1.50it/s]  3%|▎         | 288/9770 [06:27<1:44:41,  1.51it/s]  3%|▎         | 289/9770 [06:28<1:43:55,  1.52it/s]  3%|▎         | 290/9770 [06:28<1:44:02,  1.52it/s]                                                      3%|▎         | 290/9770 [06:28<1:44:02,  1.52it/s]  3%|▎         | 291/9770 [06:29<1:44:04,  1.52it/s]  3%|▎         | 292/9770 [06:30<1:44:40,  1.51it/s]  3%|▎         | 293/9770 [06:30<1:44:14,  1.52it/s]  3%|▎         | 294/9770 [06:31<1:44:14,  1.52it/s]  3%|▎         | 295/9770 [06:32<1:44:52,  1.51it/s]  3%|▎         | 296/9770 [06:32<1:46:26,  1.48it/s]  3%|▎         | 297/9770 [06:33<1:45:22,  1.50it/s]  3%|▎         | 298/9770 [06:34<1:45:22,  1.50it/s]  3%|▎         | 299/9770 [06:34<1:46:29,  1.48it/s]  3%|▎         | 300/9770 [06:35<1:45:12,  1.50it/s]                                                      3%|▎         | 300/9770 [06:35<1:45:12,  1.50it/s]  3%|▎         | 301/
+0: {'loss': 0.7519, 'grad_norm': 0.7121174201695833, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.03}
+0: 9770 [06:36<1:44:57,  1.50it/s]  3%|▎         | 302/9770 [06:36<1:45:29,  1.50it/s]  3%|▎         | 303/9770 [06:37<1:45:01,  1.50it/s]  3%|▎         | 304/9770 [06:38<1:43:34,  1.52it/s]  3%|▎         | 305/9770 [06:38<1:45:34,  1.49it/s]  3%|▎         | 306/9770 [06:39<1:45:29,  1.50it/s]  3%|▎         | 307/9770 [06:40<1:44:54,  1.50it/s]  3%|▎         | 308/9770 [06:40<1:44:15,  1.51it/s]  3%|▎         | 309/9770 [06:41<1:43:38,  1.52it/s]  3%|▎         | 310/9770 [06:42<1:43:31,  1.52it/s]                                                      3%|▎         | 310/9770 [06:42<1:43:31,  1.52it/s]  3%|▎         | 311/9770 [06:42<1:44:11,  1.51it/s]  3%|▎         | 312/9770 [06:43<1:44:24,  1.51it/s]  3%|▎         | 313/9770 [06:44<1:44:22,  1.51it/s]  3%|▎         | 314/9770 [06:44<1:46:06,  1.49it/s]  3%|▎         | 315/9770 [06:45<1:45:12,  1.50it/s]  3%|▎         | 316/9770 [06:46<1:44:04,  1.51it/s]  3%|▎         | 317/9770 [06:46<1:43:53,  1.52it/s]  3
+0: {'loss': 0.7433, 'grad_norm': 0.653143837780577, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.03}
+0: {'loss': 0.7453, 'grad_norm': 0.7225255109223913, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.03}
+0: %|▎         | 318/9770 [06:47<1:42:38,  1.53it/s]  3%|▎         | 319/9770 [06:48<1:44:13,  1.51it/s]  3%|▎         | 320/9770 [06:48<1:43:26,  1.52it/s]                                                      3%|▎         | 320/9770 [06:48<1:43:26,  1.52it/s]  3%|▎         | 321/9770 [06:49<1:44:16,  1.51it/s]  3%|▎         | 322/9770 [06:50<1:46:05,  1.48it/s]  3%|▎         | 323/9770 [06:50<1:44:39,  1.50it/s]  3%|▎         | 324/9770 [06:51<1:44:43,  1.50it/s]  3%|▎         | 325/9770 [06:52<1:44:32,  1.51it/s]  3%|▎         | 326/9770 [06:52<1:43:44,  1.52it/s]  3%|▎         | 327/9770 [06:53<1:43:23,  1.52it/s]  3%|▎         | 328/9770 [06:54<1:44:13,  1.51it/s]  3%|▎         | 329/9770 [06:54<1:46:19,  1.48it/s]  3%|▎         | 330/9770 [06:55<1:47:41,  1.46it/s]                                                      3%|▎         | 330/9770 [06:55<1:47:41,  1.46it/s]  3%|▎         | 331/9770 [06:56<1:46:23,  1.48it/s]  3%|▎         | 332/9770 [06:56<1:44
+0: {'loss': 0.7677, 'grad_norm': 0.7502311827608458, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.03}
+0: :46,  1.50it/s]  3%|▎         | 333/9770 [06:57<1:44:12,  1.51it/s]  3%|▎         | 334/9770 [06:58<1:43:37,  1.52it/s]  3%|▎         | 335/9770 [06:58<1:43:33,  1.52it/s]  3%|▎         | 336/9770 [06:59<1:44:12,  1.51it/s]  3%|▎         | 337/9770 [07:00<1:43:03,  1.53it/s]  3%|▎         | 338/9770 [07:00<1:44:01,  1.51it/s]  3%|▎         | 339/9770 [07:01<1:43:31,  1.52it/s]  3%|▎         | 340/9770 [07:02<1:44:25,  1.51it/s]                                                      3%|▎         | 340/9770 [07:02<1:44:25,  1.51it/s]  3%|▎         | 341/9770 [07:02<1:44:34,  1.50it/s]  4%|▎         | 342/9770 [07:03<1:45:47,  1.49it/s]  4%|▎         | 343/9770 [07:04<1:45:05,  1.50it/s]  4%|▎         | 344/9770 [07:04<1:44:42,  1.50it/s]  4%|▎         | 345/9770 [07:05<1:43:48,  1.51it/s]  4%|▎         | 346/9770 [07:06<1:43:52,  1.51it/s]  4%|▎         | 347/9770 [07:06<1:43:27,  1.52it/s]  4%|▎         | 348/9770 [07:07<1:43:46,  1.51it/s]  4%|▎         | 
+0: {'loss': 0.761, 'grad_norm': 0.7389752596875864, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.04}
+0: {'loss': 0.7436, 'grad_norm': 0.7200512748129708, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.04}
+0: 349/9770 [07:08<1:44:29,  1.50it/s]  4%|▎         | 350/9770 [07:08<1:43:57,  1.51it/s]                                                      4%|▎         | 350/9770 [07:08<1:43:57,  1.51it/s]  4%|▎         | 351/9770 [07:09<1:44:23,  1.50it/s]  4%|▎         | 352/9770 [07:10<1:44:31,  1.50it/s]  4%|▎         | 353/9770 [07:10<1:43:52,  1.51it/s]  4%|▎         | 354/9770 [07:11<1:41:59,  1.54it/s]  4%|▎         | 355/9770 [07:11<1:42:53,  1.53it/s]  4%|▎         | 356/9770 [07:12<1:43:48,  1.51it/s]  4%|▎         | 357/9770 [07:13<1:45:45,  1.48it/s]  4%|▎         | 358/9770 [07:14<1:44:24,  1.50it/s]  4%|▎         | 359/9770 [07:14<1:44:19,  1.50it/s]  4%|▎         | 360/9770 [07:15<1:43:09,  1.52it/s]                                                      4%|▎         | 360/9770 [07:15<1:43:09,  1.52it/s]  4%|▎         | 361/9770 [07:15<1:43:47,  1.51it/s]  4%|▎         | 362/9770 [07:16<1:43:52,  1.51it/s]  4%|▎         | 363/9770 [07:17<1:42:15,  1.53it/s]
+0: {'loss': 0.7516, 'grad_norm': 0.6817309664315718, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.04}
+0:   4%|▎         | 364/9770 [07:17<1:43:19,  1.52it/s]  4%|▎         | 365/9770 [07:18<1:42:30,  1.53it/s]  4%|▎         | 366/9770 [07:19<1:41:39,  1.54it/s]  4%|▍         | 367/9770 [07:19<1:42:35,  1.53it/s]  4%|▍         | 368/9770 [07:20<1:42:08,  1.53it/s]  4%|▍         | 369/9770 [07:21<1:43:15,  1.52it/s]  4%|▍         | 370/9770 [07:21<1:42:24,  1.53it/s]                                                      4%|▍         | 370/9770 [07:21<1:42:24,  1.53it/s]  4%|▍         | 371/9770 [07:22<1:42:32,  1.53it/s]  4%|▍         | 372/9770 [07:23<1:44:13,  1.50it/s]  4%|▍         | 373/9770 [07:23<1:42:52,  1.52it/s]  4%|▍         | 374/9770 [07:24<1:42:44,  1.52it/s]  4%|▍         | 375/9770 [07:25<1:42:24,  1.53it/s]  4%|▍         | 376/9770 [07:25<1:43:07,  1.52it/s]  4%|▍         | 377/9770 [07:26<1:43:18,  1.52it/s]  4%|▍         | 378/9770 [07:27<1:45:10,  1.49it/s]  4%|▍         | 379/9770 [07:27<1:45:14,  1.49it/s]  4%|▍         | 380/9770 [07:28<
+0: {'loss': 0.7586, 'grad_norm': 0.7021451517592754, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.04}
+0: {'loss': 0.7498, 'grad_norm': 0.7142724734303186, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.04}
+0: 1:44:10,  1.50it/s]                                                      4%|▍         | 380/9770 [07:28<1:44:10,  1.50it/s]  4%|▍         | 381/9770 [07:29<1:43:38,  1.51it/s]  4%|▍         | 382/9770 [07:29<1:43:54,  1.51it/s]  4%|▍         | 383/9770 [07:30<1:43:00,  1.52it/s]  4%|▍         | 384/9770 [07:31<1:42:14,  1.53it/s]  4%|▍         | 385/9770 [07:31<1:41:55,  1.53it/s]  4%|▍         | 386/9770 [07:32<1:43:07,  1.52it/s]  4%|▍         | 387/9770 [07:33<1:43:39,  1.51it/s]  4%|▍         | 388/9770 [07:33<1:44:15,  1.50it/s]  4%|▍         | 389/9770 [07:34<1:44:06,  1.50it/s]  4%|▍         | 390/9770 [07:35<1:43:29,  1.51it/s]                                                      4%|▍         | 390/9770 [07:35<1:43:29,  1.51it/s]  4%|▍         | 391/9770 [07:35<1:42:31,  1.52it/s]  4%|▍         | 392/9770 [07:36<1:42:00,  1.53it/s]  4%|▍         | 393/9770 [07:37<1:42:37,  1.52it/s]  4%|▍         | 394/9770 [07:37<1:42:41,  1.52it/s]  4%|▍        
+0: {'loss': 0.7496, 'grad_norm': 0.6653368137802452, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.04}
+0:  | 395/9770 [07:38<1:42:23,  1.53it/s]  4%|▍         | 396/9770 [07:39<1:42:41,  1.52it/s]  4%|▍         | 397/9770 [07:39<1:42:15,  1.53it/s]  4%|▍         | 398/9770 [07:40<1:40:55,  1.55it/s]  4%|▍         | 399/9770 [07:40<1:40:55,  1.55it/s]  4%|▍         | 400/9770 [07:41<1:40:41,  1.55it/s]                                                      4%|▍         | 400/9770 [07:41<1:40:41,  1.55it/s]  4%|▍         | 401/9770 [07:42<1:41:42,  1.54it/s]  4%|▍         | 402/9770 [07:42<1:40:55,  1.55it/s]  4%|▍         | 403/9770 [07:43<1:41:58,  1.53it/s]  4%|▍         | 404/9770 [07:44<1:42:19,  1.53it/s]  4%|▍         | 405/9770 [07:44<1:42:25,  1.52it/s]  4%|▍         | 406/9770 [07:45<1:42:08,  1.53it/s]  4%|▍         | 407/9770 [07:46<1:42:29,  1.52it/s]  4%|▍         | 408/9770 [07:46<1:42:37,  1.52it/s]  4%|▍         | 409/9770 [07:47<1:42:34,  1.52it/s]  4%|▍         | 410/9770 [07:48<1:44:44,  1.49it/s]                                                   
+0: {'loss': 0.7622, 'grad_norm': 0.6958621507498385, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.04}
+0: {'loss': 0.7415, 'grad_norm': 0.6752777941808835, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.04}
+0:    4%|▍         | 410/9770 [07:48<1:44:44,  1.49it/s]  4%|▍         | 411/9770 [07:48<1:43:21,  1.51it/s]  4%|▍         | 412/9770 [07:49<1:43:33,  1.51it/s]  4%|▍         | 413/9770 [07:50<1:42:09,  1.53it/s]  4%|▍         | 414/9770 [07:50<1:42:31,  1.52it/s]  4%|▍         | 415/9770 [07:51<1:43:11,  1.51it/s]  4%|▍         | 416/9770 [07:52<1:43:12,  1.51it/s]  4%|▍         | 417/9770 [07:52<1:43:02,  1.51it/s]  4%|▍         | 418/9770 [07:53<1:42:45,  1.52it/s]  4%|▍         | 419/9770 [07:54<1:45:05,  1.48it/s]  4%|▍         | 420/9770 [07:54<1:43:43,  1.50it/s]                                                      4%|▍         | 420/9770 [07:54<1:43:43,  1.50it/s]  4%|▍         | 421/9770 [07:55<1:44:07,  1.50it/s]  4%|▍         | 422/9770 [07:56<1:44:14,  1.49it/s]  4%|▍         | 423/9770 [07:56<1:44:06,  1.50it/s]  4%|▍         | 424/9770 [07:57<1:44:26,  1.49it/s]  4%|▍         | 425/9770 [07:58<1:44:00,  1.50it/s]  4%|▍         | 426/9770 [07:
+0: {'loss': 0.7389, 'grad_norm': 0.6633411929554945, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.04}
+0: {'loss': 0.7134, 'grad_norm': 0.7015557044207102, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.05}
+0: 58<1:43:08,  1.51it/s]  4%|▍         | 427/9770 [07:59<1:43:20,  1.51it/s]  4%|▍         | 428/9770 [08:00<1:42:44,  1.52it/s]  4%|▍         | 429/9770 [08:00<1:42:20,  1.52it/s]  4%|▍         | 430/9770 [08:01<1:42:05,  1.52it/s]                                                      4%|▍         | 430/9770 [08:01<1:42:05,  1.52it/s]  4%|▍         | 431/9770 [08:02<1:41:06,  1.54it/s]  4%|▍         | 432/9770 [08:02<1:42:05,  1.52it/s]  4%|▍         | 433/9770 [08:03<1:42:11,  1.52it/s]  4%|▍         | 434/9770 [08:04<1:41:40,  1.53it/s]  4%|▍         | 435/9770 [08:04<1:42:36,  1.52it/s]  4%|▍         | 436/9770 [08:05<1:42:55,  1.51it/s]  4%|▍         | 437/9770 [08:06<1:42:39,  1.52it/s]  4%|▍         | 438/9770 [08:06<1:42:44,  1.51it/s]  4%|▍         | 439/9770 [08:07<1:42:59,  1.51it/s]  5%|▍         | 440/9770 [08:08<1:43:58,  1.50it/s]                                                      5%|▍         | 440/9770 [08:08<1:43:58,  1.50it/s]  5%|▍     
+0: {'loss': 0.7501, 'grad_norm': 0.6841982281051456, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.05}
+0:     | 441/9770 [08:08<1:43:41,  1.50it/s]  5%|▍         | 442/9770 [08:09<1:44:31,  1.49it/s]  5%|▍         | 443/9770 [08:10<1:44:01,  1.49it/s]  5%|▍         | 444/9770 [08:10<1:42:07,  1.52it/s]  5%|▍         | 445/9770 [08:11<1:42:47,  1.51it/s]  5%|▍         | 446/9770 [08:12<1:42:16,  1.52it/s]  5%|▍         | 447/9770 [08:12<1:42:29,  1.52it/s]  5%|▍         | 448/9770 [08:13<1:42:29,  1.52it/s]  5%|▍         | 449/9770 [08:14<1:42:31,  1.52it/s]  5%|▍         | 450/9770 [08:14<1:41:59,  1.52it/s]                                                      5%|▍         | 450/9770 [08:14<1:41:59,  1.52it/s]  5%|▍         | 451/9770 [08:15<1:41:54,  1.52it/s]  5%|▍         | 452/9770 [08:15<1:42:02,  1.52it/s]  5%|▍         | 453/9770 [08:16<1:42:22,  1.52it/s]  5%|▍         | 454/9770 [08:17<1:42:21,  1.52it/s]  5%|▍         | 455/9770 [08:17<1:42:34,  1.51it/s]  5%|▍         | 456/9770 [08:18<1:42:52,  1.51it/s]  5%|▍         | 457/9770 [08:19<1:42:21,  1.5
+0: {'loss': 0.7246, 'grad_norm': 0.743106387320585, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.05}
+0: {'loss': 0.7371, 'grad_norm': 0.6879097294479487, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.05}
+0: 2it/s]  5%|▍         | 458/9770 [08:19<1:42:23,  1.52it/s]  5%|▍         | 459/9770 [08:20<1:42:04,  1.52it/s]  5%|▍         | 460/9770 [08:21<1:42:02,  1.52it/s]                                                      5%|▍         | 460/9770 [08:21<1:42:02,  1.52it/s]  5%|▍         | 461/9770 [08:21<1:42:33,  1.51it/s]  5%|▍         | 462/9770 [08:22<1:42:16,  1.52it/s]  5%|▍         | 463/9770 [08:23<1:42:12,  1.52it/s]  5%|▍         | 464/9770 [08:23<1:41:18,  1.53it/s]  5%|▍         | 465/9770 [08:24<1:41:16,  1.53it/s]  5%|▍         | 466/9770 [08:25<1:41:07,  1.53it/s]  5%|▍         | 467/9770 [08:25<1:40:59,  1.54it/s]  5%|▍         | 468/9770 [08:26<1:41:46,  1.52it/s]  5%|▍         | 469/9770 [08:27<1:40:47,  1.54it/s]  5%|▍         | 470/9770 [08:27<1:41:29,  1.53it/s]                                                      5%|▍         | 470/9770 [08:27<1:41:29,  1.53it/s]  5%|▍         | 471/9770 [08:28<1:43:49,  1.49it/s]  5%|▍         | 472/9770 [
+0: {'loss': 0.7705, 'grad_norm': 0.7042277183837815, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.05}
+0: 08:29<1:44:00,  1.49it/s]  5%|▍         | 473/9770 [08:29<1:43:20,  1.50it/s]  5%|▍         | 474/9770 [08:30<1:42:13,  1.52it/s]  5%|▍         | 475/9770 [08:31<1:41:53,  1.52it/s]  5%|▍         | 476/9770 [08:31<1:41:24,  1.53it/s]  5%|▍         | 477/9770 [08:32<1:40:58,  1.53it/s]  5%|▍         | 478/9770 [08:33<1:41:36,  1.52it/s]  5%|▍         | 479/9770 [08:33<1:42:06,  1.52it/s]  5%|▍         | 480/9770 [08:34<1:42:07,  1.52it/s]                                                      5%|▍         | 480/9770 [08:34<1:42:07,  1.52it/s]  5%|▍         | 481/9770 [08:35<1:41:52,  1.52it/s]  5%|▍         | 482/9770 [08:35<1:41:48,  1.52it/s]  5%|▍         | 483/9770 [08:36<1:42:02,  1.52it/s]  5%|▍         | 484/9770 [08:37<1:42:28,  1.51it/s]  5%|▍         | 485/9770 [08:37<1:41:35,  1.52it/s]  5%|▍         | 486/9770 [08:38<1:41:43,  1.52it/s]  5%|▍         | 487/9770 [08:39<1:41:05,  1.53it/s]  5%|▍         | 488/9770 [08:39<1:41:20,  1.53it/s]  5%|▌ 
+0: {'loss': 0.75, 'grad_norm': 0.6712405479719712, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.05}
+0: {'loss': 0.7455, 'grad_norm': 0.7019863088088841, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.05}
+0:         | 489/9770 [08:40<1:41:49,  1.52it/s]  5%|▌         | 490/9770 [08:40<1:41:19,  1.53it/s]                                                      5%|▌         | 490/9770 [08:40<1:41:19,  1.53it/s]  5%|▌         | 491/9770 [08:41<1:41:02,  1.53it/s]  5%|▌         | 492/9770 [08:42<1:41:37,  1.52it/s]  5%|▌         | 493/9770 [08:42<1:42:17,  1.51it/s]  5%|▌         | 494/9770 [08:43<1:41:55,  1.52it/s]  5%|▌         | 495/9770 [08:44<1:41:59,  1.52it/s]  5%|▌         | 496/9770 [08:44<1:42:19,  1.51it/s]  5%|▌         | 497/9770 [08:45<1:42:55,  1.50it/s]  5%|▌         | 498/9770 [08:46<1:42:41,  1.50it/s]  5%|▌         | 499/9770 [08:46<1:43:04,  1.50it/s]  5%|▌         | 500/9770 [08:47<1:43:01,  1.50it/s]                                                      5%|▌         | 500/9770 [08:47<1:43:01,  1.50it/s]  5%|▌         | 501/9770 [08:48<1:43:17,  1.50it/s]  5%|▌         | 502/9770 [08:48<1:43:17,  1.50it/s]  5%|▌         | 503/9770 [08:49<1:42:16,  
+0: {'loss': 0.7429, 'grad_norm': 0.6977512296171098, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.05}
+0: 1.51it/s]  5%|▌         | 504/9770 [08:50<1:42:45,  1.50it/s]  5%|▌         | 505/9770 [08:50<1:43:38,  1.49it/s]  5%|▌         | 506/9770 [08:51<1:42:40,  1.50it/s]  5%|▌         | 507/9770 [08:52<1:42:56,  1.50it/s]  5%|▌         | 508/9770 [08:52<1:42:11,  1.51it/s]  5%|▌         | 509/9770 [08:53<1:42:52,  1.50it/s]  5%|▌         | 510/9770 [08:54<1:43:22,  1.49it/s]                                                      5%|▌         | 510/9770 [08:54<1:43:22,  1.49it/s]  5%|▌         | 511/9770 [08:54<1:41:47,  1.52it/s]  5%|▌         | 512/9770 [08:55<1:41:17,  1.52it/s]  5%|▌         | 513/9770 [08:56<1:43:57,  1.48it/s]  5%|▌         | 514/9770 [08:56<1:41:44,  1.52it/s]  5%|▌         | 515/9770 [08:57<1:41:26,  1.52it/s]  5%|▌         | 516/9770 [08:58<1:41:49,  1.51it/s]  5%|▌         | 517/9770 [08:58<1:41:50,  1.51it/s]  5%|▌         | 518/9770 [08:59<1:41:55,  1.51it/s]  5%|▌         | 519/9770 [09:00<1:42:15,  1.51it/s]  5%|▌         | 520/97
+0: {'loss': 0.7222, 'grad_norm': 1.2757794987077393, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.05}
+0: {'loss': 0.7534, 'grad_norm': 0.7285140922395443, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.05}
+0: 70 [09:00<1:41:56,  1.51it/s]                                                      5%|▌         | 520/9770 [09:00<1:41:56,  1.51it/s]  5%|▌         | 521/9770 [09:01<1:41:45,  1.51it/s]  5%|▌         | 522/9770 [09:02<1:42:03,  1.51it/s]  5%|▌         | 523/9770 [09:02<1:43:11,  1.49it/s]  5%|▌         | 524/9770 [09:03<1:42:24,  1.50it/s]  5%|▌         | 525/9770 [09:04<1:41:35,  1.52it/s]  5%|▌         | 526/9770 [09:04<1:41:52,  1.51it/s]  5%|▌         | 527/9770 [09:05<1:41:45,  1.51it/s]  5%|▌         | 528/9770 [09:06<1:44:04,  1.48it/s]  5%|▌         | 529/9770 [09:06<1:42:52,  1.50it/s]  5%|▌         | 530/9770 [09:07<1:41:50,  1.51it/s]                                                      5%|▌         | 530/9770 [09:07<1:41:50,  1.51it/s]  5%|▌         | 531/9770 [09:08<1:43:08,  1.49it/s]  5%|▌         | 532/9770 [09:08<1:42:43,  1.50it/s]  5%|▌         | 533/9770 [09:09<1:42:11,  1.51it/s]  5%|▌         | 534/9770 [09:10<1:42:27,  1.50it/s]  5%|�
+0: {'loss': 0.7517, 'grad_norm': 0.7201411269652276, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.06}
+0: ��         | 535/9770 [09:10<1:41:02,  1.52it/s]  5%|▌         | 536/9770 [09:11<1:40:19,  1.53it/s]  5%|▌         | 537/9770 [09:12<1:40:25,  1.53it/s]  6%|▌         | 538/9770 [09:12<1:42:25,  1.50it/s]  6%|▌         | 539/9770 [09:13<1:40:50,  1.53it/s]  6%|▌         | 540/9770 [09:14<1:41:19,  1.52it/s]                                                      6%|▌         | 540/9770 [09:14<1:41:19,  1.52it/s]  6%|▌         | 541/9770 [09:14<1:40:41,  1.53it/s]  6%|▌         | 542/9770 [09:15<1:41:55,  1.51it/s]  6%|▌         | 543/9770 [09:16<1:42:04,  1.51it/s]  6%|▌         | 544/9770 [09:16<1:41:07,  1.52it/s]  6%|▌         | 545/9770 [09:17<1:40:42,  1.53it/s]  6%|▌         | 546/9770 [09:18<1:41:17,  1.52it/s]  6%|▌         | 547/9770 [09:18<1:41:10,  1.52it/s]  6%|▌         | 548/9770 [09:19<1:41:36,  1.51it/s]  6%|▌         | 549/9770 [09:20<1:41:40,  1.51it/s]  6%|▌         | 550/9770 [09:20<1:40:48,  1.52it/s]                                         
+0: {'loss': 0.7417, 'grad_norm': 0.7340283812534444, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.06}
+0: {'loss': 0.7547, 'grad_norm': 0.7217522787519168, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.06}
+0:              6%|▌         | 550/9770 [09:20<1:40:48,  1.52it/s]  6%|▌         | 551/9770 [09:21<1:39:50,  1.54it/s]  6%|▌         | 552/9770 [09:21<1:39:54,  1.54it/s]  6%|▌         | 553/9770 [09:22<1:40:01,  1.54it/s]  6%|▌         | 554/9770 [09:23<1:40:25,  1.53it/s]  6%|▌         | 555/9770 [09:23<1:40:41,  1.53it/s]  6%|▌         | 556/9770 [09:24<1:39:57,  1.54it/s]  6%|▌         | 557/9770 [09:25<1:40:39,  1.53it/s]  6%|▌         | 558/9770 [09:25<1:40:49,  1.52it/s]  6%|▌         | 559/9770 [09:26<1:41:05,  1.52it/s]  6%|▌         | 560/9770 [09:27<1:41:27,  1.51it/s]                                                      6%|▌         | 560/9770 [09:27<1:41:27,  1.51it/s]  6%|▌         | 561/9770 [09:27<1:43:00,  1.49it/s]  6%|▌         | 562/9770 [09:28<1:42:23,  1.50it/s]  6%|▌         | 563/9770 [09:29<1:41:11,  1.52it/s]  6%|▌         | 564/9770 [09:29<1:42:11,  1.50it/s]  6%|▌         | 565/9770 [09:30<1:41:55,  1.51it/s]  6%|▌         | 566
+0: {'loss': 0.7541, 'grad_norm': 0.7104274644996863, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.06}
+0: {'loss': 0.7558, 'grad_norm': 0.6883659870653294, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.06}
+0: /9770 [09:31<1:44:00,  1.47it/s]  6%|▌         | 567/9770 [09:31<1:42:58,  1.49it/s]  6%|▌         | 568/9770 [09:32<1:44:34,  1.47it/s]  6%|▌         | 569/9770 [09:33<1:44:32,  1.47it/s]  6%|▌         | 570/9770 [09:33<1:42:01,  1.50it/s]                                                      6%|▌         | 570/9770 [09:33<1:42:01,  1.50it/s]  6%|▌         | 571/9770 [09:34<1:42:49,  1.49it/s]  6%|▌         | 572/9770 [09:35<1:41:55,  1.50it/s]  6%|▌         | 573/9770 [09:36<1:43:47,  1.48it/s]  6%|▌         | 574/9770 [09:36<1:42:59,  1.49it/s]  6%|▌         | 575/9770 [09:37<1:43:10,  1.49it/s]  6%|▌         | 576/9770 [09:38<1:42:56,  1.49it/s]  6%|▌         | 577/9770 [09:38<1:41:56,  1.50it/s]  6%|▌         | 578/9770 [09:39<1:41:42,  1.51it/s]  6%|▌         | 579/9770 [09:40<1:41:39,  1.51it/s]  6%|▌         | 580/9770 [09:40<1:40:12,  1.53it/s]                                                      6%|▌         | 580/9770 [09:40<1:40:12,  1.53it/s]  6
+0: {'loss': 0.7368, 'grad_norm': 0.7245055812247989, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.06}
+0: %|▌         | 581/9770 [09:41<1:40:27,  1.52it/s]  6%|▌         | 582/9770 [09:41<1:40:36,  1.52it/s]  6%|▌         | 583/9770 [09:42<1:40:59,  1.52it/s]  6%|▌         | 584/9770 [09:43<1:40:15,  1.53it/s]  6%|▌         | 585/9770 [09:43<1:39:23,  1.54it/s]  6%|▌         | 586/9770 [09:44<1:42:01,  1.50it/s]  6%|▌         | 587/9770 [09:45<1:40:46,  1.52it/s]  6%|▌         | 588/9770 [09:45<1:40:55,  1.52it/s]  6%|▌         | 589/9770 [09:46<1:41:07,  1.51it/s]  6%|▌         | 590/9770 [09:47<1:40:33,  1.52it/s]                                                      6%|▌         | 590/9770 [09:47<1:40:33,  1.52it/s]  6%|▌         | 591/9770 [09:47<1:41:41,  1.50it/s]  6%|▌         | 592/9770 [09:48<1:41:04,  1.51it/s]  6%|▌         | 593/9770 [09:49<1:42:12,  1.50it/s]  6%|▌         | 594/9770 [09:49<1:42:11,  1.50it/s]  6%|▌         | 595/9770 [09:50<1:42:15,  1.50it/s]  6%|▌         | 596/9770 [09:51<1:42:33,  1.49it/s]  6%|▌         | 597/9770 [09:51<1:4
+0: {'loss': 0.7076, 'grad_norm': 0.7106477121472634, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.06}
+0: {'loss': 0.7788, 'grad_norm': 0.7733853626920865, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.06}
+0: 3:53,  1.47it/s]  6%|▌         | 598/9770 [09:52<1:43:09,  1.48it/s]  6%|▌         | 599/9770 [09:53<1:43:47,  1.47it/s]  6%|▌         | 600/9770 [09:53<1:42:56,  1.48it/s]                                                      6%|▌         | 600/9770 [09:53<1:42:56,  1.48it/s]  6%|▌         | 601/9770 [09:54<1:42:49,  1.49it/s]  6%|▌         | 602/9770 [09:55<1:41:43,  1.50it/s]  6%|▌         | 603/9770 [09:55<1:41:26,  1.51it/s]  6%|▌         | 604/9770 [09:56<1:43:18,  1.48it/s]  6%|▌         | 605/9770 [09:57<1:42:18,  1.49it/s]  6%|▌         | 606/9770 [09:57<1:41:31,  1.50it/s]  6%|▌         | 607/9770 [09:58<1:42:01,  1.50it/s]  6%|▌         | 608/9770 [09:59<1:40:57,  1.51it/s]  6%|▌         | 609/9770 [09:59<1:40:49,  1.51it/s]  6%|▌         | 610/9770 [10:00<1:40:01,  1.53it/s]                                                      6%|▌         | 610/9770 [10:00<1:40:01,  1.53it/s]  6%|▋         | 611/9770 [10:01<1:40:54,  1.51it/s]  6%|▋         | 
+0: {'loss': 0.7402, 'grad_norm': 0.7477003820910803, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.06}
+0: 612/9770 [10:01<1:39:56,  1.53it/s]  6%|▋         | 613/9770 [10:02<1:42:30,  1.49it/s]  6%|▋         | 614/9770 [10:03<1:41:27,  1.50it/s]  6%|▋         | 615/9770 [10:03<1:40:58,  1.51it/s]  6%|▋         | 616/9770 [10:04<1:41:37,  1.50it/s]  6%|▋         | 617/9770 [10:05<1:41:14,  1.51it/s]  6%|▋         | 618/9770 [10:05<1:41:34,  1.50it/s]  6%|▋         | 619/9770 [10:06<1:41:30,  1.50it/s]  6%|▋         | 620/9770 [10:07<1:41:50,  1.50it/s]                                                      6%|▋         | 620/9770 [10:07<1:41:50,  1.50it/s]  6%|▋         | 621/9770 [10:07<1:41:26,  1.50it/s]  6%|▋         | 622/9770 [10:08<1:41:16,  1.51it/s]  6%|▋         | 623/9770 [10:09<1:41:42,  1.50it/s]  6%|▋         | 624/9770 [10:09<1:41:39,  1.50it/s]  6%|▋         | 625/9770 [10:10<1:41:54,  1.50it/s]  6%|▋         | 626/9770 [10:11<1:41:47,  1.50it/s]  6%|▋         | 627/9770 [10:11<1:41:04,  1.51it/s]  6%|▋         | 628/9770 [10:12<1:41:42,  1.50it/s]
+0: {'loss': 0.7299, 'grad_norm': 0.7562869568357816, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.06}
+0: {'loss': 0.7478, 'grad_norm': 0.6714961138460858, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.07}
+0:   6%|▋         | 629/9770 [10:13<1:41:40,  1.50it/s]  6%|▋         | 630/9770 [10:13<1:40:38,  1.51it/s]                                                      6%|▋         | 630/9770 [10:13<1:40:38,  1.51it/s]  6%|▋         | 631/9770 [10:14<1:41:37,  1.50it/s]  6%|▋         | 632/9770 [10:15<1:41:35,  1.50it/s]  6%|▋         | 633/9770 [10:15<1:41:31,  1.50it/s]  6%|▋         | 634/9770 [10:16<1:43:06,  1.48it/s]  6%|▋         | 635/9770 [10:17<1:44:13,  1.46it/s]  7%|▋         | 636/9770 [10:17<1:43:02,  1.48it/s]  7%|▋         | 637/9770 [10:18<1:41:31,  1.50it/s]  7%|▋         | 638/9770 [10:19<1:40:17,  1.52it/s]  7%|▋         | 639/9770 [10:19<1:39:55,  1.52it/s]  7%|▋         | 640/9770 [10:20<1:39:12,  1.53it/s]                                                      7%|▋         | 640/9770 [10:20<1:39:12,  1.53it/s]  7%|▋         | 641/9770 [10:21<1:40:04,  1.52it/s]  7%|▋         | 642/9770 [10:21<1:39:57,  1.52it/s]  7%|▋         | 643/9770 [10:22<
+0: {'loss': 0.7528, 'grad_norm': 0.6820183688641436, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.07}
+0: 1:40:14,  1.52it/s]  7%|▋         | 644/9770 [10:23<1:41:19,  1.50it/s]  7%|▋         | 645/9770 [10:23<1:42:18,  1.49it/s]  7%|▋         | 646/9770 [10:24<1:41:38,  1.50it/s]  7%|▋         | 647/9770 [10:25<1:40:51,  1.51it/s]  7%|▋         | 648/9770 [10:25<1:42:01,  1.49it/s]  7%|▋         | 649/9770 [10:26<1:41:38,  1.50it/s]  7%|▋         | 650/9770 [10:27<1:39:55,  1.52it/s]                                                      7%|▋         | 650/9770 [10:27<1:39:55,  1.52it/s]  7%|▋         | 651/9770 [10:27<1:40:25,  1.51it/s]  7%|▋         | 652/9770 [10:28<1:40:41,  1.51it/s]  7%|▋         | 653/9770 [10:29<1:40:39,  1.51it/s]  7%|▋         | 654/9770 [10:29<1:41:06,  1.50it/s]  7%|▋         | 655/9770 [10:30<1:41:27,  1.50it/s]  7%|▋         | 656/9770 [10:31<1:40:11,  1.52it/s]  7%|▋         | 657/9770 [10:31<1:40:39,  1.51it/s]  7%|▋         | 658/9770 [10:32<1:40:01,  1.52it/s]  7%|▋         | 659/9770 [10:33<1:40:07,  1.52it/s]  7%|▋       
+0: {'loss': 0.7496, 'grad_norm': 0.732552118023361, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.07}
+0: {'loss': 0.7128, 'grad_norm': 0.666259934984153, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.07}
+0:   | 660/9770 [10:33<1:40:19,  1.51it/s]                                                      7%|▋         | 660/9770 [10:33<1:40:19,  1.51it/s]  7%|▋         | 661/9770 [10:34<1:39:51,  1.52it/s]  7%|▋         | 662/9770 [10:35<1:40:11,  1.52it/s]  7%|▋         | 663/9770 [10:35<1:39:50,  1.52it/s]  7%|▋         | 664/9770 [10:36<1:40:11,  1.51it/s]  7%|▋         | 665/9770 [10:37<1:39:25,  1.53it/s]  7%|▋         | 666/9770 [10:37<1:39:56,  1.52it/s]  7%|▋         | 667/9770 [10:38<1:38:38,  1.54it/s]  7%|▋         | 668/9770 [10:39<1:38:48,  1.54it/s]  7%|▋         | 669/9770 [10:39<1:39:32,  1.52it/s]  7%|▋         | 670/9770 [10:40<1:39:22,  1.53it/s]                                                      7%|▋         | 670/9770 [10:40<1:39:22,  1.53it/s]  7%|▋         | 671/9770 [10:41<1:41:31,  1.49it/s]  7%|▋         | 672/9770 [10:41<1:41:42,  1.49it/s]  7%|▋         | 673/9770 [10:42<1:41:01,  1.50it/s]  7%|▋         | 674/9770 [10:43<1:40:03,  1.52it
+0: {'loss': 0.7361, 'grad_norm': 0.6819845853956553, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.07}
+0: /s]  7%|▋         | 675/9770 [10:43<1:40:10,  1.51it/s]  7%|▋         | 676/9770 [10:44<1:40:59,  1.50it/s]  7%|▋         | 677/9770 [10:45<1:41:02,  1.50it/s]  7%|▋         | 678/9770 [10:45<1:40:08,  1.51it/s]  7%|▋         | 679/9770 [10:46<1:39:19,  1.53it/s]  7%|▋         | 680/9770 [10:47<1:39:32,  1.52it/s]                                                      7%|▋         | 680/9770 [10:47<1:39:32,  1.52it/s]  7%|▋         | 681/9770 [10:47<1:39:35,  1.52it/s]  7%|▋         | 682/9770 [10:48<1:40:01,  1.51it/s]  7%|▋         | 683/9770 [10:49<1:40:27,  1.51it/s]  7%|▋         | 684/9770 [10:49<1:41:12,  1.50it/s]  7%|▋         | 685/9770 [10:50<1:41:33,  1.49it/s]  7%|▋         | 686/9770 [10:51<1:40:46,  1.50it/s]  7%|▋         | 687/9770 [10:51<1:41:21,  1.49it/s]  7%|▋         | 688/9770 [10:52<1:40:29,  1.51it/s]  7%|▋         | 689/9770 [10:53<1:40:29,  1.51it/s]  7%|▋         | 690/9770 [10:53<1:40:51,  1.50it/s]                               
+0: {'loss': 0.7253, 'grad_norm': 0.7036677301551879, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.07}
+0: {'loss': 0.7188, 'grad_norm': 0.7260872605299662, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.07}
+0:                        7%|▋         | 690/9770 [10:53<1:40:51,  1.50it/s]  7%|▋         | 691/9770 [10:54<1:41:34,  1.49it/s]  7%|▋         | 692/9770 [10:55<1:41:53,  1.48it/s]  7%|▋         | 693/9770 [10:55<1:41:26,  1.49it/s]  7%|▋         | 694/9770 [10:56<1:39:56,  1.51it/s]  7%|▋         | 695/9770 [10:57<1:40:22,  1.51it/s]  7%|▋         | 696/9770 [10:57<1:40:00,  1.51it/s]  7%|▋         | 697/9770 [10:58<1:40:13,  1.51it/s]  7%|▋         | 698/9770 [10:59<1:42:12,  1.48it/s]  7%|▋         | 699/9770 [10:59<1:40:09,  1.51it/s]  7%|▋         | 700/9770 [11:00<1:39:24,  1.52it/s]                                                      7%|▋         | 700/9770 [11:00<1:39:24,  1.52it/s]  7%|▋         | 701/9770 [11:00<1:38:16,  1.54it/s]  7%|▋         | 702/9770 [11:01<1:39:32,  1.52it/s]  7%|▋         | 703/9770 [11:02<1:39:59,  1.51it/s]  7%|▋         | 704/9770 [11:03<1:42:05,  1.48it/s]  7%|▋         | 705/9770 [11:03<1:42:10,  1.48it/s]  7%|▋    
+0: {'loss': 0.7465, 'grad_norm': 0.7770216966956396, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.07}
+0: {'loss': 0.7345, 'grad_norm': 0.7229523530021423, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.07}
+0:      | 706/9770 [11:04<1:41:18,  1.49it/s]  7%|▋         | 707/9770 [11:04<1:40:06,  1.51it/s]  7%|▋         | 708/9770 [11:05<1:40:50,  1.50it/s]  7%|▋         | 709/9770 [11:06<1:40:09,  1.51it/s]  7%|▋         | 710/9770 [11:06<1:40:16,  1.51it/s]                                                      7%|▋         | 710/9770 [11:07<1:40:16,  1.51it/s]  7%|▋         | 711/9770 [11:07<1:40:21,  1.50it/s]  7%|▋         | 712/9770 [11:08<1:40:44,  1.50it/s]  7%|▋         | 713/9770 [11:09<1:41:50,  1.48it/s]  7%|▋         | 714/9770 [11:09<1:41:32,  1.49it/s]  7%|▋         | 715/9770 [11:10<1:39:32,  1.52it/s]  7%|▋         | 716/9770 [11:10<1:39:57,  1.51it/s]  7%|▋         | 717/9770 [11:11<1:39:57,  1.51it/s]  7%|▋         | 718/9770 [11:12<1:38:56,  1.52it/s]  7%|▋         | 719/9770 [11:12<1:39:25,  1.52it/s]  7%|▋         | 720/9770 [11:13<1:39:20,  1.52it/s]                                                      7%|▋         | 720/9770 [11:13<1:39:20,  1.5
+0: {'loss': 0.7417, 'grad_norm': 0.6779657465744413, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.07}
+0: 2it/s]  7%|▋         | 721/9770 [11:14<1:39:42,  1.51it/s]  7%|▋         | 722/9770 [11:14<1:39:12,  1.52it/s]  7%|▋         | 723/9770 [11:15<1:37:54,  1.54it/s]  7%|▋         | 724/9770 [11:16<1:38:50,  1.53it/s]  7%|▋         | 725/9770 [11:16<1:38:38,  1.53it/s]  7%|▋         | 726/9770 [11:17<1:38:25,  1.53it/s]  7%|▋         | 727/9770 [11:18<1:40:10,  1.50it/s]  7%|▋         | 728/9770 [11:18<1:41:34,  1.48it/s]  7%|▋         | 729/9770 [11:19<1:40:30,  1.50it/s]  7%|▋         | 730/9770 [11:20<1:40:48,  1.49it/s]                                                      7%|▋         | 730/9770 [11:20<1:40:48,  1.49it/s]  7%|▋         | 731/9770 [11:20<1:41:03,  1.49it/s]  7%|▋         | 732/9770 [11:21<1:41:37,  1.48it/s]  8%|▊         | 733/9770 [11:22<1:40:42,  1.50it/s]  8%|▊         | 734/9770 [11:22<1:40:05,  1.50it/s]  8%|▊         | 735/9770 [11:23<1:39:58,  1.51it/s]  8%|▊         | 736/9770 [11:24<1:39:43,  1.51it/s]  8%|▊         | 737/9770 
+0: {'loss': 0.7138, 'grad_norm': 0.7018408368309211, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.08}
+0: {'loss': 0.7146, 'grad_norm': 0.65869291590411, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.08}
+0: [11:24<1:40:54,  1.49it/s]  8%|▊         | 738/9770 [11:25<1:40:36,  1.50it/s]  8%|▊         | 739/9770 [11:26<1:41:03,  1.49it/s]  8%|▊         | 740/9770 [11:26<1:40:42,  1.49it/s]                                                      8%|▊         | 740/9770 [11:26<1:40:42,  1.49it/s]  8%|▊         | 741/9770 [11:27<1:40:06,  1.50it/s]  8%|▊         | 742/9770 [11:28<1:39:33,  1.51it/s]  8%|▊         | 743/9770 [11:28<1:38:35,  1.53it/s]  8%|▊         | 744/9770 [11:29<1:39:01,  1.52it/s]  8%|▊         | 745/9770 [11:30<1:38:10,  1.53it/s]  8%|▊         | 746/9770 [11:30<1:36:59,  1.55it/s]  8%|▊         | 747/9770 [11:31<1:38:27,  1.53it/s]  8%|▊         | 748/9770 [11:32<1:37:52,  1.54it/s]  8%|▊         | 749/9770 [11:32<1:38:41,  1.52it/s]  8%|▊         | 750/9770 [11:33<1:39:42,  1.51it/s]                                                      8%|▊         | 750/9770 [11:33<1:39:42,  1.51it/s]  8%|▊         | 751/9770 [11:34<1:39:58,  1.50it/s]  8%|▊ 
+0: {'loss': 0.7547, 'grad_norm': 0.7091292439205819, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.08}
+0:         | 752/9770 [11:34<1:38:57,  1.52it/s]  8%|▊         | 753/9770 [11:35<1:38:37,  1.52it/s]  8%|▊         | 754/9770 [11:36<1:39:21,  1.51it/s]  8%|▊         | 755/9770 [11:36<1:39:53,  1.50it/s]  8%|▊         | 756/9770 [11:37<1:39:18,  1.51it/s]  8%|▊         | 757/9770 [11:38<1:38:44,  1.52it/s]  8%|▊         | 758/9770 [11:38<1:38:42,  1.52it/s]  8%|▊         | 759/9770 [11:39<1:38:18,  1.53it/s]  8%|▊         | 760/9770 [11:40<1:37:54,  1.53it/s]                                                      8%|▊         | 760/9770 [11:40<1:37:54,  1.53it/s]  8%|▊         | 761/9770 [11:40<1:38:03,  1.53it/s]  8%|▊         | 762/9770 [11:41<1:38:07,  1.53it/s]  8%|▊         | 763/9770 [11:42<1:38:17,  1.53it/s]  8%|▊         | 764/9770 [11:42<1:37:04,  1.55it/s]  8%|▊         | 765/9770 [11:43<1:36:30,  1.56it/s]  8%|▊         | 766/9770 [11:43<1:36:23,  1.56it/s]  8%|▊         | 767/9770 [11:44<1:37:36,  1.54it/s]  8%|▊         | 768/9770 [11:45<1:37:14, 
+0: {'loss': 0.723, 'grad_norm': 0.7466168764271491, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.08}
+0: {'loss': 0.7481, 'grad_norm': 0.6832693090356019, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.08}
+0:  1.54it/s]  8%|▊         | 769/9770 [11:45<1:38:44,  1.52it/s]  8%|▊         | 770/9770 [11:46<1:39:45,  1.50it/s]                                                      8%|▊         | 770/9770 [11:46<1:39:45,  1.50it/s]  8%|▊         | 771/9770 [11:47<1:39:13,  1.51it/s]  8%|▊         | 772/9770 [11:47<1:41:28,  1.48it/s]  8%|▊         | 773/9770 [11:48<1:42:31,  1.46it/s]  8%|▊         | 774/9770 [11:49<1:42:52,  1.46it/s]  8%|▊         | 775/9770 [11:50<1:42:02,  1.47it/s]  8%|▊         | 776/9770 [11:50<1:41:22,  1.48it/s]  8%|▊         | 777/9770 [11:51<1:41:20,  1.48it/s]  8%|▊         | 778/9770 [11:52<1:41:43,  1.47it/s]  8%|▊         | 779/9770 [11:52<1:39:57,  1.50it/s]  8%|▊         | 780/9770 [11:53<1:38:35,  1.52it/s]                                                      8%|▊         | 780/9770 [11:53<1:38:35,  1.52it/s]  8%|▊         | 781/9770 [11:53<1:38:24,  1.52it/s]  8%|▊         | 782/9770 [11:54<1:37:56,  1.53it/s]  8%|▊         | 783/97
+0: {'loss': 0.7379, 'grad_norm': 0.6880026205183939, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.08}
+0: 70 [11:55<1:38:03,  1.53it/s]  8%|▊         | 784/9770 [11:55<1:37:42,  1.53it/s]  8%|▊         | 785/9770 [11:56<1:37:38,  1.53it/s]  8%|▊         | 786/9770 [11:57<1:37:46,  1.53it/s]  8%|▊         | 787/9770 [11:57<1:37:48,  1.53it/s]  8%|▊         | 788/9770 [11:58<1:38:15,  1.52it/s]  8%|▊         | 789/9770 [11:59<1:37:31,  1.53it/s]  8%|▊         | 790/9770 [11:59<1:37:42,  1.53it/s]                                                      8%|▊         | 790/9770 [11:59<1:37:42,  1.53it/s]  8%|▊         | 791/9770 [12:00<1:38:31,  1.52it/s]  8%|▊         | 792/9770 [12:01<1:38:39,  1.52it/s]  8%|▊         | 793/9770 [12:01<1:38:40,  1.52it/s]  8%|▊         | 794/9770 [12:02<1:39:23,  1.51it/s]  8%|▊         | 795/9770 [12:03<1:40:18,  1.49it/s]  8%|▊         | 796/9770 [12:03<1:38:46,  1.51it/s]  8%|▊         | 797/9770 [12:04<1:38:24,  1.52it/s]  8%|▊         | 798/9770 [12:05<1:39:50,  1.50it/s]  8%|▊         | 799/9770 [12:05<1:39:51,  1.50it/s]  8%|
+0: {'loss': 0.7226, 'grad_norm': 0.6495199535599289, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.08}
+0: {'loss': 0.743, 'grad_norm': 0.6885464404777386, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.08}
+0: ▊         | 800/9770 [12:06<1:39:52,  1.50it/s]                                                      8%|▊         | 800/9770 [12:06<1:39:52,  1.50it/s]  8%|▊         | 801/9770 [12:07<1:40:08,  1.49it/s]  8%|▊         | 802/9770 [12:07<1:41:49,  1.47it/s]  8%|▊         | 803/9770 [12:08<1:40:14,  1.49it/s]  8%|▊         | 804/9770 [12:09<1:40:06,  1.49it/s]  8%|▊         | 805/9770 [12:09<1:38:54,  1.51it/s]  8%|▊         | 806/9770 [12:10<1:39:15,  1.51it/s]  8%|▊         | 807/9770 [12:11<1:38:58,  1.51it/s]  8%|▊         | 808/9770 [12:11<1:39:41,  1.50it/s]  8%|▊         | 809/9770 [12:12<1:39:45,  1.50it/s]  8%|▊         | 810/9770 [12:13<1:38:18,  1.52it/s]                                                      8%|▊         | 810/9770 [12:13<1:38:18,  1.52it/s]  8%|▊         | 811/9770 [12:13<1:37:25,  1.53it/s]  8%|▊         | 812/9770 [12:14<1:37:35,  1.53it/s]  8%|▊         | 813/9770 [12:15<1:37:38,  1.53it/s]  8%|▊         | 814/9770 [12:15<1:38:4
+0: {'loss': 0.7356, 'grad_norm': 0.6882226690537028, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.08}
+0: 7,  1.51it/s]  8%|▊         | 815/9770 [12:16<1:39:42,  1.50it/s]  8%|▊         | 816/9770 [12:17<1:39:07,  1.51it/s]  8%|▊         | 817/9770 [12:17<1:39:03,  1.51it/s]  8%|▊         | 818/9770 [12:18<1:37:49,  1.53it/s]  8%|▊         | 819/9770 [12:19<1:37:48,  1.53it/s]  8%|▊         | 820/9770 [12:19<1:38:08,  1.52it/s]                                                      8%|▊         | 820/9770 [12:19<1:38:08,  1.52it/s]  8%|▊         | 821/9770 [12:20<1:37:44,  1.53it/s]  8%|▊         | 822/9770 [12:21<1:37:30,  1.53it/s]  8%|▊         | 823/9770 [12:21<1:37:44,  1.53it/s]  8%|▊         | 824/9770 [12:22<1:37:41,  1.53it/s]  8%|▊         | 825/9770 [12:23<1:36:49,  1.54it/s]  8%|▊         | 826/9770 [12:23<1:37:28,  1.53it/s]  8%|▊         | 827/9770 [12:24<1:37:15,  1.53it/s]  8%|▊         | 828/9770 [12:24<1:37:32,  1.53it/s]  8%|▊         | 829/9770 [12:25<1:37:05,  1.53it/s]  8%|▊         | 830/9770 [12:26<1:36:59,  1.54it/s]                     
+0: {'loss': 0.7216, 'grad_norm': 0.6690167991754846, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.08}
+0: {'loss': 0.7211, 'grad_norm': 0.7265324626150281, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.09}
+0:                                  8%|▊         | 830/9770 [12:26<1:36:59,  1.54it/s]  9%|▊         | 831/9770 [12:26<1:37:18,  1.53it/s]  9%|▊         | 832/9770 [12:27<1:39:15,  1.50it/s]  9%|▊         | 833/9770 [12:28<1:39:04,  1.50it/s]  9%|▊         | 834/9770 [12:28<1:37:41,  1.52it/s]  9%|▊         | 835/9770 [12:29<1:37:11,  1.53it/s]  9%|▊         | 836/9770 [12:30<1:37:45,  1.52it/s]  9%|▊         | 837/9770 [12:30<1:39:53,  1.49it/s]  9%|▊         | 838/9770 [12:31<1:37:55,  1.52it/s]  9%|▊         | 839/9770 [12:32<1:39:41,  1.49it/s]  9%|▊         | 840/9770 [12:32<1:39:04,  1.50it/s]                                                      9%|▊         | 840/9770 [12:32<1:39:04,  1.50it/s]  9%|▊         | 841/9770 [12:33<1:38:49,  1.51it/s]  9%|▊         | 842/9770 [12:34<1:38:06,  1.52it/s]  9%|▊         | 843/9770 [12:34<1:37:31,  1.53it/s]  9%|▊         | 844/9770 [12:35<1:37:19,  1.53it/s]  9%|▊         | 845/9770 [12:36<1:39:02,  1.50it/s]  
+0: {'loss': 0.7319, 'grad_norm': 0.7130304396605052, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.09}
+0: {'loss': 0.7556, 'grad_norm': 0.6949832508567161, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.09}
+0: 9%|▊         | 846/9770 [12:36<1:38:24,  1.51it/s]  9%|▊         | 847/9770 [12:37<1:38:41,  1.51it/s]  9%|▊         | 848/9770 [12:38<1:39:36,  1.49it/s]  9%|▊         | 849/9770 [12:38<1:38:39,  1.51it/s]  9%|▊         | 850/9770 [12:39<1:37:31,  1.52it/s]                                                      9%|▊         | 850/9770 [12:39<1:37:31,  1.52it/s]  9%|▊         | 851/9770 [12:40<1:37:07,  1.53it/s]  9%|▊         | 852/9770 [12:40<1:39:05,  1.50it/s]  9%|▊         | 853/9770 [12:41<1:40:20,  1.48it/s]  9%|▊         | 854/9770 [12:42<1:39:44,  1.49it/s]  9%|▉         | 855/9770 [12:42<1:38:50,  1.50it/s]  9%|▉         | 856/9770 [12:43<1:39:16,  1.50it/s]  9%|▉         | 857/9770 [12:44<1:38:50,  1.50it/s]  9%|▉         | 858/9770 [12:44<1:38:26,  1.51it/s]  9%|▉         | 859/9770 [12:45<1:38:52,  1.50it/s]  9%|▉         | 860/9770 [12:46<1:39:06,  1.50it/s]                                                      9%|▉         | 860/9770 [12:46<1:3
+0: {'loss': 0.7171, 'grad_norm': 0.7461139430490219, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.09}
+0: 9:06,  1.50it/s]  9%|▉         | 861/9770 [12:46<1:38:19,  1.51it/s]  9%|▉         | 862/9770 [12:47<1:38:41,  1.50it/s]  9%|▉         | 863/9770 [12:48<1:38:55,  1.50it/s]  9%|▉         | 864/9770 [12:48<1:38:00,  1.51it/s]  9%|▉         | 865/9770 [12:49<1:38:11,  1.51it/s]  9%|▉         | 866/9770 [12:50<1:37:49,  1.52it/s]  9%|▉         | 867/9770 [12:50<1:38:51,  1.50it/s]  9%|▉         | 868/9770 [12:51<1:39:27,  1.49it/s]  9%|▉         | 869/9770 [12:52<1:38:02,  1.51it/s]  9%|▉         | 870/9770 [12:52<1:38:14,  1.51it/s]                                                      9%|▉         | 870/9770 [12:52<1:38:14,  1.51it/s]  9%|▉         | 871/9770 [12:53<1:38:15,  1.51it/s]  9%|▉         | 872/9770 [12:54<1:40:13,  1.48it/s]  9%|▉         | 873/9770 [12:54<1:38:50,  1.50it/s]  9%|▉         | 874/9770 [12:55<1:39:23,  1.49it/s]  9%|▉         | 875/9770 [12:56<1:38:46,  1.50it/s]  9%|▉         | 876/9770 [12:56<1:38:34,  1.50it/s]  9%|▉         |
+0: {'loss': 0.7209, 'grad_norm': 0.7516643611434419, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.09}
+0: {'loss': 0.7057, 'grad_norm': 0.662765127238728, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.09}
+0:  877/9770 [12:57<1:38:33,  1.50it/s]  9%|▉         | 878/9770 [12:58<1:38:28,  1.51it/s]  9%|▉         | 879/9770 [12:58<1:37:13,  1.52it/s]  9%|▉         | 880/9770 [12:59<1:38:05,  1.51it/s]                                                      9%|▉         | 880/9770 [12:59<1:38:05,  1.51it/s]  9%|▉         | 881/9770 [13:00<1:36:56,  1.53it/s]  9%|▉         | 882/9770 [13:00<1:36:24,  1.54it/s]  9%|▉         | 883/9770 [13:01<1:36:16,  1.54it/s]  9%|▉         | 884/9770 [13:02<1:37:06,  1.53it/s]  9%|▉         | 885/9770 [13:02<1:37:31,  1.52it/s]  9%|▉         | 886/9770 [13:03<1:37:59,  1.51it/s]  9%|▉         | 887/9770 [13:04<1:37:35,  1.52it/s]  9%|▉         | 888/9770 [13:04<1:37:44,  1.51it/s]  9%|▉         | 889/9770 [13:05<1:39:24,  1.49it/s]  9%|▉         | 890/9770 [13:06<1:40:21,  1.47it/s]                                                      9%|▉         | 890/9770 [13:06<1:40:21,  1.47it/s]  9%|▉         | 891/9770 [13:06<1:39:56,  1.48it/s]
+0: {'loss': 0.7489, 'grad_norm': 0.740736768151442, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.09}
+0:   9%|▉         | 892/9770 [13:07<1:39:30,  1.49it/s]  9%|▉         | 893/9770 [13:08<1:37:38,  1.52it/s]  9%|▉         | 894/9770 [13:08<1:37:28,  1.52it/s]  9%|▉         | 895/9770 [13:09<1:37:33,  1.52it/s]  9%|▉         | 896/9770 [13:10<1:37:50,  1.51it/s]  9%|▉         | 897/9770 [13:10<1:36:32,  1.53it/s]  9%|▉         | 898/9770 [13:11<1:38:20,  1.50it/s]  9%|▉         | 899/9770 [13:12<1:38:10,  1.51it/s]  9%|▉         | 900/9770 [13:12<1:37:04,  1.52it/s]                                                      9%|▉         | 900/9770 [13:12<1:37:04,  1.52it/s]  9%|▉         | 901/9770 [13:13<1:37:48,  1.51it/s]  9%|▉         | 902/9770 [13:14<1:37:09,  1.52it/s]  9%|▉         | 903/9770 [13:14<1:35:57,  1.54it/s]  9%|▉         | 904/9770 [13:15<1:38:54,  1.49it/s]  9%|▉         | 905/9770 [13:16<1:38:12,  1.50it/s]  9%|▉         | 906/9770 [13:16<1:37:49,  1.51it/s]  9%|▉         | 907/9770 [13:17<1:38:03,  1.51it/s]  9%|▉         | 908/9770 [13:18
+0: {'loss': 0.7295, 'grad_norm': 0.7544407365175977, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.09}
+0: {'loss': 0.7226, 'grad_norm': 0.6464521728241293, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.09}
+0: <1:38:37,  1.50it/s]  9%|▉         | 909/9770 [13:18<1:38:28,  1.50it/s]  9%|▉         | 910/9770 [13:19<1:38:28,  1.50it/s]                                                      9%|▉         | 910/9770 [13:19<1:38:28,  1.50it/s]  9%|▉         | 911/9770 [13:19<1:37:52,  1.51it/s]  9%|▉         | 912/9770 [13:20<1:37:58,  1.51it/s]  9%|▉         | 913/9770 [13:21<1:37:50,  1.51it/s]  9%|▉         | 914/9770 [13:21<1:36:50,  1.52it/s]  9%|▉         | 915/9770 [13:22<1:36:39,  1.53it/s]  9%|▉         | 916/9770 [13:23<1:38:02,  1.51it/s]  9%|▉         | 917/9770 [13:23<1:37:36,  1.51it/s]  9%|▉         | 918/9770 [13:24<1:37:58,  1.51it/s]  9%|▉         | 919/9770 [13:25<1:37:20,  1.52it/s]  9%|▉         | 920/9770 [13:25<1:37:57,  1.51it/s]                                                      9%|▉         | 920/9770 [13:25<1:37:57,  1.51it/s]  9%|▉         | 921/9770 [13:26<1:37:46,  1.51it/s]  9%|▉         | 922/9770 [13:27<1:37:48,  1.51it/s]  9%|▉       
+0: {'loss': 0.7197, 'grad_norm': 0.6797085168266879, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.1}
+0:   | 923/9770 [13:27<1:37:18,  1.52it/s]  9%|▉         | 924/9770 [13:28<1:36:44,  1.52it/s]  9%|▉         | 925/9770 [13:29<1:36:14,  1.53it/s]  9%|▉         | 926/9770 [13:29<1:36:23,  1.53it/s]  9%|▉         | 927/9770 [13:30<1:37:16,  1.52it/s]  9%|▉         | 928/9770 [13:31<1:36:43,  1.52it/s] 10%|▉         | 929/9770 [13:31<1:36:15,  1.53it/s] 10%|▉         | 930/9770 [13:32<1:35:46,  1.54it/s]                                                     10%|▉         | 930/9770 [13:32<1:35:46,  1.54it/s] 10%|▉         | 931/9770 [13:33<1:36:06,  1.53it/s] 10%|▉         | 932/9770 [13:33<1:36:59,  1.52it/s] 10%|▉         | 933/9770 [13:34<1:39:07,  1.49it/s] 10%|▉         | 934/9770 [13:35<1:38:42,  1.49it/s] 10%|▉         | 935/9770 [13:35<1:38:59,  1.49it/s] 10%|▉         | 936/9770 [13:36<1:39:32,  1.48it/s] 10%|▉         | 937/9770 [13:37<1:40:46,  1.46it/s] 10%|▉         | 938/9770 [13:37<1:39:46,  1.48it/s] 10%|▉         | 939/9770 [13:38<1:39:02,  1.49i
+0: {'loss': 0.7279, 'grad_norm': 0.6516631120912061, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.1}
+0: {'loss': 0.7177, 'grad_norm': 0.6500084335367152, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.1}
+0: t/s] 10%|▉         | 940/9770 [13:39<1:38:40,  1.49it/s]                                                     10%|▉         | 940/9770 [13:39<1:38:40,  1.49it/s] 10%|▉         | 941/9770 [13:39<1:38:18,  1.50it/s] 10%|▉         | 942/9770 [13:40<1:37:07,  1.51it/s] 10%|▉         | 943/9770 [13:41<1:36:23,  1.53it/s] 10%|▉         | 944/9770 [13:41<1:35:37,  1.54it/s] 10%|▉         | 945/9770 [13:42<1:35:32,  1.54it/s] 10%|▉         | 946/9770 [13:43<1:37:52,  1.50it/s] 10%|▉         | 947/9770 [13:43<1:38:20,  1.50it/s] 10%|▉         | 948/9770 [13:44<1:38:22,  1.49it/s] 10%|▉         | 949/9770 [13:45<1:37:54,  1.50it/s] 10%|▉         | 950/9770 [13:45<1:37:24,  1.51it/s]                                                     10%|▉         | 950/9770 [13:45<1:37:24,  1.51it/s] 10%|▉         | 951/9770 [13:46<1:38:00,  1.50it/s] 10%|▉         | 952/9770 [13:47<1:38:05,  1.50it/s] 10%|▉         | 953/9770 [13:47<1:36:38,  1.52it/s] 10%|▉         | 954/9770 [13
+0: {'loss': 0.7277, 'grad_norm': 0.6893143454628127, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.1}
+0: :48<1:35:26,  1.54it/s] 10%|▉         | 955/9770 [13:49<1:36:15,  1.53it/s] 10%|▉         | 956/9770 [13:49<1:35:37,  1.54it/s] 10%|▉         | 957/9770 [13:50<1:36:01,  1.53it/s] 10%|▉         | 958/9770 [13:51<1:36:03,  1.53it/s] 10%|▉         | 959/9770 [13:51<1:36:01,  1.53it/s] 10%|▉         | 960/9770 [13:52<1:35:21,  1.54it/s]                                                     10%|▉         | 960/9770 [13:52<1:35:21,  1.54it/s] 10%|▉         | 961/9770 [13:53<1:35:43,  1.53it/s] 10%|▉         | 962/9770 [13:53<1:35:23,  1.54it/s] 10%|▉         | 963/9770 [13:54<1:35:12,  1.54it/s] 10%|▉         | 964/9770 [13:54<1:36:34,  1.52it/s] 10%|▉         | 965/9770 [13:55<1:36:45,  1.52it/s] 10%|▉         | 966/9770 [13:56<1:36:45,  1.52it/s] 10%|▉         | 967/9770 [13:56<1:36:51,  1.51it/s] 10%|▉         | 968/9770 [13:57<1:36:03,  1.53it/s] 10%|▉         | 969/9770 [13:58<1:36:48,  1.52it/s] 10%|▉         | 970/9770 [13:58<1:36:35,  1.52it/s]           
+0: {'loss': 0.7343, 'grad_norm': 0.6982460126696003, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.1}
+0: {'loss': 0.7313, 'grad_norm': 0.6661299414245518, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.1}
+0:                                           10%|▉         | 970/9770 [13:58<1:36:35,  1.52it/s] 10%|▉         | 971/9770 [13:59<1:36:47,  1.52it/s] 10%|▉         | 972/9770 [14:00<1:37:07,  1.51it/s] 10%|▉         | 973/9770 [14:00<1:37:29,  1.50it/s] 10%|▉         | 974/9770 [14:01<1:37:01,  1.51it/s] 10%|▉         | 975/9770 [14:02<1:36:39,  1.52it/s] 10%|▉         | 976/9770 [14:02<1:36:29,  1.52it/s] 10%|█         | 977/9770 [14:03<1:37:35,  1.50it/s] 10%|█         | 978/9770 [14:04<1:35:45,  1.53it/s] 10%|█         | 979/9770 [14:04<1:36:22,  1.52it/s] 10%|█         | 980/9770 [14:05<1:35:48,  1.53it/s]                                                     10%|█         | 980/9770 [14:05<1:35:48,  1.53it/s] 10%|█         | 981/9770 [14:06<1:36:14,  1.52it/s] 10%|█         | 982/9770 [14:06<1:35:53,  1.53it/s] 10%|█         | 983/9770 [14:07<1:35:49,  1.53it/s] 10%|█         | 984/9770 [14:08<1:36:45,  1.51it/s] 10%|█         | 985/9770 [14:08<1:37:44,  1.
+0: {'loss': 0.7384, 'grad_norm': 0.6903639148703681, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.1}
+0: {'loss': 0.7234, 'grad_norm': 0.7065131396712995, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.1}
+0: 50it/s] 10%|█         | 986/9770 [14:09<1:36:54,  1.51it/s] 10%|█         | 987/9770 [14:10<1:37:10,  1.51it/s] 10%|█         | 988/9770 [14:10<1:39:23,  1.47it/s] 10%|█         | 989/9770 [14:11<1:37:32,  1.50it/s] 10%|█         | 990/9770 [14:12<1:39:20,  1.47it/s]                                                     10%|█         | 990/9770 [14:12<1:39:20,  1.47it/s] 10%|█         | 991/9770 [14:12<1:37:44,  1.50it/s] 10%|█         | 992/9770 [14:13<1:36:50,  1.51it/s] 10%|█         | 993/9770 [14:14<1:37:01,  1.51it/s] 10%|█         | 994/9770 [14:14<1:38:45,  1.48it/s] 10%|█         | 995/9770 [14:15<1:37:30,  1.50it/s] 10%|█         | 996/9770 [14:16<1:37:48,  1.50it/s] 10%|█         | 997/9770 [14:16<1:38:31,  1.48it/s] 10%|█         | 998/9770 [14:17<1:37:27,  1.50it/s] 10%|█         | 999/9770 [14:18<1:35:31,  1.53it/s] 10%|█         | 1000/9770 [14:18<1:35:20,  1.53it/s]                                                      10%|█         | 1000/97
+0: {'loss': 0.7326, 'grad_norm': 0.7039545270723172, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.1}
+0: 70 [14:18<1:35:20,  1.53it/s] 10%|█         | 1001/9770 [14:19<1:36:03,  1.52it/s] 10%|█         | 1002/9770 [14:20<1:35:51,  1.52it/s] 10%|█         | 1003/9770 [14:20<1:35:42,  1.53it/s] 10%|█         | 1004/9770 [14:21<1:34:22,  1.55it/s] 10%|█         | 1005/9770 [14:22<1:35:28,  1.53it/s] 10%|█         | 1006/9770 [14:22<1:35:24,  1.53it/s] 10%|█         | 1007/9770 [14:23<1:36:13,  1.52it/s] 10%|█         | 1008/9770 [14:24<1:36:12,  1.52it/s] 10%|█         | 1009/9770 [14:24<1:36:35,  1.51it/s] 10%|█         | 1010/9770 [14:25<1:36:07,  1.52it/s]                                                      10%|█         | 1010/9770 [14:25<1:36:07,  1.52it/s] 10%|█         | 1011/9770 [14:26<1:35:47,  1.52it/s] 10%|█         | 1012/9770 [14:26<1:37:16,  1.50it/s] 10%|█         | 1013/9770 [14:27<1:36:49,  1.51it/s] 10%|█         | 1014/9770 [14:28<1:36:19,  1.51it/s] 10%|█         | 1015/9770 [14:28<1:36:03,  1.52it/s] 10%|█         | 1016/9770 [14:29<1:38:33
+0: {'loss': 0.7301, 'grad_norm': 0.6962783177309145, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.1}
+0: {'loss': 0.7358, 'grad_norm': 0.6653584314961221, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.11}
+0: ,  1.48it/s] 10%|█         | 1017/9770 [14:30<1:37:19,  1.50it/s] 10%|█         | 1018/9770 [14:30<1:37:02,  1.50it/s] 10%|█         | 1019/9770 [14:31<1:36:43,  1.51it/s] 10%|█         | 1020/9770 [14:32<1:37:20,  1.50it/s]                                                      10%|█         | 1020/9770 [14:32<1:37:20,  1.50it/s] 10%|█         | 1021/9770 [14:32<1:35:57,  1.52it/s] 10%|█         | 1022/9770 [14:33<1:35:21,  1.53it/s] 10%|█         | 1023/9770 [14:33<1:35:08,  1.53it/s] 10%|█         | 1024/9770 [14:34<1:37:37,  1.49it/s] 10%|█         | 1025/9770 [14:35<1:37:07,  1.50it/s] 11%|█         | 1026/9770 [14:36<1:36:34,  1.51it/s] 11%|█         | 1027/9770 [14:36<1:36:54,  1.50it/s] 11%|█         | 1028/9770 [14:37<1:37:29,  1.49it/s] 11%|█         | 1029/9770 [14:38<1:36:32,  1.51it/s] 11%|█         | 1030/9770 [14:38<1:36:39,  1.51it/s]                                                      11%|█         | 1030/9770 [14:38<1:36:39,  1.51it/s] 11%|
+0: {'loss': 0.7101, 'grad_norm': 0.6313113690595087, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.11}
+0: █         | 1031/9770 [14:39<1:35:51,  1.52it/s] 11%|█         | 1032/9770 [14:39<1:34:55,  1.53it/s] 11%|█         | 1033/9770 [14:40<1:34:39,  1.54it/s] 11%|█         | 1034/9770 [14:41<1:35:40,  1.52it/s] 11%|█         | 1035/9770 [14:41<1:35:47,  1.52it/s] 11%|█         | 1036/9770 [14:42<1:35:16,  1.53it/s] 11%|█         | 1037/9770 [14:43<1:35:15,  1.53it/s] 11%|█         | 1038/9770 [14:43<1:34:17,  1.54it/s] 11%|█         | 1039/9770 [14:44<1:33:08,  1.56it/s] 11%|█         | 1040/9770 [14:45<1:35:31,  1.52it/s]                                                      11%|█         | 1040/9770 [14:45<1:35:31,  1.52it/s] 11%|█         | 1041/9770 [14:45<1:34:51,  1.53it/s] 11%|█         | 1042/9770 [14:46<1:37:01,  1.50it/s] 11%|█         | 1043/9770 [14:47<1:37:08,  1.50it/s] 11%|█         | 1044/9770 [14:47<1:37:20,  1.49it/s] 11%|█         | 1045/9770 [14:48<1:36:52,  1.50it/s] 11%|█         | 1046/9770 [14:49<1:35:39,  1.52it/s] 11%|█         | 104
+0: {'loss': 0.7098, 'grad_norm': 0.6302999246927546, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.11}
+0: {'loss': 0.7113, 'grad_norm': 0.6585400529639046, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.11}
+0: 7/9770 [14:49<1:35:12,  1.53it/s] 11%|█         | 1048/9770 [14:50<1:35:00,  1.53it/s] 11%|█         | 1049/9770 [14:51<1:34:55,  1.53it/s] 11%|█         | 1050/9770 [14:51<1:34:29,  1.54it/s]                                                      11%|█         | 1050/9770 [14:51<1:34:29,  1.54it/s] 11%|█         | 1051/9770 [14:52<1:34:41,  1.53it/s] 11%|█         | 1052/9770 [14:53<1:33:51,  1.55it/s] 11%|█         | 1053/9770 [14:53<1:34:37,  1.54it/s] 11%|█         | 1054/9770 [14:54<1:35:12,  1.53it/s] 11%|█         | 1055/9770 [14:55<1:35:01,  1.53it/s] 11%|█         | 1056/9770 [14:55<1:35:10,  1.53it/s] 11%|█         | 1057/9770 [14:56<1:36:02,  1.51it/s] 11%|█         | 1058/9770 [14:57<1:35:31,  1.52it/s] 11%|█         | 1059/9770 [14:57<1:35:13,  1.52it/s] 11%|█         | 1060/9770 [14:58<1:35:44,  1.52it/s]                                                      11%|█         | 1060/9770 [14:58<1:35:44,  1.52it/s] 11%|█         | 1061/9770 [14:58<1:35
+0: {'loss': 0.7311, 'grad_norm': 0.7449615344148319, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.11}
+0: :18,  1.52it/s] 11%|█         | 1062/9770 [14:59<1:35:10,  1.52it/s] 11%|█         | 1063/9770 [15:00<1:36:01,  1.51it/s] 11%|█         | 1064/9770 [15:00<1:36:47,  1.50it/s] 11%|█         | 1065/9770 [15:01<1:35:52,  1.51it/s] 11%|█         | 1066/9770 [15:02<1:35:50,  1.51it/s] 11%|█         | 1067/9770 [15:02<1:34:58,  1.53it/s] 11%|█         | 1068/9770 [15:03<1:34:10,  1.54it/s] 11%|█         | 1069/9770 [15:04<1:34:34,  1.53it/s] 11%|█         | 1070/9770 [15:04<1:34:46,  1.53it/s]                                                      11%|█         | 1070/9770 [15:04<1:34:46,  1.53it/s] 11%|█         | 1071/9770 [15:05<1:37:09,  1.49it/s] 11%|█         | 1072/9770 [15:06<1:38:20,  1.47it/s] 11%|█         | 1073/9770 [15:06<1:37:03,  1.49it/s] 11%|█         | 1074/9770 [15:07<1:35:45,  1.51it/s] 11%|█         | 1075/9770 [15:08<1:36:17,  1.51it/s] 11%|█         | 1076/9770 [15:08<1:36:27,  1.50it/s] 11%|█         | 1077/9770 [15:09<1:35:54,  1.51it/s] 
+0: {'loss': 0.7135, 'grad_norm': 0.7170818401140561, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.11}
+0: {'loss': 0.7215, 'grad_norm': 0.7236218911060883, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.11}
+0: 11%|█         | 1078/9770 [15:10<1:35:01,  1.52it/s] 11%|█         | 1079/9770 [15:10<1:34:41,  1.53it/s] 11%|█         | 1080/9770 [15:11<1:34:04,  1.54it/s]                                                      11%|█         | 1080/9770 [15:11<1:34:04,  1.54it/s] 11%|█         | 1081/9770 [15:12<1:33:46,  1.54it/s] 11%|█         | 1082/9770 [15:12<1:34:38,  1.53it/s] 11%|█         | 1083/9770 [15:13<1:34:59,  1.52it/s] 11%|█         | 1084/9770 [15:14<1:34:07,  1.54it/s] 11%|█         | 1085/9770 [15:14<1:35:10,  1.52it/s] 11%|█         | 1086/9770 [15:15<1:35:14,  1.52it/s] 11%|█         | 1087/9770 [15:16<1:36:57,  1.49it/s] 11%|█         | 1088/9770 [15:16<1:36:23,  1.50it/s] 11%|█         | 1089/9770 [15:17<1:36:35,  1.50it/s] 11%|█         | 1090/9770 [15:18<1:36:11,  1.50it/s]                                                      11%|█         | 1090/9770 [15:18<1:36:11,  1.50it/s] 11%|█         | 1091/9770 [15:18<1:36:08,  1.50it/s] 11%|█         | 
+0: {'loss': 0.7336, 'grad_norm': 0.7232609581471786, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.11}
+0: 1092/9770 [15:19<1:37:39,  1.48it/s] 11%|█         | 1093/9770 [15:20<1:38:32,  1.47it/s] 11%|█         | 1094/9770 [15:20<1:37:39,  1.48it/s] 11%|█         | 1095/9770 [15:21<1:37:18,  1.49it/s] 11%|█         | 1096/9770 [15:22<1:37:02,  1.49it/s] 11%|█         | 1097/9770 [15:22<1:35:25,  1.51it/s] 11%|█         | 1098/9770 [15:23<1:35:50,  1.51it/s] 11%|█         | 1099/9770 [15:24<1:35:32,  1.51it/s] 11%|█▏        | 1100/9770 [15:24<1:36:27,  1.50it/s]                                                      11%|█▏        | 1100/9770 [15:24<1:36:27,  1.50it/s] 11%|█▏        | 1101/9770 [15:25<1:36:43,  1.49it/s] 11%|█▏        | 1102/9770 [15:26<1:36:05,  1.50it/s] 11%|█▏        | 1103/9770 [15:26<1:36:04,  1.50it/s] 11%|█▏        | 1104/9770 [15:27<1:35:47,  1.51it/s] 11%|█▏        | 1105/9770 [15:28<1:34:50,  1.52it/s] 11%|█▏        | 1106/9770 [15:28<1:35:08,  1.52it/s] 11%|█▏        | 1107/9770 [15:29<1:34:08,  1.53it/s] 11%|█▏       
+0: {'loss': 0.7373, 'grad_norm': 0.7452568041300582, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.11}
+0: {'loss': 0.708, 'grad_norm': 0.6841435218462218, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.11}
+0:  | 1108/9770 [15:30<1:34:28,  1.53it/s] 11%|█▏        | 1109/9770 [15:30<1:33:55,  1.54it/s] 11%|█▏        | 1110/9770 [15:31<1:34:39,  1.52it/s]                                                      11%|█▏        | 1110/9770 [15:31<1:34:39,  1.52it/s] 11%|█▏        | 1111/9770 [15:32<1:34:13,  1.53it/s] 11%|█▏        | 1112/9770 [15:32<1:34:35,  1.53it/s] 11%|█▏        | 1113/9770 [15:33<1:35:01,  1.52it/s] 11%|█▏        | 1114/9770 [15:34<1:35:27,  1.51it/s] 11%|█▏        | 1115/9770 [15:34<1:35:25,  1.51it/s] 11%|█▏        | 1116/9770 [15:35<1:36:18,  1.50it/s] 11%|█▏        | 1117/9770 [15:36<1:37:09,  1.48it/s] 11%|█▏        | 1118/9770 [15:36<1:36:44,  1.49it/s] 11%|█▏        | 1119/9770 [15:37<1:36:11,  1.50it/s] 11%|█▏        | 1120/9770 [15:38<1:35:13,  1.51it/s]                                                      11%|█▏        | 1120/9770 [15:38<1:35:13,  1.51it/s] 11%|█▏        | 1121/9770 [15:38<1:34:36,  1.52it/s] 11%
+0: {'loss': 0.7046, 'grad_norm': 0.6645545244306722, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.12}
+0: |█▏        | 1122/9770 [15:39<1:36:52,  1.49it/s] 11%|█▏        | 1123/9770 [15:40<1:36:20,  1.50it/s] 12%|█▏        | 1124/9770 [15:40<1:35:46,  1.50it/s] 12%|█▏        | 1125/9770 [15:41<1:34:49,  1.52it/s] 12%|█▏        | 1126/9770 [15:42<1:33:59,  1.53it/s] 12%|█▏        | 1127/9770 [15:42<1:33:50,  1.54it/s] 12%|█▏        | 1128/9770 [15:43<1:34:01,  1.53it/s] 12%|█▏        | 1129/9770 [15:43<1:33:32,  1.54it/s] 12%|█▏        | 1130/9770 [15:44<1:33:51,  1.53it/s]                                                      12%|█▏        | 1130/9770 [15:44<1:33:51,  1.53it/s] 12%|█▏        | 1131/9770 [15:45<1:34:49,  1.52it/s] 12%|█▏        | 1132/9770 [15:45<1:34:41,  1.52it/s] 12%|█▏        | 1133/9770 [15:46<1:35:13,  1.51it/s] 12%|█▏        | 1134/9770 [15:47<1:35:24,  1.51it/s] 12%|█▏        | 1135/9770 [15:47<1:34:46,  1.52it/s] 12%|█▏        | 1136/9770 [15:48<1:33:48,  1.53it/s] 12%|█▏        | 1137/9770 [15:49<1:33:45
+0: {'loss': 0.7214, 'grad_norm': 0.6926276834784485, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.12}
+0: {'loss': 0.7177, 'grad_norm': 0.6652073621218122, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.12}
+0: ,  1.53it/s] 12%|█▏        | 1138/9770 [15:49<1:33:39,  1.54it/s] 12%|█▏        | 1139/9770 [15:50<1:33:52,  1.53it/s] 12%|█▏        | 1140/9770 [15:51<1:34:48,  1.52it/s]                                                      12%|█▏        | 1140/9770 [15:51<1:34:48,  1.52it/s] 12%|█▏        | 1141/9770 [15:51<1:34:54,  1.52it/s] 12%|█▏        | 1142/9770 [15:52<1:35:03,  1.51it/s] 12%|█▏        | 1143/9770 [15:53<1:37:18,  1.48it/s] 12%|█▏        | 1144/9770 [15:53<1:37:06,  1.48it/s] 12%|█▏        | 1145/9770 [15:54<1:36:30,  1.49it/s] 12%|█▏        | 1146/9770 [15:55<1:35:15,  1.51it/s] 12%|█▏        | 1147/9770 [15:55<1:34:44,  1.52it/s] 12%|█▏        | 1148/9770 [15:56<1:34:25,  1.52it/s] 12%|█▏        | 1149/9770 [15:57<1:34:30,  1.52it/s] 12%|█▏        | 1150/9770 [15:57<1:34:32,  1.52it/s]                                                      12%|█▏        | 1150/9770 [15:57<1:34:32,  1.52it/s] 12%|█▏        | 1151/9770 
+0: {'loss': 0.7209, 'grad_norm': 0.6823674591385668, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.12}
+0: [15:58<1:36:24,  1.49it/s] 12%|█▏        | 1152/9770 [15:59<1:36:04,  1.50it/s] 12%|█▏        | 1153/9770 [15:59<1:34:58,  1.51it/s] 12%|█▏        | 1154/9770 [16:00<1:35:12,  1.51it/s] 12%|█▏        | 1155/9770 [16:01<1:34:47,  1.51it/s] 12%|█▏        | 1156/9770 [16:01<1:35:31,  1.50it/s] 12%|█▏        | 1157/9770 [16:02<1:34:56,  1.51it/s] 12%|█▏        | 1158/9770 [16:03<1:34:15,  1.52it/s] 12%|█▏        | 1159/9770 [16:03<1:33:35,  1.53it/s] 12%|█▏        | 1160/9770 [16:04<1:32:46,  1.55it/s]                                                      12%|█▏        | 1160/9770 [16:04<1:32:46,  1.55it/s] 12%|█▏        | 1161/9770 [16:05<1:33:50,  1.53it/s] 12%|█▏        | 1162/9770 [16:05<1:33:35,  1.53it/s] 12%|█▏        | 1163/9770 [16:06<1:33:57,  1.53it/s] 12%|█▏        | 1164/9770 [16:07<1:36:03,  1.49it/s] 12%|█▏        | 1165/9770 [16:07<1:35:25,  1.50it/s] 12%|█▏        | 1166/9770 [16:08<1:34:50,  1.51it/s] 12%|█▏   
+0: {'loss': 0.7028, 'grad_norm': 0.701210417059177, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.12}
+0: {'loss': 0.7209, 'grad_norm': 0.6743273050584929, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.12}
+0:      | 1167/9770 [16:09<1:33:41,  1.53it/s] 12%|█▏        | 1168/9770 [16:09<1:33:22,  1.54it/s] 12%|█▏        | 1169/9770 [16:10<1:33:56,  1.53it/s] 12%|█▏        | 1170/9770 [16:11<1:34:02,  1.52it/s]                                                      12%|█▏        | 1170/9770 [16:11<1:34:02,  1.52it/s] 12%|█▏        | 1171/9770 [16:11<1:34:13,  1.52it/s] 12%|█▏        | 1172/9770 [16:12<1:34:14,  1.52it/s] 12%|█▏        | 1173/9770 [16:12<1:34:37,  1.51it/s] 12%|█▏        | 1174/9770 [16:13<1:34:33,  1.52it/s] 12%|█▏        | 1175/9770 [16:14<1:34:48,  1.51it/s] 12%|█▏        | 1176/9770 [16:14<1:35:15,  1.50it/s] 12%|█▏        | 1177/9770 [16:15<1:34:50,  1.51it/s] 12%|█▏        | 1178/9770 [16:16<1:34:33,  1.51it/s] 12%|█▏        | 1179/9770 [16:16<1:33:30,  1.53it/s] 12%|█▏        | 1180/9770 [16:17<1:33:28,  1.53it/s]                                                      12%|█▏        | 1180/9770 [16:17<1:33:28,  1.53it/s]
+0: {'loss': 0.6927, 'grad_norm': 0.6365117360915615, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.12}
+0:  12%|█▏        | 1181/9770 [16:18<1:34:04,  1.52it/s] 12%|█▏        | 1182/9770 [16:18<1:32:28,  1.55it/s] 12%|█▏        | 1183/9770 [16:19<1:32:05,  1.55it/s] 12%|█▏        | 1184/9770 [16:20<1:32:18,  1.55it/s] 12%|█▏        | 1185/9770 [16:20<1:32:45,  1.54it/s] 12%|█▏        | 1186/9770 [16:21<1:33:22,  1.53it/s] 12%|█▏        | 1187/9770 [16:22<1:34:16,  1.52it/s] 12%|█▏        | 1188/9770 [16:22<1:34:29,  1.51it/s] 12%|█▏        | 1189/9770 [16:23<1:33:59,  1.52it/s] 12%|█▏        | 1190/9770 [16:24<1:33:38,  1.53it/s]                                                      12%|█▏        | 1190/9770 [16:24<1:33:38,  1.53it/s] 12%|█▏        | 1191/9770 [16:24<1:33:18,  1.53it/s] 12%|█▏        | 1192/9770 [16:25<1:34:05,  1.52it/s] 12%|█▏        | 1193/9770 [16:26<1:35:00,  1.50it/s] 12%|█▏        | 1194/9770 [16:26<1:35:04,  1.50it/s] 12%|█▏        | 1195/9770 [16:27<1:35:19,  1.50it/s] 12%|█▏        | 1196/9770 [16:28<1:3
+0: {'loss': 0.7094, 'grad_norm': 0.7216981541650839, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.12}
+0: {'loss': 0.722, 'grad_norm': 0.7296991087611251, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.12}
+0: 4:21,  1.51it/s] 12%|█▏        | 1197/9770 [16:28<1:33:56,  1.52it/s] 12%|█▏        | 1198/9770 [16:29<1:33:27,  1.53it/s] 12%|█▏        | 1199/9770 [16:30<1:33:28,  1.53it/s] 12%|█▏        | 1200/9770 [16:30<1:34:09,  1.52it/s]                                                      12%|█▏        | 1200/9770 [16:30<1:34:09,  1.52it/s] 12%|█▏        | 1201/9770 [16:31<1:35:16,  1.50it/s] 12%|█▏        | 1202/9770 [16:32<1:35:05,  1.50it/s] 12%|█▏        | 1203/9770 [16:32<1:35:12,  1.50it/s] 12%|█▏        | 1204/9770 [16:33<1:36:21,  1.48it/s] 12%|█▏        | 1205/9770 [16:34<1:36:05,  1.49it/s] 12%|█▏        | 1206/9770 [16:34<1:35:39,  1.49it/s] 12%|█▏        | 1207/9770 [16:35<1:36:02,  1.49it/s] 12%|█▏        | 1208/9770 [16:36<1:35:29,  1.49it/s] 12%|█▏        | 1209/9770 [16:36<1:35:44,  1.49it/s] 12%|█▏        | 1210/9770 [16:37<1:34:36,  1.51it/s]                                                      12%|█▏        | 1210/9
+0: {'loss': 0.7138, 'grad_norm': 0.6649576830874662, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.12}
+0: 770 [16:37<1:34:36,  1.51it/s] 12%|█▏        | 1211/9770 [16:38<1:33:52,  1.52it/s] 12%|█▏        | 1212/9770 [16:38<1:33:46,  1.52it/s] 12%|█▏        | 1213/9770 [16:39<1:33:18,  1.53it/s] 12%|█▏        | 1214/9770 [16:40<1:33:05,  1.53it/s] 12%|█▏        | 1215/9770 [16:40<1:32:31,  1.54it/s] 12%|█▏        | 1216/9770 [16:41<1:33:19,  1.53it/s] 12%|█▏        | 1217/9770 [16:41<1:32:54,  1.53it/s] 12%|█▏        | 1218/9770 [16:42<1:33:11,  1.53it/s] 12%|█▏        | 1219/9770 [16:43<1:33:34,  1.52it/s] 12%|█▏        | 1220/9770 [16:43<1:33:28,  1.52it/s]                                                      12%|█▏        | 1220/9770 [16:43<1:33:28,  1.52it/s] 12%|█▏        | 1221/9770 [16:44<1:34:48,  1.50it/s] 13%|█▎        | 1222/9770 [16:45<1:36:23,  1.48it/s] 13%|█▎        | 1223/9770 [16:45<1:35:04,  1.50it/s] 13%|█▎        | 1224/9770 [16:46<1:34:24,  1.51it/s] 13%|█▎        | 1225/9770 [16:47<1:33:45,  1.52it/s] 13%|█�
+0: {'loss': 0.697, 'grad_norm': 0.6469093295533984, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.13}
+0: �        | 1226/9770 [16:47<1:34:33,  1.51it/s] 13%|█▎        | 1227/9770 [16:48<1:34:26,  1.51it/s] 13%|█▎        | 1228/9770 [16:49<1:34:30,  1.51it/s] 13%|█▎        | 1229/9770 [16:49<1:34:36,  1.50it/s] 13%|█▎        | 1230/9770 [16:50<1:36:15,  1.48it/s]                                                      13%|█▎        | 1230/9770 [16:50<1:36:15,  1.48it/s] 13%|█▎        | 1231/9770 [16:51<1:35:48,  1.49it/s] 13%|█▎        | 1232/9770 [16:51<1:35:06,  1.50it/s] 13%|█▎        | 1233/9770 [16:52<1:34:32,  1.50it/s] 13%|█▎        | 1234/9770 [16:53<1:34:23,  1.51it/s] 13%|█▎        | 1235/9770 [16:53<1:33:20,  1.52it/s] 13%|█▎        | 1236/9770 [16:54<1:34:23,  1.51it/s] 13%|█▎        | 1237/9770 [16:55<1:34:10,  1.51it/s] 13%|█▎        | 1238/9770 [16:55<1:34:25,  1.51it/s] 13%|█▎        | 1239/9770 [16:56<1:35:33,  1.49it/s] 13%|█▎        | 1240/9770 [16:57<1:35:42,  1.49it/s]                                                   
+0: {'loss': 0.7068, 'grad_norm': 0.692761726268952, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.13}
+0: {'loss': 0.7376, 'grad_norm': 0.7248731651851534, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.13}
+0:    13%|█▎        | 1240/9770 [16:57<1:35:42,  1.49it/s] 13%|█▎        | 1241/9770 [16:57<1:34:33,  1.50it/s] 13%|█▎        | 1242/9770 [16:58<1:34:06,  1.51it/s] 13%|█▎        | 1243/9770 [16:59<1:33:54,  1.51it/s] 13%|█▎        | 1244/9770 [16:59<1:33:50,  1.51it/s] 13%|█▎        | 1245/9770 [17:00<1:33:30,  1.52it/s] 13%|█▎        | 1246/9770 [17:01<1:33:27,  1.52it/s] 13%|█▎        | 1247/9770 [17:01<1:33:16,  1.52it/s] 13%|█▎        | 1248/9770 [17:02<1:32:37,  1.53it/s] 13%|█▎        | 1249/9770 [17:03<1:32:38,  1.53it/s] 13%|█▎        | 1250/9770 [17:03<1:32:35,  1.53it/s]                                                      13%|█▎        | 1250/9770 [17:03<1:32:35,  1.53it/s] 13%|█▎        | 1251/9770 [17:04<1:33:42,  1.52it/s] 13%|█▎        | 1252/9770 [17:05<1:34:04,  1.51it/s] 13%|█▎        | 1253/9770 [17:05<1:33:47,  1.51it/s] 13%|█▎        | 1254/9770 [17:06<1:34:03,  1.51it/s] 13%|█▎        | 1255/9770 [17:07
+0: {'loss': 0.7383, 'grad_norm': 0.6334478000061585, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.13}
+0: <1:34:01,  1.51it/s] 13%|█▎        | 1256/9770 [17:07<1:34:06,  1.51it/s] 13%|█▎        | 1257/9770 [17:08<1:33:52,  1.51it/s] 13%|█▎        | 1258/9770 [17:09<1:33:28,  1.52it/s] 13%|█▎        | 1259/9770 [17:09<1:32:40,  1.53it/s] 13%|█▎        | 1260/9770 [17:10<1:32:54,  1.53it/s]                                                      13%|█▎        | 1260/9770 [17:10<1:32:54,  1.53it/s] 13%|█▎        | 1261/9770 [17:11<1:32:55,  1.53it/s] 13%|█▎        | 1262/9770 [17:11<1:32:38,  1.53it/s] 13%|█▎        | 1263/9770 [17:12<1:33:02,  1.52it/s] 13%|█▎        | 1264/9770 [17:13<1:34:12,  1.50it/s] 13%|█▎        | 1265/9770 [17:13<1:32:54,  1.53it/s] 13%|█▎        | 1266/9770 [17:14<1:33:07,  1.52it/s] 13%|█▎        | 1267/9770 [17:15<1:32:21,  1.53it/s] 13%|█▎        | 1268/9770 [17:15<1:33:08,  1.52it/s] 13%|█▎        | 1269/9770 [17:16<1:33:02,  1.52it/s] 13%|█▎        | 1270/9770 [17:17<1:34:39,  1.50it/s]                    
+0: {'loss': 0.708, 'grad_norm': 0.6728790581264449, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.13}
+0: {'loss': 0.7102, 'grad_norm': 0.7111734228950166, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.13}
+0:                                   13%|█▎        | 1270/9770 [17:17<1:34:39,  1.50it/s] 13%|█▎        | 1271/9770 [17:17<1:34:50,  1.49it/s] 13%|█▎        | 1272/9770 [17:18<1:33:47,  1.51it/s] 13%|█▎        | 1273/9770 [17:19<1:33:26,  1.52it/s] 13%|█▎        | 1274/9770 [17:19<1:33:42,  1.51it/s] 13%|█▎        | 1275/9770 [17:20<1:33:58,  1.51it/s] 13%|█▎        | 1276/9770 [17:21<1:33:53,  1.51it/s] 13%|█▎        | 1277/9770 [17:21<1:33:32,  1.51it/s] 13%|█▎        | 1278/9770 [17:22<1:33:10,  1.52it/s] 13%|█▎        | 1279/9770 [17:23<1:34:49,  1.49it/s] 13%|█▎        | 1280/9770 [17:23<1:35:47,  1.48it/s]                                                      13%|█▎        | 1280/9770 [17:23<1:35:47,  1.48it/s] 13%|█▎        | 1281/9770 [17:24<1:33:50,  1.51it/s] 13%|█▎        | 1282/9770 [17:25<1:34:00,  1.50it/s] 13%|█▎        | 1283/9770 [17:25<1:33:55,  1.51it/s] 13%|█▎        | 1284/9770 [17:26<1:34:18,  1.50it/s] 13%|�
+0: {'loss': 0.7149, 'grad_norm': 0.6980786726034038, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.13}
+0: ��▎        | 1285/9770 [17:27<1:34:40,  1.49it/s] 13%|█▎        | 1286/9770 [17:27<1:33:46,  1.51it/s] 13%|█▎        | 1287/9770 [17:28<1:33:20,  1.51it/s] 13%|█▎        | 1288/9770 [17:29<1:33:39,  1.51it/s] 13%|█▎        | 1289/9770 [17:29<1:32:31,  1.53it/s] 13%|█▎        | 1290/9770 [17:30<1:32:59,  1.52it/s]                                                      13%|█▎        | 1290/9770 [17:30<1:32:59,  1.52it/s] 13%|█▎        | 1291/9770 [17:30<1:32:23,  1.53it/s] 13%|█▎        | 1292/9770 [17:31<1:33:53,  1.50it/s] 13%|█▎        | 1293/9770 [17:32<1:33:25,  1.51it/s] 13%|█▎        | 1294/9770 [17:32<1:32:22,  1.53it/s] 13%|█▎        | 1295/9770 [17:33<1:32:23,  1.53it/s] 13%|█▎        | 1296/9770 [17:34<1:32:45,  1.52it/s] 13%|█▎        | 1297/9770 [17:34<1:32:53,  1.52it/s] 13%|█▎        | 1298/9770 [17:35<1:32:36,  1.52it/s] 13%|█▎        | 1299/9770 [17:36<1:32:45,  1.52it/s] 13%|█▎        | 1300/9770 [17:36<1:32:18, 
+0: {'loss': 0.7202, 'grad_norm': 0.6806181084582316, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.13}
+0: {'loss': 0.7112, 'grad_norm': 0.6927153286902209, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.13}
+0:  1.53it/s]                                                      13%|█▎        | 1300/9770 [17:36<1:32:18,  1.53it/s] 13%|█▎        | 1301/9770 [17:37<1:33:56,  1.50it/s] 13%|█▎        | 1302/9770 [17:38<1:33:22,  1.51it/s] 13%|█▎        | 1303/9770 [17:38<1:33:37,  1.51it/s] 13%|█▎        | 1304/9770 [17:39<1:32:33,  1.52it/s] 13%|█▎        | 1305/9770 [17:40<1:32:40,  1.52it/s] 13%|█▎        | 1306/9770 [17:40<1:32:44,  1.52it/s] 13%|█▎        | 1307/9770 [17:41<1:34:16,  1.50it/s] 13%|█▎        | 1308/9770 [17:42<1:34:08,  1.50it/s] 13%|█▎        | 1309/9770 [17:42<1:34:07,  1.50it/s] 13%|█▎        | 1310/9770 [17:43<1:32:50,  1.52it/s]                                                      13%|█▎        | 1310/9770 [17:43<1:32:50,  1.52it/s] 13%|█▎        | 1311/9770 [17:44<1:32:49,  1.52it/s] 13%|█▎        | 1312/9770 [17:44<1:32:36,  1.52it/s] 13%|█▎        | 1313/9770 [17:45<1:32:07,  1.53it/s] 13%|█▎        | 1314/9770 [1
+0: {'loss': 0.7097, 'grad_norm': 0.6596149039183153, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.14}
+0: 7:46<1:32:51,  1.52it/s] 13%|█▎        | 1315/9770 [17:46<1:34:27,  1.49it/s] 13%|█▎        | 1316/9770 [17:47<1:35:02,  1.48it/s] 13%|█▎        | 1317/9770 [17:48<1:34:46,  1.49it/s] 13%|█▎        | 1318/9770 [17:48<1:33:55,  1.50it/s] 14%|█▎        | 1319/9770 [17:49<1:32:56,  1.52it/s] 14%|█▎        | 1320/9770 [17:50<1:33:11,  1.51it/s]                                                      14%|█▎        | 1320/9770 [17:50<1:33:11,  1.51it/s] 14%|█▎        | 1321/9770 [17:50<1:33:20,  1.51it/s] 14%|█▎        | 1322/9770 [17:51<1:32:42,  1.52it/s] 14%|█▎        | 1323/9770 [17:52<1:32:11,  1.53it/s] 14%|█▎        | 1324/9770 [17:52<1:31:51,  1.53it/s] 14%|█▎        | 1325/9770 [17:53<1:33:35,  1.50it/s] 14%|█▎        | 1326/9770 [17:54<1:35:08,  1.48it/s] 14%|█▎        | 1327/9770 [17:54<1:33:39,  1.50it/s] 14%|█▎        | 1328/9770 [17:55<1:33:56,  1.50it/s] 14%|█▎        | 1329/9770 [17:56<1:34:18,  1.49it/s] 14%|█▎     
+0: {'loss': 0.7219, 'grad_norm': 0.6893886057005171, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.14}
+0: {'loss': 0.7123, 'grad_norm': 0.6657075197714051, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.14}
+0:    | 1330/9770 [17:56<1:34:00,  1.50it/s]                                                      14%|█▎        | 1330/9770 [17:56<1:34:00,  1.50it/s] 14%|█▎        | 1331/9770 [17:57<1:33:57,  1.50it/s] 14%|█▎        | 1332/9770 [17:58<1:32:36,  1.52it/s] 14%|█▎        | 1333/9770 [17:58<1:33:10,  1.51it/s] 14%|█▎        | 1334/9770 [17:59<1:32:47,  1.52it/s] 14%|█▎        | 1335/9770 [18:00<1:31:40,  1.53it/s] 14%|█▎        | 1336/9770 [18:00<1:31:53,  1.53it/s] 14%|█▎        | 1337/9770 [18:01<1:34:07,  1.49it/s] 14%|█▎        | 1338/9770 [18:02<1:32:58,  1.51it/s] 14%|█▎        | 1339/9770 [18:02<1:32:45,  1.51it/s] 14%|█▎        | 1340/9770 [18:03<1:32:06,  1.53it/s]                                                      14%|█▎        | 1340/9770 [18:03<1:32:06,  1.53it/s] 14%|█▎        | 1341/9770 [18:04<1:31:57,  1.53it/s] 14%|█▎        | 1342/9770 [18:04<1:31:52,  1.53it/s] 14%|█▎        | 1343/9770 [18:05<1:32:05,  1.53it/s] 1
+0: {'loss': 0.7206, 'grad_norm': 0.6558014041627545, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.14}
+0: 4%|█▍        | 1344/9770 [18:05<1:31:37,  1.53it/s] 14%|█▍        | 1345/9770 [18:06<1:32:06,  1.52it/s] 14%|█▍        | 1346/9770 [18:07<1:32:02,  1.53it/s] 14%|█▍        | 1347/9770 [18:07<1:32:47,  1.51it/s] 14%|█▍        | 1348/9770 [18:08<1:33:16,  1.50it/s] 14%|█▍        | 1349/9770 [18:09<1:33:25,  1.50it/s] 14%|█▍        | 1350/9770 [18:09<1:34:01,  1.49it/s]                                                      14%|█▍        | 1350/9770 [18:09<1:34:01,  1.49it/s] 14%|█▍        | 1351/9770 [18:10<1:33:19,  1.50it/s] 14%|█▍        | 1352/9770 [18:11<1:33:19,  1.50it/s] 14%|█▍        | 1353/9770 [18:11<1:32:23,  1.52it/s] 14%|█▍        | 1354/9770 [18:12<1:32:31,  1.52it/s] 14%|█▍        | 1355/9770 [18:13<1:31:39,  1.53it/s] 14%|█▍        | 1356/9770 [18:13<1:30:58,  1.54it/s] 14%|█▍        | 1357/9770 [18:14<1:30:41,  1.55it/s] 14%|█▍        | 1358/9770 [18:15<1:31:25,  1.53it/s] 14%|█▍        | 1359/9770 [18:15<1:31:
+0: {'loss': 0.7109, 'grad_norm': 0.6567457848634286, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.14}
+0: {'loss': 0.6993, 'grad_norm': 0.6524021123529937, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.14}
+0: 34,  1.53it/s] 14%|█▍        | 1360/9770 [18:16<1:31:17,  1.54it/s]                                                      14%|█▍        | 1360/9770 [18:16<1:31:17,  1.54it/s] 14%|█▍        | 1361/9770 [18:17<1:31:02,  1.54it/s] 14%|█▍        | 1362/9770 [18:17<1:31:51,  1.53it/s] 14%|█▍        | 1363/9770 [18:18<1:34:12,  1.49it/s] 14%|█▍        | 1364/9770 [18:19<1:33:10,  1.50it/s] 14%|█▍        | 1365/9770 [18:19<1:32:52,  1.51it/s] 14%|█▍        | 1366/9770 [18:20<1:32:40,  1.51it/s] 14%|█▍        | 1367/9770 [18:21<1:32:10,  1.52it/s] 14%|█▍        | 1368/9770 [18:21<1:31:31,  1.53it/s] 14%|█▍        | 1369/9770 [18:22<1:31:17,  1.53it/s] 14%|█▍        | 1370/9770 [18:23<1:31:49,  1.52it/s]                                                      14%|█▍        | 1370/9770 [18:23<1:31:49,  1.52it/s] 14%|█▍        | 1371/9770 [18:23<1:32:06,  1.52it/s] 14%|█▍        | 1372/9770 [18:24<1:31:14,  1.53it/s] 14%|█▍        | 1373/977
+0: {'loss': 0.7195, 'grad_norm': 0.6760434408063445, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.14}
+0: 0 [18:25<1:31:49,  1.52it/s] 14%|█▍        | 1374/9770 [18:25<1:32:32,  1.51it/s] 14%|█▍        | 1375/9770 [18:26<1:32:49,  1.51it/s] 14%|█▍        | 1376/9770 [18:27<1:34:05,  1.49it/s] 14%|█▍        | 1377/9770 [18:27<1:34:02,  1.49it/s] 14%|█▍        | 1378/9770 [18:28<1:33:31,  1.50it/s] 14%|█▍        | 1379/9770 [18:29<1:32:58,  1.50it/s] 14%|█▍        | 1380/9770 [18:29<1:33:13,  1.50it/s]                                                      14%|█▍        | 1380/9770 [18:29<1:33:13,  1.50it/s] 14%|█▍        | 1381/9770 [18:30<1:32:00,  1.52it/s] 14%|█▍        | 1382/9770 [18:31<1:32:12,  1.52it/s] 14%|█▍        | 1383/9770 [18:31<1:31:34,  1.53it/s] 14%|█▍        | 1384/9770 [18:32<1:32:31,  1.51it/s] 14%|█▍        | 1385/9770 [18:33<1:32:04,  1.52it/s] 14%|█▍        | 1386/9770 [18:33<1:32:01,  1.52it/s] 14%|█▍        | 1387/9770 [18:34<1:32:52,  1.50it/s] 14%|█▍        | 1388/9770 [18:35<1:34:11,  1.48it/s] 14%|█▍ 
+0: {'loss': 0.7212, 'grad_norm': 0.64911132047645, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.14}
+0: {'loss': 0.716, 'grad_norm': 0.6545453379441815, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.14}
+0:        | 1389/9770 [18:35<1:33:13,  1.50it/s] 14%|█▍        | 1390/9770 [18:36<1:31:43,  1.52it/s]                                                      14%|█▍        | 1390/9770 [18:36<1:31:43,  1.52it/s] 14%|█▍        | 1391/9770 [18:37<1:32:24,  1.51it/s] 14%|█▍        | 1392/9770 [18:37<1:32:48,  1.50it/s] 14%|█▍        | 1393/9770 [18:38<1:32:54,  1.50it/s] 14%|█▍        | 1394/9770 [18:39<1:33:20,  1.50it/s] 14%|█▍        | 1395/9770 [18:39<1:31:52,  1.52it/s] 14%|█▍        | 1396/9770 [18:40<1:31:38,  1.52it/s] 14%|█▍        | 1397/9770 [18:40<1:31:11,  1.53it/s] 14%|█▍        | 1398/9770 [18:41<1:31:58,  1.52it/s] 14%|█▍        | 1399/9770 [18:42<1:32:29,  1.51it/s] 14%|█▍        | 1400/9770 [18:42<1:32:41,  1.50it/s]                                                      14%|█▍        | 1400/9770 [18:42<1:32:41,  1.50it/s] 14%|█▍        | 1401/9770 [18:43<1:32:24,  1.51it/s] 14%|█▍        | 1402/9770 [18:44<1:32:09,  1.51it/s
+0: {'loss': 0.7177, 'grad_norm': 0.7089443403867572, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.14}
+0: ] 14%|█▍        | 1403/9770 [18:44<1:31:53,  1.52it/s] 14%|█▍        | 1404/9770 [18:45<1:31:32,  1.52it/s] 14%|█▍        | 1405/9770 [18:46<1:33:30,  1.49it/s] 14%|█▍        | 1406/9770 [18:46<1:32:28,  1.51it/s] 14%|█▍        | 1407/9770 [18:47<1:31:34,  1.52it/s] 14%|█▍        | 1408/9770 [18:48<1:31:47,  1.52it/s] 14%|█▍        | 1409/9770 [18:48<1:32:57,  1.50it/s] 14%|█▍        | 1410/9770 [18:49<1:31:35,  1.52it/s]                                                      14%|█▍        | 1410/9770 [18:49<1:31:35,  1.52it/s] 14%|█▍        | 1411/9770 [18:50<1:31:56,  1.52it/s] 14%|█▍        | 1412/9770 [18:50<1:32:06,  1.51it/s] 14%|█▍        | 1413/9770 [18:51<1:30:16,  1.54it/s] 14%|█▍        | 1414/9770 [18:52<1:30:49,  1.53it/s] 14%|█▍        | 1415/9770 [18:52<1:31:06,  1.53it/s] 14%|█▍        | 1416/9770 [18:53<1:33:36,  1.49it/s] 15%|█▍        | 1417/9770 [18:54<1:32:44,  1.50it/s] 15%|█▍        | 1418/9770 [18:54<1
+0: {'loss': 0.7313, 'grad_norm': 0.6621066224178461, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.15}
+0: {'loss': 0.7168, 'grad_norm': 0.7299994643919728, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.15}
+0: :32:50,  1.50it/s] 15%|█▍        | 1419/9770 [18:55<1:31:44,  1.52it/s] 15%|█▍        | 1420/9770 [18:56<1:31:47,  1.52it/s]                                                      15%|█▍        | 1420/9770 [18:56<1:31:47,  1.52it/s] 15%|█▍        | 1421/9770 [18:56<1:31:57,  1.51it/s] 15%|█▍        | 1422/9770 [18:57<1:31:52,  1.51it/s] 15%|█▍        | 1423/9770 [18:58<1:31:18,  1.52it/s] 15%|█▍        | 1424/9770 [18:58<1:33:10,  1.49it/s] 15%|█▍        | 1425/9770 [18:59<1:31:27,  1.52it/s] 15%|█▍        | 1426/9770 [19:00<1:31:21,  1.52it/s] 15%|█▍        | 1427/9770 [19:00<1:31:10,  1.53it/s] 15%|█▍        | 1428/9770 [19:01<1:32:42,  1.50it/s] 15%|█▍        | 1429/9770 [19:02<1:31:42,  1.52it/s] 15%|█▍        | 1430/9770 [19:02<1:31:06,  1.53it/s]                                                      15%|█▍        | 1430/9770 [19:02<1:31:06,  1.53it/s] 15%|█▍        | 1431/9770 [19:03<1:33:09,  1.49it/s] 15%|█▍        | 1432
+0: {'loss': 0.6951, 'grad_norm': 0.7068084735819559, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.15}
+0: /9770 [19:04<1:34:13,  1.47it/s] 15%|█▍        | 1433/9770 [19:04<1:33:39,  1.48it/s] 15%|█▍        | 1434/9770 [19:05<1:32:32,  1.50it/s] 15%|█▍        | 1435/9770 [19:06<1:32:04,  1.51it/s] 15%|█▍        | 1436/9770 [19:06<1:32:13,  1.51it/s] 15%|█▍        | 1437/9770 [19:07<1:31:30,  1.52it/s] 15%|█▍        | 1438/9770 [19:08<1:31:36,  1.52it/s] 15%|█▍        | 1439/9770 [19:08<1:31:42,  1.51it/s] 15%|█▍        | 1440/9770 [19:09<1:31:54,  1.51it/s]                                                      15%|█▍        | 1440/9770 [19:09<1:31:54,  1.51it/s] 15%|█▍        | 1441/9770 [19:10<1:31:05,  1.52it/s] 15%|█▍        | 1442/9770 [19:10<1:31:00,  1.53it/s] 15%|█▍        | 1443/9770 [19:11<1:33:15,  1.49it/s] 15%|█▍        | 1444/9770 [19:12<1:32:38,  1.50it/s] 15%|█▍        | 1445/9770 [19:12<1:32:17,  1.50it/s] 15%|█▍        | 1446/9770 [19:13<1:32:18,  1.50it/s] 15%|█▍        | 1447/9770 [19:14<1:31:31,  1.52it/s] 15%|█
+0: {'loss': 0.7068, 'grad_norm': 0.7134032635173736, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.15}
+0: {'loss': 0.7173, 'grad_norm': 0.6712567580091471, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.15}
+0: ▍        | 1448/9770 [19:14<1:30:36,  1.53it/s] 15%|█▍        | 1449/9770 [19:15<1:32:10,  1.50it/s] 15%|█▍        | 1450/9770 [19:16<1:32:13,  1.50it/s]                                                      15%|█▍        | 1450/9770 [19:16<1:32:13,  1.50it/s] 15%|█▍        | 1451/9770 [19:16<1:33:41,  1.48it/s] 15%|█▍        | 1452/9770 [19:17<1:34:48,  1.46it/s] 15%|█▍        | 1453/9770 [19:18<1:34:03,  1.47it/s] 15%|█▍        | 1454/9770 [19:18<1:34:14,  1.47it/s] 15%|█▍        | 1455/9770 [19:19<1:33:07,  1.49it/s] 15%|█▍        | 1456/9770 [19:20<1:34:13,  1.47it/s] 15%|█▍        | 1457/9770 [19:20<1:32:58,  1.49it/s] 15%|█▍        | 1458/9770 [19:21<1:32:45,  1.49it/s] 15%|█▍        | 1459/9770 [19:22<1:31:26,  1.51it/s] 15%|█▍        | 1460/9770 [19:22<1:31:29,  1.51it/s]                                                      15%|█▍        | 1460/9770 [19:22<1:31:29,  1.51it/s] 15%|█▍        | 1461/9770 [19:23<1:31:43,  1.51
+0: {'loss': 0.7112, 'grad_norm': 0.6631079418985482, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.15}
+0: it/s] 15%|█▍        | 1462/9770 [19:24<1:31:53,  1.51it/s] 15%|█▍        | 1463/9770 [19:24<1:31:37,  1.51it/s] 15%|█▍        | 1464/9770 [19:25<1:31:24,  1.51it/s] 15%|█▍        | 1465/9770 [19:26<1:31:32,  1.51it/s] 15%|█▌        | 1466/9770 [19:26<1:31:30,  1.51it/s] 15%|█▌        | 1467/9770 [19:27<1:31:37,  1.51it/s] 15%|█▌        | 1468/9770 [19:28<1:31:47,  1.51it/s] 15%|█▌        | 1469/9770 [19:28<1:31:39,  1.51it/s] 15%|█▌        | 1470/9770 [19:29<1:31:49,  1.51it/s]                                                      15%|█▌        | 1470/9770 [19:29<1:31:49,  1.51it/s] 15%|█▌        | 1471/9770 [19:30<1:31:38,  1.51it/s] 15%|█▌        | 1472/9770 [19:30<1:31:05,  1.52it/s] 15%|█▌        | 1473/9770 [19:31<1:29:53,  1.54it/s] 15%|█▌        | 1474/9770 [19:32<1:30:46,  1.52it/s] 15%|█▌        | 1475/9770 [19:32<1:30:53,  1.52it/s] 15%|█▌        | 1476/9770 [19:33<1:31:03,  1.52it/s] 15%|█▌        | 1477/9770 [19:
+0: {'loss': 0.7049, 'grad_norm': 0.6661452313635602, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.15}
+0: {'loss': 0.7067, 'grad_norm': 0.7100267139981215, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.15}
+0: 34<1:31:06,  1.52it/s] 15%|█▌        | 1478/9770 [19:34<1:29:56,  1.54it/s] 15%|█▌        | 1479/9770 [19:35<1:30:27,  1.53it/s] 15%|█▌        | 1480/9770 [19:35<1:31:09,  1.52it/s]                                                      15%|█▌        | 1480/9770 [19:35<1:31:09,  1.52it/s] 15%|█▌        | 1481/9770 [19:36<1:31:51,  1.50it/s] 15%|█▌        | 1482/9770 [19:37<1:32:06,  1.50it/s] 15%|█▌        | 1483/9770 [19:37<1:31:17,  1.51it/s] 15%|█▌        | 1484/9770 [19:38<1:30:35,  1.52it/s] 15%|█▌        | 1485/9770 [19:39<1:31:02,  1.52it/s] 15%|█▌        | 1486/9770 [19:39<1:30:34,  1.52it/s] 15%|█▌        | 1487/9770 [19:40<1:30:50,  1.52it/s] 15%|█▌        | 1488/9770 [19:41<1:32:35,  1.49it/s] 15%|█▌        | 1489/9770 [19:41<1:32:07,  1.50it/s] 15%|█▌        | 1490/9770 [19:42<1:33:37,  1.47it/s]                                                      15%|█▌        | 1490/9770 [19:42<1:33:37,  1.47it/s] 15%|█▌        | 
+0: {'loss': 0.7179, 'grad_norm': 0.6639321669934656, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.15}
+0: 1491/9770 [19:43<1:33:20,  1.48it/s] 15%|█▌        | 1492/9770 [19:44<1:32:59,  1.48it/s] 15%|█▌        | 1493/9770 [19:44<1:32:38,  1.49it/s] 15%|█▌        | 1494/9770 [19:45<1:32:18,  1.49it/s] 15%|█▌        | 1495/9770 [19:45<1:31:10,  1.51it/s] 15%|█▌        | 1496/9770 [19:46<1:32:00,  1.50it/s] 15%|█▌        | 1497/9770 [19:47<1:33:30,  1.47it/s] 15%|█▌        | 1498/9770 [19:48<1:33:04,  1.48it/s] 15%|█▌        | 1499/9770 [19:48<1:32:21,  1.49it/s] 15%|█▌        | 1500/9770 [19:49<1:32:18,  1.49it/s]                                                      15%|█▌        | 1500/9770 [19:49<1:32:18,  1.49it/s] 15%|█▌        | 1501/9770 [19:50<1:31:47,  1.50it/s] 15%|█▌        | 1502/9770 [19:50<1:31:37,  1.50it/s] 15%|█▌        | 1503/9770 [19:51<1:31:13,  1.51it/s] 15%|█▌        | 1504/9770 [19:52<1:31:17,  1.51it/s] 15%|█▌        | 1505/9770 [19:52<1:30:40,  1.52it/s] 15%|█▌        | 1506/9770 [19:53<1:30:40,  1.52it/s] 15%
+0: {'loss': 0.7148, 'grad_norm': 0.6739816552265073, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.15}
+0: {'loss': 0.7125, 'grad_norm': 0.662563570377684, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.16}
+0: |█▌        | 1507/9770 [19:53<1:29:43,  1.53it/s] 15%|█▌        | 1508/9770 [19:54<1:29:36,  1.54it/s] 15%|█▌        | 1509/9770 [19:55<1:29:57,  1.53it/s] 15%|█▌        | 1510/9770 [19:55<1:29:17,  1.54it/s]                                                      15%|█▌        | 1510/9770 [19:55<1:29:17,  1.54it/s] 15%|█▌        | 1511/9770 [19:56<1:29:34,  1.54it/s] 15%|█▌        | 1512/9770 [19:57<1:30:04,  1.53it/s] 15%|█▌        | 1513/9770 [19:57<1:29:39,  1.53it/s] 15%|█▌        | 1514/9770 [19:58<1:30:27,  1.52it/s] 16%|█▌        | 1515/9770 [19:59<1:31:39,  1.50it/s] 16%|█▌        | 1516/9770 [19:59<1:31:47,  1.50it/s] 16%|█▌        | 1517/9770 [20:00<1:30:59,  1.51it/s] 16%|█▌        | 1518/9770 [20:01<1:30:25,  1.52it/s] 16%|█▌        | 1519/9770 [20:01<1:30:57,  1.51it/s] 16%|█▌        | 1520/9770 [20:02<1:30:43,  1.52it/s]                                                      16%|█▌        | 1520/9770 [20:02<1:30:43,  
+0: {'loss': 0.6948, 'grad_norm': 0.6981927450083755, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.16}
+0: 1.52it/s] 16%|█▌        | 1521/9770 [20:03<1:32:13,  1.49it/s] 16%|█▌        | 1522/9770 [20:03<1:32:52,  1.48it/s] 16%|█▌        | 1523/9770 [20:04<1:31:56,  1.49it/s] 16%|█▌        | 1524/9770 [20:05<1:31:25,  1.50it/s] 16%|█▌        | 1525/9770 [20:05<1:33:17,  1.47it/s] 16%|█▌        | 1526/9770 [20:06<1:31:44,  1.50it/s] 16%|█▌        | 1527/9770 [20:07<1:31:21,  1.50it/s] 16%|█▌        | 1528/9770 [20:07<1:30:42,  1.51it/s] 16%|█▌        | 1529/9770 [20:08<1:29:48,  1.53it/s] 16%|█▌        | 1530/9770 [20:09<1:31:11,  1.51it/s]                                                      16%|█▌        | 1530/9770 [20:09<1:31:11,  1.51it/s] 16%|█▌        | 1531/9770 [20:09<1:31:18,  1.50it/s] 16%|█▌        | 1532/9770 [20:10<1:31:12,  1.51it/s] 16%|█▌        | 1533/9770 [20:11<1:31:25,  1.50it/s] 16%|█▌        | 1534/9770 [20:11<1:31:23,  1.50it/s] 16%|█▌        | 1535/9770 [20:12<1:31:57,  1.49it/s] 16%|█▌        | 1536/9770 
+0: {'loss': 0.7286, 'grad_norm': 0.6621256493658119, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.16}
+0: {'loss': 0.696, 'grad_norm': 0.6610565287666501, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.16}
+0: [20:13<1:30:56,  1.51it/s] 16%|█▌        | 1537/9770 [20:13<1:30:50,  1.51it/s] 16%|█▌        | 1538/9770 [20:14<1:30:11,  1.52it/s] 16%|█▌        | 1539/9770 [20:15<1:29:57,  1.52it/s] 16%|█▌        | 1540/9770 [20:15<1:31:14,  1.50it/s]                                                      16%|█▌        | 1540/9770 [20:15<1:31:14,  1.50it/s] 16%|█▌        | 1541/9770 [20:16<1:31:51,  1.49it/s] 16%|█▌        | 1542/9770 [20:17<1:31:22,  1.50it/s] 16%|█▌        | 1543/9770 [20:17<1:30:11,  1.52it/s] 16%|█▌        | 1544/9770 [20:18<1:30:30,  1.51it/s] 16%|█▌        | 1545/9770 [20:19<1:30:36,  1.51it/s] 16%|█▌        | 1546/9770 [20:19<1:30:07,  1.52it/s] 16%|█▌        | 1547/9770 [20:20<1:30:12,  1.52it/s] 16%|█▌        | 1548/9770 [20:21<1:30:42,  1.51it/s] 16%|█▌        | 1549/9770 [20:21<1:31:02,  1.51it/s] 16%|█▌        | 1550/9770 [20:22<1:31:10,  1.50it/s]                                                      16%|█▌      
+0: {'loss': 0.7044, 'grad_norm': 0.6696477548735924, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.16}
+0:   | 1550/9770 [20:22<1:31:10,  1.50it/s] 16%|█▌        | 1551/9770 [20:23<1:30:18,  1.52it/s] 16%|█▌        | 1552/9770 [20:23<1:29:47,  1.53it/s] 16%|█▌        | 1553/9770 [20:24<1:28:17,  1.55it/s] 16%|█▌        | 1554/9770 [20:25<1:28:45,  1.54it/s] 16%|█▌        | 1555/9770 [20:25<1:29:18,  1.53it/s] 16%|█▌        | 1556/9770 [20:26<1:29:25,  1.53it/s] 16%|█▌        | 1557/9770 [20:27<1:30:02,  1.52it/s] 16%|█▌        | 1558/9770 [20:27<1:28:58,  1.54it/s] 16%|█▌        | 1559/9770 [20:28<1:29:13,  1.53it/s] 16%|█▌        | 1560/9770 [20:28<1:28:42,  1.54it/s]                                                      16%|█▌        | 1560/9770 [20:28<1:28:42,  1.54it/s] 16%|█▌        | 1561/9770 [20:29<1:28:50,  1.54it/s] 16%|█▌        | 1562/9770 [20:30<1:29:04,  1.54it/s] 16%|█▌        | 1563/9770 [20:30<1:29:35,  1.53it/s] 16%|█▌        | 1564/9770 [20:31<1:31:53,  1.49it/s] 16%|█▌        | 1565/9770 [20:32<1:31:43,  1.49it/s]
+0: {'loss': 0.7117, 'grad_norm': 0.6637871066257344, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.16}
+0:  16%|█▌        | 1566/9770 [20:32<1:31:48,  1.49it/s] 16%|█▌        | 1567/9770 [20:33<1:31:15,  1.50it/s] 16%|█▌        | 1568/9770 [20:34<1:31:47,  1.49it/s] 16%|█▌        | 1569/9770 [20:34<1:31:47,  1.49it/s] 16%|█▌        | 1570/9770 [20:35<1:31:32,  1.49it/s]                                                      16%|█▌        | 1570/9770 [20:35<1:31:32,  1.49it/s] 16%|█▌        | 1571/9770 [20:36<1:31:20,  1.50it/s] 16%|█▌        | 1572/9770 [20:36<1:30:22,  1.51it/s] 16%|█▌        | 1573/9770 [20:37<1:30:14,  1.51it/s] 16%|█▌        | 1574/9770 [20:38<1:30:33,  1.51it/s] 16%|█▌        | 1575/9770 [20:38<1:29:48,  1.52it/s] 16%|█▌        | 1576/9770 [20:39<1:29:48,  1.52it/s] 16%|█▌        | 1577/9770 [20:40<1:29:19,  1.53it/s] 16%|█▌        | 1578/9770 [20:40<1:29:17,  1.53it/s] 16%|█▌        | 1579/9770 [20:41<1:29:19,  1.53it/s] 16%|█▌        | 1580/9770 [20:42<1:30:38,  1.51it/s]                                         
+0: {'loss': 0.705, 'grad_norm': 0.6309546850465658, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.16}
+0: {'loss': 0.7036, 'grad_norm': 0.6288371723180415, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.16}
+0:              16%|█▌        | 1580/9770 [20:42<1:30:38,  1.51it/s] 16%|█▌        | 1581/9770 [20:42<1:31:01,  1.50it/s] 16%|█▌        | 1582/9770 [20:43<1:31:03,  1.50it/s] 16%|█▌        | 1583/9770 [20:44<1:31:29,  1.49it/s] 16%|█▌        | 1584/9770 [20:44<1:31:00,  1.50it/s] 16%|█▌        | 1585/9770 [20:45<1:30:14,  1.51it/s] 16%|█▌        | 1586/9770 [20:46<1:29:38,  1.52it/s] 16%|█▌        | 1587/9770 [20:46<1:29:00,  1.53it/s] 16%|█▋        | 1588/9770 [20:47<1:29:37,  1.52it/s] 16%|█▋        | 1589/9770 [20:48<1:30:34,  1.51it/s] 16%|█▋        | 1590/9770 [20:48<1:30:22,  1.51it/s]                                                      16%|█▋        | 1590/9770 [20:48<1:30:22,  1.51it/s] 16%|█▋        | 1591/9770 [20:49<1:30:18,  1.51it/s] 16%|█▋        | 1592/9770 [20:50<1:30:45,  1.50it/s] 16%|█▋        | 1593/9770 [20:50<1:29:27,  1.52it/s] 16%|█▋        | 1594/9770 [20:51<1:28:53,  1.53it/s] 16%|█▋        | 1595/9
+0: {'loss': 0.6847, 'grad_norm': 0.6823549766396124, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.16}
+0: 770 [20:52<1:28:41,  1.54it/s] 16%|█▋        | 1596/9770 [20:52<1:28:58,  1.53it/s] 16%|█▋        | 1597/9770 [20:53<1:28:59,  1.53it/s] 16%|█▋        | 1598/9770 [20:54<1:28:41,  1.54it/s] 16%|█▋        | 1599/9770 [20:54<1:29:11,  1.53it/s] 16%|█▋        | 1600/9770 [20:55<1:29:45,  1.52it/s]                                                      16%|█▋        | 1600/9770 [20:55<1:29:45,  1.52it/s] 16%|█▋        | 1601/9770 [20:56<1:30:20,  1.51it/s] 16%|█▋        | 1602/9770 [20:56<1:29:53,  1.51it/s] 16%|█▋        | 1603/9770 [20:57<1:29:34,  1.52it/s] 16%|█▋        | 1604/9770 [20:58<1:30:17,  1.51it/s] 16%|█▋        | 1605/9770 [20:58<1:30:17,  1.51it/s] 16%|█▋        | 1606/9770 [20:59<1:30:06,  1.51it/s] 16%|█▋        | 1607/9770 [21:00<1:29:51,  1.51it/s] 16%|█▋        | 1608/9770 [21:00<1:28:57,  1.53it/s] 16%|█▋        | 1609/9770 [21:01<1:29:13,  1.52it/s] 16%|█▋        | 1610/9770 [21:01<1:29:43,  1.52it/s]          
+0: {'loss': 0.7155, 'grad_norm': 0.6899701563409427, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.16}
+0: {'loss': 0.7005, 'grad_norm': 0.631018132428086, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.17}
+0:                                             16%|█▋        | 1610/9770 [21:01<1:29:43,  1.52it/s] 16%|█▋        | 1611/9770 [21:02<1:29:37,  1.52it/s] 16%|█▋        | 1612/9770 [21:03<1:30:36,  1.50it/s] 17%|█▋        | 1613/9770 [21:04<1:30:46,  1.50it/s] 17%|█▋        | 1614/9770 [21:04<1:29:55,  1.51it/s] 17%|█▋        | 1615/9770 [21:05<1:30:04,  1.51it/s] 17%|█▋        | 1616/9770 [21:05<1:30:21,  1.50it/s] 17%|█▋        | 1617/9770 [21:06<1:29:39,  1.52it/s] 17%|█▋        | 1618/9770 [21:07<1:28:58,  1.53it/s] 17%|█▋        | 1619/9770 [21:07<1:28:32,  1.53it/s] 17%|█▋        | 1620/9770 [21:08<1:29:25,  1.52it/s]                                                      17%|█▋        | 1620/9770 [21:08<1:29:25,  1.52it/s] 17%|█▋        | 1621/9770 [21:09<1:30:06,  1.51it/s] 17%|█▋        | 1622/9770 [21:09<1:30:24,  1.50it/s] 17%|█▋        | 1623/9770 [21:10<1:29:09,  1.52it/s] 17%|█▋        | 1624/9770 [21:11<1:30:17,  1.50it
+0: {'loss': 0.7057, 'grad_norm': 0.6332315753665916, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.17}
+0: /s] 17%|█▋        | 1625/9770 [21:11<1:28:32,  1.53it/s] 17%|█▋        | 1626/9770 [21:12<1:27:26,  1.55it/s] 17%|█▋        | 1627/9770 [21:13<1:28:48,  1.53it/s] 17%|█▋        | 1628/9770 [21:13<1:28:57,  1.53it/s] 17%|█▋        | 1629/9770 [21:14<1:29:06,  1.52it/s] 17%|█▋        | 1630/9770 [21:15<1:29:09,  1.52it/s]                                                      17%|█▋        | 1630/9770 [21:15<1:29:09,  1.52it/s] 17%|█▋        | 1631/9770 [21:15<1:29:30,  1.52it/s] 17%|█▋        | 1632/9770 [21:16<1:30:43,  1.49it/s] 17%|█▋        | 1633/9770 [21:17<1:29:28,  1.52it/s] 17%|█▋        | 1634/9770 [21:17<1:29:41,  1.51it/s] 17%|█▋        | 1635/9770 [21:18<1:31:02,  1.49it/s] 17%|█▋        | 1636/9770 [21:19<1:30:10,  1.50it/s] 17%|█▋        | 1637/9770 [21:19<1:30:08,  1.50it/s] 17%|█▋        | 1638/9770 [21:20<1:29:54,  1.51it/s] 17%|█▋        | 1639/9770 [21:21<1:30:08,  1.50it/s] 17%|█▋        | 1640/9770 [21:21
+0: {'loss': 0.7111, 'grad_norm': 0.7191528013301974, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.17}
+0: {'loss': 0.7021, 'grad_norm': 0.7153026649727555, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.17}
+0: <1:30:08,  1.50it/s]                                                      17%|█▋        | 1640/9770 [21:21<1:30:08,  1.50it/s] 17%|█▋        | 1641/9770 [21:22<1:30:15,  1.50it/s] 17%|█▋        | 1642/9770 [21:23<1:29:19,  1.52it/s] 17%|█▋        | 1643/9770 [21:23<1:30:28,  1.50it/s] 17%|█▋        | 1644/9770 [21:24<1:30:53,  1.49it/s] 17%|█▋        | 1645/9770 [21:25<1:30:18,  1.50it/s] 17%|█▋        | 1646/9770 [21:25<1:30:04,  1.50it/s] 17%|█▋        | 1647/9770 [21:26<1:29:53,  1.51it/s] 17%|█▋        | 1648/9770 [21:27<1:29:41,  1.51it/s] 17%|█▋        | 1649/9770 [21:27<1:29:53,  1.51it/s] 17%|█▋        | 1650/9770 [21:28<1:29:34,  1.51it/s]                                                      17%|█▋        | 1650/9770 [21:28<1:29:34,  1.51it/s] 17%|█▋        | 1651/9770 [21:29<1:28:13,  1.53it/s] 17%|█▋        | 1652/9770 [21:29<1:28:43,  1.52it/s] 17%|█▋        | 1653/9770 [21:30<1:27:59,  1.54it/s] 17%|█▋        | 16
+0: {'loss': 0.6944, 'grad_norm': 0.7051560141949752, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.17}
+0: 54/9770 [21:31<1:28:33,  1.53it/s] 17%|█▋        | 1655/9770 [21:31<1:28:28,  1.53it/s] 17%|█▋        | 1656/9770 [21:32<1:29:01,  1.52it/s] 17%|█▋        | 1657/9770 [21:33<1:29:06,  1.52it/s] 17%|█▋        | 1658/9770 [21:33<1:29:27,  1.51it/s] 17%|█▋        | 1659/9770 [21:34<1:28:30,  1.53it/s] 17%|█▋        | 1660/9770 [21:34<1:27:31,  1.54it/s]                                                      17%|█▋        | 1660/9770 [21:34<1:27:31,  1.54it/s] 17%|█▋        | 1661/9770 [21:35<1:29:21,  1.51it/s] 17%|█▋        | 1662/9770 [21:36<1:29:41,  1.51it/s] 17%|█▋        | 1663/9770 [21:37<1:29:33,  1.51it/s] 17%|█▋        | 1664/9770 [21:37<1:29:10,  1.51it/s] 17%|█▋        | 1665/9770 [21:38<1:29:10,  1.51it/s] 17%|█▋        | 1666/9770 [21:38<1:28:05,  1.53it/s] 17%|█▋        | 1667/9770 [21:39<1:27:32,  1.54it/s] 17%|█▋        | 1668/9770 [21:40<1:29:46,  1.50it/s] 17%|█▋        | 1669/9770 [21:40<1:28:19,  1.53it/s] 17%|�
+0: {'loss': 0.6988, 'grad_norm': 0.6886653408918136, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.17}
+0: {'loss': 0.7254, 'grad_norm': 0.6490327812962825, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.17}
+0: ��▋        | 1670/9770 [21:41<1:28:54,  1.52it/s]                                                      17%|█▋        | 1670/9770 [21:41<1:28:54,  1.52it/s] 17%|█▋        | 1671/9770 [21:42<1:29:38,  1.51it/s] 17%|█▋        | 1672/9770 [21:42<1:31:38,  1.47it/s] 17%|█▋        | 1673/9770 [21:43<1:30:10,  1.50it/s] 17%|█▋        | 1674/9770 [21:44<1:29:41,  1.50it/s] 17%|█▋        | 1675/9770 [21:44<1:30:01,  1.50it/s] 17%|█▋        | 1676/9770 [21:45<1:29:03,  1.51it/s] 17%|█▋        | 1677/9770 [21:46<1:28:54,  1.52it/s] 17%|█▋        | 1678/9770 [21:46<1:29:22,  1.51it/s] 17%|█▋        | 1679/9770 [21:47<1:28:40,  1.52it/s] 17%|█▋        | 1680/9770 [21:48<1:28:43,  1.52it/s]                                                      17%|█▋        | 1680/9770 [21:48<1:28:43,  1.52it/s] 17%|█▋        | 1681/9770 [21:48<1:28:28,  1.52it/s] 17%|█▋        | 1682/9770 [21:49<1:27:30,  1.54it/s] 17%|█▋        | 1683/9770 [21:50<1:28:06,  1.
+0: {'loss': 0.7077, 'grad_norm': 0.7222801303406803, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.17}
+0: 53it/s] 17%|█▋        | 1684/9770 [21:50<1:28:48,  1.52it/s] 17%|█▋        | 1685/9770 [21:51<1:28:59,  1.51it/s] 17%|█▋        | 1686/9770 [21:52<1:29:03,  1.51it/s] 17%|█▋        | 1687/9770 [21:52<1:28:23,  1.52it/s] 17%|█▋        | 1688/9770 [21:53<1:27:41,  1.54it/s] 17%|█▋        | 1689/9770 [21:54<1:27:27,  1.54it/s] 17%|█▋        | 1690/9770 [21:54<1:27:48,  1.53it/s]                                                      17%|█▋        | 1690/9770 [21:54<1:27:48,  1.53it/s] 17%|█▋        | 1691/9770 [21:55<1:29:56,  1.50it/s] 17%|█▋        | 1692/9770 [21:56<1:28:50,  1.52it/s] 17%|█▋        | 1693/9770 [21:56<1:28:22,  1.52it/s] 17%|█▋        | 1694/9770 [21:57<1:27:38,  1.54it/s] 17%|█▋        | 1695/9770 [21:58<1:28:24,  1.52it/s] 17%|█▋        | 1696/9770 [21:58<1:28:09,  1.53it/s] 17%|█▋        | 1697/9770 [21:59<1:26:58,  1.55it/s] 17%|█▋        | 1698/9770 [21:59<1:26:22,  1.56it/s] 17%|█▋        | 1699/9770 [2
+0: {'loss': 0.6953, 'grad_norm': 0.6311012069047182, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.17}
+0: {'loss': 0.7328, 'grad_norm': 0.6981305042612962, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.17}
+0: 2:00<1:27:24,  1.54it/s] 17%|█▋        | 1700/9770 [22:01<1:29:40,  1.50it/s]                                                      17%|█▋        | 1700/9770 [22:01<1:29:40,  1.50it/s] 17%|█▋        | 1701/9770 [22:02<1:30:12,  1.49it/s] 17%|█▋        | 1702/9770 [22:02<1:29:29,  1.50it/s] 17%|█▋        | 1703/9770 [22:03<1:29:05,  1.51it/s] 17%|█▋        | 1704/9770 [22:03<1:28:39,  1.52it/s] 17%|█▋        | 1705/9770 [22:04<1:28:38,  1.52it/s] 17%|█▋        | 1706/9770 [22:05<1:27:52,  1.53it/s] 17%|█▋        | 1707/9770 [22:05<1:27:39,  1.53it/s] 17%|█▋        | 1708/9770 [22:06<1:28:18,  1.52it/s] 17%|█▋        | 1709/9770 [22:07<1:28:33,  1.52it/s] 18%|█▊        | 1710/9770 [22:07<1:30:01,  1.49it/s]                                                      18%|█▊        | 1710/9770 [22:07<1:30:01,  1.49it/s] 18%|█▊        | 1711/9770 [22:08<1:30:04,  1.49it/s] 18%|█▊        | 1712/9770 [22:09<1:28:48,  1.51it/s] 18%|█▊        
+0: {'loss': 0.6989, 'grad_norm': 0.6612029966787886, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.18}
+0: | 1713/9770 [22:09<1:29:09,  1.51it/s] 18%|█▊        | 1714/9770 [22:10<1:27:46,  1.53it/s] 18%|█▊        | 1715/9770 [22:11<1:28:34,  1.52it/s] 18%|█▊        | 1716/9770 [22:11<1:28:29,  1.52it/s] 18%|█▊        | 1717/9770 [22:12<1:28:26,  1.52it/s] 18%|█▊        | 1718/9770 [22:13<1:27:47,  1.53it/s] 18%|█▊        | 1719/9770 [22:13<1:27:23,  1.54it/s] 18%|█▊        | 1720/9770 [22:14<1:28:50,  1.51it/s]                                                      18%|█▊        | 1720/9770 [22:14<1:28:50,  1.51it/s] 18%|█▊        | 1721/9770 [22:15<1:29:26,  1.50it/s] 18%|█▊        | 1722/9770 [22:15<1:28:31,  1.52it/s] 18%|█▊        | 1723/9770 [22:16<1:27:40,  1.53it/s] 18%|█▊        | 1724/9770 [22:17<1:28:25,  1.52it/s] 18%|█▊        | 1725/9770 [22:17<1:29:07,  1.50it/s] 18%|█▊        | 1726/9770 [22:18<1:31:03,  1.47it/s] 18%|█▊        | 1727/9770 [22:19<1:30:12,  1.49it/s] 18%|█▊        | 1728/9770 [22:19<1:29:03,  1.51it/s] 1
+0: {'loss': 0.738, 'grad_norm': 0.6601662974172445, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.18}
+0: {'loss': 0.705, 'grad_norm': 0.6416467570673219, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.18}
+0: 8%|█▊        | 1729/9770 [22:20<1:28:58,  1.51it/s] 18%|█▊        | 1730/9770 [22:21<1:28:58,  1.51it/s]                                                      18%|█▊        | 1730/9770 [22:21<1:28:58,  1.51it/s] 18%|█▊        | 1731/9770 [22:21<1:28:52,  1.51it/s] 18%|█▊        | 1732/9770 [22:22<1:28:40,  1.51it/s] 18%|█▊        | 1733/9770 [22:23<1:28:40,  1.51it/s] 18%|█▊        | 1734/9770 [22:23<1:27:51,  1.52it/s] 18%|█▊        | 1735/9770 [22:24<1:26:47,  1.54it/s] 18%|█▊        | 1736/9770 [22:25<1:26:42,  1.54it/s] 18%|█▊        | 1737/9770 [22:25<1:27:00,  1.54it/s] 18%|█▊        | 1738/9770 [22:26<1:27:38,  1.53it/s] 18%|█▊        | 1739/9770 [22:27<1:29:35,  1.49it/s] 18%|█▊        | 1740/9770 [22:27<1:28:07,  1.52it/s]                                                      18%|█▊        | 1740/9770 [22:27<1:28:07,  1.52it/s] 18%|█▊        | 1741/9770 [22:28<1:27:13,  1.53it/s] 18%|█▊        | 1742/9770 [22:29<1:27:09,
+0: {'loss': 0.6936, 'grad_norm': 0.6436643395755568, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.18}
+0:   1.54it/s] 18%|█▊        | 1743/9770 [22:29<1:26:35,  1.55it/s] 18%|█▊        | 1744/9770 [22:30<1:27:24,  1.53it/s] 18%|█▊        | 1745/9770 [22:30<1:26:24,  1.55it/s] 18%|█▊        | 1746/9770 [22:31<1:27:24,  1.53it/s] 18%|█▊        | 1747/9770 [22:32<1:27:27,  1.53it/s] 18%|█▊        | 1748/9770 [22:32<1:27:26,  1.53it/s] 18%|█▊        | 1749/9770 [22:33<1:27:20,  1.53it/s] 18%|█▊        | 1750/9770 [22:34<1:28:04,  1.52it/s]                                                      18%|█▊        | 1750/9770 [22:34<1:28:04,  1.52it/s] 18%|█▊        | 1751/9770 [22:34<1:27:47,  1.52it/s] 18%|█▊        | 1752/9770 [22:35<1:30:07,  1.48it/s] 18%|█▊        | 1753/9770 [22:36<1:28:40,  1.51it/s] 18%|█▊        | 1754/9770 [22:36<1:27:51,  1.52it/s] 18%|█▊        | 1755/9770 [22:37<1:27:48,  1.52it/s] 18%|█▊        | 1756/9770 [22:38<1:27:02,  1.53it/s] 18%|█▊        | 1757/9770 [22:38<1:27:37,  1.52it/s] 18%|█▊        | 1758/977
+0: {'loss': 0.6818, 'grad_norm': 0.6367935485032978, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.18}
+0: {'loss': 0.7111, 'grad_norm': 0.6730334998061857, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.18}
+0: 0 [22:39<1:27:22,  1.53it/s] 18%|█▊        | 1759/9770 [22:40<1:27:09,  1.53it/s] 18%|█▊        | 1760/9770 [22:40<1:26:20,  1.55it/s]                                                      18%|█▊        | 1760/9770 [22:40<1:26:20,  1.55it/s] 18%|█▊        | 1761/9770 [22:41<1:26:47,  1.54it/s] 18%|█▊        | 1762/9770 [22:42<1:26:21,  1.55it/s] 18%|█▊        | 1763/9770 [22:42<1:28:18,  1.51it/s] 18%|█▊        | 1764/9770 [22:43<1:28:08,  1.51it/s] 18%|█▊        | 1765/9770 [22:44<1:27:40,  1.52it/s] 18%|█▊        | 1766/9770 [22:44<1:27:45,  1.52it/s] 18%|█▊        | 1767/9770 [22:45<1:27:35,  1.52it/s] 18%|█▊        | 1768/9770 [22:46<1:28:28,  1.51it/s] 18%|█▊        | 1769/9770 [22:46<1:29:08,  1.50it/s] 18%|█▊        | 1770/9770 [22:47<1:29:04,  1.50it/s]                                                      18%|█▊        | 1770/9770 [22:47<1:29:04,  1.50it/s] 18%|█▊        | 1771/9770 [22:48<1:27:54,  1.52it/s] 18%|█▊    
+0: {'loss': 0.7138, 'grad_norm': 0.6697989761249823, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.18}
+0:     | 1772/9770 [22:48<1:29:32,  1.49it/s] 18%|█▊        | 1773/9770 [22:49<1:28:26,  1.51it/s] 18%|█▊        | 1774/9770 [22:50<1:28:00,  1.51it/s] 18%|█▊        | 1775/9770 [22:50<1:27:27,  1.52it/s] 18%|█▊        | 1776/9770 [22:51<1:27:36,  1.52it/s] 18%|█▊        | 1777/9770 [22:52<1:27:10,  1.53it/s] 18%|█▊        | 1778/9770 [22:52<1:26:42,  1.54it/s] 18%|█▊        | 1779/9770 [22:53<1:26:56,  1.53it/s] 18%|█▊        | 1780/9770 [22:54<1:28:01,  1.51it/s]                                                      18%|█▊        | 1780/9770 [22:54<1:28:01,  1.51it/s] 18%|█▊        | 1781/9770 [22:54<1:28:52,  1.50it/s] 18%|█▊        | 1782/9770 [22:55<1:28:14,  1.51it/s] 18%|█▊        | 1783/9770 [22:56<1:27:57,  1.51it/s] 18%|█▊        | 1784/9770 [22:56<1:27:36,  1.52it/s] 18%|█▊        | 1785/9770 [22:57<1:28:02,  1.51it/s] 18%|█▊        | 1786/9770 [22:57<1:26:47,  1.53it/s] 18%|█▊        | 1787/9770 [22:58<1:27:36,  1.52it/s
+0: {'loss': 0.7088, 'grad_norm': 0.6516163848532479, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.18}
+0: {'loss': 0.7142, 'grad_norm': 0.6350922904457226, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.18}
+0: ] 18%|█▊        | 1788/9770 [22:59<1:29:25,  1.49it/s] 18%|█▊        | 1789/9770 [23:00<1:30:00,  1.48it/s] 18%|█▊        | 1790/9770 [23:00<1:28:43,  1.50it/s]                                                      18%|█▊        | 1790/9770 [23:00<1:28:43,  1.50it/s] 18%|█▊        | 1791/9770 [23:01<1:30:09,  1.47it/s] 18%|█▊        | 1792/9770 [23:02<1:31:27,  1.45it/s] 18%|█▊        | 1793/9770 [23:02<1:30:47,  1.46it/s] 18%|█▊        | 1794/9770 [23:03<1:30:07,  1.47it/s] 18%|█▊        | 1795/9770 [23:04<1:28:30,  1.50it/s] 18%|█▊        | 1796/9770 [23:04<1:28:14,  1.51it/s] 18%|█▊        | 1797/9770 [23:05<1:28:22,  1.50it/s] 18%|█▊        | 1798/9770 [23:06<1:28:13,  1.51it/s] 18%|█▊        | 1799/9770 [23:06<1:28:12,  1.51it/s] 18%|█▊        | 1800/9770 [23:07<1:27:34,  1.52it/s]                                                      18%|█▊        | 1800/9770 [23:07<1:27:34,  1.52it/s] 18%|█▊        | 1801/9770 [23:08<1:27
+0: {'loss': 0.6841, 'grad_norm': 0.6553509587727674, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.19}
+0: :56,  1.51it/s] 18%|█▊        | 1802/9770 [23:08<1:27:00,  1.53it/s] 18%|█▊        | 1803/9770 [23:09<1:27:28,  1.52it/s] 18%|█▊        | 1804/9770 [23:10<1:28:44,  1.50it/s] 18%|█▊        | 1805/9770 [23:10<1:28:51,  1.49it/s] 18%|█▊        | 1806/9770 [23:11<1:27:41,  1.51it/s] 18%|█▊        | 1807/9770 [23:12<1:27:35,  1.52it/s] 19%|█▊        | 1808/9770 [23:12<1:26:26,  1.54it/s] 19%|█▊        | 1809/9770 [23:13<1:26:30,  1.53it/s] 19%|█▊        | 1810/9770 [23:13<1:26:05,  1.54it/s]                                                      19%|█▊        | 1810/9770 [23:13<1:26:05,  1.54it/s] 19%|█▊        | 1811/9770 [23:14<1:26:17,  1.54it/s] 19%|█▊        | 1812/9770 [23:15<1:27:29,  1.52it/s] 19%|█▊        | 1813/9770 [23:15<1:26:43,  1.53it/s] 19%|█▊        | 1814/9770 [23:16<1:26:54,  1.53it/s] 19%|█▊        | 1815/9770 [23:17<1:26:59,  1.52it/s] 19%|█▊        | 1816/9770 [23:17<1:27:25,  1.52it/s] 19%|█▊        | 1817
+0: {'loss': 0.714, 'grad_norm': 0.6730232319323477, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.19}
+0: {'loss': 0.7017, 'grad_norm': 0.6546312936833185, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.19}
+0: /9770 [23:18<1:26:43,  1.53it/s] 19%|█▊        | 1818/9770 [23:19<1:26:17,  1.54it/s] 19%|█▊        | 1819/9770 [23:19<1:28:01,  1.51it/s] 19%|█▊        | 1820/9770 [23:20<1:27:55,  1.51it/s]                                                      19%|█▊        | 1820/9770 [23:20<1:27:55,  1.51it/s] 19%|█▊        | 1821/9770 [23:21<1:28:29,  1.50it/s] 19%|█▊        | 1822/9770 [23:21<1:28:16,  1.50it/s] 19%|█▊        | 1823/9770 [23:22<1:28:06,  1.50it/s] 19%|█▊        | 1824/9770 [23:23<1:26:51,  1.52it/s] 19%|█▊        | 1825/9770 [23:23<1:26:42,  1.53it/s] 19%|█▊        | 1826/9770 [23:24<1:25:48,  1.54it/s] 19%|█▊        | 1827/9770 [23:25<1:26:13,  1.54it/s] 19%|█▊        | 1828/9770 [23:25<1:26:53,  1.52it/s] 19%|█▊        | 1829/9770 [23:26<1:28:15,  1.50it/s] 19%|█▊        | 1830/9770 [23:27<1:26:58,  1.52it/s]                                                      19%|█▊        | 1830/9770 [23:27<1:26:58,  1.52it/s] 19%|█▊
+0: {'loss': 0.708, 'grad_norm': 0.6411598866366788, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.19}
+0:         | 1831/9770 [23:27<1:27:17,  1.52it/s] 19%|█▉        | 1832/9770 [23:28<1:27:32,  1.51it/s] 19%|█▉        | 1833/9770 [23:29<1:27:37,  1.51it/s] 19%|█▉        | 1834/9770 [23:29<1:27:00,  1.52it/s] 19%|█▉        | 1835/9770 [23:30<1:27:16,  1.52it/s] 19%|█▉        | 1836/9770 [23:31<1:27:18,  1.51it/s] 19%|█▉        | 1837/9770 [23:31<1:27:37,  1.51it/s] 19%|█▉        | 1838/9770 [23:32<1:27:24,  1.51it/s] 19%|█▉        | 1839/9770 [23:33<1:26:47,  1.52it/s] 19%|█▉        | 1840/9770 [23:33<1:26:24,  1.53it/s]                                                      19%|█▉        | 1840/9770 [23:33<1:26:24,  1.53it/s] 19%|█▉        | 1841/9770 [23:34<1:26:19,  1.53it/s] 19%|█▉        | 1842/9770 [23:35<1:28:13,  1.50it/s] 19%|█▉        | 1843/9770 [23:35<1:28:29,  1.49it/s] 19%|█▉        | 1844/9770 [23:36<1:28:42,  1.49it/s] 19%|█▉        | 1845/9770 [23:37<1:27:43,  1.51it/s] 19%|█▉        | 1846/9770 [23:37<1:28:57,  1.48
+0: {'loss': 0.7125, 'grad_norm': 0.6575170336315558, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.19}
+0: {'loss': 0.6845, 'grad_norm': 0.6860193865897046, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.19}
+0: it/s] 19%|█▉        | 1847/9770 [23:38<1:29:15,  1.48it/s] 19%|█▉        | 1848/9770 [23:39<1:28:36,  1.49it/s] 19%|█▉        | 1849/9770 [23:39<1:28:17,  1.50it/s] 19%|█▉        | 1850/9770 [23:40<1:29:57,  1.47it/s]                                                      19%|█▉        | 1850/9770 [23:40<1:29:57,  1.47it/s] 19%|█▉        | 1851/9770 [23:41<1:28:06,  1.50it/s] 19%|█▉        | 1852/9770 [23:41<1:28:15,  1.50it/s] 19%|█▉        | 1853/9770 [23:42<1:27:01,  1.52it/s] 19%|█▉        | 1854/9770 [23:43<1:26:07,  1.53it/s] 19%|█▉        | 1855/9770 [23:43<1:26:53,  1.52it/s] 19%|█▉        | 1856/9770 [23:44<1:27:05,  1.51it/s] 19%|█▉        | 1857/9770 [23:45<1:28:19,  1.49it/s] 19%|█▉        | 1858/9770 [23:45<1:26:57,  1.52it/s] 19%|█▉        | 1859/9770 [23:46<1:26:08,  1.53it/s] 19%|█▉        | 1860/9770 [23:47<1:26:40,  1.52it/s]                                                      19%|█▉        | 1860/9770 [23:47<
+0: {'loss': 0.7005, 'grad_norm': 0.7450953538179935, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.19}
+0: 1:26:40,  1.52it/s] 19%|█▉        | 1861/9770 [23:47<1:27:34,  1.51it/s] 19%|█▉        | 1862/9770 [23:48<1:28:40,  1.49it/s] 19%|█▉        | 1863/9770 [23:49<1:27:09,  1.51it/s] 19%|█▉        | 1864/9770 [23:49<1:28:40,  1.49it/s] 19%|█▉        | 1865/9770 [23:50<1:27:56,  1.50it/s] 19%|█▉        | 1866/9770 [23:51<1:28:26,  1.49it/s] 19%|█▉        | 1867/9770 [23:51<1:28:53,  1.48it/s] 19%|█▉        | 1868/9770 [23:52<1:28:16,  1.49it/s] 19%|█▉        | 1869/9770 [23:53<1:27:56,  1.50it/s] 19%|█▉        | 1870/9770 [23:53<1:28:36,  1.49it/s]                                                      19%|█▉        | 1870/9770 [23:53<1:28:36,  1.49it/s] 19%|█▉        | 1871/9770 [23:54<1:28:51,  1.48it/s] 19%|█▉        | 1872/9770 [23:55<1:29:51,  1.46it/s] 19%|█▉        | 1873/9770 [23:55<1:28:06,  1.49it/s] 19%|█▉        | 1874/9770 [23:56<1:27:59,  1.50it/s] 19%|█▉        | 1875/9770 [23:57<1:27:04,  1.51it/s] 19%|█▉        | 
+0: {'loss': 0.694, 'grad_norm': 0.6907586729724744, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.19}
+0: {'loss': 0.6943, 'grad_norm': 0.6653968402488354, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.19}
+0: 1876/9770 [23:57<1:26:31,  1.52it/s] 19%|█▉        | 1877/9770 [23:58<1:26:20,  1.52it/s] 19%|█▉        | 1878/9770 [23:59<1:25:52,  1.53it/s] 19%|█▉        | 1879/9770 [23:59<1:26:13,  1.53it/s] 19%|█▉        | 1880/9770 [24:00<1:26:31,  1.52it/s]                                                      19%|█▉        | 1880/9770 [24:00<1:26:31,  1.52it/s] 19%|█▉        | 1881/9770 [24:01<1:27:19,  1.51it/s] 19%|█▉        | 1882/9770 [24:01<1:27:24,  1.50it/s] 19%|█▉        | 1883/9770 [24:02<1:28:04,  1.49it/s] 19%|█▉        | 1884/9770 [24:03<1:26:54,  1.51it/s] 19%|█▉        | 1885/9770 [24:03<1:26:05,  1.53it/s] 19%|█▉        | 1886/9770 [24:04<1:25:06,  1.54it/s] 19%|█▉        | 1887/9770 [24:04<1:24:45,  1.55it/s] 19%|█▉        | 1888/9770 [24:05<1:25:10,  1.54it/s] 19%|█▉        | 1889/9770 [24:06<1:24:25,  1.56it/s] 19%|█▉        | 1890/9770 [24:06<1:25:14,  1.54it/s]                                                      19%|�
+0: {'loss': 0.7169, 'grad_norm': 0.6579165805236343, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.19}
+0: �▉        | 1890/9770 [24:06<1:25:14,  1.54it/s] 19%|█▉        | 1891/9770 [24:07<1:26:20,  1.52it/s] 19%|█▉        | 1892/9770 [24:08<1:25:57,  1.53it/s] 19%|█▉        | 1893/9770 [24:08<1:25:48,  1.53it/s] 19%|█▉        | 1894/9770 [24:09<1:25:58,  1.53it/s] 19%|█▉        | 1895/9770 [24:10<1:25:41,  1.53it/s] 19%|█▉        | 1896/9770 [24:10<1:26:25,  1.52it/s] 19%|█▉        | 1897/9770 [24:11<1:26:55,  1.51it/s] 19%|█▉        | 1898/9770 [24:12<1:26:34,  1.52it/s] 19%|█▉        | 1899/9770 [24:12<1:25:34,  1.53it/s] 19%|█▉        | 1900/9770 [24:13<1:25:41,  1.53it/s]                                                      19%|█▉        | 1900/9770 [24:13<1:25:41,  1.53it/s] 19%|█▉        | 1901/9770 [24:14<1:27:20,  1.50it/s] 19%|█▉        | 1902/9770 [24:14<1:26:14,  1.52it/s] 19%|█▉        | 1903/9770 [24:15<1:25:11,  1.54it/s] 19%|█▉        | 1904/9770 [24:16<1:25:14,  1.54it/s] 19%|█▉        | 1905/9770 [24:16<1:27:35,  
+0: {'loss': 0.7117, 'grad_norm': 0.6814850674583643, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.2}
+0: 1.50it/s] 20%|█▉        | 1906/9770 [24:17<1:27:22,  1.50it/s] 20%|█▉        | 1907/9770 [24:18<1:27:22,  1.50it/s] 20%|█▉        | 1908/9770 [24:18<1:27:35,  1.50it/s] 20%|█▉        | 1909/9770 [24:19<1:26:30,  1.51it/s] 20%|█▉        | 1910/9770 [24:20<1:25:31,  1.53it/s]                                                      20%|█▉        | 1910/9770 [24:20<1:25:31,  1.53it/s] 20%|█▉        | 1911/9770 [24:20<1:26:00,  1.52it/s] 20%|█▉        | 1912/9770 [24:21<1:27:38,  1.49it/s] 20%|█▉        | 1913/9770 [24:22<1:26:23,  1.52it/s] 20%|█▉        | 1914/9770 [24:22<1:26:24,  1.52it/s] 20%|█▉        | 1915/9770 [24:23<1:25:33,  1.53it/s] 20%|█▉        | 1916/9770 [24:24<1:26:33,  1.51it/s] 20%|█▉        | 1917/9770 [24:24<1:26:21,  1.52it/s] 20%|█▉        | 1918/9770 [24:25<1:26:24,  1.51it/s] 20%|█▉        | 1919/9770 [24:26<1:26:30,  1.51it/s] 20%|█▉        | 1920/9770 [24:26<1:25:34,  1.53it/s]                               
+0: {'loss': 0.7129, 'grad_norm': 0.666641152490347, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.2}
+0: {'loss': 0.7242, 'grad_norm': 0.7500278717182322, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.2}
+0:                        20%|█▉        | 1920/9770 [24:26<1:25:34,  1.53it/s] 20%|█▉        | 1921/9770 [24:27<1:27:19,  1.50it/s] 20%|█▉        | 1922/9770 [24:27<1:25:54,  1.52it/s] 20%|█▉        | 1923/9770 [24:28<1:27:28,  1.50it/s] 20%|█▉        | 1924/9770 [24:29<1:27:16,  1.50it/s] 20%|█▉        | 1925/9770 [24:30<1:27:01,  1.50it/s] 20%|█▉        | 1926/9770 [24:30<1:26:18,  1.51it/s] 20%|█▉        | 1927/9770 [24:31<1:26:04,  1.52it/s] 20%|█▉        | 1928/9770 [24:31<1:25:54,  1.52it/s] 20%|█▉        | 1929/9770 [24:32<1:25:29,  1.53it/s] 20%|█▉        | 1930/9770 [24:33<1:27:43,  1.49it/s]                                                      20%|█▉        | 1930/9770 [24:33<1:27:43,  1.49it/s] 20%|█▉        | 1931/9770 [24:33<1:27:18,  1.50it/s] 20%|█▉        | 1932/9770 [24:34<1:26:31,  1.51it/s] 20%|█▉        | 1933/9770 [24:35<1:26:51,  1.50it/s] 20%|█▉        | 1934/9770 [24:35<1:26:34,  1.51it/s] 20%|█▉      
+0: {'loss': 0.7059, 'grad_norm': 0.7111800869539243, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.2}
+0:   | 1935/9770 [24:36<1:26:22,  1.51it/s] 20%|█▉        | 1936/9770 [24:37<1:25:59,  1.52it/s] 20%|█▉        | 1937/9770 [24:37<1:25:14,  1.53it/s] 20%|█▉        | 1938/9770 [24:38<1:25:24,  1.53it/s] 20%|█▉        | 1939/9770 [24:39<1:24:40,  1.54it/s] 20%|█▉        | 1940/9770 [24:39<1:25:36,  1.52it/s]                                                      20%|█▉        | 1940/9770 [24:39<1:25:36,  1.52it/s] 20%|█▉        | 1941/9770 [24:40<1:26:04,  1.52it/s] 20%|█▉        | 1942/9770 [24:41<1:26:05,  1.52it/s] 20%|█▉        | 1943/9770 [24:41<1:27:37,  1.49it/s] 20%|█▉        | 1944/9770 [24:42<1:27:19,  1.49it/s] 20%|█▉        | 1945/9770 [24:43<1:26:46,  1.50it/s] 20%|█▉        | 1946/9770 [24:43<1:25:30,  1.53it/s] 20%|█▉        | 1947/9770 [24:44<1:25:55,  1.52it/s] 20%|█▉        | 1948/9770 [24:45<1:25:22,  1.53it/s] 20%|█▉        | 1949/9770 [24:45<1:25:30,  1.52it/s] 20%|█▉        | 1950/9770 [24:46<1:25:17,  1.53it/s]
+0: {'loss': 0.6949, 'grad_norm': 0.6640638859953982, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.2}
+0: [2025-09-02 20:20:50,539] [INFO] [axolotl.core.trainers.base._save:613] [PID:3622631] [RANK:0] Saving model checkpoint to /lustre/fswork/projects/rech/dgo/udv55np/math/Qwen3-235B-A22B/Qwen2.5-0.5B_reasoning/1/checkpoint-1954[39m
+0: [2025-09-02 20:20:51,388] [INFO] [axolotl.core.trainers.base._save:662] [PID:3622631] [RANK:0] Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`[39m
+0: {'loss': 0.6927, 'grad_norm': 0.6565078425741968, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.2}
+0:                                                       20%|█▉        | 1950/9770 [24:46<1:25:17,  1.53it/s] 20%|█▉        | 1951/9770 [24:47<1:24:52,  1.54it/s] 20%|█▉        | 1952/9770 [24:47<1:24:47,  1.54it/s] 20%|█▉        | 1953/9770 [24:48<1:25:17,  1.53it/s] 20%|██        | 1954/9770 [24:49<1:26:20,  1.51it/s] 20%|██        | 1955/9770 [24:51<2:48:26,  1.29s/it] 20%|██        | 1956/9770 [24:52<2:23:36,  1.10s/it] 20%|██        | 1957/9770 [24:53<2:06:29,  1.03it/s] 20%|██        | 1958/9770 [24:53<1:54:24,  1.14it/s] 20%|██        | 1959/9770 [24:54<1:46:27,  1.22it/s] 20%|██        | 1960/9770 [24:55<1:40:21,  1.30it/s]                                                      20%|██        | 1960/9770 [24:55<1:40:21,  1.30it/s] 20%|██        | 1961/9770 [24:55<1:37:10,  1.34it/s] 20%|██        | 1962/9770 [24:56<1:35:28,  1.36it/s] 20%|██        | 1963/9770 [24:57<1:34:22,  1.38it/s] 20%|██        | 1964/9770 [24:57<1:31:2
+0: {'loss': 0.6842, 'grad_norm': 0.6945081398640743, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.2}
+0: 9,  1.42it/s] 20%|██        | 1965/9770 [24:58<1:29:16,  1.46it/s] 20%|██        | 1966/9770 [24:59<1:26:50,  1.50it/s] 20%|██        | 1967/9770 [24:59<1:26:06,  1.51it/s] 20%|██        | 1968/9770 [25:00<1:26:01,  1.51it/s] 20%|██        | 1969/9770 [25:01<1:25:14,  1.53it/s] 20%|██        | 1970/9770 [25:01<1:24:56,  1.53it/s]                                                      20%|██        | 1970/9770 [25:01<1:24:56,  1.53it/s] 20%|██        | 1971/9770 [25:02<1:25:04,  1.53it/s] 20%|██        | 1972/9770 [25:03<1:25:10,  1.53it/s] 20%|██        | 1973/9770 [25:03<1:24:18,  1.54it/s] 20%|██        | 1974/9770 [25:04<1:25:15,  1.52it/s] 20%|██        | 1975/9770 [25:05<1:24:48,  1.53it/s] 20%|██        | 1976/9770 [25:05<1:24:12,  1.54it/s] 20%|██        | 1977/9770 [25:06<1:25:40,  1.52it/s] 20%|██        | 1978/9770 [25:07<1:25:52,  1.51it/s] 20%|██        | 1979/9770 [25:07<1:25:27,  1.52it/s] 20%|██        | 1980/9
+0: {'loss': 0.6914, 'grad_norm': 0.6543023871657132, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.2}
+0: {'loss': 0.6991, 'grad_norm': 0.6500560980171683, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.2}
+0: 770 [25:08<1:25:19,  1.52it/s]                                                      20%|██        | 1980/9770 [25:08<1:25:19,  1.52it/s] 20%|██        | 1981/9770 [25:09<1:25:43,  1.51it/s] 20%|██        | 1982/9770 [25:09<1:25:42,  1.51it/s] 20%|██        | 1983/9770 [25:10<1:25:29,  1.52it/s] 20%|██        | 1984/9770 [25:11<1:24:46,  1.53it/s] 20%|██        | 1985/9770 [25:11<1:26:35,  1.50it/s] 20%|██        | 1986/9770 [25:12<1:25:44,  1.51it/s] 20%|██        | 1987/9770 [25:12<1:24:14,  1.54it/s] 20%|██        | 1988/9770 [25:13<1:24:56,  1.53it/s] 20%|██        | 1989/9770 [25:14<1:24:16,  1.54it/s] 20%|██        | 1990/9770 [25:14<1:24:46,  1.53it/s]                                                      20%|██        | 1990/9770 [25:14<1:24:46,  1.53it/s] 20%|██        | 1991/9770 [25:15<1:25:18,  1.52it/s] 20%|██        | 1992/9770 [25:16<1:26:14,  1.50it/s] 20%|██        | 1993/9770 [25:17<1:27:23,  1.48it/s] 20%|██  
+0: {'loss': 0.7013, 'grad_norm': 0.6539763552325782, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.2}
+0:       | 1994/9770 [25:17<1:26:37,  1.50it/s] 20%|██        | 1995/9770 [25:18<1:26:44,  1.49it/s] 20%|██        | 1996/9770 [25:18<1:26:14,  1.50it/s] 20%|██        | 1997/9770 [25:19<1:25:22,  1.52it/s] 20%|██        | 1998/9770 [25:20<1:24:34,  1.53it/s] 20%|██        | 1999/9770 [25:20<1:25:13,  1.52it/s] 20%|██        | 2000/9770 [25:21<1:25:13,  1.52it/s]                                                      20%|██        | 2000/9770 [25:21<1:25:13,  1.52it/s] 20%|██        | 2001/9770 [25:22<1:25:40,  1.51it/s] 20%|██        | 2002/9770 [25:22<1:26:21,  1.50it/s] 21%|██        | 2003/9770 [25:23<1:26:18,  1.50it/s] 21%|██        | 2004/9770 [25:24<1:26:11,  1.50it/s] 21%|██        | 2005/9770 [25:24<1:25:54,  1.51it/s] 21%|██        | 2006/9770 [25:25<1:27:45,  1.47it/s] 21%|██        | 2007/9770 [25:26<1:26:57,  1.49it/s] 21%|██        | 2008/9770 [25:26<1:25:52,  1.51it/s] 21%|██        | 2009/9770 [25:27<1:25:07,  1.52it
+0: {'loss': 0.6943, 'grad_norm': 0.6710374386168445, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.21}
+0: {'loss': 0.7014, 'grad_norm': 0.6612901855942517, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.21}
+0: /s] 21%|██        | 2010/9770 [25:28<1:25:54,  1.51it/s]                                                      21%|██        | 2010/9770 [25:28<1:25:54,  1.51it/s] 21%|██        | 2011/9770 [25:28<1:25:22,  1.51it/s] 21%|██        | 2012/9770 [25:29<1:24:57,  1.52it/s] 21%|██        | 2013/9770 [25:30<1:24:23,  1.53it/s] 21%|██        | 2014/9770 [25:30<1:25:09,  1.52it/s] 21%|██        | 2015/9770 [25:31<1:24:48,  1.52it/s] 21%|██        | 2016/9770 [25:32<1:24:53,  1.52it/s] 21%|██        | 2017/9770 [25:32<1:24:26,  1.53it/s] 21%|██        | 2018/9770 [25:33<1:23:51,  1.54it/s] 21%|██        | 2019/9770 [25:34<1:23:59,  1.54it/s] 21%|██        | 2020/9770 [25:34<1:23:50,  1.54it/s]                                                      21%|██        | 2020/9770 [25:34<1:23:50,  1.54it/s] 21%|██        | 2021/9770 [25:35<1:23:34,  1.55it/s] 21%|██        | 2022/9770 [25:36<1:23:04,  1.55it/s] 21%|██        | 2023/9770 [25:36<1:
+0: {'loss': 0.675, 'grad_norm': 0.6712996353460524, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.21}
+0: 23:32,  1.55it/s] 21%|██        | 2024/9770 [25:37<1:25:49,  1.50it/s] 21%|██        | 2025/9770 [25:38<1:25:28,  1.51it/s] 21%|██        | 2026/9770 [25:38<1:24:50,  1.52it/s] 21%|██        | 2027/9770 [25:39<1:24:25,  1.53it/s] 21%|██        | 2028/9770 [25:40<1:23:49,  1.54it/s] 21%|██        | 2029/9770 [25:40<1:23:00,  1.55it/s] 21%|██        | 2030/9770 [25:41<1:23:07,  1.55it/s]                                                      21%|██        | 2030/9770 [25:41<1:23:07,  1.55it/s] 21%|██        | 2031/9770 [25:41<1:24:03,  1.53it/s] 21%|██        | 2032/9770 [25:42<1:23:31,  1.54it/s] 21%|██        | 2033/9770 [25:43<1:23:16,  1.55it/s] 21%|██        | 2034/9770 [25:43<1:24:05,  1.53it/s] 21%|██        | 2035/9770 [25:44<1:23:37,  1.54it/s] 21%|██        | 2036/9770 [25:45<1:23:35,  1.54it/s] 21%|██        | 2037/9770 [25:45<1:24:19,  1.53it/s] 21%|██        | 2038/9770 [25:46<1:23:09,  1.55it/s] 21%|██        | 20
+0: {'loss': 0.6897, 'grad_norm': 0.6589979758154797, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.21}
+0: {'loss': 0.7257, 'grad_norm': 0.7412955588179325, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.21}
+0: 39/9770 [25:47<1:23:50,  1.54it/s] 21%|██        | 2040/9770 [25:47<1:23:41,  1.54it/s]                                                      21%|██        | 2040/9770 [25:47<1:23:41,  1.54it/s] 21%|██        | 2041/9770 [25:48<1:24:29,  1.52it/s] 21%|██        | 2042/9770 [25:49<1:24:56,  1.52it/s] 21%|██        | 2043/9770 [25:49<1:26:21,  1.49it/s] 21%|██        | 2044/9770 [25:50<1:25:59,  1.50it/s] 21%|██        | 2045/9770 [25:51<1:24:32,  1.52it/s] 21%|██        | 2046/9770 [25:51<1:24:46,  1.52it/s] 21%|██        | 2047/9770 [25:52<1:24:52,  1.52it/s] 21%|██        | 2048/9770 [25:53<1:24:27,  1.52it/s] 21%|██        | 2049/9770 [25:53<1:23:53,  1.53it/s] 21%|██        | 2050/9770 [25:54<1:24:38,  1.52it/s]                                                      21%|██        | 2050/9770 [25:54<1:24:38,  1.52it/s] 21%|██        | 2051/9770 [25:55<1:24:49,  1.52it/s] 21%|██        | 2052/9770 [25:55<1:25:07,  1.51it/s] 21%|█�
+0: {'loss': 0.7084, 'grad_norm': 0.6664743998503643, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.21}
+0: ��        | 2053/9770 [25:56<1:24:11,  1.53it/s] 21%|██        | 2054/9770 [25:57<1:24:31,  1.52it/s] 21%|██        | 2055/9770 [25:57<1:24:54,  1.51it/s] 21%|██        | 2056/9770 [25:58<1:26:30,  1.49it/s] 21%|██        | 2057/9770 [25:59<1:26:03,  1.49it/s] 21%|██        | 2058/9770 [25:59<1:25:24,  1.50it/s] 21%|██        | 2059/9770 [26:00<1:23:57,  1.53it/s] 21%|██        | 2060/9770 [26:01<1:24:16,  1.52it/s]                                                      21%|██        | 2060/9770 [26:01<1:24:16,  1.52it/s] 21%|██        | 2061/9770 [26:01<1:23:45,  1.53it/s] 21%|██        | 2062/9770 [26:02<1:23:46,  1.53it/s] 21%|██        | 2063/9770 [26:02<1:23:26,  1.54it/s] 21%|██        | 2064/9770 [26:03<1:23:41,  1.53it/s] 21%|██        | 2065/9770 [26:04<1:23:18,  1.54it/s] 21%|██        | 2066/9770 [26:04<1:23:55,  1.53it/s] 21%|██        | 2067/9770 [26:05<1:24:01,  1.53it/s] 21%|██        | 2068/9770 [26:06<1:23:50,  1.
+0: {'loss': 0.7128, 'grad_norm': 0.6373230557382719, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.21}
+0: {'loss': 0.7016, 'grad_norm': 0.6594369817948404, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.21}
+0: 53it/s] 21%|██        | 2069/9770 [26:06<1:23:26,  1.54it/s] 21%|██        | 2070/9770 [26:07<1:23:43,  1.53it/s]                                                      21%|██        | 2070/9770 [26:07<1:23:43,  1.53it/s] 21%|██        | 2071/9770 [26:08<1:23:09,  1.54it/s] 21%|██        | 2072/9770 [26:08<1:24:12,  1.52it/s] 21%|██        | 2073/9770 [26:09<1:24:22,  1.52it/s] 21%|██        | 2074/9770 [26:10<1:24:11,  1.52it/s] 21%|██        | 2075/9770 [26:10<1:23:27,  1.54it/s] 21%|██        | 2076/9770 [26:11<1:23:08,  1.54it/s] 21%|██▏       | 2077/9770 [26:12<1:23:17,  1.54it/s] 21%|██▏       | 2078/9770 [26:12<1:23:29,  1.54it/s] 21%|██▏       | 2079/9770 [26:13<1:23:10,  1.54it/s] 21%|██▏       | 2080/9770 [26:14<1:23:33,  1.53it/s]                                                      21%|██▏       | 2080/9770 [26:14<1:23:33,  1.53it/s] 21%|██▏       | 2081/9770 [26:14<1:23:11,  1.54it/s] 21%|██▏       | 2
+0: {'loss': 0.6942, 'grad_norm': 0.6404038862051435, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.21}
+0: 082/9770 [26:15<1:23:07,  1.54it/s] 21%|██▏       | 2083/9770 [26:15<1:22:42,  1.55it/s] 21%|██▏       | 2084/9770 [26:16<1:23:16,  1.54it/s] 21%|██▏       | 2085/9770 [26:17<1:22:29,  1.55it/s] 21%|██▏       | 2086/9770 [26:17<1:24:51,  1.51it/s] 21%|██▏       | 2087/9770 [26:18<1:23:56,  1.53it/s] 21%|██▏       | 2088/9770 [26:19<1:24:22,  1.52it/s] 21%|██▏       | 2089/9770 [26:19<1:23:12,  1.54it/s] 21%|██▏       | 2090/9770 [26:20<1:23:21,  1.54it/s]                                                      21%|██▏       | 2090/9770 [26:20<1:23:21,  1.54it/s] 21%|██▏       | 2091/9770 [26:21<1:23:53,  1.53it/s] 21%|██▏       | 2092/9770 [26:21<1:23:50,  1.53it/s] 21%|██▏       | 2093/9770 [26:22<1:23:21,  1.54it/s] 21%|██▏       | 2094/9770 [26:23<1:22:56,  1.54it/s] 21%|██▏       | 2095/9770 [26:23<1:22:27,  1.55it/s] 21%|██▏       | 2096/9770 [26:24<1:23:35,  1.53it/s] 21%|██▏       | 2097/9770 
+0: {'loss': 0.719, 'grad_norm': 0.6519542506170287, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.21}
+0: {'loss': 0.6835, 'grad_norm': 0.6536422816732395, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.22}
+0: [26:25<1:25:15,  1.50it/s] 21%|██▏       | 2098/9770 [26:25<1:24:44,  1.51it/s] 21%|██▏       | 2099/9770 [26:26<1:23:35,  1.53it/s] 21%|██▏       | 2100/9770 [26:27<1:23:23,  1.53it/s]                                                      21%|██▏       | 2100/9770 [26:27<1:23:23,  1.53it/s] 22%|██▏       | 2101/9770 [26:27<1:22:21,  1.55it/s] 22%|██▏       | 2102/9770 [26:28<1:21:54,  1.56it/s] 22%|██▏       | 2103/9770 [26:29<1:22:37,  1.55it/s] 22%|██▏       | 2104/9770 [26:29<1:22:22,  1.55it/s] 22%|██▏       | 2105/9770 [26:30<1:22:05,  1.56it/s] 22%|██▏       | 2106/9770 [26:30<1:21:58,  1.56it/s] 22%|██▏       | 2107/9770 [26:31<1:22:50,  1.54it/s] 22%|██▏       | 2108/9770 [26:32<1:23:10,  1.54it/s] 22%|██▏       | 2109/9770 [26:32<1:23:23,  1.53it/s] 22%|██▏       | 2110/9770 [26:33<1:24:11,  1.52it/s]                                                      22%|██▏       | 2110/9770 [26:33<1:24:11
+0: {'loss': 0.6961, 'grad_norm': 0.6535060663386806, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.22}
+0: ,  1.52it/s] 22%|██▏       | 2111/9770 [26:34<1:26:00,  1.48it/s] 22%|██▏       | 2112/9770 [26:34<1:25:48,  1.49it/s] 22%|██▏       | 2113/9770 [26:35<1:25:42,  1.49it/s] 22%|██▏       | 2114/9770 [26:36<1:25:22,  1.49it/s] 22%|██▏       | 2115/9770 [26:36<1:24:45,  1.51it/s] 22%|██▏       | 2116/9770 [26:37<1:23:58,  1.52it/s] 22%|██▏       | 2117/9770 [26:38<1:23:57,  1.52it/s] 22%|██▏       | 2118/9770 [26:38<1:23:42,  1.52it/s] 22%|██▏       | 2119/9770 [26:39<1:24:18,  1.51it/s] 22%|██▏       | 2120/9770 [26:40<1:24:42,  1.51it/s]                                                      22%|██▏       | 2120/9770 [26:40<1:24:42,  1.51it/s] 22%|██▏       | 2121/9770 [26:40<1:25:42,  1.49it/s] 22%|██▏       | 2122/9770 [26:41<1:26:53,  1.47it/s] 22%|██▏       | 2123/9770 [26:42<1:25:22,  1.49it/s] 22%|██▏       | 2124/9770 [26:42<1:24:40,  1.51it/s] 22%|██▏       | 2125/9770 [26:43<1:24:35,  1.51it
+0: {'loss': 0.6927, 'grad_norm': 0.5960848397298177, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.22}
+0: /s] 22%|██▏       | 2126/9770 [26:44<1:23:22,  1.53it/s] 22%|██▏       | 2127/9770 [26:44<1:24:27,  1.51it/s] 22%|██▏       | 2128/9770 [26:45<1:23:51,  1.52it/s] 22%|██▏       | 2129/9770 [26:46<1:23:42,  1.52it/s] 22%|██▏       | 2130/9770 [26:46<1:24:19,  1.51it/s]                                                      22%|██▏       | 2130/9770 [26:46<1:24:19,  1.51it/s] 22%|██▏       | 2131/9770 [26:47<1:23:34,  1.52it/s] 22%|██▏       | 2132/9770 [26:48<1:25:30,  1.49it/s] 22%|██▏       | 2133/9770 [26:48<1:25:10,  1.49it/s] 22%|██▏       | 2134/9770 [26:49<1:24:39,  1.50it/s] 22%|██▏       | 2135/9770 [26:50<1:24:30,  1.51it/s] 22%|██▏       | 2136/9770 [26:50<1:24:58,  1.50it/s] 22%|██▏       | 2137/9770 [26:51<1:24:38,  1.50it/s] 22%|██▏       | 2138/9770 [26:52<1:24:46,  1.50it/s] 22%|██▏       | 2139/9770 [26:52<1:24:41,  1.50it/s] 22%|██▏       | 2140/9770 [26:53<1:24:42,  1.50it/s]     
+0: {'loss': 0.6726, 'grad_norm': 0.6985278723307764, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.22}
+0: {'loss': 0.6854, 'grad_norm': 0.6551569393057011, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.22}
+0:                                                  22%|██▏       | 2140/9770 [26:53<1:24:42,  1.50it/s] 22%|██▏       | 2141/9770 [26:54<1:24:21,  1.51it/s] 22%|██▏       | 2142/9770 [26:54<1:23:40,  1.52it/s] 22%|██▏       | 2143/9770 [26:55<1:23:39,  1.52it/s] 22%|██▏       | 2144/9770 [26:56<1:23:03,  1.53it/s] 22%|██▏       | 2145/9770 [26:56<1:22:50,  1.53it/s] 22%|██▏       | 2146/9770 [26:57<1:23:10,  1.53it/s] 22%|██▏       | 2147/9770 [26:58<1:23:28,  1.52it/s] 22%|██▏       | 2148/9770 [26:58<1:22:57,  1.53it/s] 22%|██▏       | 2149/9770 [26:59<1:23:24,  1.52it/s] 22%|██▏       | 2150/9770 [27:00<1:23:57,  1.51it/s]                                                      22%|██▏       | 2150/9770 [27:00<1:23:57,  1.51it/s] 22%|██▏       | 2151/9770 [27:00<1:23:32,  1.52it/s] 22%|██▏       | 2152/9770 [27:01<1:24:15,  1.51it/s] 22%|██▏       | 2153/9770 [27:02<1:24:59,  1.49it/s] 22%|██▏     
+0: {'loss': 0.7044, 'grad_norm': 0.6285554727242006, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.22}
+0:   | 2154/9770 [27:02<1:26:06,  1.47it/s] 22%|██▏       | 2155/9770 [27:03<1:24:45,  1.50it/s] 22%|██▏       | 2156/9770 [27:04<1:24:32,  1.50it/s] 22%|██▏       | 2157/9770 [27:04<1:24:03,  1.51it/s] 22%|██▏       | 2158/9770 [27:05<1:24:01,  1.51it/s] 22%|██▏       | 2159/9770 [27:06<1:23:33,  1.52it/s] 22%|██▏       | 2160/9770 [27:06<1:22:55,  1.53it/s]                                                      22%|██▏       | 2160/9770 [27:06<1:22:55,  1.53it/s] 22%|██▏       | 2161/9770 [27:07<1:24:39,  1.50it/s] 22%|██▏       | 2162/9770 [27:08<1:23:46,  1.51it/s] 22%|██▏       | 2163/9770 [27:08<1:23:52,  1.51it/s] 22%|██▏       | 2164/9770 [27:09<1:24:17,  1.50it/s] 22%|██▏       | 2165/9770 [27:10<1:25:14,  1.49it/s] 22%|██▏       | 2166/9770 [27:10<1:24:24,  1.50it/s] 22%|██▏       | 2167/9770 [27:11<1:23:38,  1.51it/s] 22%|██▏       | 2168/9770 [27:12<1:25:27,  1.48it/s] 22%|██▏       | 2169/
+0: {'loss': 0.701, 'grad_norm': 0.7001124718908, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.22}
+0: {'loss': 0.6869, 'grad_norm': 0.6676667608170364, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.22}
+0: 9770 [27:12<1:24:46,  1.49it/s] 22%|██▏       | 2170/9770 [27:13<1:23:39,  1.51it/s]                                                      22%|██▏       | 2170/9770 [27:13<1:23:39,  1.51it/s] 22%|██▏       | 2171/9770 [27:14<1:23:49,  1.51it/s] 22%|██▏       | 2172/9770 [27:14<1:24:10,  1.50it/s] 22%|██▏       | 2173/9770 [27:15<1:24:17,  1.50it/s] 22%|██▏       | 2174/9770 [27:16<1:24:03,  1.51it/s] 22%|██▏       | 2175/9770 [27:16<1:23:47,  1.51it/s] 22%|██▏       | 2176/9770 [27:17<1:23:34,  1.51it/s] 22%|██▏       | 2177/9770 [27:18<1:23:00,  1.52it/s] 22%|██▏       | 2178/9770 [27:18<1:23:42,  1.51it/s] 22%|██▏       | 2179/9770 [27:19<1:23:15,  1.52it/s] 22%|██▏       | 2180/9770 [27:20<1:22:57,  1.52it/s]                                                      22%|██▏       | 2180/9770 [27:20<1:22:57,  1.52it/s] 22%|██▏       | 2181/9770 [27:20<1:22:49,  1.53it/s] 22%|██▏       | 2182/9770 [27:21<1:
+0: {'loss': 0.6991, 'grad_norm': 0.6496071275873664, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.22}
+0: 22:35,  1.53it/s] 22%|██▏       | 2183/9770 [27:21<1:22:45,  1.53it/s] 22%|██▏       | 2184/9770 [27:22<1:22:40,  1.53it/s] 22%|██▏       | 2185/9770 [27:23<1:22:07,  1.54it/s] 22%|██▏       | 2186/9770 [27:23<1:22:29,  1.53it/s] 22%|██▏       | 2187/9770 [27:24<1:22:28,  1.53it/s] 22%|██▏       | 2188/9770 [27:25<1:23:03,  1.52it/s] 22%|██▏       | 2189/9770 [27:25<1:23:26,  1.51it/s] 22%|██▏       | 2190/9770 [27:26<1:22:29,  1.53it/s]                                                      22%|██▏       | 2190/9770 [27:26<1:22:29,  1.53it/s] 22%|██▏       | 2191/9770 [27:27<1:22:05,  1.54it/s] 22%|██▏       | 2192/9770 [27:27<1:22:05,  1.54it/s] 22%|██▏       | 2193/9770 [27:28<1:21:08,  1.56it/s] 22%|██▏       | 2194/9770 [27:29<1:21:58,  1.54it/s] 22%|██▏       | 2195/9770 [27:29<1:24:10,  1.50it/s] 22%|██▏       | 2196/9770 [27:30<1:24:18,  1.50it/s] 22%|██▏       | 2197/9770 [27:31<1:24:07,  1
+0: {'loss': 0.7162, 'grad_norm': 0.6612981904173098, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.23}
+0: {'loss': 0.7154, 'grad_norm': 0.6796044502133897, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.23}
+0: .50it/s] 22%|██▏       | 2198/9770 [27:31<1:24:14,  1.50it/s] 23%|██▎       | 2199/9770 [27:32<1:25:59,  1.47it/s] 23%|██▎       | 2200/9770 [27:33<1:25:23,  1.48it/s]                                                      23%|██▎       | 2200/9770 [27:33<1:25:23,  1.48it/s] 23%|██▎       | 2201/9770 [27:33<1:24:44,  1.49it/s] 23%|██▎       | 2202/9770 [27:34<1:23:17,  1.51it/s] 23%|██▎       | 2203/9770 [27:35<1:23:23,  1.51it/s] 23%|██▎       | 2204/9770 [27:35<1:22:43,  1.52it/s] 23%|██▎       | 2205/9770 [27:36<1:22:49,  1.52it/s] 23%|██▎       | 2206/9770 [27:37<1:22:32,  1.53it/s] 23%|██▎       | 2207/9770 [27:37<1:22:05,  1.54it/s] 23%|██▎       | 2208/9770 [27:38<1:24:11,  1.50it/s] 23%|██▎       | 2209/9770 [27:39<1:23:50,  1.50it/s] 23%|██▎       | 2210/9770 [27:39<1:23:24,  1.51it/s]                                                      23%|██▎       | 2210/9770 [27:39<1:23:24,  1.51it/s] 23%|
+0: {'loss': 0.6913, 'grad_norm': 0.6462252491398575, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.23}
+0: ██▎       | 2211/9770 [27:40<1:23:43,  1.50it/s] 23%|██▎       | 2212/9770 [27:41<1:23:03,  1.52it/s] 23%|██▎       | 2213/9770 [27:41<1:23:30,  1.51it/s] 23%|██▎       | 2214/9770 [27:42<1:25:25,  1.47it/s] 23%|██▎       | 2215/9770 [27:43<1:24:14,  1.49it/s] 23%|██▎       | 2216/9770 [27:43<1:23:46,  1.50it/s] 23%|██▎       | 2217/9770 [27:44<1:23:06,  1.51it/s] 23%|██▎       | 2218/9770 [27:45<1:22:55,  1.52it/s] 23%|██▎       | 2219/9770 [27:45<1:22:56,  1.52it/s] 23%|██▎       | 2220/9770 [27:46<1:22:21,  1.53it/s]                                                      23%|██▎       | 2220/9770 [27:46<1:22:21,  1.53it/s] 23%|██▎       | 2221/9770 [27:47<1:22:53,  1.52it/s] 23%|██▎       | 2222/9770 [27:47<1:22:45,  1.52it/s] 23%|██▎       | 2223/9770 [27:48<1:23:22,  1.51it/s] 23%|██▎       | 2224/9770 [27:49<1:21:52,  1.54it/s] 23%|██▎       | 2225/9770 [27:49<1:21:14,  1.55it/s] 23%|██▎
+0: {'loss': 0.6924, 'grad_norm': 0.6051102836694884, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.23}
+0:        | 2226/9770 [27:50<1:21:09,  1.55it/s] 23%|██▎       | 2227/9770 [27:50<1:21:19,  1.55it/s] 23%|██▎       | 2228/9770 [27:51<1:21:51,  1.54it/s] 23%|██▎       | 2229/9770 [27:52<1:23:26,  1.51it/s] 23%|██▎       | 2230/9770 [27:52<1:22:49,  1.52it/s]                                                      23%|██▎       | 2230/9770 [27:52<1:22:49,  1.52it/s] 23%|██▎       | 2231/9770 [27:53<1:23:14,  1.51it/s] 23%|██▎       | 2232/9770 [27:54<1:23:29,  1.50it/s] 23%|██▎       | 2233/9770 [27:54<1:22:47,  1.52it/s] 23%|██▎       | 2234/9770 [27:55<1:21:53,  1.53it/s] 23%|██▎       | 2235/9770 [27:56<1:21:56,  1.53it/s] 23%|██▎       | 2236/9770 [27:56<1:23:17,  1.51it/s] 23%|██▎       | 2237/9770 [27:57<1:22:38,  1.52it/s] 23%|██▎       | 2238/9770 [27:58<1:22:50,  1.52it/s] 23%|██▎       | 2239/9770 [27:58<1:24:35,  1.48it/s] 23%|██▎       | 2240/9770 [27:59<1:23:21,  1.51it/s]                       
+0: {'loss': 0.7015, 'grad_norm': 0.6365712298187204, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.23}
+0: {'loss': 0.7024, 'grad_norm': 0.681782965572124, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.23}
+0:                                23%|██▎       | 2240/9770 [27:59<1:23:21,  1.51it/s] 23%|██▎       | 2241/9770 [28:00<1:23:18,  1.51it/s] 23%|██▎       | 2242/9770 [28:00<1:23:04,  1.51it/s] 23%|██▎       | 2243/9770 [28:01<1:22:24,  1.52it/s] 23%|██▎       | 2244/9770 [28:02<1:22:32,  1.52it/s] 23%|██▎       | 2245/9770 [28:02<1:22:32,  1.52it/s] 23%|██▎       | 2246/9770 [28:03<1:23:31,  1.50it/s] 23%|██���       | 2247/9770 [28:04<1:22:45,  1.52it/s] 23%|██▎       | 2248/9770 [28:05<1:30:12,  1.39it/s] 23%|██▎       | 2249/9770 [28:05<1:27:30,  1.43it/s] 23%|██▎       | 2250/9770 [28:06<1:25:41,  1.46it/s]                                                      23%|██▎       | 2250/9770 [28:06<1:25:41,  1.46it/s] 23%|██▎       | 2251/9770 [28:07<1:25:31,  1.47it/s] 23%|██▎       | 2252/9770 [28:07<1:23:49,  1.49it/s] 23%|██▎       | 2253/9770 [28:08<1:24:02,  1.49it/s] 23%|██▎       | 2254/9770 [28:
+0: {'loss': 0.6898, 'grad_norm': 0.6831612937779107, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.23}
+0: 09<1:23:35,  1.50it/s] 23%|██▎       | 2255/9770 [28:09<1:23:25,  1.50it/s] 23%|██▎       | 2256/9770 [28:10<1:25:01,  1.47it/s] 23%|██▎       | 2257/9770 [28:11<1:24:20,  1.48it/s] 23%|██▎       | 2258/9770 [28:11<1:23:23,  1.50it/s] 23%|██▎       | 2259/9770 [28:12<1:23:28,  1.50it/s] 23%|██▎       | 2260/9770 [28:13<1:23:36,  1.50it/s]                                                      23%|██▎       | 2260/9770 [28:13<1:23:36,  1.50it/s] 23%|██▎       | 2261/9770 [28:13<1:23:14,  1.50it/s] 23%|██▎       | 2262/9770 [28:14<1:23:08,  1.50it/s] 23%|██▎       | 2263/9770 [28:15<1:22:55,  1.51it/s] 23%|██▎       | 2264/9770 [28:15<1:23:13,  1.50it/s] 23%|██▎       | 2265/9770 [28:16<1:22:38,  1.51it/s] 23%|██▎       | 2266/9770 [28:17<1:23:03,  1.51it/s] 23%|██▎       | 2267/9770 [28:17<1:22:53,  1.51it/s] 23%|██▎       | 2268/9770 [28:18<1:22:45,  1.51it/s] 23%|██▎       | 2269/9770 [28:18<1:23:0
+0: {'loss': 0.6938, 'grad_norm': 0.6407106322264341, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.23}
+0: {'loss': 0.6995, 'grad_norm': 0.6441926385283238, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.23}
+0: 0,  1.51it/s] 23%|██▎       | 2270/9770 [28:19<1:22:02,  1.52it/s]                                                      23%|██▎       | 2270/9770 [28:19<1:22:02,  1.52it/s] 23%|██▎       | 2271/9770 [28:20<1:21:59,  1.52it/s] 23%|██▎       | 2272/9770 [28:20<1:22:01,  1.52it/s] 23%|██▎       | 2273/9770 [28:21<1:21:59,  1.52it/s] 23%|██▎       | 2274/9770 [28:22<1:22:02,  1.52it/s] 23%|██▎       | 2275/9770 [28:22<1:21:48,  1.53it/s] 23%|██▎       | 2276/9770 [28:23<1:21:30,  1.53it/s] 23%|██▎       | 2277/9770 [28:24<1:22:17,  1.52it/s] 23%|██▎       | 2278/9770 [28:24<1:21:56,  1.52it/s] 23%|██▎       | 2279/9770 [28:25<1:21:50,  1.53it/s] 23%|██▎       | 2280/9770 [28:26<1:22:36,  1.51it/s]                                                      23%|██▎       | 2280/9770 [28:26<1:22:36,  1.51it/s] 23%|██▎       | 2281/9770 [28:26<1:21:51,  1.52it/s] 23%|██▎       | 2282/9770 [28:27<1:21:06,  1.54it/s]
+0: {'loss': 0.6953, 'grad_norm': 0.6621951656012156, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.23}
+0:  23%|██▎       | 2283/9770 [28:28<1:21:24,  1.53it/s] 23%|██▎       | 2284/9770 [28:28<1:21:18,  1.53it/s] 23%|██▎       | 2285/9770 [28:29<1:21:41,  1.53it/s] 23%|██▎       | 2286/9770 [28:30<1:20:25,  1.55it/s] 23%|██▎       | 2287/9770 [28:30<1:20:23,  1.55it/s] 23%|██▎       | 2288/9770 [28:31<1:21:09,  1.54it/s] 23%|██▎       | 2289/9770 [28:32<1:21:50,  1.52it/s] 23%|██▎       | 2290/9770 [28:32<1:21:19,  1.53it/s]                                                      23%|██▎       | 2290/9770 [28:32<1:21:19,  1.53it/s] 23%|██▎       | 2291/9770 [28:33<1:21:34,  1.53it/s] 23%|██▎       | 2292/9770 [28:34<1:22:38,  1.51it/s] 23%|██▎       | 2293/9770 [28:34<1:22:40,  1.51it/s] 23%|██▎       | 2294/9770 [28:35<1:22:13,  1.52it/s] 23%|██▎       | 2295/9770 [28:35<1:21:32,  1.53it/s] 24%|██▎       | 2296/9770 [28:36<1:22:07,  1.52it/s] 24%|██▎       | 2297/9770 [28:37<1:21:02,  1.54it/s] 24%|█�
+0: {'loss': 0.7054, 'grad_norm': 0.7113487213728596, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.24}
+0: {'loss': 0.7023, 'grad_norm': 0.6298331863248442, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.24}
+0: ��▎       | 2298/9770 [28:37<1:22:18,  1.51it/s] 24%|██▎       | 2299/9770 [28:38<1:21:51,  1.52it/s] 24%|██▎       | 2300/9770 [28:39<1:21:52,  1.52it/s]                                                      24%|██▎       | 2300/9770 [28:39<1:21:52,  1.52it/s] 24%|██▎       | 2301/9770 [28:39<1:21:25,  1.53it/s] 24%|██▎       | 2302/9770 [28:40<1:22:12,  1.51it/s] 24%|██▎       | 2303/9770 [28:41<1:20:36,  1.54it/s] 24%|██▎       | 2304/9770 [28:41<1:22:29,  1.51it/s] 24%|██▎       | 2305/9770 [28:42<1:24:00,  1.48it/s] 24%|██▎       | 2306/9770 [28:43<1:22:57,  1.50it/s] 24%|██▎       | 2307/9770 [28:43<1:21:41,  1.52it/s] 24%|██▎       | 2308/9770 [28:44<1:22:16,  1.51it/s] 24%|██▎       | 2309/9770 [28:45<1:22:40,  1.50it/s] 24%|██▎       | 2310/9770 [28:45<1:22:45,  1.50it/s]                                                      24%|██▎       | 2310/9770 [28:45<1:22:45,  1.50it/s] 24%|██▎       | 
+0: {'loss': 0.704, 'grad_norm': 0.6170509402541738, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.24}
+0: 2311/9770 [28:46<1:22:46,  1.50it/s] 24%|██▎       | 2312/9770 [28:47<1:22:06,  1.51it/s] 24%|██▎       | 2313/9770 [28:47<1:22:27,  1.51it/s] 24%|██▎       | 2314/9770 [28:48<1:22:50,  1.50it/s] 24%|██▎       | 2315/9770 [28:49<1:21:43,  1.52it/s] 24%|██▎       | 2316/9770 [28:49<1:22:31,  1.51it/s] 24%|██▎       | 2317/9770 [28:50<1:22:12,  1.51it/s] 24%|██▎       | 2318/9770 [28:51<1:22:30,  1.51it/s] 24%|██▎       | 2319/9770 [28:51<1:22:32,  1.50it/s] 24%|██▎       | 2320/9770 [28:52<1:22:10,  1.51it/s]                                                      24%|██▎       | 2320/9770 [28:52<1:22:10,  1.51it/s] 24%|██▍       | 2321/9770 [28:53<1:23:17,  1.49it/s] 24%|██▍       | 2322/9770 [28:53<1:22:59,  1.50it/s] 24%|██▍       | 2323/9770 [28:54<1:21:49,  1.52it/s] 24%|██▍       | 2324/9770 [28:55<1:22:35,  1.50it/s] 24%|██▍       | 2325/9770 [28:55<1:24:04,  1.48it/s] 24%|██▍       | 2326/9770
+0: {'loss': 0.6919, 'grad_norm': 0.6560002134611792, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.24}
+0:  [28:56<1:23:35,  1.48it/s] 24%|██▍       | 2327/9770 [28:57<1:23:11,  1.49it/s] 24%|██▍       | 2328/9770 [28:57<1:23:01,  1.49it/s] 24%|██▍       | 2329/9770 [28:58<1:22:07,  1.51it/s] 24%|██▍       | 2330/9770 [28:59<1:21:51,  1.51it/s]                                                      24%|██▍       | 2330/9770 [28:59<1:21:51,  1.51it/s] 24%|██▍       | 2331/9770 [28:59<1:21:17,  1.53it/s] 24%|██▍       | 2332/9770 [29:00<1:20:39,  1.54it/s] 24%|██▍       | 2333/9770 [29:01<1:20:13,  1.55it/s] 24%|██▍       | 2334/9770 [29:01<1:19:31,  1.56it/s] 24%|██▍       | 2335/9770 [29:02<1:20:00,  1.55it/s] 24%|██▍       | 2336/9770 [29:03<1:20:01,  1.55it/s] 24%|██▍       | 2337/9770 [29:03<1:19:01,  1.57it/s] 24%|██▍       | 2338/9770 [29:04<1:18:48,  1.57it/s] 24%|██▍       | 2339/9770 [29:04<1:18:45,  1.57it/s] 24%|██▍       | 2340/9770 [29:05<1:18:36,  1.58it/s]                                         
+0: {'loss': 0.6752, 'grad_norm': 0.7033228390739535, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.24}
+0: {'loss': 0.6889, 'grad_norm': 0.6453095540665097, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.24}
+0:              24%|██▍       | 2340/9770 [29:05<1:18:36,  1.58it/s] 24%|██▍       | 2341/9770 [29:06<1:20:11,  1.54it/s] 24%|██▍       | 2342/9770 [29:06<1:20:17,  1.54it/s] 24%|██▍       | 2343/9770 [29:07<1:20:11,  1.54it/s] 24%|██▍       | 2344/9770 [29:08<1:20:41,  1.53it/s] 24%|██▍       | 2345/9770 [29:08<1:20:26,  1.54it/s] 24%|██▍       | 2346/9770 [29:09<1:21:13,  1.52it/s] 24%|██▍       | 2347/9770 [29:10<1:21:01,  1.53it/s] 24%|██▍       | 2348/9770 [29:10<1:20:28,  1.54it/s] 24%|██▍       | 2349/9770 [29:11<1:21:03,  1.53it/s] 24%|██▍       | 2350/9770 [29:12<1:21:09,  1.52it/s]                                                      24%|██▍       | 2350/9770 [29:12<1:21:09,  1.52it/s] 24%|██▍       | 2351/9770 [29:12<1:21:01,  1.53it/s] 24%|██▍       | 2352/9770 [29:13<1:20:34,  1.53it/s] 24%|██▍       | 2353/9770 [29:14<1:20:57,  1.53it/s] 24%|██▍       | 2354/9770 [29:14<1:21:19,  1.52i
+0: {'loss': 0.7065, 'grad_norm': 0.6661320101544158, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.24}
+0: t/s] 24%|██▍       | 2355/9770 [29:15<1:21:36,  1.51it/s] 24%|██▍       | 2356/9770 [29:16<1:22:25,  1.50it/s] 24%|██▍       | 2357/9770 [29:16<1:21:50,  1.51it/s] 24%|██▍       | 2358/9770 [29:17<1:22:01,  1.51it/s] 24%|██▍       | 2359/9770 [29:18<1:21:59,  1.51it/s] 24%|██▍       | 2360/9770 [29:18<1:20:35,  1.53it/s]                                                      24%|██▍       | 2360/9770 [29:18<1:20:35,  1.53it/s] 24%|██▍       | 2361/9770 [29:19<1:20:51,  1.53it/s] 24%|██▍       | 2362/9770 [29:20<1:21:33,  1.51it/s] 24%|██▍       | 2363/9770 [29:20<1:21:36,  1.51it/s] 24%|██▍       | 2364/9770 [29:21<1:22:18,  1.50it/s] 24%|██▍       | 2365/9770 [29:22<1:21:00,  1.52it/s] 24%|██▍       | 2366/9770 [29:22<1:21:46,  1.51it/s] 24%|██▍       | 2367/9770 [29:23<1:21:22,  1.52it/s] 24%|██▍       | 2368/9770 [29:24<1:21:18,  1.52it/s] 24%|██▍       | 2369/9770 [29:24<1:21:36,  1.51it/s] 24%
+0: {'loss': 0.6995, 'grad_norm': 0.7447715527305673, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.24}
+0: {'loss': 0.7155, 'grad_norm': 0.7095958786007863, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.24}
+0: |██▍       | 2370/9770 [29:25<1:22:23,  1.50it/s]                                                      24%|██▍       | 2370/9770 [29:25<1:22:23,  1.50it/s] 24%|██▍       | 2371/9770 [29:26<1:22:01,  1.50it/s] 24%|██▍       | 2372/9770 [29:26<1:21:25,  1.51it/s] 24%|██▍       | 2373/9770 [29:27<1:29:03,  1.38it/s] 24%|██▍       | 2374/9770 [29:28<1:26:29,  1.43it/s] 24%|██▍       | 2375/9770 [29:28<1:24:22,  1.46it/s] 24%|██▍       | 2376/9770 [29:29<1:25:06,  1.45it/s] 24%|██▍       | 2377/9770 [29:30<1:24:17,  1.46it/s] 24%|██▍       | 2378/9770 [29:30<1:23:12,  1.48it/s] 24%|██▍       | 2379/9770 [29:31<1:22:38,  1.49it/s] 24%|██▍       | 2380/9770 [29:32<1:23:00,  1.48it/s]                                                      24%|██▍       | 2380/9770 [29:32<1:23:00,  1.48it/s] 24%|██▍       | 2381/9770 [29:32<1:22:18,  1.50it/s] 24%|██▍       | 2382/9770 [29:33<1:21:20,  1.51it/s] 24%|██▍    
+0: {'loss': 0.6986, 'grad_norm': 0.6708710026563455, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.24}
+0:    | 2383/9770 [29:34<1:20:44,  1.52it/s] 24%|██▍       | 2384/9770 [29:34<1:20:26,  1.53it/s] 24%|██▍       | 2385/9770 [29:35<1:20:26,  1.53it/s] 24%|██▍       | 2386/9770 [29:36<1:20:20,  1.53it/s] 24%|██▍       | 2387/9770 [29:36<1:19:22,  1.55it/s] 24%|██▍       | 2388/9770 [29:37<1:20:12,  1.53it/s] 24%|██▍       | 2389/9770 [29:38<1:19:23,  1.55it/s] 24%|██▍       | 2390/9770 [29:38<1:21:45,  1.50it/s]                                                      24%|██▍       | 2390/9770 [29:38<1:21:45,  1.50it/s] 24%|██▍       | 2391/9770 [29:39<1:21:32,  1.51it/s] 24%|██▍       | 2392/9770 [29:40<1:21:13,  1.51it/s] 24%|██▍       | 2393/9770 [29:40<1:21:21,  1.51it/s] 25%|██▍       | 2394/9770 [29:41<1:20:56,  1.52it/s] 25%|██▍       | 2395/9770 [29:42<1:20:41,  1.52it/s] 25%|██▍       | 2396/9770 [29:42<1:20:41,  1.52it/s] 25%|██▍       | 2397/9770 [29:43<1:20:59,  1.52it/s] 25%|██▍       | 2398
+0: {'loss': 0.6886, 'grad_norm': 0.6958802805826247, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.25}
+0: {'loss': 0.6934, 'grad_norm': 0.6834266616612527, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.25}
+0: /9770 [29:44<1:21:06,  1.51it/s] 25%|██▍       | 2399/9770 [29:44<1:21:38,  1.50it/s] 25%|██▍       | 2400/9770 [29:45<1:21:52,  1.50it/s]                                                      25%|██▍       | 2400/9770 [29:45<1:21:52,  1.50it/s] 25%|██▍       | 2401/9770 [29:46<1:21:40,  1.50it/s] 25%|██▍       | 2402/9770 [29:46<1:21:49,  1.50it/s] 25%|██▍       | 2403/9770 [29:47<1:21:33,  1.51it/s] 25%|██▍       | 2404/9770 [29:48<1:21:18,  1.51it/s] 25%|██▍       | 2405/9770 [29:48<1:20:06,  1.53it/s] 25%|██▍       | 2406/9770 [29:49<1:20:28,  1.52it/s] 25%|██▍       | 2407/9770 [29:49<1:20:06,  1.53it/s] 25%|██▍       | 2408/9770 [29:50<1:21:11,  1.51it/s] 25%|██▍       | 2409/9770 [29:51<1:20:59,  1.51it/s] 25%|██▍       | 2410/9770 [29:51<1:20:58,  1.51it/s]                                                      25%|██▍       | 2410/9770 [29:51<1:20:58,  1.51it/s] 25%|██▍       | 2411/9770 [29:52<1
+0: {'loss': 0.705, 'grad_norm': 0.6605714232757514, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.25}
+0: :20:51,  1.52it/s] 25%|██▍       | 2412/9770 [29:53<1:21:38,  1.50it/s] 25%|██▍       | 2413/9770 [29:53<1:20:52,  1.52it/s] 25%|██▍       | 2414/9770 [29:54<1:20:39,  1.52it/s] 25%|██▍       | 2415/9770 [29:55<1:19:48,  1.54it/s] 25%|██▍       | 2416/9770 [29:55<1:19:44,  1.54it/s] 25%|██▍       | 2417/9770 [29:56<1:19:52,  1.53it/s] 25%|██▍       | 2418/9770 [29:57<1:22:04,  1.49it/s] 25%|██▍       | 2419/9770 [29:57<1:21:39,  1.50it/s] 25%|██▍       | 2420/9770 [29:58<1:21:09,  1.51it/s]                                                      25%|██▍       | 2420/9770 [29:58<1:21:09,  1.51it/s] 25%|██▍       | 2421/9770 [29:59<1:21:26,  1.50it/s] 25%|██▍       | 2422/9770 [29:59<1:20:57,  1.51it/s] 25%|██▍       | 2423/9770 [30:00<1:20:44,  1.52it/s] 25%|██▍       | 2424/9770 [30:01<1:20:01,  1.53it/s] 25%|██▍       | 2425/9770 [30:01<1:20:36,  1.52it/s] 25%|██▍       | 2426/9770 [30:02<1:20:36,  
+0: {'loss': 0.6885, 'grad_norm': 0.6327435562591638, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.25}
+0: {'loss': 0.6963, 'grad_norm': 0.6650247149095739, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.25}
+0: 1.52it/s] 25%|██▍       | 2427/9770 [30:03<1:20:07,  1.53it/s] 25%|██▍       | 2428/9770 [30:03<1:19:23,  1.54it/s] 25%|██▍       | 2429/9770 [30:04<1:19:53,  1.53it/s] 25%|██▍       | 2430/9770 [30:05<1:21:33,  1.50it/s]                                                      25%|██▍       | 2430/9770 [30:05<1:21:33,  1.50it/s] 25%|██▍       | 2431/9770 [30:05<1:21:23,  1.50it/s] 25%|██▍       | 2432/9770 [30:06<1:21:25,  1.50it/s] 25%|██▍       | 2433/9770 [30:07<1:21:04,  1.51it/s] 25%|██▍       | 2434/9770 [30:07<1:20:00,  1.53it/s] 25%|██▍       | 2435/9770 [30:08<1:19:42,  1.53it/s] 25%|██▍       | 2436/9770 [30:09<1:20:21,  1.52it/s] 25%|██▍       | 2437/9770 [30:09<1:21:24,  1.50it/s] 25%|██▍       | 2438/9770 [30:10<1:20:05,  1.53it/s] 25%|██▍       | 2439/9770 [30:11<1:21:58,  1.49it/s] 25%|██▍       | 2440/9770 [30:11<1:20:49,  1.51it/s]                                                      25%
+0: {'loss': 0.7184, 'grad_norm': 0.7083155582450582, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.25}
+0: |██▍       | 2440/9770 [30:11<1:20:49,  1.51it/s] 25%|██▍       | 2441/9770 [30:12<1:19:47,  1.53it/s] 25%|██▍       | 2442/9770 [30:13<1:20:33,  1.52it/s] 25%|██▌       | 2443/9770 [30:13<1:20:32,  1.52it/s] 25%|██▌       | 2444/9770 [30:14<1:20:35,  1.52it/s] 25%|██▌       | 2445/9770 [30:15<1:20:52,  1.51it/s] 25%|██▌       | 2446/9770 [30:15<1:21:17,  1.50it/s] 25%|██▌       | 2447/9770 [30:16<1:20:30,  1.52it/s] 25%|██▌       | 2448/9770 [30:17<1:20:01,  1.53it/s] 25%|██▌       | 2449/9770 [30:17<1:20:09,  1.52it/s] 25%|██▌       | 2450/9770 [30:18<1:21:25,  1.50it/s]                                                      25%|██▌       | 2450/9770 [30:18<1:21:25,  1.50it/s] 25%|██▌       | 2451/9770 [30:19<1:21:22,  1.50it/s] 25%|██▌       | 2452/9770 [30:19<1:21:12,  1.50it/s] 25%|██▌       | 2453/9770 [30:20<1:20:19,  1.52it/s] 25%|██▌       | 2454/9770 [30:20<1:20:03,  1.52it/s] 25%|██�
+0: {'loss': 0.674, 'grad_norm': 0.6638480769410481, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.25}
+0: �       | 2455/9770 [30:21<1:20:00,  1.52it/s] 25%|██▌       | 2456/9770 [30:22<1:19:49,  1.53it/s] 25%|██▌       | 2457/9770 [30:22<1:20:55,  1.51it/s] 25%|██▌       | 2458/9770 [30:23<1:20:40,  1.51it/s] 25%|██▌       | 2459/9770 [30:24<1:20:30,  1.51it/s] 25%|██▌       | 2460/9770 [30:24<1:21:33,  1.49it/s]                                                      25%|██▌       | 2460/9770 [30:24<1:21:33,  1.49it/s] 25%|██▌       | 2461/9770 [30:25<1:20:58,  1.50it/s] 25%|██▌       | 2462/9770 [30:26<1:21:58,  1.49it/s] 25%|██▌       | 2463/9770 [30:26<1:21:30,  1.49it/s] 25%|██▌       | 2464/9770 [30:27<1:20:36,  1.51it/s] 25%|██▌       | 2465/9770 [30:28<1:20:05,  1.52it/s] 25%|██▌       | 2466/9770 [30:28<1:19:41,  1.53it/s] 25%|██▌       | 2467/9770 [30:29<1:19:46,  1.53it/s] 25%|██▌       | 2468/9770 [30:30<1:19:33,  1.53it/s] 25%|██▌       | 2469/9770 [30:30<1:19:18,  1.53it/s] 25%|██▌       |
+0: {'loss': 0.6835, 'grad_norm': 0.6872804901742495, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.25}
+0: {'loss': 0.7102, 'grad_norm': 0.6261018581440215, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.25}
+0:  2470/9770 [30:31<1:19:11,  1.54it/s]                                                      25%|██▌       | 2470/9770 [30:31<1:19:11,  1.54it/s] 25%|██▌       | 2471/9770 [30:32<1:18:34,  1.55it/s] 25%|██▌       | 2472/9770 [30:32<1:20:06,  1.52it/s] 25%|██▌       | 2473/9770 [30:33<1:19:34,  1.53it/s] 25%|██▌       | 2474/9770 [30:34<1:19:50,  1.52it/s] 25%|██▌       | 2475/9770 [30:34<1:19:40,  1.53it/s] 25%|██▌       | 2476/9770 [30:35<1:19:28,  1.53it/s] 25%|██▌       | 2477/9770 [30:36<1:18:38,  1.55it/s] 25%|██▌       | 2478/9770 [30:36<1:19:00,  1.54it/s] 25%|██▌       | 2479/9770 [30:37<1:18:29,  1.55it/s] 25%|██▌       | 2480/9770 [30:38<1:19:07,  1.54it/s]                                                      25%|██▌       | 2480/9770 [30:38<1:19:07,  1.54it/s] 25%|██▌       | 2481/9770 [30:38<1:19:25,  1.53it/s] 25%|██▌       | 2482/9770 [30:39<1:19:43,  1.52it/s] 25%|██▌       | 2483/9770 [30
+0: {'loss': 0.6872, 'grad_norm': 0.6594941981971951, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.25}
+0: :40<1:19:56,  1.52it/s] 25%|██▌       | 2484/9770 [30:40<1:19:47,  1.52it/s] 25%|██▌       | 2485/9770 [30:41<1:19:43,  1.52it/s] 25%|██▌       | 2486/9770 [30:41<1:19:05,  1.53it/s] 25%|██▌       | 2487/9770 [30:42<1:18:39,  1.54it/s] 25%|██▌       | 2488/9770 [30:43<1:19:35,  1.52it/s] 25%|██▌       | 2489/9770 [30:43<1:19:52,  1.52it/s] 25%|██▌       | 2490/9770 [30:44<1:21:02,  1.50it/s]                                                      25%|██▌       | 2490/9770 [30:44<1:21:02,  1.50it/s] 25%|██▌       | 2491/9770 [30:45<1:21:00,  1.50it/s] 26%|██▌       | 2492/9770 [30:45<1:20:33,  1.51it/s] 26%|██▌       | 2493/9770 [30:46<1:19:06,  1.53it/s] 26%|██▌       | 2494/9770 [30:47<1:19:34,  1.52it/s] 26%|██▌       | 2495/9770 [30:47<1:20:36,  1.50it/s] 26%|██▌       | 2496/9770 [30:48<1:20:10,  1.51it/s] 26%|██▌       | 2497/9770 [30:49<1:19:48,  1.52it/s] 26%|██▌       | 2498/9770 [30:49<1:19:
+0: {'loss': 0.6848, 'grad_norm': 0.6579965013718169, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.26}
+0: {'loss': 0.7132, 'grad_norm': 0.6706785406272323, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.26}
+0: 44,  1.52it/s] 26%|██▌       | 2499/9770 [30:50<1:20:34,  1.50it/s] 26%|██▌       | 2500/9770 [30:51<1:20:14,  1.51it/s]                                                      26%|██▌       | 2500/9770 [30:51<1:20:14,  1.51it/s] 26%|██▌       | 2501/9770 [30:51<1:19:48,  1.52it/s] 26%|██▌       | 2502/9770 [30:52<1:20:03,  1.51it/s] 26%|██▌       | 2503/9770 [30:53<1:19:27,  1.52it/s] 26%|██▌       | 2504/9770 [30:53<1:19:28,  1.52it/s] 26%|██▌       | 2505/9770 [30:54<1:19:45,  1.52it/s] 26%|██▌       | 2506/9770 [30:55<1:19:42,  1.52it/s] 26%|██▌       | 2507/9770 [30:55<1:20:04,  1.51it/s] 26%|██▌       | 2508/9770 [30:56<1:20:15,  1.51it/s] 26%|██▌       | 2509/9770 [30:57<1:19:38,  1.52it/s] 26%|██▌       | 2510/9770 [30:57<1:19:40,  1.52it/s]                                                      26%|██▌       | 2510/9770 [30:57<1:19:40,  1.52it/s] 26%|██▌       | 2511/9770 [30:58<1:19:21,  1.52it/s]
+0: {'loss': 0.695, 'grad_norm': 0.6819231660488712, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.26}
+0:  26%|██▌       | 2512/9770 [30:59<1:18:48,  1.54it/s] 26%|██▌       | 2513/9770 [30:59<1:17:57,  1.55it/s] 26%|██▌       | 2514/9770 [31:00<1:18:31,  1.54it/s] 26%|██▌       | 2515/9770 [31:01<1:18:49,  1.53it/s] 26%|██▌       | 2516/9770 [31:01<1:19:06,  1.53it/s] 26%|██▌       | 2517/9770 [31:02<1:18:42,  1.54it/s] 26%|██▌       | 2518/9770 [31:03<1:18:22,  1.54it/s] 26%|██▌       | 2519/9770 [31:03<1:18:40,  1.54it/s] 26%|██▌       | 2520/9770 [31:04<1:19:28,  1.52it/s]                                                      26%|██▌       | 2520/9770 [31:04<1:19:28,  1.52it/s] 26%|██▌       | 2521/9770 [31:04<1:18:54,  1.53it/s] 26%|██▌       | 2522/9770 [31:05<1:19:28,  1.52it/s] 26%|██▌       | 2523/9770 [31:06<1:18:36,  1.54it/s] 26%|██▌       | 2524/9770 [31:06<1:18:34,  1.54it/s] 26%|██▌       | 2525/9770 [31:07<1:18:13,  1.54it/s] 26%|██▌       | 2526/9770 [31:08<1:17:38,  1.56it/s] 26%|█
+0: {'loss': 0.7041, 'grad_norm': 0.6260161993010644, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.26}
+0: {'loss': 0.6826, 'grad_norm': 0.6721983154332225, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.26}
+0: █▌       | 2527/9770 [31:08<1:17:48,  1.55it/s] 26%|██▌       | 2528/9770 [31:09<1:18:09,  1.54it/s] 26%|██▌       | 2529/9770 [31:10<1:18:25,  1.54it/s] 26%|██▌       | 2530/9770 [31:10<1:19:07,  1.52it/s]                                                      26%|██▌       | 2530/9770 [31:10<1:19:07,  1.52it/s] 26%|██▌       | 2531/9770 [31:11<1:20:47,  1.49it/s] 26%|██▌       | 2532/9770 [31:12<1:21:04,  1.49it/s] 26%|██▌       | 2533/9770 [31:12<1:22:23,  1.46it/s] 26%|██▌       | 2534/9770 [31:13<1:20:35,  1.50it/s] 26%|██▌       | 2535/9770 [31:14<1:19:57,  1.51it/s] 26%|██▌       | 2536/9770 [31:14<1:19:27,  1.52it/s] 26%|██▌       | 2537/9770 [31:15<1:18:50,  1.53it/s] 26%|██▌       | 2538/9770 [31:16<1:19:23,  1.52it/s] 26%|██▌       | 2539/9770 [31:16<1:19:27,  1.52it/s] 26%|██▌       | 2540/9770 [31:17<1:19:21,  1.52it/s]                                                      26%|██▌       |
+0: {'loss': 0.683, 'grad_norm': 0.632845371003001, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.26}
+0:  2540/9770 [31:17<1:19:21,  1.52it/s] 26%|██▌       | 2541/9770 [31:18<1:19:22,  1.52it/s] 26%|██▌       | 2542/9770 [31:18<1:19:18,  1.52it/s] 26%|██▌       | 2543/9770 [31:19<1:19:59,  1.51it/s] 26%|██▌       | 2544/9770 [31:20<1:18:38,  1.53it/s] 26%|██▌       | 2545/9770 [31:20<1:18:52,  1.53it/s] 26%|██▌       | 2546/9770 [31:21<1:18:44,  1.53it/s] 26%|██▌       | 2547/9770 [31:22<1:18:28,  1.53it/s] 26%|██▌       | 2548/9770 [31:22<1:19:26,  1.52it/s] 26%|██▌       | 2549/9770 [31:23<1:19:42,  1.51it/s] 26%|██▌       | 2550/9770 [31:24<1:19:32,  1.51it/s]                                                      26%|██▌       | 2550/9770 [31:24<1:19:32,  1.51it/s] 26%|██▌       | 2551/9770 [31:24<1:20:01,  1.50it/s] 26%|██▌       | 2552/9770 [31:25<1:19:53,  1.51it/s] 26%|██▌       | 2553/9770 [31:26<1:18:49,  1.53it/s] 26%|██▌       | 2554/9770 [31:26<1:19:31,  1.51it/s] 26%|██▌       | 2555/977
+0: {'loss': 0.7043, 'grad_norm': 0.6468594271100815, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.26}
+0: 0 [31:27<1:19:14,  1.52it/s] 26%|██▌       | 2556/9770 [31:28<1:19:03,  1.52it/s] 26%|██▌       | 2557/9770 [31:28<1:19:04,  1.52it/s] 26%|██▌       | 2558/9770 [31:29<1:20:48,  1.49it/s] 26%|██▌       | 2559/9770 [31:30<1:20:25,  1.49it/s] 26%|██▌       | 2560/9770 [31:30<1:20:00,  1.50it/s]                                                      26%|██▌       | 2560/9770 [31:30<1:20:00,  1.50it/s] 26%|██▌       | 2561/9770 [31:31<1:20:02,  1.50it/s] 26%|██▌       | 2562/9770 [31:32<1:19:51,  1.50it/s] 26%|██▌       | 2563/9770 [31:32<1:19:20,  1.51it/s] 26%|██▌       | 2564/9770 [31:33<1:19:23,  1.51it/s] 26%|██▋       | 2565/9770 [31:34<1:19:10,  1.52it/s] 26%|██▋       | 2566/9770 [31:34<1:19:05,  1.52it/s] 26%|██▋       | 2567/9770 [31:35<1:20:53,  1.48it/s] 26%|██▋       | 2568/9770 [31:36<1:19:53,  1.50it/s] 26%|██▋       | 2569/9770 [31:36<1:20:03,  1.50it/s] 26%|██▋       | 2570/9770 [31:37<
+0: {'loss': 0.699, 'grad_norm': 0.6503548334613835, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.26}
+0: {'loss': 0.6898, 'grad_norm': 0.6850959775677724, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.26}
+0: 1:20:19,  1.49it/s]                                                      26%|██▋       | 2570/9770 [31:37<1:20:19,  1.49it/s] 26%|██▋       | 2571/9770 [31:38<1:20:24,  1.49it/s] 26%|██▋       | 2572/9770 [31:38<1:19:50,  1.50it/s] 26%|██▋       | 2573/9770 [31:39<1:19:31,  1.51it/s] 26%|██▋       | 2574/9770 [31:40<1:19:07,  1.52it/s] 26%|██▋       | 2575/9770 [31:40<1:20:53,  1.48it/s] 26%|██▋       | 2576/9770 [31:41<1:18:58,  1.52it/s] 26%|██▋       | 2577/9770 [31:41<1:18:27,  1.53it/s] 26%|██▋       | 2578/9770 [31:42<1:18:40,  1.52it/s] 26%|██▋       | 2579/9770 [31:43<1:18:46,  1.52it/s] 26%|██▋       | 2580/9770 [31:43<1:19:24,  1.51it/s]                                                      26%|██▋       | 2580/9770 [31:43<1:19:24,  1.51it/s] 26%|██▋       | 2581/9770 [31:44<1:19:37,  1.50it/s] 26%|██▋       | 2582/9770 [31:45<1:18:55,  1.52it/s] 26%|██▋       | 2583/9770 [31:45<1:18:36,  1.52
+0: {'loss': 0.6983, 'grad_norm': 0.7601607684741332, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.26}
+0: it/s] 26%|██▋       | 2584/9770 [31:46<1:19:03,  1.51it/s] 26%|██▋       | 2585/9770 [31:47<1:19:09,  1.51it/s] 26%|██▋       | 2586/9770 [31:47<1:19:40,  1.50it/s] 26%|██▋       | 2587/9770 [31:48<1:18:29,  1.53it/s] 26%|██▋       | 2588/9770 [31:49<1:18:47,  1.52it/s] 26%|██▋       | 2589/9770 [31:49<1:18:35,  1.52it/s] 27%|██▋       | 2590/9770 [31:50<1:18:31,  1.52it/s]                                                      27%|██▋       | 2590/9770 [31:50<1:18:31,  1.52it/s] 27%|██▋       | 2591/9770 [31:51<1:20:16,  1.49it/s] 27%|██▋       | 2592/9770 [31:51<1:19:35,  1.50it/s] 27%|██▋       | 2593/9770 [31:52<1:19:13,  1.51it/s] 27%|██▋       | 2594/9770 [31:53<1:17:53,  1.54it/s] 27%|██▋       | 2595/9770 [31:53<1:17:58,  1.53it/s] 27%|██▋       | 2596/9770 [31:54<1:19:03,  1.51it/s] 27%|██▋       | 2597/9770 [31:55<1:19:32,  1.50it/s] 27%|██▋       | 2598/9770 [31:55<1:19:46,  1.50it/s] 27
+0: {'loss': 0.7158, 'grad_norm': 0.714844249648634, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.27}
+0: {'loss': 0.6902, 'grad_norm': 0.6566247528951694, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.27}
+0: %|██▋       | 2599/9770 [31:56<1:18:36,  1.52it/s] 27%|██▋       | 2600/9770 [31:57<1:18:45,  1.52it/s]                                                      27%|██▋       | 2600/9770 [31:57<1:18:45,  1.52it/s] 27%|██▋       | 2601/9770 [31:57<1:19:21,  1.51it/s] 27%|██▋       | 2602/9770 [31:58<1:19:29,  1.50it/s] 27%|██▋       | 2603/9770 [31:59<1:19:32,  1.50it/s] 27%|██▋       | 2604/9770 [31:59<1:19:04,  1.51it/s] 27%|██▋       | 2605/9770 [32:00<1:19:11,  1.51it/s] 27%|██▋       | 2606/9770 [32:01<1:19:32,  1.50it/s] 27%|██▋       | 2607/9770 [32:01<1:19:16,  1.51it/s] 27%|██▋       | 2608/9770 [32:02<1:18:26,  1.52it/s] 27%|██▋       | 2609/9770 [32:03<1:18:21,  1.52it/s] 27%|██▋       | 2610/9770 [32:03<1:18:49,  1.51it/s]                                                      27%|██▋       | 2610/9770 [32:03<1:18:49,  1.51it/s] 27%|██▋       | 2611/9770 [32:04<1:18:40,  1.52it/s] 27%|██▋   
+0: {'loss': 0.6799, 'grad_norm': 0.6818533156172748, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.27}
+0:     | 2612/9770 [32:05<1:19:00,  1.51it/s] 27%|██▋       | 2613/9770 [32:05<1:19:07,  1.51it/s] 27%|██▋       | 2614/9770 [32:06<1:17:56,  1.53it/s] 27%|██▋       | 2615/9770 [32:07<1:18:00,  1.53it/s] 27%|██▋       | 2616/9770 [32:07<1:18:37,  1.52it/s] 27%|██▋       | 2617/9770 [32:08<1:18:49,  1.51it/s] 27%|██▋       | 2618/9770 [32:09<1:19:01,  1.51it/s] 27%|██▋       | 2619/9770 [32:09<1:20:14,  1.49it/s] 27%|██▋       | 2620/9770 [32:10<1:19:01,  1.51it/s]                                                      27%|██▋       | 2620/9770 [32:10<1:19:01,  1.51it/s] 27%|██▋       | 2621/9770 [32:11<1:20:23,  1.48it/s] 27%|██▋       | 2622/9770 [32:11<1:19:26,  1.50it/s] 27%|██▋       | 2623/9770 [32:12<1:18:31,  1.52it/s] 27%|██▋       | 2624/9770 [32:13<1:18:18,  1.52it/s] 27%|██▋       | 2625/9770 [32:13<1:19:19,  1.50it/s] 27%|██▋       | 2626/9770 [32:14<1:18:34,  1.52it/s] 27%|██▋       | 262
+0: {'loss': 0.68, 'grad_norm': 0.6587684863284928, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.27}
+0: {'loss': 0.705, 'grad_norm': 0.6693220598241055, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.27}
+0: 7/9770 [32:15<1:17:57,  1.53it/s] 27%|██▋       | 2628/9770 [32:15<1:18:21,  1.52it/s] 27%|██▋       | 2629/9770 [32:16<1:17:47,  1.53it/s] 27%|██▋       | 2630/9770 [32:17<1:18:07,  1.52it/s]                                                      27%|██▋       | 2630/9770 [32:17<1:18:07,  1.52it/s] 27%|██▋       | 2631/9770 [32:17<1:18:55,  1.51it/s] 27%|██▋       | 2632/9770 [32:18<1:20:31,  1.48it/s] 27%|██▋       | 2633/9770 [32:19<1:19:22,  1.50it/s] 27%|██▋       | 2634/9770 [32:19<1:18:27,  1.52it/s] 27%|██▋       | 2635/9770 [32:20<1:17:58,  1.53it/s] 27%|██▋       | 2636/9770 [32:21<1:18:13,  1.52it/s] 27%|██▋       | 2637/9770 [32:21<1:17:14,  1.54it/s] 27%|██▋       | 2638/9770 [32:22<1:17:44,  1.53it/s] 27%|██▋       | 2639/9770 [32:22<1:17:31,  1.53it/s] 27%|██▋       | 2640/9770 [32:23<1:18:38,  1.51it/s]                                                      27%|██▋       | 2640/9770 [32:23<
+0: {'loss': 0.7191, 'grad_norm': 0.6404089490596173, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.27}
+0: 1:18:38,  1.51it/s] 27%|██▋       | 2641/9770 [32:24<1:18:38,  1.51it/s] 27%|██▋       | 2642/9770 [32:24<1:18:45,  1.51it/s] 27%|██▋       | 2643/9770 [32:25<1:18:01,  1.52it/s] 27%|██▋       | 2644/9770 [32:26<1:19:39,  1.49it/s] 27%|██▋       | 2645/9770 [32:26<1:18:47,  1.51it/s] 27%|██▋       | 2646/9770 [32:27<1:17:53,  1.52it/s] 27%|██▋       | 2647/9770 [32:28<1:18:12,  1.52it/s] 27%|██▋       | 2648/9770 [32:28<1:17:53,  1.52it/s] 27%|██▋       | 2649/9770 [32:29<1:17:28,  1.53it/s] 27%|██▋       | 2650/9770 [32:30<1:17:17,  1.54it/s]                                                      27%|██▋       | 2650/9770 [32:30<1:17:17,  1.54it/s] 27%|██▋       | 2651/9770 [32:30<1:17:35,  1.53it/s] 27%|██▋       | 2652/9770 [32:31<1:16:32,  1.55it/s] 27%|██▋       | 2653/9770 [32:32<1:15:56,  1.56it/s] 27%|██▋       | 2654/9770 [32:32<1:16:35,  1.55it/s] 27%|██▋       | 2655/9770 [32:33<1:17:05, 
+0: {'loss': 0.7207, 'grad_norm': 0.6434468794401105, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.27}
+0:  1.54it/s] 27%|██▋       | 2656/9770 [32:34<1:17:09,  1.54it/s] 27%|██▋       | 2657/9770 [32:34<1:19:09,  1.50it/s] 27%|██▋       | 2658/9770 [32:35<1:19:20,  1.49it/s] 27%|██▋       | 2659/9770 [32:36<1:19:02,  1.50it/s] 27%|██▋       | 2660/9770 [32:36<1:18:07,  1.52it/s]                                                      27%|██▋       | 2660/9770 [32:36<1:18:07,  1.52it/s] 27%|██▋       | 2661/9770 [32:37<1:18:40,  1.51it/s] 27%|██▋       | 2662/9770 [32:38<1:18:04,  1.52it/s] 27%|██▋       | 2663/9770 [32:38<1:18:13,  1.51it/s] 27%|██▋       | 2664/9770 [32:39<1:18:19,  1.51it/s] 27%|██▋       | 2665/9770 [32:40<1:17:44,  1.52it/s] 27%|██▋       | 2666/9770 [32:40<1:17:18,  1.53it/s] 27%|██▋       | 2667/9770 [32:41<1:17:17,  1.53it/s] 27%|██▋       | 2668/9770 [32:42<1:17:01,  1.54it/s] 27%|██▋       | 2669/9770 [32:42<1:16:52,  1.54it/s] 27%|██▋       | 2670/9770 [32:43<1:16:49,  1.54it/s
+0: {'loss': 0.6956, 'grad_norm': 0.6373079921026094, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.27}
+0: {'loss': 0.7138, 'grad_norm': 0.6716590217601842, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.27}
+0: ]                                                      27%|██▋       | 2670/9770 [32:43<1:16:49,  1.54it/s] 27%|██▋       | 2671/9770 [32:43<1:17:00,  1.54it/s] 27%|██▋       | 2672/9770 [32:44<1:18:13,  1.51it/s] 27%|██▋       | 2673/9770 [32:45<1:17:15,  1.53it/s] 27%|██▋       | 2674/9770 [32:45<1:17:18,  1.53it/s] 27%|██▋       | 2675/9770 [32:46<1:17:44,  1.52it/s] 27%|██▋       | 2676/9770 [32:47<1:18:21,  1.51it/s] 27%|██▋       | 2677/9770 [32:47<1:18:11,  1.51it/s] 27%|██▋       | 2678/9770 [32:48<1:17:27,  1.53it/s] 27%|██▋       | 2679/9770 [32:49<1:18:26,  1.51it/s] 27%|██▋       | 2680/9770 [32:49<1:18:51,  1.50it/s]                                                      27%|██▋       | 2680/9770 [32:49<1:18:51,  1.50it/s] 27%|██▋       | 2681/9770 [32:50<1:17:48,  1.52it/s] 27%|██▋       | 2682/9770 [32:51<1:17:23,  1.53it/s] 27%|██▋       | 2683/9770 [32:51<1:16:50,  1.54it/s] 27%|██�
+0: {'loss': 0.6953, 'grad_norm': 0.7004235270005645, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.28}
+0: ��       | 2684/9770 [32:52<1:16:16,  1.55it/s] 27%|██▋       | 2685/9770 [32:53<1:16:35,  1.54it/s] 27%|██▋       | 2686/9770 [32:53<1:17:38,  1.52it/s] 28%|██▊       | 2687/9770 [32:54<1:17:48,  1.52it/s] 28%|██▊       | 2688/9770 [32:55<1:16:56,  1.53it/s] 28%|██▊       | 2689/9770 [32:55<1:16:35,  1.54it/s] 28%|██▊       | 2690/9770 [32:56<1:16:45,  1.54it/s]                                                      28%|██▊       | 2690/9770 [32:56<1:16:45,  1.54it/s] 28%|██▊       | 2691/9770 [32:57<1:16:26,  1.54it/s] 28%|██▊       | 2692/9770 [32:57<1:17:35,  1.52it/s] 28%|██▊       | 2693/9770 [32:58<1:17:43,  1.52it/s] 28%|██▊       | 2694/9770 [32:59<1:17:42,  1.52it/s] 28%|██▊       | 2695/9770 [32:59<1:16:55,  1.53it/s] 28%|██▊       | 2696/9770 [33:00<1:16:31,  1.54it/s] 28%|██▊       | 2697/9770 [33:00<1:16:43,  1.54it/s] 28%|██▊       | 2698/9770 [33:01<1:16:57,  1.53it/s] 28%|██▊       
+0: {'loss': 0.6957, 'grad_norm': 0.6823894126516401, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.28}
+0: {'loss': 0.6987, 'grad_norm': 0.6873280943033803, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.28}
+0: | 2699/9770 [33:02<1:16:52,  1.53it/s] 28%|██▊       | 2700/9770 [33:02<1:17:10,  1.53it/s]                                                      28%|██▊       | 2700/9770 [33:02<1:17:10,  1.53it/s] 28%|██▊       | 2701/9770 [33:03<1:17:14,  1.53it/s] 28%|██▊       | 2702/9770 [33:04<1:16:57,  1.53it/s] 28%|██▊       | 2703/9770 [33:04<1:18:43,  1.50it/s] 28%|██▊       | 2704/9770 [33:05<1:18:35,  1.50it/s] 28%|██▊       | 2705/9770 [33:06<1:17:53,  1.51it/s] 28%|██▊       | 2706/9770 [33:06<1:17:55,  1.51it/s] 28%|██▊       | 2707/9770 [33:07<1:18:17,  1.50it/s] 28%|██▊       | 2708/9770 [33:08<1:18:37,  1.50it/s] 28%|██▊       | 2709/9770 [33:08<1:17:44,  1.51it/s] 28%|██▊       | 2710/9770 [33:09<1:17:38,  1.52it/s]                                                      28%|██▊       | 2710/9770 [33:09<1:17:38,  1.52it/s] 28%|██▊       | 2711/9770 [33:10<1:17:04,  1.53it/s] 28%|██▊       | 2712/9770 [3
+0: {'loss': 0.6776, 'grad_norm': 0.6137549934793076, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.28}
+0: 3:10<1:16:51,  1.53it/s] 28%|██▊       | 2713/9770 [33:11<1:15:50,  1.55it/s] 28%|██▊       | 2714/9770 [33:12<1:16:48,  1.53it/s] 28%|██▊       | 2715/9770 [33:12<1:16:54,  1.53it/s] 28%|██▊       | 2716/9770 [33:13<1:17:10,  1.52it/s] 28%|██▊       | 2717/9770 [33:14<1:16:47,  1.53it/s] 28%|██▊       | 2718/9770 [33:14<1:16:41,  1.53it/s] 28%|██▊       | 2719/9770 [33:15<1:16:37,  1.53it/s] 28%|██▊       | 2720/9770 [33:16<1:16:44,  1.53it/s]                                                      28%|██▊       | 2720/9770 [33:16<1:16:44,  1.53it/s] 28%|██▊       | 2721/9770 [33:16<1:16:27,  1.54it/s] 28%|██▊       | 2722/9770 [33:17<1:16:48,  1.53it/s] 28%|██▊       | 2723/9770 [33:18<1:16:24,  1.54it/s] 28%|██▊       | 2724/9770 [33:18<1:17:04,  1.52it/s] 28%|██▊       | 2725/9770 [33:19<1:16:54,  1.53it/s] 28%|██▊       | 2726/9770 [33:20<1:16:47,  1.53it/s] 28%|██▊       | 2727/9770 [33:20<1:17
+0: {'loss': 0.7144, 'grad_norm': 0.7114441459312836, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.28}
+0: {'loss': 0.6926, 'grad_norm': 0.6816157335832983, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.28}
+0: :15,  1.52it/s] 28%|██▊       | 2728/9770 [33:21<1:17:15,  1.52it/s] 28%|██▊       | 2729/9770 [33:22<1:17:17,  1.52it/s] 28%|██▊       | 2730/9770 [33:22<1:17:14,  1.52it/s]                                                      28%|██▊       | 2730/9770 [33:22<1:17:14,  1.52it/s] 28%|██▊       | 2731/9770 [33:23<1:17:03,  1.52it/s] 28%|██▊       | 2732/9770 [33:23<1:16:18,  1.54it/s] 28%|██▊       | 2733/9770 [33:24<1:16:49,  1.53it/s] 28%|██▊       | 2734/9770 [33:25<1:17:06,  1.52it/s] 28%|██▊       | 2735/9770 [33:25<1:16:38,  1.53it/s] 28%|██▊       | 2736/9770 [33:26<1:16:26,  1.53it/s] 28%|██▊       | 2737/9770 [33:27<1:16:49,  1.53it/s] 28%|██▊       | 2738/9770 [33:27<1:16:44,  1.53it/s] 28%|██▊       | 2739/9770 [33:28<1:17:02,  1.52it/s] 28%|██▊       | 2740/9770 [33:29<1:16:53,  1.52it/s]                                                      28%|██▊       | 2740/9770 [33:29<1:16:53,  1.52it/s
+0: {'loss': 0.6937, 'grad_norm': 0.6993635335347452, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.28}
+0: ] 28%|██▊       | 2741/9770 [33:29<1:16:56,  1.52it/s] 28%|██▊       | 2742/9770 [33:30<1:17:00,  1.52it/s] 28%|██▊       | 2743/9770 [33:31<1:16:24,  1.53it/s] 28%|██▊       | 2744/9770 [33:31<1:17:09,  1.52it/s] 28%|██▊       | 2745/9770 [33:32<1:17:36,  1.51it/s] 28%|██▊       | 2746/9770 [33:33<1:18:10,  1.50it/s] 28%|██▊       | 2747/9770 [33:33<1:17:56,  1.50it/s] 28%|██▊       | 2748/9770 [33:34<1:18:07,  1.50it/s] 28%|██▊       | 2749/9770 [33:35<1:17:10,  1.52it/s] 28%|██▊       | 2750/9770 [33:35<1:17:05,  1.52it/s]                                                      28%|██▊       | 2750/9770 [33:35<1:17:05,  1.52it/s] 28%|██▊       | 2751/9770 [33:36<1:16:21,  1.53it/s] 28%|██▊       | 2752/9770 [33:37<1:15:56,  1.54it/s] 28%|██▊       | 2753/9770 [33:37<1:15:55,  1.54it/s] 28%|██▊       | 2754/9770 [33:38<1:16:02,  1.54it/s] 28%|██▊       | 2755/9770 [33:39<1:15:59,  1.54it/s] 28%|�
+0: {'loss': 0.6744, 'grad_norm': 0.7044848293097536, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.28}
+0: �█▊       | 2756/9770 [33:39<1:15:45,  1.54it/s] 28%|██▊       | 2757/9770 [33:40<1:16:07,  1.54it/s] 28%|██▊       | 2758/9770 [33:41<1:16:01,  1.54it/s] 28%|██▊       | 2759/9770 [33:41<1:16:25,  1.53it/s] 28%|██▊       | 2760/9770 [33:42<1:17:04,  1.52it/s]                                                      28%|██▊       | 2760/9770 [33:42<1:17:04,  1.52it/s] 28%|██▊       | 2761/9770 [33:43<1:18:19,  1.49it/s] 28%|██▊       | 2762/9770 [33:43<1:17:52,  1.50it/s] 28%|██▊       | 2763/9770 [33:44<1:17:35,  1.51it/s] 28%|██▊       | 2764/9770 [33:45<1:16:54,  1.52it/s] 28%|██▊       | 2765/9770 [33:45<1:17:21,  1.51it/s] 28%|██▊       | 2766/9770 [33:46<1:17:30,  1.51it/s] 28%|██▊       | 2767/9770 [33:47<1:25:46,  1.36it/s] 28%|██▊       | 2768/9770 [33:47<1:22:48,  1.41it/s] 28%|██▊       | 2769/9770 [33:48<1:20:37,  1.45it/s] 28%|██▊       | 2770/9770 [33:49<1:19:46,  1.46it/s]                
+0: {'loss': 0.6785, 'grad_norm': 0.6635997869028231, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.28}
+0: {'loss': 0.7011, 'grad_norm': 0.6666638442957333, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.28}
+0:                                       28%|██▊       | 2770/9770 [33:49<1:19:46,  1.46it/s] 28%|██▊       | 2771/9770 [33:49<1:19:23,  1.47it/s] 28%|██▊       | 2772/9770 [33:50<1:18:03,  1.49it/s] 28%|██▊       | 2773/9770 [33:51<1:16:41,  1.52it/s] 28%|██▊       | 2774/9770 [33:51<1:15:27,  1.55it/s] 28%|██▊       | 2775/9770 [33:52<1:15:21,  1.55it/s] 28%|██▊       | 2776/9770 [33:53<1:16:27,  1.52it/s] 28%|██▊       | 2777/9770 [33:53<1:15:43,  1.54it/s] 28%|██▊       | 2778/9770 [33:54<1:17:04,  1.51it/s] 28%|██▊       | 2779/9770 [33:55<1:16:17,  1.53it/s] 28%|██▊       | 2780/9770 [33:55<1:17:16,  1.51it/s]                                                      28%|██▊       | 2780/9770 [33:55<1:17:16,  1.51it/s] 28%|██▊       | 2781/9770 [33:56<1:17:38,  1.50it/s] 28%|██▊       | 2782/9770 [33:57<1:16:13,  1.53it/s] 28%|██▊       | 2783/9770 [33:57<1:15:58,  1.53it/s] 28%|██▊       | 2784/97
+0: {'loss': 0.6971, 'grad_norm': 0.6546325638321053, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.29}
+0: 70 [33:58<1:15:46,  1.54it/s] 29%|██▊       | 2785/9770 [33:59<1:15:53,  1.53it/s] 29%|██▊       | 2786/9770 [33:59<1:16:14,  1.53it/s] 29%|██▊       | 2787/9770 [34:00<1:16:55,  1.51it/s] 29%|██▊       | 2788/9770 [34:00<1:16:27,  1.52it/s] 29%|██▊       | 2789/9770 [34:01<1:16:51,  1.51it/s] 29%|██▊       | 2790/9770 [34:02<1:26:22,  1.35it/s]                                                      29%|██▊       | 2790/9770 [34:02<1:26:22,  1.35it/s] 29%|██▊       | 2791/9770 [34:03<1:23:36,  1.39it/s] 29%|██▊       | 2792/9770 [34:03<1:22:59,  1.40it/s] 29%|██▊       | 2793/9770 [34:04<1:21:08,  1.43it/s] 29%|██▊       | 2794/9770 [34:05<1:19:20,  1.47it/s] 29%|██▊       | 2795/9770 [34:05<1:19:46,  1.46it/s] 29%|██▊       | 2796/9770 [34:06<1:18:12,  1.49it/s] 29%|██▊       | 2797/9770 [34:07<1:17:14,  1.50it/s] 29%|██▊       | 2798/9770 [34:07<1:17:16,  1.50it/s] 29%|██▊       | 2799/9770 [34:08
+0: {'loss': 0.6971, 'grad_norm': 0.7212135225200016, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.29}
+0: {'loss': 0.7044, 'grad_norm': 0.6597693574036081, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.29}
+0: <1:16:41,  1.51it/s] 29%|██▊       | 2800/9770 [34:09<1:16:47,  1.51it/s]                                                      29%|██▊       | 2800/9770 [34:09<1:16:47,  1.51it/s] 29%|██▊       | 2801/9770 [34:09<1:16:41,  1.51it/s] 29%|██▊       | 2802/9770 [34:10<1:17:28,  1.50it/s] 29%|██▊       | 2803/9770 [34:11<1:16:52,  1.51it/s] 29%|██▊       | 2804/9770 [34:11<1:16:48,  1.51it/s] 29%|██▊       | 2805/9770 [34:12<1:16:40,  1.51it/s] 29%|██▊       | 2806/9770 [34:13<1:16:43,  1.51it/s] 29%|██▊       | 2807/9770 [34:13<1:16:34,  1.52it/s] 29%|██▊       | 2808/9770 [34:14<1:16:47,  1.51it/s] 29%|██▉       | 2809/9770 [34:15<1:16:17,  1.52it/s] 29%|██▉       | 2810/9770 [34:15<1:15:59,  1.53it/s]                                                      29%|██▉       | 2810/9770 [34:15<1:15:59,  1.53it/s] 29%|██▉       | 2811/9770 [34:16<1:16:09,  1.52it/s] 29%|██▉       | 2812/9770 [34:17<1:16:07,  1.5
+0: {'loss': 0.6917, 'grad_norm': 0.6578503686481688, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.29}
+0: 2it/s] 29%|██▉       | 2813/9770 [34:17<1:16:10,  1.52it/s] 29%|██▉       | 2814/9770 [34:18<1:16:00,  1.53it/s] 29%|██▉       | 2815/9770 [34:19<1:15:52,  1.53it/s] 29%|██▉       | 2816/9770 [34:19<1:16:12,  1.52it/s] 29%|██▉       | 2817/9770 [34:20<1:16:25,  1.52it/s] 29%|██▉       | 2818/9770 [34:21<1:15:28,  1.54it/s] 29%|██▉       | 2819/9770 [34:21<1:14:51,  1.55it/s] 29%|██▉       | 2820/9770 [34:22<1:15:25,  1.54it/s]                                                      29%|██▉       | 2820/9770 [34:22<1:15:25,  1.54it/s] 29%|██▉       | 2821/9770 [34:22<1:14:59,  1.54it/s] 29%|██▉       | 2822/9770 [34:23<1:16:16,  1.52it/s] 29%|██▉       | 2823/9770 [34:24<1:16:17,  1.52it/s] 29%|██▉       | 2824/9770 [34:24<1:15:57,  1.52it/s] 29%|██▉       | 2825/9770 [34:25<1:16:13,  1.52it/s] 29%|██▉       | 2826/9770 [34:26<1:25:17,  1.36it/s] 29%|██▉       | 2827/9770 [34:27<1:23:14,  1.39it/s] 2
+0: {'loss': 0.6794, 'grad_norm': 0.6592209926870769, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.29}
+0: {'loss': 0.7005, 'grad_norm': 0.6793546919894566, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.29}
+0: 9%|██▉       | 2828/9770 [34:27<1:21:10,  1.43it/s] 29%|██▉       | 2829/9770 [34:28<1:19:47,  1.45it/s] 29%|██▉       | 2830/9770 [34:29<1:19:05,  1.46it/s]                                                      29%|██▉       | 2830/9770 [34:29<1:19:05,  1.46it/s] 29%|██▉       | 2831/9770 [34:29<1:17:10,  1.50it/s] 29%|██▉       | 2832/9770 [34:30<1:16:18,  1.52it/s] 29%|██▉       | 2833/9770 [34:31<1:16:08,  1.52it/s] 29%|██▉       | 2834/9770 [34:31<1:15:19,  1.53it/s] 29%|██▉       | 2835/9770 [34:32<1:26:53,  1.33it/s] 29%|██▉       | 2836/9770 [34:33<1:23:21,  1.39it/s] 29%|██▉       | 2837/9770 [34:34<1:31:32,  1.26it/s] 29%|██▉       | 2838/9770 [34:35<1:27:12,  1.32it/s] 29%|██▉       | 2839/9770 [34:35<1:24:13,  1.37it/s] 29%|██▉       | 2840/9770 [34:36<1:22:02,  1.41it/s]                                                      29%|██▉       | 2840/9770 [34:36<1:22:02,  1.41it/s] 29%|██▉  
+0: {'loss': 0.7013, 'grad_norm': 0.6364167733450667, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.29}
+0:      | 2841/9770 [34:37<1:20:29,  1.43it/s] 29%|██▉       | 2842/9770 [34:37<1:19:34,  1.45it/s] 29%|██▉       | 2843/9770 [34:38<1:17:54,  1.48it/s] 29%|██▉       | 2844/9770 [34:39<1:16:12,  1.51it/s] 29%|██▉       | 2845/9770 [34:39<1:15:43,  1.52it/s] 29%|██▉       | 2846/9770 [34:40<1:15:52,  1.52it/s] 29%|██▉       | 2847/9770 [34:40<1:15:33,  1.53it/s] 29%|██▉       | 2848/9770 [34:41<1:15:37,  1.53it/s] 29%|██▉       | 2849/9770 [34:42<1:28:18,  1.31it/s] 29%|██▉       | 2850/9770 [34:43<1:24:02,  1.37it/s]                                                      29%|██▉       | 2850/9770 [34:43<1:24:02,  1.37it/s] 29%|██▉       | 2851/9770 [34:43<1:21:39,  1.41it/s] 29%|██▉       | 2852/9770 [34:44<1:30:36,  1.27it/s] 29%|██▉       | 2853/9770 [34:45<1:26:12,  1.34it/s] 29%|██▉       | 2854/9770 [34:46<1:23:09,  1.39it/s] 29%|██▉       | 2855/9770 [34:46<1:20:38,  1.43it/s] 29%|██▉       | 28
+0: {'loss': 0.69, 'grad_norm': 0.7694239540103294, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.29}
+0: 56/9770 [34:47<1:19:48,  1.44it/s] 29%|██▉       | 2857/9770 [34:48<1:18:36,  1.47it/s] 29%|██▉       | 2858/9770 [34:48<1:18:24,  1.47it/s] 29%|██▉       | 2859/9770 [34:49<1:17:29,  1.49it/s] 29%|██▉       | 2860/9770 [34:50<1:25:26,  1.35it/s]                                                      29%|██▉       | 2860/9770 [34:50<1:25:26,  1.35it/s] 29%|██▉       | 2861/9770 [34:51<1:30:31,  1.27it/s] 29%|██▉       | 2862/9770 [34:51<1:26:11,  1.34it/s] 29%|██▉       | 2863/9770 [34:52<1:32:19,  1.25it/s] 29%|██▉       | 2864/9770 [34:53<1:27:16,  1.32it/s] 29%|██▉       | 2865/9770 [34:54<1:23:29,  1.38it/s] 29%|██▉       | 2866/9770 [34:54<1:22:50,  1.39it/s] 29%|██▉       | 2867/9770 [34:55<1:20:26,  1.43it/s] 29%|██▉       | 2868/9770 [34:56<1:19:19,  1.45it/s] 29%|██▉       | 2869/9770 [34:56<1:17:27,  1.48it/s] 29%|██▉       | 2870/9770 [34:57<1:15:53,  1.52it/s]                                  
+0: {'loss': 0.6773, 'grad_norm': 0.6418972172187125, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.29}
+0: {'loss': 0.7115, 'grad_norm': 0.7219525462185958, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.29}
+0:                     29%|██▉       | 2870/9770 [34:57<1:15:53,  1.52it/s] 29%|██▉       | 2871/9770 [34:58<1:14:53,  1.54it/s] 29%|██▉       | 2872/9770 [34:59<1:27:20,  1.32it/s] 29%|██▉       | 2873/9770 [34:59<1:22:57,  1.39it/s] 29%|██▉       | 2874/9770 [35:00<1:20:53,  1.42it/s] 29%|██▉       | 2875/9770 [35:01<1:19:08,  1.45it/s] 29%|██▉       | 2876/9770 [35:01<1:17:30,  1.48it/s] 29%|██▉       | 2877/9770 [35:02<1:16:43,  1.50it/s] 29%|██▉       | 2878/9770 [35:03<1:16:23,  1.50it/s] 29%|██▉       | 2879/9770 [35:03<1:15:56,  1.51it/s] 29%|██▉       | 2880/9770 [35:04<1:16:09,  1.51it/s]                                                      29%|██▉       | 2880/9770 [35:04<1:16:09,  1.51it/s] 29%|██▉       | 2881/9770 [35:05<1:15:14,  1.53it/s] 29%|██▉       | 2882/9770 [35:05<1:14:47,  1.53it/s] 30%|██▉       | 2883/9770 [35:06<1:14:04,  1.55it/s] 30%|██▉       | 2884/9770 [35:06<1:14:42,
+0: {'loss': 0.6836, 'grad_norm': 0.7350393946798834, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.3}
+0:   1.54it/s] 30%|██▉       | 2885/9770 [35:07<1:14:53,  1.53it/s] 30%|██▉       | 2886/9770 [35:08<1:25:58,  1.33it/s] 30%|██▉       | 2887/9770 [35:09<1:22:43,  1.39it/s] 30%|██▉       | 2888/9770 [35:09<1:21:44,  1.40it/s] 30%|██▉       | 2889/9770 [35:10<1:19:28,  1.44it/s] 30%|██▉       | 2890/9770 [35:11<1:18:00,  1.47it/s]                                                      30%|██▉       | 2890/9770 [35:11<1:18:00,  1.47it/s] 30%|██▉       | 2891/9770 [35:11<1:17:31,  1.48it/s] 30%|██▉       | 2892/9770 [35:12<1:16:50,  1.49it/s] 30%|██▉       | 2893/9770 [35:13<1:16:13,  1.50it/s] 30%|██▉       | 2894/9770 [35:13<1:15:43,  1.51it/s] 30%|██▉       | 2895/9770 [35:14<1:16:27,  1.50it/s] 30%|██▉       | 2896/9770 [35:15<1:16:59,  1.49it/s] 30%|██▉       | 2897/9770 [35:15<1:16:46,  1.49it/s] 30%|██▉       | 2898/9770 [35:16<1:15:21,  1.52it/s] 30%|██▉       | 2899/9770 [35:17<1:15:22,  1.52it/
+0: {'loss': 0.7202, 'grad_norm': 0.6766754879569933, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.3}
+0: {'loss': 0.7011, 'grad_norm': 0.6646716199848026, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.3}
+0: s] 30%|██▉       | 2900/9770 [35:17<1:15:55,  1.51it/s]                                                      30%|██▉       | 2900/9770 [35:17<1:15:55,  1.51it/s] 30%|██▉       | 2901/9770 [35:18<1:16:00,  1.51it/s] 30%|██▉       | 2902/9770 [35:19<1:16:00,  1.51it/s] 30%|██▉       | 2903/9770 [35:19<1:15:57,  1.51it/s] 30%|██▉       | 2904/9770 [35:20<1:15:33,  1.51it/s] 30%|██▉       | 2905/9770 [35:21<1:15:31,  1.51it/s] 30%|██▉       | 2906/9770 [35:21<1:15:54,  1.51it/s] 30%|██▉       | 2907/9770 [35:22<1:15:36,  1.51it/s] 30%|██▉       | 2908/9770 [35:23<1:15:37,  1.51it/s] 30%|██▉       | 2909/9770 [35:23<1:14:58,  1.53it/s] 30%|██▉       | 2910/9770 [35:24<1:14:58,  1.52it/s]                                                      30%|██▉       | 2910/9770 [35:24<1:14:58,  1.52it/s] 30%|██▉       | 2911/9770 [35:25<1:15:46,  1.51it/s] 30%|██▉       | 2912/9770 [35:25<1:14:48,  1.53it/s] 30%|██
+0: {'loss': 0.6891, 'grad_norm': 0.6485553091648115, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.3}
+0: ▉       | 2913/9770 [35:26<1:14:55,  1.53it/s] 30%|██▉       | 2914/9770 [35:27<1:15:05,  1.52it/s] 30%|██▉       | 2915/9770 [35:27<1:15:23,  1.52it/s] 30%|██▉       | 2916/9770 [35:28<1:15:30,  1.51it/s] 30%|██▉       | 2917/9770 [35:29<1:16:02,  1.50it/s] 30%|██▉       | 2918/9770 [35:29<1:15:57,  1.50it/s] 30%|██▉       | 2919/9770 [35:30<1:15:15,  1.52it/s] 30%|██▉       | 2920/9770 [35:31<1:16:02,  1.50it/s]                                                      30%|██▉       | 2920/9770 [35:31<1:16:02,  1.50it/s] 30%|██▉       | 2921/9770 [35:31<1:16:14,  1.50it/s] 30%|██▉       | 2922/9770 [35:32<1:15:56,  1.50it/s] 30%|██▉       | 2923/9770 [35:33<1:15:15,  1.52it/s] 30%|██▉       | 2924/9770 [35:33<1:14:56,  1.52it/s] 30%|██▉       | 2925/9770 [35:34<1:15:22,  1.51it/s] 30%|██▉       | 2926/9770 [35:35<1:15:06,  1.52it/s] 30%|██▉       | 2927/9770 [35:35<1:15:24,  1.51it/s] 30%|██▉      
+0: {'loss': 0.6953, 'grad_norm': 0.6789681285276337, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.3}
+0: {'loss': 0.6945, 'grad_norm': 0.7220289214637919, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.3}
+0:  | 2928/9770 [35:36<1:15:10,  1.52it/s] 30%|██▉       | 2929/9770 [35:37<1:15:03,  1.52it/s] 30%|██▉       | 2930/9770 [35:37<1:15:43,  1.51it/s]                                                      30%|██▉       | 2930/9770 [35:37<1:15:43,  1.51it/s] 30%|███       | 2931/9770 [35:38<1:15:59,  1.50it/s] 30%|███       | 2932/9770 [35:39<1:17:42,  1.47it/s] 30%|███       | 2933/9770 [35:39<1:18:03,  1.46it/s] 30%|███       | 2934/9770 [35:40<1:17:18,  1.47it/s] 30%|███       | 2935/9770 [35:41<1:16:32,  1.49it/s] 30%|███       | 2936/9770 [35:41<1:22:57,  1.37it/s] 30%|███       | 2937/9770 [35:42<1:20:27,  1.42it/s] 30%|███       | 2938/9770 [35:43<1:18:29,  1.45it/s] 30%|███       | 2939/9770 [35:43<1:16:49,  1.48it/s] 30%|███       | 2940/9770 [35:44<1:16:24,  1.49it/s]                                                      30%|███       | 2940/9770 [35:44<1:16:24,  1.49it/s] 30%|███       | 2941/9770 [
+0: {'loss': 0.6687, 'grad_norm': 0.6520813555126397, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.3}
+0: 35:45<1:16:26,  1.49it/s] 30%|███       | 2942/9770 [35:45<1:15:52,  1.50it/s] 30%|███       | 2943/9770 [35:46<1:17:07,  1.48it/s] 30%|███       | 2944/9770 [35:47<1:16:33,  1.49it/s] 30%|███       | 2945/9770 [35:47<1:15:54,  1.50it/s] 30%|███       | 2946/9770 [35:48<1:17:08,  1.47it/s] 30%|███       | 2947/9770 [35:49<1:16:32,  1.49it/s] 30%|███       | 2948/9770 [35:49<1:14:54,  1.52it/s] 30%|███       | 2949/9770 [35:50<1:15:14,  1.51it/s] 30%|███       | 2950/9770 [35:51<1:15:03,  1.51it/s]                                                      30%|███       | 2950/9770 [35:51<1:15:03,  1.51it/s] 30%|███       | 2951/9770 [35:51<1:16:38,  1.48it/s] 30%|███       | 2952/9770 [35:52<1:17:33,  1.47it/s] 30%|███       | 2953/9770 [35:53<1:16:44,  1.48it/s] 30%|███       | 2954/9770 [35:53<1:16:35,  1.48it/s] 30%|███       | 2955/9770 [35:54<1:16:38,  1.48it/s] 30%|███       | 2956/9770 [35:55<1:1
+0: {'loss': 0.7153, 'grad_norm': 0.6057842498278948, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.3}
+0: 5:57,  1.50it/s] 30%|███       | 2957/9770 [35:55<1:15:13,  1.51it/s] 30%|███       | 2958/9770 [35:56<1:15:28,  1.50it/s] 30%|███       | 2959/9770 [35:57<1:15:44,  1.50it/s] 30%|███       | 2960/9770 [35:57<1:15:58,  1.49it/s]                                                      30%|███       | 2960/9770 [35:57<1:15:58,  1.49it/s] 30%|███       | 2961/9770 [35:58<1:15:18,  1.51it/s] 30%|███       | 2962/9770 [35:59<1:14:35,  1.52it/s] 30%|███       | 2963/9770 [35:59<1:16:22,  1.49it/s] 30%|███       | 2964/9770 [36:00<1:15:08,  1.51it/s] 30%|███       | 2965/9770 [36:01<1:15:57,  1.49it/s] 30%|███       | 2966/9770 [36:01<1:15:33,  1.50it/s] 30%|███       | 2967/9770 [36:02<1:15:18,  1.51it/s] 30%|███       | 2968/9770 [36:03<1:16:43,  1.48it/s] 30%|███       | 2969/9770 [36:03<1:15:54,  1.49it/s] 30%|███       | 2970/9770 [36:04<1:15:15,  1.51it/s]                                                    
+0: {'loss': 0.6948, 'grad_norm': 0.6663808491051855, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.3}
+0: {'loss': 0.6808, 'grad_norm': 0.6202681039044606, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.3}
+0:   30%|███       | 2970/9770 [36:04<1:15:15,  1.51it/s] 30%|███       | 2971/9770 [36:05<1:15:19,  1.50it/s] 30%|███       | 2972/9770 [36:05<1:15:04,  1.51it/s] 30%|███       | 2973/9770 [36:06<1:14:18,  1.52it/s] 30%|███       | 2974/9770 [36:07<1:14:17,  1.52it/s] 30%|███       | 2975/9770 [36:07<1:13:37,  1.54it/s] 30%|███       | 2976/9770 [36:08<1:14:07,  1.53it/s] 30%|███       | 2977/9770 [36:09<1:13:39,  1.54it/s] 30%|███       | 2978/9770 [36:09<1:14:01,  1.53it/s] 30%|███       | 2979/9770 [36:10<1:14:54,  1.51it/s] 31%|███       | 2980/9770 [36:11<1:15:29,  1.50it/s]                                                      31%|███       | 2980/9770 [36:11<1:15:29,  1.50it/s] 31%|███       | 2981/9770 [36:11<1:15:26,  1.50it/s] 31%|███       | 2982/9770 [36:12<1:14:30,  1.52it/s] 31%|███       | 2983/9770 [36:13<1:14:28,  1.52it/s] 31%|███       | 2984/9770 [36:13<1:14:31,  1.52it/s] 31%|�
+0: {'loss': 0.7121, 'grad_norm': 0.6380993857228097, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.31}
+0: ��██       | 2985/9770 [36:14<1:13:26,  1.54it/s] 31%|███       | 2986/9770 [36:15<1:14:02,  1.53it/s] 31%|███       | 2987/9770 [36:15<1:13:14,  1.54it/s] 31%|███       | 2988/9770 [36:16<1:12:08,  1.57it/s] 31%|███       | 2989/9770 [36:17<1:12:38,  1.56it/s] 31%|███       | 2990/9770 [36:17<1:13:14,  1.54it/s]                                                      31%|███       | 2990/9770 [36:17<1:13:14,  1.54it/s] 31%|███       | 2991/9770 [36:18<1:13:29,  1.54it/s] 31%|███       | 2992/9770 [36:18<1:12:54,  1.55it/s] 31%|███       | 2993/9770 [36:19<1:13:50,  1.53it/s] 31%|███       | 2994/9770 [36:20<1:13:18,  1.54it/s] 31%|███       | 2995/9770 [36:20<1:13:53,  1.53it/s] 31%|███       | 2996/9770 [36:21<1:13:55,  1.53it/s] 31%|███       | 2997/9770 [36:22<1:14:17,  1.52it/s] 31%|███       | 2998/9770 [36:22<1:14:47,  1.51it/s] 31%|███       | 2999/9770 [36:23<1:15:06,  1.50it/s] 31%|███ 
+0: {'loss': 0.6971, 'grad_norm': 0.6776259121230762, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.31}
+0: {'loss': 0.6892, 'grad_norm': 0.7025850975168401, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.31}
+0:       | 3000/9770 [36:24<1:14:56,  1.51it/s]                                                      31%|███       | 3000/9770 [36:24<1:14:56,  1.51it/s] 31%|███       | 3001/9770 [36:24<1:14:25,  1.52it/s] 31%|███       | 3002/9770 [36:25<1:14:25,  1.52it/s] 31%|███       | 3003/9770 [36:26<1:14:11,  1.52it/s] 31%|███       | 3004/9770 [36:26<1:13:12,  1.54it/s] 31%|███       | 3005/9770 [36:27<1:13:02,  1.54it/s] 31%|███       | 3006/9770 [36:28<1:14:08,  1.52it/s] 31%|███       | 3007/9770 [36:28<1:14:13,  1.52it/s] 31%|███       | 3008/9770 [36:29<1:13:34,  1.53it/s] 31%|███       | 3009/9770 [36:30<1:14:03,  1.52it/s] 31%|███       | 3010/9770 [36:30<1:14:06,  1.52it/s]                                                      31%|███       | 3010/9770 [36:30<1:14:06,  1.52it/s] 31%|███       | 3011/9770 [36:31<1:14:24,  1.51it/s] 31%|███       | 3012/9770 [36:32<1:14:11,  1.52it/s] 31%|███       | 3013/9
+0: {'loss': 0.6827, 'grad_norm': 0.6939108136749735, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.31}
+0: 770 [36:32<1:13:35,  1.53it/s] 31%|███       | 3014/9770 [36:33<1:15:08,  1.50it/s] 31%|███       | 3015/9770 [36:34<1:15:11,  1.50it/s] 31%|███       | 3016/9770 [36:34<1:15:04,  1.50it/s] 31%|███       | 3017/9770 [36:35<1:14:15,  1.52it/s] 31%|���██       | 3018/9770 [36:36<1:14:03,  1.52it/s] 31%|███       | 3019/9770 [36:36<1:13:51,  1.52it/s] 31%|███       | 3020/9770 [36:37<1:13:47,  1.52it/s]                                                      31%|███       | 3020/9770 [36:37<1:13:47,  1.52it/s] 31%|███       | 3021/9770 [36:38<1:13:17,  1.53it/s] 31%|███       | 3022/9770 [36:38<1:13:07,  1.54it/s] 31%|███       | 3023/9770 [36:39<1:12:59,  1.54it/s] 31%|███       | 3024/9770 [36:40<1:13:12,  1.54it/s] 31%|███       | 3025/9770 [36:40<1:13:42,  1.53it/s] 31%|███       | 3026/9770 [36:41<1:13:26,  1.53it/s] 31%|███       | 3027/9770 [36:41<1:12:08,  1.56it/s] 31%|███       | 3028/9770 [36:4
+0: {'loss': 0.7, 'grad_norm': 0.6628349251671267, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.31}
+0: {'loss': 0.7257, 'grad_norm': 0.6559916093778086, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.31}
+0: 2<1:12:04,  1.56it/s] 31%|███       | 3029/9770 [36:43<1:13:01,  1.54it/s] 31%|███       | 3030/9770 [36:43<1:13:14,  1.53it/s]                                                      31%|███       | 3030/9770 [36:43<1:13:14,  1.53it/s] 31%|███       | 3031/9770 [36:44<1:13:47,  1.52it/s] 31%|███       | 3032/9770 [36:45<1:14:10,  1.51it/s] 31%|███       | 3033/9770 [36:45<1:15:40,  1.48it/s] 31%|███       | 3034/9770 [36:46<1:15:16,  1.49it/s] 31%|███       | 3035/9770 [36:47<1:14:45,  1.50it/s] 31%|███       | 3036/9770 [36:47<1:14:17,  1.51it/s] 31%|███       | 3037/9770 [36:48<1:13:32,  1.53it/s] 31%|███       | 3038/9770 [36:49<1:13:27,  1.53it/s] 31%|███       | 3039/9770 [36:49<1:13:35,  1.52it/s] 31%|███       | 3040/9770 [36:50<1:13:17,  1.53it/s]                                                      31%|███       | 3040/9770 [36:50<1:13:17,  1.53it/s] 31%|███       | 3041/9770 [36:51<1:13:25,  1.
+0: {'loss': 0.6922, 'grad_norm': 0.7144948194607025, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.31}
+0: 53it/s] 31%|███       | 3042/9770 [36:51<1:13:42,  1.52it/s] 31%|███       | 3043/9770 [36:52<1:15:15,  1.49it/s] 31%|███       | 3044/9770 [36:53<1:15:00,  1.49it/s] 31%|███       | 3045/9770 [36:53<1:15:01,  1.49it/s] 31%|███       | 3046/9770 [36:54<1:15:24,  1.49it/s] 31%|███       | 3047/9770 [36:55<1:15:02,  1.49it/s] 31%|███       | 3048/9770 [36:55<1:14:55,  1.50it/s] 31%|███       | 3049/9770 [36:56<1:15:03,  1.49it/s] 31%|███       | 3050/9770 [36:57<1:15:44,  1.48it/s]                                                      31%|███       | 3050/9770 [36:57<1:15:44,  1.48it/s] 31%|███       | 3051/9770 [36:57<1:15:28,  1.48it/s] 31%|███       | 3052/9770 [36:58<1:14:32,  1.50it/s] 31%|███       | 3053/9770 [36:59<1:14:16,  1.51it/s] 31%|███▏      | 3054/9770 [36:59<1:13:16,  1.53it/s] 31%|███▏      | 3055/9770 [37:00<1:13:33,  1.52it/s] 31%|███▏      | 3056/9770 [37:01<1:12:46,  1.54i
+0: {'loss': 0.6651, 'grad_norm': 0.65454515169968, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.31}
+0: t/s] 31%|███▏      | 3057/9770 [37:01<1:13:09,  1.53it/s] 31%|███▏      | 3058/9770 [37:02<1:12:18,  1.55it/s] 31%|███▏      | 3059/9770 [37:03<1:12:56,  1.53it/s] 31%|███▏      | 3060/9770 [37:03<1:13:03,  1.53it/s]                                                      31%|███▏      | 3060/9770 [37:03<1:13:03,  1.53it/s] 31%|███▏      | 3061/9770 [37:04<1:13:52,  1.51it/s] 31%|███▏      | 3062/9770 [37:05<1:13:11,  1.53it/s] 31%|███▏      | 3063/9770 [37:05<1:13:44,  1.52it/s] 31%|███▏      | 3064/9770 [37:06<1:13:58,  1.51it/s] 31%|███▏      | 3065/9770 [37:07<1:15:10,  1.49it/s] 31%|███▏      | 3066/9770 [37:07<1:14:42,  1.50it/s] 31%|███▏      | 3067/9770 [37:08<1:14:58,  1.49it/s] 31%|███▏      | 3068/9770 [37:09<1:13:43,  1.52it/s] 31%|███▏      | 3069/9770 [37:09<1:14:59,  1.49it/s] 31%|███▏      | 3070/9770 [37:10<1:14:57,  1.49it/s]                                  
+0: {'loss': 0.6984, 'grad_norm': 0.656416424567537, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.31}
+0: {'loss': 0.6846, 'grad_norm': 0.657974632287886, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.32}
+0:                     31%|███▏      | 3070/9770 [37:10<1:14:57,  1.49it/s] 31%|███▏      | 3071/9770 [37:11<1:14:24,  1.50it/s] 31%|███▏      | 3072/9770 [37:11<1:14:13,  1.50it/s] 31%|███▏      | 3073/9770 [37:12<1:15:31,  1.48it/s] 31%|███▏      | 3074/9770 [37:13<1:15:25,  1.48it/s] 31%|███▏      | 3075/9770 [37:13<1:15:17,  1.48it/s] 31%|███▏      | 3076/9770 [37:14<1:14:47,  1.49it/s] 31%|███▏      | 3077/9770 [37:15<1:15:22,  1.48it/s] 32%|███▏      | 3078/9770 [37:15<1:15:05,  1.49it/s] 32%|███▏      | 3079/9770 [37:16<1:14:44,  1.49it/s] 32%|███▏      | 3080/9770 [37:17<1:14:51,  1.49it/s]                                                      32%|███▏      | 3080/9770 [37:17<1:14:51,  1.49it/s] 32%|███▏      | 3081/9770 [37:17<1:14:51,  1.49it/s] 32%|███▏      | 3082/9770 [37:18<1:14:58,  1.49it/s] 32%|███▏      | 3083/9770 [37:19<1:14:38,  1.49it/s] 32%|███▏ 
+0: {'loss': 0.694, 'grad_norm': 0.6135256708218882, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.32}
+0:      | 3084/9770 [37:19<1:14:35,  1.49it/s] 32%|███▏      | 3085/9770 [37:20<1:13:29,  1.52it/s] 32%|███▏      | 3086/9770 [37:21<1:13:44,  1.51it/s] 32%|███▏      | 3087/9770 [37:21<1:14:04,  1.50it/s] 32%|███▏      | 3088/9770 [37:22<1:12:52,  1.53it/s] 32%|███▏      | 3089/9770 [37:23<1:12:39,  1.53it/s] 32%|███▏      | 3090/9770 [37:23<1:12:36,  1.53it/s]                                                      32%|███▏      | 3090/9770 [37:23<1:12:36,  1.53it/s] 32%|███▏      | 3091/9770 [37:24<1:12:06,  1.54it/s] 32%|███▏      | 3092/9770 [37:25<1:12:40,  1.53it/s] 32%|███▏      | 3093/9770 [37:25<1:13:11,  1.52it/s] 32%|███▏      | 3094/9770 [37:26<1:14:05,  1.50it/s] 32%|███▏      | 3095/9770 [37:27<1:13:54,  1.51it/s] 32%|███▏      | 3096/9770 [37:27<1:14:03,  1.50it/s] 32%|███▏      | 3097/9770 [37:28<1:15:15,  1.48it/s] 32%|███▏      | 3098/9770 [37:29<1:14:08,  1.50i
+0: {'loss': 0.6862, 'grad_norm': 0.6402061509023796, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.32}
+0: {'loss': 0.699, 'grad_norm': 0.6419710411618812, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.32}
+0: t/s] 32%|███▏      | 3099/9770 [37:29<1:14:15,  1.50it/s] 32%|███▏      | 3100/9770 [37:30<1:13:58,  1.50it/s]                                                      32%|███▏      | 3100/9770 [37:30<1:13:58,  1.50it/s] 32%|███▏      | 3101/9770 [37:31<1:14:37,  1.49it/s] 32%|███▏      | 3102/9770 [37:31<1:14:37,  1.49it/s] 32%|███▏      | 3103/9770 [37:32<1:13:41,  1.51it/s] 32%|███▏      | 3104/9770 [37:33<1:13:45,  1.51it/s] 32%|███▏      | 3105/9770 [37:33<1:14:23,  1.49it/s] 32%|███▏      | 3106/9770 [37:34<1:13:53,  1.50it/s] 32%|███▏      | 3107/9770 [37:35<1:13:14,  1.52it/s] 32%|███▏      | 3108/9770 [37:35<1:13:01,  1.52it/s] 32%|███▏      | 3109/9770 [37:36<1:13:21,  1.51it/s] 32%|███▏      | 3110/9770 [37:37<1:13:09,  1.52it/s]                                                      32%|███▏      | 3110/9770 [37:37<1:13:09,  1.52it/s] 32%|███▏      | 3111/9770 [37:37
+0: {'loss': 0.7156, 'grad_norm': 0.6484948921832452, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.32}
+0: <1:13:28,  1.51it/s] 32%|███▏      | 3112/9770 [37:38<1:13:26,  1.51it/s] 32%|███▏      | 3113/9770 [37:39<1:13:18,  1.51it/s] 32%|███▏      | 3114/9770 [37:39<1:12:31,  1.53it/s] 32%|███▏      | 3115/9770 [37:40<1:12:32,  1.53it/s] 32%|███▏      | 3116/9770 [37:40<1:12:09,  1.54it/s] 32%|███▏      | 3117/9770 [37:41<1:12:29,  1.53it/s] 32%|███▏      | 3118/9770 [37:42<1:12:59,  1.52it/s] 32%|███▏      | 3119/9770 [37:42<1:12:10,  1.54it/s] 32%|███▏      | 3120/9770 [37:43<1:12:13,  1.53it/s]                                                      32%|███▏      | 3120/9770 [37:43<1:12:13,  1.53it/s] 32%|███▏      | 3121/9770 [37:44<1:13:44,  1.50it/s] 32%|███▏      | 3122/9770 [37:44<1:13:04,  1.52it/s] 32%|███▏      | 3123/9770 [37:45<1:13:25,  1.51it/s] 32%|███▏      | 3124/9770 [37:46<1:14:15,  1.49it/s] 32%|███▏      | 3125/9770 [37:46<1:14:02,  1.50it/s] 32%|███▏ 
+0: {'loss': 0.6871, 'grad_norm': 0.6760327250876728, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.32}
+0:      | 3126/9770 [37:47<1:13:55,  1.50it/s] 32%|███▏      | 3127/9770 [37:48<1:14:39,  1.48it/s] 32%|███▏      | 3128/9770 [37:48<1:14:20,  1.49it/s] 32%|███▏      | 3129/9770 [37:49<1:12:29,  1.53it/s] 32%|███▏      | 3130/9770 [37:50<1:12:29,  1.53it/s]                                                      32%|███▏      | 3130/9770 [37:50<1:12:29,  1.53it/s] 32%|███▏      | 3131/9770 [37:50<1:12:42,  1.52it/s] 32%|███▏      | 3132/9770 [37:51<1:12:04,  1.54it/s] 32%|███▏      | 3133/9770 [37:52<1:12:27,  1.53it/s] 32%|███▏      | 3134/9770 [37:52<1:11:34,  1.55it/s] 32%|███▏      | 3135/9770 [37:53<1:11:41,  1.54it/s] 32%|███▏      | 3136/9770 [37:54<1:11:58,  1.54it/s] 32%|███▏      | 3137/9770 [37:54<1:12:25,  1.53it/s] 32%|███▏      | 3138/9770 [37:55<1:12:39,  1.52it/s] 32%|███▏      | 3139/9770 [37:56<1:13:41,  1.50it/s] 32%|███▏      | 3140/9770 [37:56<1:13:49,  1.50i
+0: {'loss': 0.6871, 'grad_norm': 0.6567712769002737, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.32}
+0: {'loss': 0.6751, 'grad_norm': 0.653068885444386, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.32}
+0: t/s]                                                      32%|███▏      | 3140/9770 [37:56<1:13:49,  1.50it/s] 32%|███▏      | 3141/9770 [37:57<1:14:20,  1.49it/s] 32%|███▏      | 3142/9770 [37:58<1:14:22,  1.49it/s] 32%|███▏      | 3143/9770 [37:58<1:14:43,  1.48it/s] 32%|███▏      | 3144/9770 [37:59<1:13:55,  1.49it/s] 32%|███▏      | 3145/9770 [38:00<1:14:54,  1.47it/s] 32%|███▏      | 3146/9770 [38:00<1:14:30,  1.48it/s] 32%|███▏      | 3147/9770 [38:01<1:15:35,  1.46it/s] 32%|███▏      | 3148/9770 [38:02<1:14:40,  1.48it/s] 32%|███▏      | 3149/9770 [38:02<1:14:07,  1.49it/s] 32%|███▏      | 3150/9770 [38:03<1:13:12,  1.51it/s]                                                      32%|███▏      | 3150/9770 [38:03<1:13:12,  1.51it/s] 32%|███▏      | 3151/9770 [38:04<1:13:27,  1.50it/s] 32%|███▏      | 3152/9770 [38:04<1:12:58,  1.51it/s] 32%|███▏      | 3153/9770 [38:05
+0: {'loss': 0.7061, 'grad_norm': 0.7236697260126095, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.32}
+0: <1:13:06,  1.51it/s] 32%|███▏      | 3154/9770 [38:06<1:13:01,  1.51it/s] 32%|███▏      | 3155/9770 [38:06<1:12:44,  1.52it/s] 32%|███▏      | 3156/9770 [38:07<1:12:48,  1.51it/s] 32%|███▏      | 3157/9770 [38:08<1:13:00,  1.51it/s] 32%|███▏      | 3158/9770 [38:08<1:12:58,  1.51it/s] 32%|███▏      | 3159/9770 [38:09<1:13:06,  1.51it/s] 32%|███▏      | 3160/9770 [38:10<1:12:47,  1.51it/s]                                                      32%|███▏      | 3160/9770 [38:10<1:12:47,  1.51it/s] 32%|███▏      | 3161/9770 [38:10<1:12:17,  1.52it/s] 32%|███▏      | 3162/9770 [38:11<1:11:50,  1.53it/s] 32%|███▏      | 3163/9770 [38:12<1:13:14,  1.50it/s] 32%|███▏      | 3164/9770 [38:12<1:13:44,  1.49it/s] 32%|███▏      | 3165/9770 [38:13<1:13:33,  1.50it/s] 32%|███▏      | 3166/9770 [38:14<1:13:41,  1.49it/s] 32%|███▏      | 3167/9770 [38:14<1:14:20,  1.48it/s] 32%|███▏ 
+0: {'loss': 0.6945, 'grad_norm': 0.6666809603218877, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.32}
+0: {'loss': 0.6795, 'grad_norm': 0.7178729703245473, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.33}
+0:      | 3168/9770 [38:15<1:14:33,  1.48it/s] 32%|███▏      | 3169/9770 [38:16<1:14:27,  1.48it/s] 32%|███▏      | 3170/9770 [38:16<1:14:54,  1.47it/s]                                                      32%|███▏      | 3170/9770 [38:16<1:14:54,  1.47it/s] 32%|███▏      | 3171/9770 [38:17<1:14:42,  1.47it/s] 32%|███▏      | 3172/9770 [38:18<1:14:36,  1.47it/s] 32%|███▏      | 3173/9770 [38:18<1:13:57,  1.49it/s] 32%|███▏      | 3174/9770 [38:19<1:13:37,  1.49it/s] 32%|███▏      | 3175/9770 [38:20<1:14:01,  1.49it/s] 33%|███▎      | 3176/9770 [38:20<1:12:56,  1.51it/s] 33%|███▎      | 3177/9770 [38:21<1:12:36,  1.51it/s] 33%|███▎      | 3178/9770 [38:22<1:14:03,  1.48it/s] 33%|███▎      | 3179/9770 [38:22<1:13:27,  1.50it/s] 33%|███▎      | 3180/9770 [38:23<1:13:43,  1.49it/s]                                                      33%|███▎      | 3180/9770 [38:23<1:13:43,  1.49it/s] 3
+0: {'loss': 0.6854, 'grad_norm': 0.6499722619294048, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.33}
+0: 3%|███▎      | 3181/9770 [38:24<1:13:40,  1.49it/s] 33%|███▎      | 3182/9770 [38:24<1:14:01,  1.48it/s] 33%|███▎      | 3183/9770 [38:25<1:13:43,  1.49it/s] 33%|███▎      | 3184/9770 [38:26<1:13:32,  1.49it/s] 33%|███▎      | 3185/9770 [38:26<1:12:27,  1.51it/s] 33%|███▎      | 3186/9770 [38:27<1:12:37,  1.51it/s] 33%|███▎      | 3187/9770 [38:28<1:12:39,  1.51it/s] 33%|███▎      | 3188/9770 [38:28<1:12:05,  1.52it/s] 33%|███▎      | 3189/9770 [38:29<1:11:27,  1.53it/s] 33%|███▎      | 3190/9770 [38:30<1:11:21,  1.54it/s]                                                      33%|███▎      | 3190/9770 [38:30<1:11:21,  1.54it/s] 33%|███▎      | 3191/9770 [38:30<1:11:08,  1.54it/s] 33%|███▎      | 3192/9770 [38:31<1:11:46,  1.53it/s] 33%|███▎      | 3193/9770 [38:32<1:10:59,  1.54it/s] 33%|███▎      | 3194/9770 [38:32<1:11:43,  1.53it/s] 33%|███▎      | 3195/9770 [38:33
+0: {'loss': 0.718, 'grad_norm': 0.6500696032011751, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.33}
+0: <1:11:41,  1.53it/s] 33%|███▎      | 3196/9770 [38:34<1:10:47,  1.55it/s] 33%|███▎      | 3197/9770 [38:34<1:11:47,  1.53it/s] 33%|███▎      | 3198/9770 [38:35<1:11:52,  1.52it/s] 33%|███▎      | 3199/9770 [38:36<1:13:12,  1.50it/s] 33%|███▎      | 3200/9770 [38:36<1:14:16,  1.47it/s]                                                      33%|███▎      | 3200/9770 [38:36<1:14:16,  1.47it/s] 33%|███▎      | 3201/9770 [38:37<1:13:31,  1.49it/s] 33%|███▎      | 3202/9770 [38:38<1:13:16,  1.49it/s] 33%|███▎      | 3203/9770 [38:38<1:13:55,  1.48it/s] 33%|███▎      | 3204/9770 [38:39<1:13:21,  1.49it/s] 33%|███▎      | 3205/9770 [38:40<1:13:35,  1.49it/s] 33%|███▎      | 3206/9770 [38:40<1:13:02,  1.50it/s] 33%|███▎      | 3207/9770 [38:41<1:13:46,  1.48it/s] 33%|███▎      | 3208/9770 [38:42<1:13:31,  1.49it/s] 33%|███▎      | 3209/9770 [38:42<1:13:04,  1.50it/s] 33%|███▎ 
+0: {'loss': 0.6964, 'grad_norm': 0.8590862790298767, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.33}
+0: {'loss': 0.7184, 'grad_norm': 0.6313948194132907, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.33}
+0:      | 3210/9770 [38:43<1:12:41,  1.50it/s]                                                      33%|███▎      | 3210/9770 [38:43<1:12:41,  1.50it/s] 33%|███▎      | 3211/9770 [38:44<1:13:02,  1.50it/s] 33%|███▎      | 3212/9770 [38:44<1:13:00,  1.50it/s] 33%|███▎      | 3213/9770 [38:45<1:12:23,  1.51it/s] 33%|███▎      | 3214/9770 [38:46<1:12:11,  1.51it/s] 33%|███▎      | 3215/9770 [38:46<1:11:54,  1.52it/s] 33%|███▎      | 3216/9770 [38:47<1:12:36,  1.50it/s] 33%|███▎      | 3217/9770 [38:48<1:12:05,  1.52it/s] 33%|███▎      | 3218/9770 [38:48<1:12:24,  1.51it/s] 33%|███▎      | 3219/9770 [38:49<1:13:19,  1.49it/s] 33%|███▎      | 3220/9770 [38:50<1:13:25,  1.49it/s]                                                      33%|███▎      | 3220/9770 [38:50<1:13:25,  1.49it/s] 33%|███▎      | 3221/9770 [38:50<1:13:08,  1.49it/s] 33%|███▎      | 3222/9770 [38:51<1:12:46,  1.50it/s] 3
+0: {'loss': 0.6624, 'grad_norm': 0.6270739749160082, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.33}
+0: 3%|███▎      | 3223/9770 [38:52<1:12:30,  1.50it/s] 33%|███▎      | 3224/9770 [38:52<1:12:28,  1.51it/s] 33%|███▎      | 3225/9770 [38:53<1:12:21,  1.51it/s] 33%|███▎      | 3226/9770 [38:54<1:12:36,  1.50it/s] 33%|███▎      | 3227/9770 [38:54<1:12:29,  1.50it/s] 33%|███▎      | 3228/9770 [38:55<1:12:01,  1.51it/s] 33%|███▎      | 3229/9770 [38:56<1:12:01,  1.51it/s] 33%|███▎      | 3230/9770 [38:56<1:10:51,  1.54it/s]                                                      33%|███▎      | 3230/9770 [38:56<1:10:51,  1.54it/s] 33%|███▎      | 3231/9770 [38:57<1:11:23,  1.53it/s] 33%|███▎      | 3232/9770 [38:58<1:12:18,  1.51it/s] 33%|███▎      | 3233/9770 [38:58<1:12:24,  1.50it/s] 33%|███▎      | 3234/9770 [38:59<1:12:30,  1.50it/s] 33%|███▎      | 3235/9770 [39:00<1:11:50,  1.52it/s] 33%|███▎      | 3236/9770 [39:00<1:11:50,  1.52it/s] 33%|███▎      | 3237/9770 [39:01
+0: {'loss': 0.7067, 'grad_norm': 0.7241317913957019, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.33}
+0: {'loss': 0.6861, 'grad_norm': 0.6420427525099284, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.33}
+0: <1:11:59,  1.51it/s] 33%|███▎      | 3238/9770 [39:02<1:11:23,  1.53it/s] 33%|███▎      | 3239/9770 [39:02<1:11:00,  1.53it/s] 33%|███▎      | 3240/9770 [39:03<1:11:13,  1.53it/s]                                                      33%|███▎      | 3240/9770 [39:03<1:11:13,  1.53it/s] 33%|███▎      | 3241/9770 [39:03<1:11:01,  1.53it/s] 33%|███▎      | 3242/9770 [39:04<1:11:23,  1.52it/s] 33%|███▎      | 3243/9770 [39:05<1:11:07,  1.53it/s] 33%|███▎      | 3244/9770 [39:06<1:17:11,  1.41it/s] 33%|███▎      | 3245/9770 [39:06<1:15:18,  1.44it/s] 33%|███▎      | 3246/9770 [39:07<1:13:44,  1.47it/s] 33%|███▎      | 3247/9770 [39:08<1:12:53,  1.49it/s] 33%|███▎      | 3248/9770 [39:08<1:12:23,  1.50it/s] 33%|███▎      | 3249/9770 [39:09<1:12:01,  1.51it/s] 33%|███▎      | 3250/9770 [39:10<1:10:40,  1.54it/s]                                                      33%|███▎      | 
+0: {'loss': 0.6896, 'grad_norm': 0.6628162365168045, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.33}
+0: 3250/9770 [39:10<1:10:40,  1.54it/s] 33%|███▎      | 3251/9770 [39:10<1:12:22,  1.50it/s] 33%|███▎      | 3252/9770 [39:11<1:12:21,  1.50it/s] 33%|███▎      | 3253/9770 [39:12<1:12:13,  1.50it/s] 33%|███▎      | 3254/9770 [39:12<1:13:22,  1.48it/s] 33%|███▎      | 3255/9770 [39:13<1:12:46,  1.49it/s] 33%|███▎      | 3256/9770 [39:14<1:12:05,  1.51it/s] 33%|███▎      | 3257/9770 [39:14<1:11:59,  1.51it/s] 33%|███▎      | 3258/9770 [39:15<1:11:36,  1.52it/s] 33%|███▎      | 3259/9770 [39:16<1:11:36,  1.52it/s] 33%|███▎      | 3260/9770 [39:16<1:12:41,  1.49it/s]                                                      33%|███▎      | 3260/9770 [39:16<1:12:41,  1.49it/s] 33%|███▎      | 3261/9770 [39:17<1:11:22,  1.52it/s] 33%|███▎      | 3262/9770 [39:17<1:10:17,  1.54it/s] 33%|███▎      | 3263/9770 [39:18<1:09:44,  1.55it/s] 33%|███▎      | 3264/9770 [39:19<1:10:30,  1.54it/s] 3
+0: {'loss': 0.6765, 'grad_norm': 0.6468844116395263, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.33}
+0: 3%|███▎      | 3265/9770 [39:19<1:10:30,  1.54it/s] 33%|███▎      | 3266/9770 [39:20<1:10:55,  1.53it/s] 33%|███▎      | 3267/9770 [39:21<1:11:20,  1.52it/s] 33%|███▎      | 3268/9770 [39:21<1:12:24,  1.50it/s] 33%|███▎      | 3269/9770 [39:22<1:12:09,  1.50it/s] 33%|███▎      | 3270/9770 [39:23<1:11:48,  1.51it/s]                                                      33%|███▎      | 3270/9770 [39:23<1:11:48,  1.51it/s] 33%|███▎      | 3271/9770 [39:23<1:11:52,  1.51it/s] 33%|███▎      | 3272/9770 [39:24<1:11:44,  1.51it/s] 34%|███▎      | 3273/9770 [39:25<1:11:56,  1.51it/s] 34%|███▎      | 3274/9770 [39:25<1:11:44,  1.51it/s] 34%|███▎      | 3275/9770 [39:26<1:12:37,  1.49it/s] 34%|███▎      | 3276/9770 [39:27<1:11:21,  1.52it/s] 34%|███▎      | 3277/9770 [39:27<1:11:17,  1.52it/s] 34%|███▎      | 3278/9770 [39:28<1:11:30,  1.51it/s] 34%|███▎      | 3279/9770 [39:29
+0: {'loss': 0.6865, 'grad_norm': 0.6961831320373305, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.34}
+0: {'loss': 0.6863, 'grad_norm': 0.6721689589677343, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.34}
+0: <1:10:25,  1.54it/s] 34%|███▎      | 3280/9770 [39:29<1:11:39,  1.51it/s]                                                      34%|███▎      | 3280/9770 [39:29<1:11:39,  1.51it/s] 34%|███▎      | 3281/9770 [39:30<1:10:57,  1.52it/s] 34%|███▎      | 3282/9770 [39:31<1:10:42,  1.53it/s] 34%|███▎      | 3283/9770 [39:31<1:11:08,  1.52it/s] 34%|███▎      | 3284/9770 [39:32<1:11:56,  1.50it/s] 34%|███▎      | 3285/9770 [39:33<1:12:08,  1.50it/s] 34%|███▎      | 3286/9770 [39:33<1:11:48,  1.50it/s] 34%|███▎      | 3287/9770 [39:34<1:11:13,  1.52it/s] 34%|███▎      | 3288/9770 [39:35<1:11:07,  1.52it/s] 34%|███▎      | 3289/9770 [39:35<1:10:16,  1.54it/s] 34%|███▎      | 3290/9770 [39:36<1:10:39,  1.53it/s]                                                      34%|███▎      | 3290/9770 [39:36<1:10:39,  1.53it/s] 34%|███▎      | 3291/9770 [39:37<1:11:05,  1.52it/s] 34%|███▎      | 
+0: {'loss': 0.731, 'grad_norm': 0.7145796054358293, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.34}
+0: 3292/9770 [39:37<1:10:47,  1.53it/s] 34%|███▎      | 3293/9770 [39:38<1:10:21,  1.53it/s] 34%|███▎      | 3294/9770 [39:39<1:10:26,  1.53it/s] 34%|███▎      | 3295/9770 [39:39<1:09:26,  1.55it/s] 34%|███▎      | 3296/9770 [39:40<1:09:58,  1.54it/s] 34%|███▎      | 3297/9770 [39:41<1:09:49,  1.54it/s] 34%|███▍      | 3298/9770 [39:41<1:11:28,  1.51it/s] 34%|███▍      | 3299/9770 [39:42<1:11:26,  1.51it/s] 34%|███▍      | 3300/9770 [39:43<1:11:17,  1.51it/s]                                                      34%|███▍      | 3300/9770 [39:43<1:11:17,  1.51it/s] 34%|███▍      | 3301/9770 [39:43<1:11:44,  1.50it/s] 34%|███▍      | 3302/9770 [39:44<1:11:32,  1.51it/s] 34%|███▍      | 3303/9770 [39:45<1:11:36,  1.51it/s] 34%|███▍      | 3304/9770 [39:45<1:12:03,  1.50it/s] 34%|███▍      | 3305/9770 [39:46<1:11:46,  1.50it/s] 34%|███▍      | 3306/9770 [39:47<1:11:26,  1.51it/s] 3
+0: {'loss': 0.7096, 'grad_norm': 0.6755966861746031, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.34}
+0: 4%|███▍      | 3307/9770 [39:47<1:12:14,  1.49it/s] 34%|███▍      | 3308/9770 [39:48<1:12:00,  1.50it/s] 34%|███▍      | 3309/9770 [39:49<1:10:56,  1.52it/s] 34%|███▍      | 3310/9770 [39:49<1:12:12,  1.49it/s]                                                      34%|███▍      | 3310/9770 [39:49<1:12:12,  1.49it/s] 34%|███▍      | 3311/9770 [39:50<1:11:02,  1.52it/s] 34%|███▍      | 3312/9770 [39:51<1:11:15,  1.51it/s] 34%|███▍      | 3313/9770 [39:51<1:10:28,  1.53it/s] 34%|███▍      | 3314/9770 [39:52<1:10:40,  1.52it/s] 34%|███▍      | 3315/9770 [39:52<1:09:59,  1.54it/s] 34%|███▍      | 3316/9770 [39:53<1:10:09,  1.53it/s] 34%|███▍      | 3317/9770 [39:54<1:10:35,  1.52it/s] 34%|███▍      | 3318/9770 [39:54<1:10:15,  1.53it/s] 34%|███▍      | 3319/9770 [39:55<1:09:52,  1.54it/s] 34%|███▍      | 3320/9770 [39:56<1:09:48,  1.54it/s]                                         
+0: {'loss': 0.7029, 'grad_norm': 0.6802038383709352, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.34}
+0: {'loss': 0.689, 'grad_norm': 0.6470768527254148, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.34}
+0:              34%|███▍      | 3320/9770 [39:56<1:09:48,  1.54it/s] 34%|███▍      | 3321/9770 [39:56<1:10:00,  1.54it/s] 34%|███▍      | 3322/9770 [39:57<1:10:12,  1.53it/s] 34%|███▍      | 3323/9770 [39:58<1:09:57,  1.54it/s] 34%|███▍      | 3324/9770 [39:58<1:10:07,  1.53it/s] 34%|███▍      | 3325/9770 [39:59<1:09:18,  1.55it/s] 34%|███▍      | 3326/9770 [40:00<1:09:53,  1.54it/s] 34%|███▍      | 3327/9770 [40:00<1:10:57,  1.51it/s] 34%|███▍      | 3328/9770 [40:01<1:10:52,  1.51it/s] 34%|███▍      | 3329/9770 [40:02<1:10:57,  1.51it/s] 34%|███▍      | 3330/9770 [40:02<1:10:47,  1.52it/s]                                                      34%|███▍      | 3330/9770 [40:02<1:10:47,  1.52it/s] 34%|███▍      | 3331/9770 [40:03<1:10:41,  1.52it/s] 34%|███▍      | 3332/9770 [40:04<1:10:29,  1.52it/s] 34%|███▍      | 3333/9770 [40:04<1:16:53,  1.40it/s] 34%|███▍      | 
+0: {'loss': 0.6975, 'grad_norm': 0.7476840593808041, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.34}
+0: 3334/9770 [40:05<1:14:17,  1.44it/s] 34%|███▍      | 3335/9770 [40:06<1:13:21,  1.46it/s] 34%|███▍      | 3336/9770 [40:06<1:12:57,  1.47it/s] 34%|███▍      | 3337/9770 [40:07<1:12:12,  1.48it/s] 34%|███▍      | 3338/9770 [40:08<1:12:58,  1.47it/s] 34%|███▍      | 3339/9770 [40:08<1:12:08,  1.49it/s] 34%|███▍      | 3340/9770 [40:09<1:11:53,  1.49it/s]                                                      34%|███▍      | 3340/9770 [40:09<1:11:53,  1.49it/s] 34%|███▍      | 3341/9770 [40:10<1:11:12,  1.50it/s] 34%|███▍      | 3342/9770 [40:10<1:11:01,  1.51it/s] 34%|███▍      | 3343/9770 [40:11<1:09:36,  1.54it/s] 34%|███▍      | 3344/9770 [40:12<1:09:50,  1.53it/s] 34%|███▍      | 3345/9770 [40:12<1:09:51,  1.53it/s] 34%|███▍      | 3346/9770 [40:13<1:10:06,  1.53it/s] 34%|███▍      | 3347/9770 [40:14<1:10:20,  1.52it/s] 34%|███▍      | 3348/9770 [40:14<1:10:23,  1.52it/s] 3
+0: {'loss': 0.7047, 'grad_norm': 0.7322290166481111, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.34}
+0: {'loss': 0.7213, 'grad_norm': 0.7545565015916705, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.34}
+0: 4%|███▍      | 3349/9770 [40:15<1:10:02,  1.53it/s] 34%|███▍      | 3350/9770 [40:16<1:10:07,  1.53it/s]                                                      34%|███▍      | 3350/9770 [40:16<1:10:07,  1.53it/s] 34%|███▍      | 3351/9770 [40:16<1:09:56,  1.53it/s] 34%|███▍      | 3352/9770 [40:17<1:09:33,  1.54it/s] 34%|███▍      | 3353/9770 [40:18<1:09:51,  1.53it/s] 34%|███▍      | 3354/9770 [40:18<1:10:20,  1.52it/s] 34%|███▍      | 3355/9770 [40:19<1:09:26,  1.54it/s] 34%|███▍      | 3356/9770 [40:19<1:08:41,  1.56it/s] 34%|███▍      | 3357/9770 [40:20<1:09:14,  1.54it/s] 34%|███▍      | 3358/9770 [40:21<1:08:40,  1.56it/s] 34%|███▍      | 3359/9770 [40:21<1:09:02,  1.55it/s] 34%|███▍      | 3360/9770 [40:22<1:08:30,  1.56it/s]                                                      34%|███▍      | 3360/9770 [40:22<1:08:30,  1.56it/s] 34%|███▍      | 3361/9770 [40:23<1:09:1
+0: {'loss': 0.6951, 'grad_norm': 0.6700011484635324, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.34}
+0: 5,  1.54it/s] 34%|███▍      | 3362/9770 [40:23<1:08:48,  1.55it/s] 34%|███▍      | 3363/9770 [40:24<1:10:00,  1.53it/s] 34%|███▍      | 3364/9770 [40:25<1:11:19,  1.50it/s] 34%|███▍      | 3365/9770 [40:25<1:10:59,  1.50it/s] 34%|███▍      | 3366/9770 [40:26<1:10:31,  1.51it/s] 34%|███▍      | 3367/9770 [40:27<1:10:14,  1.52it/s] 34%|███▍      | 3368/9770 [40:27<1:10:10,  1.52it/s] 34%|███▍      | 3369/9770 [40:28<1:10:27,  1.51it/s] 34%|███▍      | 3370/9770 [40:29<1:10:46,  1.51it/s]                                                      34%|███▍      | 3370/9770 [40:29<1:10:46,  1.51it/s] 35%|███▍      | 3371/9770 [40:29<1:10:46,  1.51it/s] 35%|███▍      | 3372/9770 [40:30<1:09:52,  1.53it/s] 35%|███▍      | 3373/9770 [40:31<1:09:22,  1.54it/s] 35%|███▍      | 3374/9770 [40:31<1:08:53,  1.55it/s] 35%|███▍      | 3375/9770 [40:32<1:08:05,  1.57it/s] 35%|███▍      | 
+0: {'loss': 0.658, 'grad_norm': 0.6007452148124209, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.35}
+0: 3376/9770 [40:33<1:07:35,  1.58it/s] 35%|███▍      | 3377/9770 [40:33<1:07:37,  1.58it/s] 35%|███▍      | 3378/9770 [40:34<1:07:32,  1.58it/s] 35%|███▍      | 3379/9770 [40:34<1:08:32,  1.55it/s] 35%|███▍      | 3380/9770 [40:35<1:09:15,  1.54it/s]                                                      35%|███▍      | 3380/9770 [40:35<1:09:15,  1.54it/s] 35%|███▍      | 3381/9770 [40:36<1:09:43,  1.53it/s] 35%|███▍      | 3382/9770 [40:36<1:09:17,  1.54it/s] 35%|███▍      | 3383/9770 [40:37<1:09:37,  1.53it/s] 35%|███▍      | 3384/9770 [40:38<1:09:54,  1.52it/s] 35%|███▍      | 3385/9770 [40:38<1:10:02,  1.52it/s] 35%|███▍      | 3386/9770 [40:39<1:09:00,  1.54it/s] 35%|███▍      | 3387/9770 [40:40<1:08:28,  1.55it/s] 35%|███▍      | 3388/9770 [40:40<1:08:59,  1.54it/s] 35%|███▍      | 3389/9770 [40:41<1:09:58,  1.52it/s] 35%|███▍      | 3390/9770 [40:42<1:10:06,  1.52it/s]  
+0: {'loss': 0.6857, 'grad_norm': 0.6451253885718397, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.35}
+0: {'loss': 0.6762, 'grad_norm': 0.6668972209227537, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.35}
+0:                                                     35%|███▍      | 3390/9770 [40:42<1:10:06,  1.52it/s] 35%|███▍      | 3391/9770 [40:42<1:10:08,  1.52it/s] 35%|███▍      | 3392/9770 [40:43<1:09:42,  1.52it/s] 35%|███▍      | 3393/9770 [40:44<1:09:24,  1.53it/s] 35%|███▍      | 3394/9770 [40:44<1:08:49,  1.54it/s] 35%|███▍      | 3395/9770 [40:45<1:09:02,  1.54it/s] 35%|███▍      | 3396/9770 [40:46<1:10:22,  1.51it/s] 35%|███▍      | 3397/9770 [40:46<1:09:55,  1.52it/s] 35%|███▍      | 3398/9770 [40:47<1:10:12,  1.51it/s] 35%|███▍      | 3399/9770 [40:48<1:09:41,  1.52it/s] 35%|███▍      | 3400/9770 [40:48<1:11:14,  1.49it/s]                                                      35%|███▍      | 3400/9770 [40:48<1:11:14,  1.49it/s] 35%|███▍      | 3401/9770 [40:49<1:10:49,  1.50it/s] 35%|███▍      | 3402/9770 [40:50<1:09:49,  1.52it/s] 35%|███▍      | 3403/9770 [40:50<1:09:2
+0: {'loss': 0.6795, 'grad_norm': 0.6619191600210886, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.35}
+0: 1,  1.53it/s] 35%|███▍      | 3404/9770 [40:51<1:10:03,  1.51it/s] 35%|███▍      | 3405/9770 [40:52<1:09:00,  1.54it/s] 35%|███▍      | 3406/9770 [40:52<1:08:37,  1.55it/s] 35%|███▍      | 3407/9770 [40:53<1:08:58,  1.54it/s] 35%|███▍      | 3408/9770 [40:53<1:08:48,  1.54it/s] 35%|███▍      | 3409/9770 [40:54<1:08:55,  1.54it/s] 35%|███▍      | 3410/9770 [40:55<1:10:12,  1.51it/s]                                                      35%|███▍      | 3410/9770 [40:55<1:10:12,  1.51it/s] 35%|███▍      | 3411/9770 [40:55<1:09:44,  1.52it/s] 35%|███▍      | 3412/9770 [40:56<1:09:40,  1.52it/s] 35%|███▍      | 3413/9770 [40:57<1:09:56,  1.51it/s] 35%|███▍      | 3414/9770 [40:57<1:10:05,  1.51it/s] 35%|███▍      | 3415/9770 [40:58<1:09:53,  1.52it/s] 35%|███▍      | 3416/9770 [40:59<1:09:58,  1.51it/s] 35%|███▍      | 3417/9770 [40:59<1:10:05,  1.51it/s] 35%|███▍      | 
+0: {'loss': 0.6838, 'grad_norm': 0.7122524777539763, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.35}
+0: {'loss': 0.6981, 'grad_norm': 0.6082673342201741, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.35}
+0: 3418/9770 [41:00<1:09:02,  1.53it/s] 35%|███▍      | 3419/9770 [41:01<1:09:15,  1.53it/s] 35%|███▌      | 3420/9770 [41:01<1:09:27,  1.52it/s]                                                      35%|███▌      | 3420/9770 [41:01<1:09:27,  1.52it/s] 35%|███▌      | 3421/9770 [41:02<1:10:25,  1.50it/s] 35%|███▌      | 3422/9770 [41:03<1:10:07,  1.51it/s] 35%|███▌      | 3423/9770 [41:03<1:09:31,  1.52it/s] 35%|███▌      | 3424/9770 [41:04<1:09:53,  1.51it/s] 35%|███▌      | 3425/9770 [41:05<1:09:55,  1.51it/s] 35%|███▌      | 3426/9770 [41:05<1:09:53,  1.51it/s] 35%|███▌      | 3427/9770 [41:06<1:10:04,  1.51it/s] 35%|███▌      | 3428/9770 [41:07<1:09:20,  1.52it/s] 35%|███▌      | 3429/9770 [41:07<1:08:36,  1.54it/s] 35%|███▌      | 3430/9770 [41:08<1:07:33,  1.56it/s]                                                      35%|███▌      | 3430/9770 [41:08<1:07:33,  1.56it/s] 35%|█�
+0: {'loss': 0.6995, 'grad_norm': 0.6055907561270045, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.35}
+0: ��█▌      | 3431/9770 [41:09<1:07:51,  1.56it/s] 35%|███▌      | 3432/9770 [41:09<1:07:40,  1.56it/s] 35%|███▌      | 3433/9770 [41:10<1:07:25,  1.57it/s] 35%|███▌      | 3434/9770 [41:10<1:07:58,  1.55it/s] 35%|███▌      | 3435/9770 [41:11<1:08:19,  1.55it/s] 35%|███▌      | 3436/9770 [41:12<1:08:13,  1.55it/s] 35%|███▌      | 3437/9770 [41:12<1:07:59,  1.55it/s] 35%|███▌      | 3438/9770 [41:13<1:08:21,  1.54it/s] 35%|███▌      | 3439/9770 [41:14<1:09:01,  1.53it/s] 35%|███▌      | 3440/9770 [41:14<1:09:09,  1.53it/s]                                                      35%|███▌      | 3440/9770 [41:14<1:09:09,  1.53it/s] 35%|███▌      | 3441/9770 [41:15<1:09:47,  1.51it/s] 35%|███▌      | 3442/9770 [41:16<1:09:49,  1.51it/s] 35%|███▌      | 3443/9770 [41:16<1:09:13,  1.52it/s] 35%|███▌      | 3444/9770 [41:17<1:09:33,  1.52it/s] 35%|███▌      | 3445/9770 [41:18<1:09:3
+0: {'loss': 0.6838, 'grad_norm': 0.6146586564798304, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.35}
+0: 6,  1.51it/s] 35%|███▌      | 3446/9770 [41:18<1:09:36,  1.51it/s] 35%|███▌      | 3447/9770 [41:19<1:08:47,  1.53it/s] 35%|███▌      | 3448/9770 [41:20<1:09:19,  1.52it/s] 35%|███▌      | 3449/9770 [41:20<1:09:29,  1.52it/s] 35%|███▌      | 3450/9770 [41:21<1:09:27,  1.52it/s]                                                      35%|███▌      | 3450/9770 [41:21<1:09:27,  1.52it/s] 35%|███▌      | 3451/9770 [41:22<1:09:25,  1.52it/s] 35%|███▌      | 3452/9770 [41:22<1:08:52,  1.53it/s] 35%|███▌      | 3453/9770 [41:23<1:08:54,  1.53it/s] 35%|███▌      | 3454/9770 [41:24<1:08:43,  1.53it/s] 35%|███▌      | 3455/9770 [41:24<1:08:54,  1.53it/s] 35%|███▌      | 3456/9770 [41:25<1:09:06,  1.52it/s] 35%|███▌      | 3457/9770 [41:26<1:08:38,  1.53it/s] 35%|███▌      | 3458/9770 [41:26<1:08:09,  1.54it/s] 35%|███▌      | 3459/9770 [41:27<1:08:39,  1.53it/s] 35%|███▌      | 
+0: {'loss': 0.6918, 'grad_norm': 0.6645738122245831, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.35}
+0: {'loss': 0.6998, 'grad_norm': 0.7174373189706844, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.35}
+0: 3460/9770 [41:28<1:08:54,  1.53it/s]                                                      35%|███▌      | 3460/9770 [41:28<1:08:54,  1.53it/s] 35%|███▌      | 3461/9770 [41:28<1:08:32,  1.53it/s] 35%|███▌      | 3462/9770 [41:29<1:08:35,  1.53it/s] 35%|███▌      | 3463/9770 [41:29<1:08:40,  1.53it/s] 35%|███▌      | 3464/9770 [41:30<1:09:11,  1.52it/s] 35%|███▌      | 3465/9770 [41:31<1:09:45,  1.51it/s] 35%|███▌      | 3466/9770 [41:32<1:10:17,  1.49it/s] 35%|███▌      | 3467/9770 [41:32<1:09:48,  1.50it/s] 35%|███▌      | 3468/9770 [41:33<1:09:33,  1.51it/s] 36%|███▌      | 3469/9770 [41:34<1:09:57,  1.50it/s] 36%|███▌      | 3470/9770 [41:34<1:09:48,  1.50it/s]                                                      36%|███▌      | 3470/9770 [41:34<1:09:48,  1.50it/s] 36%|███▌      | 3471/9770 [41:35<1:09:39,  1.51it/s] 36%|███▌      | 3472/9770 [41:36<1:10:18,  1.49it/s] 36%|█�
+0: {'loss': 0.6625, 'grad_norm': 0.6082861578740087, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.36}
+0: ��█▌      | 3473/9770 [41:36<1:10:07,  1.50it/s] 36%|███▌      | 3474/9770 [41:37<1:09:44,  1.50it/s] 36%|███▌      | 3475/9770 [41:37<1:09:23,  1.51it/s] 36%|███▌      | 3476/9770 [41:38<1:08:30,  1.53it/s] 36%|███▌      | 3477/9770 [41:39<1:08:32,  1.53it/s] 36%|███▌      | 3478/9770 [41:39<1:08:55,  1.52it/s] 36%|███▌      | 3479/9770 [41:40<1:08:54,  1.52it/s] 36%|███▌      | 3480/9770 [41:41<1:08:34,  1.53it/s]                                                      36%|███▌      | 3480/9770 [41:41<1:08:34,  1.53it/s] 36%|███▌      | 3481/9770 [41:41<1:08:42,  1.53it/s] 36%|███▌      | 3482/9770 [41:42<1:08:45,  1.52it/s] 36%|███▌      | 3483/9770 [41:43<1:09:44,  1.50it/s] 36%|███▌      | 3484/9770 [41:43<1:09:26,  1.51it/s] 36%|███▌      | 3485/9770 [41:44<1:09:22,  1.51it/s] 36%|███▌      | 3486/9770 [41:45<1:08:14,  1.53it/s] 36%|███▌      | 3487/9770 [41:45<1:08:3
+0: {'loss': 0.6981, 'grad_norm': 0.6853542901976013, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.36}
+0: {'loss': 0.6857, 'grad_norm': 0.6317417521320896, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.36}
+0: 1,  1.53it/s] 36%|███▌      | 3488/9770 [41:46<1:08:11,  1.54it/s] 36%|███▌      | 3489/9770 [41:47<1:07:23,  1.55it/s] 36%|███▌      | 3490/9770 [41:47<1:07:59,  1.54it/s]                                                      36%|███▌      | 3490/9770 [41:47<1:07:59,  1.54it/s] 36%|███▌      | 3491/9770 [41:48<1:07:25,  1.55it/s] 36%|███▌      | 3492/9770 [41:49<1:08:41,  1.52it/s] 36%|███▌      | 3493/9770 [41:49<1:09:08,  1.51it/s] 36%|███▌      | 3494/9770 [41:50<1:08:13,  1.53it/s] 36%|███▌      | 3495/9770 [41:51<1:09:24,  1.51it/s] 36%|███▌      | 3496/9770 [41:51<1:09:09,  1.51it/s] 36%|███▌      | 3497/9770 [41:52<1:09:11,  1.51it/s] 36%|███▌      | 3498/9770 [41:53<1:09:04,  1.51it/s] 36%|███▌      | 3499/9770 [41:53<1:08:57,  1.52it/s] 36%|███▌      | 3500/9770 [41:54<1:08:38,  1.52it/s]                                                      36%|███▌      | 3500/97
+0: {'loss': 0.6523, 'grad_norm': 0.6105208182727795, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.36}
+0: 70 [41:54<1:08:38,  1.52it/s] 36%|███▌      | 3501/9770 [41:55<1:09:32,  1.50it/s] 36%|███▌      | 3502/9770 [41:55<1:08:40,  1.52it/s] 36%|███▌      | 3503/9770 [41:56<1:08:11,  1.53it/s] 36%|███▌      | 3504/9770 [41:57<1:08:24,  1.53it/s] 36%|███▌      | 3505/9770 [41:57<1:08:01,  1.53it/s] 36%|███▌      | 3506/9770 [41:58<1:07:47,  1.54it/s] 36%|███▌      | 3507/9770 [41:58<1:07:53,  1.54it/s] 36%|███▌      | 3508/9770 [41:59<1:09:05,  1.51it/s] 36%|███▌      | 3509/9770 [42:00<1:08:37,  1.52it/s] 36%|███▌      | 3510/9770 [42:00<1:09:27,  1.50it/s]                                                      36%|███▌      | 3510/9770 [42:00<1:09:27,  1.50it/s] 36%|███▌      | 3511/9770 [42:01<1:09:31,  1.50it/s] 36%|███▌      | 3512/9770 [42:02<1:09:32,  1.50it/s] 36%|███▌      | 3513/9770 [42:02<1:08:52,  1.51it/s] 36%|███▌      | 3514/9770 [42:03<1:09:05,  1.51it/s] 36%|█�
+0: {'loss': 0.6862, 'grad_norm': 0.6976614513906086, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.36}
+0: ��█▌      | 3515/9770 [42:04<1:08:31,  1.52it/s] 36%|███▌      | 3516/9770 [42:04<1:09:11,  1.51it/s] 36%|███▌      | 3517/9770 [42:05<1:09:07,  1.51it/s] 36%|███▌      | 3518/9770 [42:06<1:09:00,  1.51it/s] 36%|███▌      | 3519/9770 [42:06<1:09:01,  1.51it/s] 36%|███▌      | 3520/9770 [42:07<1:08:31,  1.52it/s]                                                      36%|███▌      | 3520/9770 [42:07<1:08:31,  1.52it/s] 36%|███▌      | 3521/9770 [42:08<1:08:22,  1.52it/s] 36%|███▌      | 3522/9770 [42:08<1:08:12,  1.53it/s] 36%|███▌      | 3523/9770 [42:09<1:07:54,  1.53it/s] 36%|███▌      | 3524/9770 [42:10<1:07:48,  1.54it/s] 36%|███▌      | 3525/9770 [42:10<1:08:11,  1.53it/s] 36%|███▌      | 3526/9770 [42:11<1:08:02,  1.53it/s] 36%|███▌      | 3527/9770 [42:12<1:08:02,  1.53it/s] 36%|███▌      | 3528/9770 [42:12<1:08:39,  1.52it/s] 36%|███▌      | 3529/9770 [42:13<1:08:4
+0: {'loss': 0.6654, 'grad_norm': 0.600091402224677, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.36}
+0: {'loss': 0.694, 'grad_norm': 0.6270725977134045, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.36}
+0: 1,  1.51it/s] 36%|███▌      | 3530/9770 [42:14<1:08:32,  1.52it/s]                                                      36%|███▌      | 3530/9770 [42:14<1:08:32,  1.52it/s] 36%|███▌      | 3531/9770 [42:14<1:08:48,  1.51it/s] 36%|███▌      | 3532/9770 [42:15<1:10:04,  1.48it/s] 36%|███▌      | 3533/9770 [42:16<1:10:17,  1.48it/s] 36%|███▌      | 3534/9770 [42:16<1:09:05,  1.50it/s] 36%|███▌      | 3535/9770 [42:17<1:08:26,  1.52it/s] 36%|███▌      | 3536/9770 [42:18<1:08:37,  1.51it/s] 36%|███▌      | 3537/9770 [42:18<1:08:56,  1.51it/s] 36%|███▌      | 3538/9770 [42:19<1:08:16,  1.52it/s] 36%|███▌      | 3539/9770 [42:20<1:08:22,  1.52it/s] 36%|███▌      | 3540/9770 [42:20<1:09:10,  1.50it/s]                                                      36%|███▌      | 3540/9770 [42:20<1:09:10,  1.50it/s] 36%|███▌      | 3541/9770 [42:21<1:08:54,  1.51it/s] 36%|███▋      | 3542/97
+0: {'loss': 0.6688, 'grad_norm': 0.624146249088739, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.36}
+0: 70 [42:22<1:08:53,  1.51it/s] 36%|███▋      | 3543/9770 [42:22<1:09:47,  1.49it/s] 36%|███▋      | 3544/9770 [42:23<1:09:16,  1.50it/s] 36%|███▋      | 3545/9770 [42:24<1:08:37,  1.51it/s] 36%|███▋      | 3546/9770 [42:24<1:09:59,  1.48it/s] 36%|███▋      | 3547/9770 [42:25<1:09:10,  1.50it/s] 36%|███▋      | 3548/9770 [42:26<1:09:02,  1.50it/s] 36%|███▋      | 3549/9770 [42:26<1:09:20,  1.50it/s] 36%|███▋      | 3550/9770 [42:27<1:07:49,  1.53it/s]                                                      36%|███▋      | 3550/9770 [42:27<1:07:49,  1.53it/s] 36%|███▋      | 3551/9770 [42:28<1:09:34,  1.49it/s] 36%|███▋      | 3552/9770 [42:28<1:09:03,  1.50it/s] 36%|███▋      | 3553/9770 [42:29<1:08:32,  1.51it/s] 36%|███▋      | 3554/9770 [42:30<1:08:05,  1.52it/s] 36%|███▋      | 3555/9770 [42:30<1:08:03,  1.52it/s] 36%|███▋      | 3556/9770 [42:31<1:09:09,  1.50it/s] 36%|█�
+0: {'loss': 0.6884, 'grad_norm': 0.6602346578907496, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.36}
+0: ��█▋      | 3557/9770 [42:32<1:08:46,  1.51it/s] 36%|███▋      | 3558/9770 [42:32<1:08:42,  1.51it/s] 36%|███▋      | 3559/9770 [42:33<1:08:41,  1.51it/s] 36%|███▋      | 3560/9770 [42:34<1:08:18,  1.52it/s]                                                      36%|███▋      | 3560/9770 [42:34<1:08:18,  1.52it/s] 36%|███▋      | 3561/9770 [42:34<1:07:01,  1.54it/s] 36%|███▋      | 3562/9770 [42:35<1:07:38,  1.53it/s] 36%|███▋      | 3563/9770 [42:35<1:06:38,  1.55it/s] 36%|███▋      | 3564/9770 [42:36<1:07:10,  1.54it/s] 36%|███▋      | 3565/9770 [42:37<1:07:19,  1.54it/s] 36%|███▋      | 3566/9770 [42:37<1:07:36,  1.53it/s] 37%|███▋      | 3567/9770 [42:38<1:08:35,  1.51it/s] 37%|███▋      | 3568/9770 [42:39<1:08:00,  1.52it/s] 37%|███▋      | 3569/9770 [42:39<1:09:06,  1.50it/s] 37%|███▋      | 3570/9770 [42:40<1:09:58,  1.48it/s]                                                
+0: {'loss': 0.6905, 'grad_norm': 0.6706614677701693, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.37}
+0: {'loss': 0.665, 'grad_norm': 0.638804433261994, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.37}
+0:       37%|███▋      | 3570/9770 [42:40<1:09:58,  1.48it/s] 37%|███▋      | 3571/9770 [42:41<1:08:58,  1.50it/s] 37%|███▋      | 3572/9770 [42:41<1:08:11,  1.51it/s] 37%|███▋      | 3573/9770 [42:42<1:08:24,  1.51it/s] 37%|███▋      | 3574/9770 [42:43<1:08:37,  1.50it/s] 37%|███▋      | 3575/9770 [42:43<1:07:48,  1.52it/s] 37%|███▋      | 3576/9770 [42:44<1:07:06,  1.54it/s] 37%|███▋      | 3577/9770 [42:45<1:07:24,  1.53it/s] 37%|███▋      | 3578/9770 [42:45<1:07:36,  1.53it/s] 37%|███▋      | 3579/9770 [42:46<1:07:24,  1.53it/s] 37%|███▋      | 3580/9770 [42:47<1:07:44,  1.52it/s]                                                      37%|███▋      | 3580/9770 [42:47<1:07:44,  1.52it/s] 37%|███▋      | 3581/9770 [42:47<1:08:16,  1.51it/s] 37%|███▋      | 3582/9770 [42:48<1:07:02,  1.54it/s] 37%|███▋      | 3583/9770 [42:49<1:07:05,  1.54it/s] 37%|███▋      | 3584/97
+0: {'loss': 0.6954, 'grad_norm': 0.6753442133488154, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.37}
+0: 70 [42:49<1:07:27,  1.53it/s] 37%|███▋      | 3585/9770 [42:50<1:07:42,  1.52it/s] 37%|███▋      | 3586/9770 [42:51<1:08:04,  1.51it/s] 37%|███▋      | 3587/9770 [42:51<1:07:18,  1.53it/s] 37%|███▋      | 3588/9770 [42:52<1:07:21,  1.53it/s] 37%|███▋      | 3589/9770 [42:53<1:07:38,  1.52it/s] 37%|███▋      | 3590/9770 [42:53<1:06:54,  1.54it/s]                                                      37%|███▋      | 3590/9770 [42:53<1:06:54,  1.54it/s] 37%|███▋      | 3591/9770 [42:54<1:07:27,  1.53it/s] 37%|███▋      | 3592/9770 [42:55<1:07:05,  1.53it/s] 37%|███▋      | 3593/9770 [42:55<1:07:18,  1.53it/s] 37%|███▋      | 3594/9770 [42:56<1:07:40,  1.52it/s] 37%|███▋      | 3595/9770 [42:57<1:07:29,  1.53it/s] 37%|███▋      | 3596/9770 [42:57<1:07:14,  1.53it/s] 37%|███▋      | 3597/9770 [42:58<1:07:28,  1.52it/s] 37%|███▋      | 3598/9770 [42:58<1:07:04,  1.53it/s] 37%|█�
+0: {'loss': 0.6908, 'grad_norm': 0.6147818933922237, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.37}
+0: {'loss': 0.6891, 'grad_norm': 0.6362226689113726, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.37}
+0: ��█▋      | 3599/9770 [42:59<1:06:45,  1.54it/s] 37%|███▋      | 3600/9770 [43:00<1:06:19,  1.55it/s]                                                      37%|███▋      | 3600/9770 [43:00<1:06:19,  1.55it/s] 37%|███▋      | 3601/9770 [43:00<1:06:37,  1.54it/s] 37%|███▋      | 3602/9770 [43:01<1:06:49,  1.54it/s] 37%|███▋      | 3603/9770 [43:02<1:07:13,  1.53it/s] 37%|███▋      | 3604/9770 [43:02<1:07:29,  1.52it/s] 37%|███▋      | 3605/9770 [43:03<1:07:25,  1.52it/s] 37%|███▋      | 3606/9770 [43:04<1:07:27,  1.52it/s] 37%|███▋      | 3607/9770 [43:04<1:07:10,  1.53it/s] 37%|███▋      | 3608/9770 [43:05<1:06:43,  1.54it/s] 37%|███▋      | 3609/9770 [43:06<1:07:17,  1.53it/s] 37%|███▋      | 3610/9770 [43:06<1:07:43,  1.52it/s]                                                      37%|███▋      | 3610/9770 [43:06<1:07:43,  1.52it/s] 37%|███▋      | 3611/9770 [43:07<1:07:50,  1.5
+0: {'loss': 0.662, 'grad_norm': 0.600883506632343, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.37}
+0: 1it/s] 37%|███▋      | 3612/9770 [43:08<1:06:34,  1.54it/s] 37%|███▋      | 3613/9770 [43:08<1:06:59,  1.53it/s] 37%|███▋      | 3614/9770 [43:09<1:07:50,  1.51it/s] 37%|███▋      | 3615/9770 [43:10<1:07:07,  1.53it/s] 37%|███▋      | 3616/9770 [43:10<1:07:37,  1.52it/s] 37%|███▋      | 3617/9770 [43:11<1:07:15,  1.52it/s] 37%|███▋      | 3618/9770 [43:12<1:06:32,  1.54it/s] 37%|███▋      | 3619/9770 [43:12<1:07:27,  1.52it/s] 37%|███▋      | 3620/9770 [43:13<1:07:26,  1.52it/s]                                                      37%|███▋      | 3620/9770 [43:13<1:07:26,  1.52it/s] 37%|███▋      | 3621/9770 [43:14<1:07:14,  1.52it/s] 37%|███▋      | 3622/9770 [43:14<1:06:41,  1.54it/s] 37%|███▋      | 3623/9770 [43:15<1:06:47,  1.53it/s] 37%|███▋      | 3624/9770 [43:15<1:06:47,  1.53it/s] 37%|███▋      | 3625/9770 [43:16<1:08:14,  1.50it/s] 37%|███▋      | 3626/97
+0: {'loss': 0.6772, 'grad_norm': 0.6530779905064625, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.37}
+0: 70 [43:17<1:07:41,  1.51it/s] 37%|███▋      | 3627/9770 [43:17<1:07:11,  1.52it/s] 37%|███▋      | 3628/9770 [43:18<1:07:32,  1.52it/s] 37%|███▋      | 3629/9770 [43:19<1:06:39,  1.54it/s] 37%|███▋      | 3630/9770 [43:19<1:07:04,  1.53it/s]                                                      37%|███▋      | 3630/9770 [43:19<1:07:04,  1.53it/s] 37%|███▋      | 3631/9770 [43:20<1:07:16,  1.52it/s] 37%|███▋      | 3632/9770 [43:21<1:06:27,  1.54it/s] 37%|███▋      | 3633/9770 [43:21<1:06:24,  1.54it/s] 37%|███▋      | 3634/9770 [43:22<1:06:11,  1.55it/s] 37%|███▋      | 3635/9770 [43:23<1:05:31,  1.56it/s] 37%|███▋      | 3636/9770 [43:23<1:06:30,  1.54it/s] 37%|███▋      | 3637/9770 [43:24<1:07:05,  1.52it/s] 37%|███▋      | 3638/9770 [43:25<1:06:28,  1.54it/s] 37%|███▋      | 3639/9770 [43:25<1:06:51,  1.53it/s] 37%|███▋      | 3640/9770 [43:26<1:07:08,  1.52it/s]         
+0: {'loss': 0.6715, 'grad_norm': 0.5987265179126843, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.37}
+0: {'loss': 0.6982, 'grad_norm': 0.6085838817532605, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.37}
+0:                                              37%|███▋      | 3640/9770 [43:26<1:07:08,  1.52it/s] 37%|███▋      | 3641/9770 [43:27<1:07:35,  1.51it/s] 37%|███▋      | 3642/9770 [43:27<1:07:28,  1.51it/s] 37%|███▋      | 3643/9770 [43:28<1:07:21,  1.52it/s] 37%|███▋      | 3644/9770 [43:29<1:06:34,  1.53it/s] 37%|███▋      | 3645/9770 [43:29<1:06:48,  1.53it/s] 37%|███▋      | 3646/9770 [43:30<1:06:25,  1.54it/s] 37%|███▋      | 3647/9770 [43:31<1:06:47,  1.53it/s] 37%|███▋      | 3648/9770 [43:31<1:07:14,  1.52it/s] 37%|███▋      | 3649/9770 [43:32<1:06:58,  1.52it/s] 37%|███▋      | 3650/9770 [43:33<1:07:35,  1.51it/s]                                                      37%|███▋      | 3650/9770 [43:33<1:07:35,  1.51it/s] 37%|███▋      | 3651/9770 [43:33<1:06:59,  1.52it/s] 37%|███▋      | 3652/9770 [43:34<1:06:52,  1.52it/s] 37%|███▋      | 3653/9770 [43:35<1:07:23,  1.5
+0: {'loss': 0.6938, 'grad_norm': 0.6480782617729687, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.37}
+0: 1it/s] 37%|███▋      | 3654/9770 [43:35<1:07:07,  1.52it/s] 37%|███▋      | 3655/9770 [43:36<1:06:50,  1.52it/s] 37%|███▋      | 3656/9770 [43:36<1:06:58,  1.52it/s] 37%|███▋      | 3657/9770 [43:37<1:06:33,  1.53it/s] 37%|███▋      | 3658/9770 [43:38<1:07:44,  1.50it/s] 37%|███▋      | 3659/9770 [43:38<1:07:52,  1.50it/s] 37%|███▋      | 3660/9770 [43:39<1:08:00,  1.50it/s]                                                      37%|███▋      | 3660/9770 [43:39<1:08:00,  1.50it/s] 37%|███▋      | 3661/9770 [43:40<1:07:40,  1.50it/s] 37%|███▋      | 3662/9770 [43:40<1:07:13,  1.51it/s] 37%|███▋      | 3663/9770 [43:41<1:07:14,  1.51it/s] 38%|███▊      | 3664/9770 [43:42<1:07:05,  1.52it/s] 38%|███▊      | 3665/9770 [43:42<1:07:11,  1.51it/s] 38%|███▊      | 3666/9770 [43:43<1:06:42,  1.53it/s] 38%|███▊      | 3667/9770 [43:44<1:06:55,  1.52it/s] 38%|███▊      | 3668/97
+0: {'loss': 0.6965, 'grad_norm': 0.6308774434035178, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.38}
+0: {'loss': 0.6901, 'grad_norm': 0.6356834869732386, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.38}
+0: 70 [43:44<1:07:17,  1.51it/s] 38%|███▊      | 3669/9770 [43:45<1:07:27,  1.51it/s] 38%|███▊      | 3670/9770 [43:46<1:07:27,  1.51it/s]                                                      38%|███▊      | 3670/9770 [43:46<1:07:27,  1.51it/s] 38%|███▊      | 3671/9770 [43:46<1:07:01,  1.52it/s] 38%|███▊      | 3672/9770 [43:47<1:06:26,  1.53it/s] 38%|███▊      | 3673/9770 [43:48<1:06:17,  1.53it/s] 38%|███▊      | 3674/9770 [43:48<1:06:48,  1.52it/s] 38%|███▊      | 3675/9770 [43:49<1:07:00,  1.52it/s] 38%|███▊      | 3676/9770 [43:50<1:06:10,  1.53it/s] 38%|███▊      | 3677/9770 [43:50<1:05:22,  1.55it/s] 38%|███▊      | 3678/9770 [43:51<1:05:11,  1.56it/s] 38%|███▊      | 3679/9770 [43:52<1:05:42,  1.54it/s] 38%|███▊      | 3680/9770 [43:52<1:05:23,  1.55it/s]                                                      38%|███▊      | 3680/9770 [43:52<1:05:23,  1.55it/s] 38%|███�
+0: {'loss': 0.6814, 'grad_norm': 0.6490579846843991, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.38}
+0: �      | 3681/9770 [43:53<1:05:53,  1.54it/s] 38%|███▊      | 3682/9770 [43:54<1:05:43,  1.54it/s] 38%|███▊      | 3683/9770 [43:54<1:05:28,  1.55it/s] 38%|███▊      | 3684/9770 [43:55<1:05:39,  1.54it/s] 38%|███▊      | 3685/9770 [43:55<1:06:01,  1.54it/s] 38%|███▊      | 3686/9770 [43:56<1:05:39,  1.54it/s] 38%|███▊      | 3687/9770 [43:57<1:07:07,  1.51it/s] 38%|███▊      | 3688/9770 [43:57<1:07:05,  1.51it/s] 38%|███▊      | 3689/9770 [43:58<1:06:02,  1.53it/s] 38%|███▊      | 3690/9770 [43:59<1:06:29,  1.52it/s]                                                      38%|███▊      | 3690/9770 [43:59<1:06:29,  1.52it/s] 38%|███▊      | 3691/9770 [43:59<1:06:27,  1.52it/s] 38%|███▊      | 3692/9770 [44:00<1:05:12,  1.55it/s] 38%|███▊      | 3693/9770 [44:01<1:05:11,  1.55it/s] 38%|███▊      | 3694/9770 [44:01<1:05:24,  1.55it/s] 38%|███▊      | 3695/9770 [44:02<1:05:09,  1.5
+0: {'loss': 0.6676, 'grad_norm': 0.6203903723587489, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.38}
+0: 5it/s] 38%|███▊      | 3696/9770 [44:03<1:04:48,  1.56it/s] 38%|███▊      | 3697/9770 [44:03<1:05:37,  1.54it/s] 38%|███▊      | 3698/9770 [44:04<1:05:10,  1.55it/s] 38%|███▊      | 3699/9770 [44:05<1:05:26,  1.55it/s] 38%|███▊      | 3700/9770 [44:05<1:05:40,  1.54it/s]                                                      38%|███▊      | 3700/9770 [44:05<1:05:40,  1.54it/s] 38%|███▊      | 3701/9770 [44:06<1:05:59,  1.53it/s] 38%|███▊      | 3702/9770 [44:07<1:05:36,  1.54it/s] 38%|███▊      | 3703/9770 [44:07<1:06:29,  1.52it/s] 38%|███▊      | 3704/9770 [44:08<1:07:48,  1.49it/s] 38%|███▊      | 3705/9770 [44:09<1:07:09,  1.51it/s] 38%|███▊      | 3706/9770 [44:09<1:07:54,  1.49it/s] 38%|███▊      | 3707/9770 [44:10<1:08:04,  1.48it/s] 38%|███▊      | 3708/9770 [44:11<1:07:32,  1.50it/s] 38%|███▊      | 3709/9770 [44:11<1:07:06,  1.51it/s] 38%|███▊      | 3710/97
+0: {'loss': 0.6882, 'grad_norm': 0.6567375739802029, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.38}
+0: {'loss': 0.6848, 'grad_norm': 0.6363369621308691, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.38}
+0: 70 [44:12<1:07:04,  1.51it/s]                                                      38%|███▊      | 3710/9770 [44:12<1:07:04,  1.51it/s] 38%|███▊      | 3711/9770 [44:13<1:07:01,  1.51it/s] 38%|███▊      | 3712/9770 [44:13<1:07:01,  1.51it/s] 38%|███▊      | 3713/9770 [44:14<1:06:46,  1.51it/s] 38%|███▊      | 3714/9770 [44:15<1:06:39,  1.51it/s] 38%|███▊      | 3715/9770 [44:15<1:06:08,  1.53it/s] 38%|███▊      | 3716/9770 [44:16<1:06:44,  1.51it/s] 38%|███▊      | 3717/9770 [44:16<1:06:16,  1.52it/s] 38%|███▊      | 3718/9770 [44:17<1:05:42,  1.54it/s] 38%|███▊      | 3719/9770 [44:18<1:05:21,  1.54it/s] 38%|███▊      | 3720/9770 [44:18<1:04:56,  1.55it/s]                                                      38%|███▊      | 3720/9770 [44:18<1:04:56,  1.55it/s] 38%|███▊      | 3721/9770 [44:19<1:05:47,  1.53it/s] 38%|███▊      | 3722/9770 [44:20<1:05:11,  1.55it/s] 38%|███�
+0: {'loss': 0.6794, 'grad_norm': 0.6574026161570821, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.38}
+0: �      | 3723/9770 [44:20<1:05:40,  1.53it/s] 38%|███▊      | 3724/9770 [44:21<1:05:35,  1.54it/s] 38%|███▊      | 3725/9770 [44:22<1:05:11,  1.55it/s] 38%|███▊      | 3726/9770 [44:22<1:07:00,  1.50it/s] 38%|███▊      | 3727/9770 [44:23<1:07:04,  1.50it/s] 38%|███▊      | 3728/9770 [44:24<1:07:34,  1.49it/s] 38%|███▊      | 3729/9770 [44:24<1:08:25,  1.47it/s] 38%|███▊      | 3730/9770 [44:25<1:07:44,  1.49it/s]                                                      38%|███▊      | 3730/9770 [44:25<1:07:44,  1.49it/s] 38%|███▊      | 3731/9770 [44:26<1:06:49,  1.51it/s] 38%|███▊      | 3732/9770 [44:26<1:06:01,  1.52it/s] 38%|███▊      | 3733/9770 [44:27<1:05:31,  1.54it/s] 38%|███▊      | 3734/9770 [44:28<1:05:21,  1.54it/s] 38%|███▊      | 3735/9770 [44:28<1:05:36,  1.53it/s] 38%|███▊      | 3736/9770 [44:29<1:05:57,  1.52it/s] 38%|███▊      | 3737/9770 [44:30<1:06:12,  1.5
+0: {'loss': 0.6808, 'grad_norm': 0.6319851533736518, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.38}
+0: {'loss': 0.6806, 'grad_norm': 0.6757901047661943, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.38}
+0: 2it/s] 38%|███▊      | 3738/9770 [44:30<1:05:30,  1.53it/s] 38%|███▊      | 3739/9770 [44:31<1:06:46,  1.51it/s] 38%|███▊      | 3740/9770 [44:32<1:07:19,  1.49it/s]                                                      38%|███▊      | 3740/9770 [44:32<1:07:19,  1.49it/s] 38%|███▊      | 3741/9770 [44:32<1:06:39,  1.51it/s] 38%|███▊      | 3742/9770 [44:33<1:06:46,  1.50it/s] 38%|███▊      | 3743/9770 [44:34<1:07:29,  1.49it/s] 38%|███▊      | 3744/9770 [44:34<1:08:31,  1.47it/s] 38%|███▊      | 3745/9770 [44:35<1:07:44,  1.48it/s] 38%|███▊      | 3746/9770 [44:36<1:06:36,  1.51it/s] 38%|███▊      | 3747/9770 [44:36<1:06:37,  1.51it/s] 38%|███▊      | 3748/9770 [44:37<1:07:20,  1.49it/s] 38%|███▊      | 3749/9770 [44:38<1:08:22,  1.47it/s] 38%|███▊      | 3750/9770 [44:38<1:07:12,  1.49it/s]                                                      38%|███▊      | 3750/9770 [44:
+0: {'loss': 0.6892, 'grad_norm': 0.6763012523075451, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.38}
+0: 38<1:07:12,  1.49it/s] 38%|███▊      | 3751/9770 [44:39<1:06:02,  1.52it/s] 38%|███▊      | 3752/9770 [44:40<1:06:07,  1.52it/s] 38%|███▊      | 3753/9770 [44:40<1:05:29,  1.53it/s] 38%|███▊      | 3754/9770 [44:41<1:05:20,  1.53it/s] 38%|███▊      | 3755/9770 [44:42<1:05:17,  1.54it/s] 38%|███▊      | 3756/9770 [44:42<1:05:39,  1.53it/s] 38%|███▊      | 3757/9770 [44:43<1:05:39,  1.53it/s] 38%|███▊      | 3758/9770 [44:44<1:05:30,  1.53it/s] 38%|███▊      | 3759/9770 [44:44<1:05:08,  1.54it/s] 38%|███▊      | 3760/9770 [44:45<1:06:07,  1.51it/s]                                                      38%|███▊      | 3760/9770 [44:45<1:06:07,  1.51it/s] 38%|███▊      | 3761/9770 [44:46<1:05:41,  1.52it/s] 39%|███▊      | 3762/9770 [44:46<1:05:17,  1.53it/s] 39%|███▊      | 3763/9770 [44:47<1:05:46,  1.52it/s] 39%|███▊      | 3764/9770 [44:47<1:06:14,  1.51it/s] 39%|███�
+0: {'loss': 0.686, 'grad_norm': 0.6211816930529909, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.39}
+0: �      | 3765/9770 [44:48<1:05:59,  1.52it/s] 39%|███▊      | 3766/9770 [44:49<1:06:17,  1.51it/s] 39%|███▊      | 3767/9770 [44:49<1:05:33,  1.53it/s] 39%|███▊      | 3768/9770 [44:50<1:05:17,  1.53it/s] 39%|███▊      | 3769/9770 [44:51<1:05:28,  1.53it/s] 39%|███▊      | 3770/9770 [44:51<1:05:22,  1.53it/s]                                                      39%|███▊      | 3770/9770 [44:51<1:05:22,  1.53it/s] 39%|███▊      | 3771/9770 [44:52<1:05:58,  1.52it/s] 39%|███▊      | 3772/9770 [44:53<1:06:29,  1.50it/s] 39%|███▊      | 3773/9770 [44:53<1:05:37,  1.52it/s] 39%|███▊      | 3774/9770 [44:54<1:04:47,  1.54it/s] 39%|███▊      | 3775/9770 [44:55<1:05:34,  1.52it/s] 39%|███▊      | 3776/9770 [44:55<1:05:17,  1.53it/s] 39%|███▊      | 3777/9770 [44:56<1:05:26,  1.53it/s] 39%|███▊      | 3778/9770 [44:57<1:05:14,  1.53it/s] 39%|███▊      | 3779/9770 [44:57<1:05:45,  1.5
+0: {'loss': 0.6731, 'grad_norm': 0.6651068063163778, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.39}
+0: {'loss': 0.7022, 'grad_norm': 0.6543865116056973, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.39}
+0: 2it/s] 39%|███▊      | 3780/9770 [44:58<1:05:51,  1.52it/s]                                                      39%|███▊      | 3780/9770 [44:58<1:05:51,  1.52it/s] 39%|███▊      | 3781/9770 [44:59<1:05:51,  1.52it/s] 39%|███▊      | 3782/9770 [44:59<1:06:12,  1.51it/s] 39%|███▊      | 3783/9770 [45:00<1:06:12,  1.51it/s] 39%|███▊      | 3784/9770 [45:01<1:06:47,  1.49it/s] 39%|███▊      | 3785/9770 [45:01<1:06:16,  1.50it/s] 39%|███▉      | 3786/9770 [45:02<1:06:56,  1.49it/s] 39%|███▉      | 3787/9770 [45:03<1:06:45,  1.49it/s] 39%|███▉      | 3788/9770 [45:03<1:06:20,  1.50it/s] 39%|███▉      | 3789/9770 [45:04<1:05:39,  1.52it/s] 39%|███▉      | 3790/9770 [45:05<1:05:28,  1.52it/s]                                                      39%|███▉      | 3790/9770 [45:05<1:05:28,  1.52it/s] 39%|███▉      | 3791/9770 [45:05<1:05:34,  1.52it/s] 39%|███▉      | 3792/9770 [45:
+0: {'loss': 0.6895, 'grad_norm': 0.6670962613474667, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.39}
+0: 06<1:05:34,  1.52it/s] 39%|███▉      | 3793/9770 [45:07<1:04:56,  1.53it/s] 39%|███▉      | 3794/9770 [45:07<1:04:30,  1.54it/s] 39%|███▉      | 3795/9770 [45:08<1:05:06,  1.53it/s] 39%|███▉      | 3796/9770 [45:09<1:05:22,  1.52it/s] 39%|███▉      | 3797/9770 [45:09<1:05:15,  1.53it/s] 39%|███▉      | 3798/9770 [45:10<1:05:20,  1.52it/s] 39%|███▉      | 3799/9770 [45:11<1:05:14,  1.53it/s] 39%|███▉      | 3800/9770 [45:11<1:05:35,  1.52it/s]                                                      39%|███▉      | 3800/9770 [45:11<1:05:35,  1.52it/s] 39%|███▉      | 3801/9770 [45:12<1:05:51,  1.51it/s] 39%|███▉      | 3802/9770 [45:12<1:04:58,  1.53it/s] 39%|███▉      | 3803/9770 [45:13<1:05:30,  1.52it/s] 39%|███▉      | 3804/9770 [45:14<1:05:33,  1.52it/s] 39%|███▉      | 3805/9770 [45:15<1:06:49,  1.49it/s] 39%|███▉      | 3806/9770 [45:15<1:07:03,  1.48it/s] 39%|███�
+0: {'loss': 0.6842, 'grad_norm': 0.6886142929521072, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.39}
+0: {'loss': 0.6742, 'grad_norm': 0.6128142071741325, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.39}
+0: �      | 3807/9770 [45:16<1:06:54,  1.49it/s] 39%|███▉      | 3808/9770 [45:17<1:06:08,  1.50it/s] 39%|███▉      | 3809/9770 [45:17<1:05:26,  1.52it/s] 39%|███▉      | 3810/9770 [45:18<1:05:17,  1.52it/s]                                                      39%|███▉      | 3810/9770 [45:18<1:05:17,  1.52it/s] 39%|███▉      | 3811/9770 [45:18<1:05:22,  1.52it/s] 39%|███▉      | 3812/9770 [45:19<1:04:55,  1.53it/s] 39%|███▉      | 3813/9770 [45:20<1:04:45,  1.53it/s] 39%|███▉      | 3814/9770 [45:20<1:04:01,  1.55it/s] 39%|███▉      | 3815/9770 [45:21<1:04:00,  1.55it/s] 39%|███▉      | 3816/9770 [45:22<1:03:39,  1.56it/s] 39%|███▉      | 3817/9770 [45:22<1:04:03,  1.55it/s] 39%|███▉      | 3818/9770 [45:23<1:04:25,  1.54it/s] 39%|███▉      | 3819/9770 [45:24<1:06:01,  1.50it/s] 39%|███▉      | 3820/9770 [45:24<1:05:39,  1.51it/s]                                                     
+0: {'loss': 0.6731, 'grad_norm': 0.608292936546312, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.39}
+0:  39%|███▉      | 3820/9770 [45:24<1:05:39,  1.51it/s] 39%|███▉      | 3821/9770 [45:25<1:06:06,  1.50it/s] 39%|███▉      | 3822/9770 [45:26<1:05:16,  1.52it/s] 39%|███▉      | 3823/9770 [45:26<1:05:40,  1.51it/s] 39%|███▉      | 3824/9770 [45:27<1:05:40,  1.51it/s] 39%|███▉      | 3825/9770 [45:28<1:06:06,  1.50it/s] 39%|███▉      | 3826/9770 [45:28<1:05:27,  1.51it/s] 39%|███▉      | 3827/9770 [45:29<1:06:17,  1.49it/s] 39%|███▉      | 3828/9770 [45:30<1:06:23,  1.49it/s] 39%|███▉      | 3829/9770 [45:30<1:05:38,  1.51it/s] 39%|███▉      | 3830/9770 [45:31<1:04:55,  1.52it/s]                                                      39%|███▉      | 3830/9770 [45:31<1:04:55,  1.52it/s] 39%|███▉      | 3831/9770 [45:32<1:04:34,  1.53it/s] 39%|███▉      | 3832/9770 [45:32<1:05:09,  1.52it/s] 39%|███▉      | 3833/9770 [45:33<1:05:07,  1.52it/s] 39%|███▉      | 3834/9770 [45:
+0: {'loss': 0.6821, 'grad_norm': 0.6508755805607591, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.39}
+0: 34<1:05:39,  1.51it/s] 39%|███▉      | 3835/9770 [45:34<1:05:29,  1.51it/s] 39%|███▉      | 3836/9770 [45:35<1:04:37,  1.53it/s] 39%|███▉      | 3837/9770 [45:36<1:04:43,  1.53it/s] 39%|███▉      | 3838/9770 [45:36<1:04:13,  1.54it/s] 39%|███▉      | 3839/9770 [45:37<1:05:03,  1.52it/s] 39%|███▉      | 3840/9770 [45:38<1:04:20,  1.54it/s]                                                      39%|███▉      | 3840/9770 [45:38<1:04:20,  1.54it/s] 39%|███▉      | 3841/9770 [45:38<1:04:48,  1.52it/s] 39%|███▉      | 3842/9770 [45:39<1:05:04,  1.52it/s] 39%|███▉      | 3843/9770 [45:40<1:06:04,  1.50it/s] 39%|███▉      | 3844/9770 [45:40<1:05:37,  1.51it/s] 39%|███▉      | 3845/9770 [45:41<1:05:30,  1.51it/s] 39%|███▉      | 3846/9770 [45:42<1:05:31,  1.51it/s] 39%|███▉      | 3847/9770 [45:42<1:06:43,  1.48it/s] 39%|███▉      | 3848/9770 [45:43<1:06:05,  1.49it/s] 39%|███�
+0: {'loss': 0.6905, 'grad_norm': 0.5984699872329943, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.39}
+0: {'loss': 0.6619, 'grad_norm': 0.6534331465162374, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.39}
+0: �      | 3849/9770 [45:44<1:06:17,  1.49it/s] 39%|███▉      | 3850/9770 [45:44<1:06:36,  1.48it/s]                                                      39%|███▉      | 3850/9770 [45:44<1:06:36,  1.48it/s] 39%|███▉      | 3851/9770 [45:45<1:06:11,  1.49it/s] 39%|███▉      | 3852/9770 [45:46<1:05:57,  1.50it/s] 39%|███▉      | 3853/9770 [45:46<1:05:42,  1.50it/s] 39%|███▉      | 3854/9770 [45:47<1:05:33,  1.50it/s] 39%|███▉      | 3855/9770 [45:48<1:05:36,  1.50it/s] 39%|███▉      | 3856/9770 [45:48<1:05:39,  1.50it/s] 39%|███▉      | 3857/9770 [45:49<1:05:22,  1.51it/s] 39%|███▉      | 3858/9770 [45:50<1:05:10,  1.51it/s] 39%|███▉      | 3859/9770 [45:50<1:04:53,  1.52it/s] 40%|███▉      | 3860/9770 [45:51<1:04:24,  1.53it/s]                                                      40%|███▉      | 3860/9770 [45:51<1:04:24,  1.53it/s] 40%|███▉      | 3861/9770 [45:52<1:04:35,  1.52it/s]
+0: {'loss': 0.7051, 'grad_norm': 0.6617786171175576, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.4}
+0:  40%|███▉      | 3862/9770 [45:52<1:04:39,  1.52it/s] 40%|███▉      | 3863/9770 [45:53<1:05:27,  1.50it/s] 40%|███▉      | 3864/9770 [45:54<1:05:40,  1.50it/s] 40%|███▉      | 3865/9770 [45:54<1:05:21,  1.51it/s] 40%|███▉      | 3866/9770 [45:55<1:05:09,  1.51it/s] 40%|███▉      | 3867/9770 [45:55<1:05:16,  1.51it/s] 40%|███▉      | 3868/9770 [45:56<1:04:32,  1.52it/s] 40%|███▉      | 3869/9770 [45:57<1:05:45,  1.50it/s] 40%|███▉      | 3870/9770 [45:57<1:05:11,  1.51it/s]                                                      40%|███▉      | 3870/9770 [45:57<1:05:11,  1.51it/s] 40%|███▉      | 3871/9770 [45:58<1:04:52,  1.52it/s] 40%|███▉      | 3872/9770 [45:59<1:06:08,  1.49it/s] 40%|███▉      | 3873/9770 [45:59<1:05:40,  1.50it/s] 40%|███▉      | 3874/9770 [46:00<1:05:25,  1.50it/s] 40%|███▉      | 3875/9770 [46:01<1:05:34,  1.50it/s] 40%|███▉      | 3876/9770 [46:
+0: {'loss': 0.6836, 'grad_norm': 0.6263478630584508, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.4}
+0: 01<1:05:17,  1.50it/s] 40%|███▉      | 3877/9770 [46:02<1:05:23,  1.50it/s] 40%|███▉      | 3878/9770 [46:03<1:05:02,  1.51it/s] 40%|███▉      | 3879/9770 [46:03<1:04:28,  1.52it/s] 40%|███▉      | 3880/9770 [46:04<1:05:28,  1.50it/s]                                                      40%|███▉      | 3880/9770 [46:04<1:05:28,  1.50it/s] 40%|███▉      | 3881/9770 [46:05<1:10:01,  1.40it/s] 40%|███▉      | 3882/9770 [46:06<1:15:41,  1.30it/s] 40%|███▉      | 3883/9770 [46:07<1:17:00,  1.27it/s] 40%|███▉      | 3884/9770 [46:07<1:13:03,  1.34it/s] 40%|███▉      | 3885/9770 [46:08<1:10:37,  1.39it/s] 40%|███▉      | 3886/9770 [46:09<1:09:01,  1.42it/s] 40%|███▉      | 3887/9770 [46:09<1:07:34,  1.45it/s] 40%|███▉      | 3888/9770 [46:10<1:07:27,  1.45it/s] 40%|███▉      | 3889/9770 [46:11<1:06:44,  1.47it/s] 40%|███▉      | 3890/9770 [46:11<1:07:25,  1.45it/s]                
+0: {'loss': 0.6979, 'grad_norm': 0.6182063540656019, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.4}
+0: {'loss': 0.6675, 'grad_norm': 0.6277899799839937, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.4}
+0:                                       40%|███▉      | 3890/9770 [46:11<1:07:25,  1.45it/s] 40%|███▉      | 3891/9770 [46:12<1:07:38,  1.45it/s] 40%|███▉      | 3892/9770 [46:13<1:06:35,  1.47it/s] 40%|███▉      | 3893/9770 [46:13<1:05:45,  1.49it/s] 40%|███▉      | 3894/9770 [46:14<1:05:35,  1.49it/s] 40%|███▉      | 3895/9770 [46:15<1:05:08,  1.50it/s] 40%|███▉      | 3896/9770 [46:15<1:04:37,  1.51it/s] 40%|███▉      | 3897/9770 [46:16<1:04:08,  1.53it/s] 40%|███▉      | 3898/9770 [46:17<1:04:27,  1.52it/s] 40%|███▉      | 3899/9770 [46:17<1:04:39,  1.51it/s] 40%|███▉      | 3900/9770 [46:18<1:04:06,  1.53it/s]                                                      40%|███▉      | 3900/9770 [46:18<1:04:06,  1.53it/s] 40%|███▉      | 3901/9770 [46:19<1:04:31,  1.52it/s] 40%|███▉      | 3902/9770 [46:19<1:04:37,  1.51it/s] 40%|███▉      | 3903/9770 [46:20<1:05:57,  1.48it/s]
+0: [2025-09-02 20:42:25,266] [INFO] [axolotl.core.trainers.base._save:613] [PID:3622631] [RANK:0] Saving model checkpoint to /lustre/fswork/projects/rech/dgo/udv55np/math/Qwen3-235B-A22B/Qwen2.5-0.5B_reasoning/1/checkpoint-3908[39m
+0: [2025-09-02 20:42:26,198] [INFO] [axolotl.core.trainers.base._save:662] [PID:3622631] [RANK:0] Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`[39m
+0: {'loss': 0.6734, 'grad_norm': 0.6234948549415069, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.4}
+0:  40%|███▉      | 3904/9770 [46:21<1:06:04,  1.48it/s] 40%|███▉      | 3905/9770 [46:21<1:05:48,  1.49it/s] 40%|███▉      | 3906/9770 [46:22<1:05:57,  1.48it/s] 40%|███▉      | 3907/9770 [46:23<1:05:34,  1.49it/s] 40%|████      | 3908/9770 [46:23<1:05:11,  1.50it/s] 40%|████      | 3909/9770 [46:26<2:16:14,  1.39s/it] 40%|████      | 3910/9770 [46:27<1:54:11,  1.17s/it]                                                      40%|████      | 3910/9770 [46:27<1:54:11,  1.17s/it] 40%|████      | 3911/9770 [46:28<1:38:59,  1.01s/it] 40%|████      | 3912/9770 [46:28<1:28:57,  1.10it/s] 40%|████      | 3913/9770 [46:29<1:21:33,  1.20it/s] 40%|████      | 3914/9770 [46:30<1:15:46,  1.29it/s] 40%|████      | 3915/9770 [46:30<1:12:17,  1.35it/s] 40%|████      | 3916/9770 [46:31<1:10:52,  1.38it/s] 40%|████      | 3917/9770 [46:32<1:08:17,  1.43it/s] 40%|████      | 3918/9770 [46:
+0: {'loss': 0.6716, 'grad_norm': 0.6635311663860688, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.4}
+0: {'loss': 0.6678, 'grad_norm': 0.668125380985513, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.4}
+0: 32<1:07:41,  1.44it/s] 40%|████      | 3919/9770 [46:33<1:13:07,  1.33it/s] 40%|████      | 3920/9770 [46:34<1:10:18,  1.39it/s]                                                      40%|████      | 3920/9770 [46:34<1:10:18,  1.39it/s] 40%|████      | 3921/9770 [46:35<1:08:02,  1.43it/s] 40%|████      | 3922/9770 [46:35<1:06:21,  1.47it/s] 40%|████      | 3923/9770 [46:36<1:05:45,  1.48it/s] 40%|████      | 3924/9770 [46:37<1:05:06,  1.50it/s] 40%|████      | 3925/9770 [46:37<1:04:57,  1.50it/s] 40%|████      | 3926/9770 [46:38<1:04:50,  1.50it/s] 40%|████      | 3927/9770 [46:38<1:04:30,  1.51it/s] 40%|████      | 3928/9770 [46:39<1:03:59,  1.52it/s] 40%|████      | 3929/9770 [46:40<1:03:25,  1.53it/s] 40%|████      | 3930/9770 [46:40<1:04:06,  1.52it/s]                                                      40%|████      | 3930/9770 [46:40<1:04:06,  1.52it/s] 40%|████      
+0: {'loss': 0.6594, 'grad_norm': 0.5913230341114011, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.4}
+0: | 3931/9770 [46:41<1:03:51,  1.52it/s] 40%|████      | 3932/9770 [46:42<1:03:55,  1.52it/s] 40%|████      | 3933/9770 [46:42<1:03:59,  1.52it/s] 40%|████      | 3934/9770 [46:43<1:04:06,  1.52it/s] 40%|████      | 3935/9770 [46:44<1:05:04,  1.49it/s] 40%|████      | 3936/9770 [46:44<1:04:27,  1.51it/s] 40%|████      | 3937/9770 [46:45<1:07:00,  1.45it/s] 40%|████      | 3938/9770 [46:46<1:05:38,  1.48it/s] 40%|████      | 3939/9770 [46:46<1:05:17,  1.49it/s] 40%|████      | 3940/9770 [46:47<1:04:59,  1.50it/s]                                                      40%|████      | 3940/9770 [46:47<1:04:59,  1.50it/s] 40%|████      | 3941/9770 [46:48<1:04:26,  1.51it/s] 40%|████      | 3942/9770 [46:48<1:03:22,  1.53it/s] 40%|████      | 3943/9770 [46:49<1:02:58,  1.54it/s] 40%|████      | 3944/9770 [46:50<1:03:15,  1.53it/s] 40%|████      | 3945/9770 [46:50<1:02:51,  1.54it/s]
+0: {'loss': 0.6752, 'grad_norm': 0.623502442130232, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.4}
+0:  40%|████      | 3946/9770 [46:51<1:02:28,  1.55it/s] 40%|████      | 3947/9770 [46:52<1:03:07,  1.54it/s] 40%|████      | 3948/9770 [46:52<1:02:59,  1.54it/s] 40%|████      | 3949/9770 [46:53<1:03:09,  1.54it/s] 40%|████      | 3950/9770 [46:54<1:03:15,  1.53it/s]                                                      40%|████      | 3950/9770 [46:54<1:03:15,  1.53it/s] 40%|████      | 3951/9770 [46:54<1:03:01,  1.54it/s] 40%|████      | 3952/9770 [46:55<1:02:31,  1.55it/s] 40%|████      | 3953/9770 [46:56<1:03:08,  1.54it/s] 40%|████      | 3954/9770 [46:56<1:03:05,  1.54it/s] 40%|████      | 3955/9770 [46:57<1:03:27,  1.53it/s] 40%|████      | 3956/9770 [46:58<1:05:04,  1.49it/s] 41%|████      | 3957/9770 [46:58<1:04:35,  1.50it/s] 41%|████      | 3958/9770 [46:59<1:04:22,  1.50it/s] 41%|████      | 3959/9770 [47:00<1:04:09,  1.51it/s] 41%|████      | 3960/9770 [47:
+0: {'loss': 0.6619, 'grad_norm': 0.6097053603693122, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.41}
+0: {'loss': 0.6875, 'grad_norm': 0.6161818605147438, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.41}
+0: 00<1:03:56,  1.51it/s]                                                      41%|████      | 3960/9770 [47:00<1:03:56,  1.51it/s] 41%|████      | 3961/9770 [47:01<1:03:32,  1.52it/s] 41%|████      | 3962/9770 [47:02<1:03:35,  1.52it/s] 41%|████      | 3963/9770 [47:02<1:03:33,  1.52it/s] 41%|████      | 3964/9770 [47:03<1:03:52,  1.52it/s] 41%|████      | 3965/9770 [47:04<1:04:10,  1.51it/s] 41%|████      | 3966/9770 [47:04<1:03:03,  1.53it/s] 41%|████      | 3967/9770 [47:05<1:03:41,  1.52it/s] 41%|████      | 3968/9770 [47:05<1:03:44,  1.52it/s] 41%|████      | 3969/9770 [47:06<1:03:39,  1.52it/s] 41%|████      | 3970/9770 [47:07<1:02:52,  1.54it/s]                                                      41%|████      | 3970/9770 [47:07<1:02:52,  1.54it/s] 41%|████      | 3971/9770 [47:07<1:03:00,  1.53it/s] 41%|████      | 3972/9770 [47:08<1:03:51,  1.51it/s] 41%|████      
+0: {'loss': 0.6978, 'grad_norm': 0.6657499661361843, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.41}
+0: | 3973/9770 [47:09<1:05:29,  1.48it/s] 41%|████      | 3974/9770 [47:09<1:04:57,  1.49it/s] 41%|████      | 3975/9770 [47:10<1:04:43,  1.49it/s] 41%|████      | 3976/9770 [47:11<1:04:09,  1.50it/s] 41%|████      | 3977/9770 [47:11<1:04:21,  1.50it/s] 41%|████      | 3978/9770 [47:12<1:04:00,  1.51it/s] 41%|████      | 3979/9770 [47:13<1:04:16,  1.50it/s] 41%|████      | 3980/9770 [47:13<1:04:07,  1.50it/s]                                                      41%|████      | 3980/9770 [47:13<1:04:07,  1.50it/s] 41%|████      | 3981/9770 [47:14<1:03:28,  1.52it/s] 41%|████      | 3982/9770 [47:15<1:03:40,  1.51it/s] 41%|████      | 3983/9770 [47:15<1:04:11,  1.50it/s] 41%|████      | 3984/9770 [47:16<1:04:15,  1.50it/s] 41%|████      | 3985/9770 [47:17<1:04:10,  1.50it/s] 41%|████      | 3986/9770 [47:17<1:03:21,  1.52it/s] 41%|████      | 3987/9770 [47:18<1:03:13,  1.52it/s]
+0: {'loss': 0.6624, 'grad_norm': 0.646864287847045, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.41}
+0: {'loss': 0.6781, 'grad_norm': 0.6428436343225846, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.41}
+0:  41%|████      | 3988/9770 [47:19<1:03:20,  1.52it/s] 41%|████      | 3989/9770 [47:19<1:03:23,  1.52it/s] 41%|████      | 3990/9770 [47:20<1:03:20,  1.52it/s]                                                      41%|████      | 3990/9770 [47:20<1:03:20,  1.52it/s] 41%|████      | 3991/9770 [47:21<1:05:08,  1.48it/s] 41%|████      | 3992/9770 [47:21<1:04:43,  1.49it/s] 41%|████      | 3993/9770 [47:22<1:04:17,  1.50it/s] 41%|████      | 3994/9770 [47:23<1:03:41,  1.51it/s] 41%|████      | 3995/9770 [47:23<1:04:32,  1.49it/s] 41%|████      | 3996/9770 [47:24<1:04:23,  1.49it/s] 41%|████      | 3997/9770 [47:25<1:04:11,  1.50it/s] 41%|████      | 3998/9770 [47:25<1:03:10,  1.52it/s] 41%|████      | 3999/9770 [47:26<1:03:44,  1.51it/s] 41%|████      | 4000/9770 [47:27<1:03:52,  1.51it/s]                                                      41%|████      | 4000/9770 [47:27<1:03
+0: {'loss': 0.6933, 'grad_norm': 0.6890656609990352, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.41}
+0: :52,  1.51it/s] 41%|████      | 4001/9770 [47:27<1:04:57,  1.48it/s] 41%|████      | 4002/9770 [47:28<1:04:21,  1.49it/s] 41%|████      | 4003/9770 [47:29<1:04:48,  1.48it/s] 41%|████      | 4004/9770 [47:29<1:04:56,  1.48it/s] 41%|████      | 4005/9770 [47:30<1:04:23,  1.49it/s] 41%|████      | 4006/9770 [47:31<1:03:39,  1.51it/s] 41%|████      | 4007/9770 [47:31<1:04:11,  1.50it/s] 41%|████      | 4008/9770 [47:32<1:04:28,  1.49it/s] 41%|████      | 4009/9770 [47:33<1:05:34,  1.46it/s] 41%|████      | 4010/9770 [47:33<1:04:56,  1.48it/s]                                                      41%|████      | 4010/9770 [47:33<1:04:56,  1.48it/s] 41%|████      | 4011/9770 [47:34<1:04:11,  1.50it/s] 41%|████      | 4012/9770 [47:35<1:04:28,  1.49it/s] 41%|████      | 4013/9770 [47:35<1:04:22,  1.49it/s] 41%|████      | 4014/9770 [47:36<1:04:17,  1.49it/s] 41%|████      
+0: {'loss': 0.701, 'grad_norm': 0.6344444084921689, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.41}
+0: | 4015/9770 [47:37<1:04:13,  1.49it/s] 41%|████      | 4016/9770 [47:37<1:03:59,  1.50it/s] 41%|████      | 4017/9770 [47:38<1:03:44,  1.50it/s] 41%|████      | 4018/9770 [47:39<1:03:33,  1.51it/s] 41%|████      | 4019/9770 [47:39<1:02:53,  1.52it/s] 41%|████      | 4020/9770 [47:40<1:02:52,  1.52it/s]                                                      41%|████      | 4020/9770 [47:40<1:02:52,  1.52it/s] 41%|████      | 4021/9770 [47:41<1:02:46,  1.53it/s] 41%|████      | 4022/9770 [47:41<1:02:12,  1.54it/s] 41%|████      | 4023/9770 [47:42<1:02:50,  1.52it/s] 41%|████      | 4024/9770 [47:43<1:03:34,  1.51it/s] 41%|████      | 4025/9770 [47:43<1:02:55,  1.52it/s] 41%|████      | 4026/9770 [47:44<1:03:20,  1.51it/s] 41%|████      | 4027/9770 [47:45<1:03:13,  1.51it/s] 41%|████      | 4028/9770 [47:45<1:03:05,  1.52it/s] 41%|████      | 4029/9770 [47:46<1:06:32,  1.44it/s]
+0: {'loss': 0.6512, 'grad_norm': 0.6000300993038492, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.41}
+0: {'loss': 0.6851, 'grad_norm': 0.7507050942994034, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.41}
+0:  41%|████      | 4030/9770 [47:47<1:06:35,  1.44it/s]                                                      41%|████      | 4030/9770 [47:47<1:06:35,  1.44it/s] 41%|████▏     | 4031/9770 [47:47<1:05:44,  1.46it/s] 41%|████▏     | 4032/9770 [47:48<1:04:50,  1.47it/s] 41%|████▏     | 4033/9770 [47:49<1:05:26,  1.46it/s] 41%|████▏     | 4034/9770 [47:49<1:03:47,  1.50it/s] 41%|████▏     | 4035/9770 [47:50<1:03:49,  1.50it/s] 41%|████▏     | 4036/9770 [47:51<1:03:55,  1.50it/s] 41%|████▏     | 4037/9770 [47:51<1:03:19,  1.51it/s] 41%|████▏     | 4038/9770 [47:52<1:03:06,  1.51it/s] 41%|████▏     | 4039/9770 [47:53<1:02:35,  1.53it/s] 41%|████▏     | 4040/9770 [47:53<1:03:25,  1.51it/s]                                                      41%|████▏     | 4040/9770 [47:53<1:03:25,  1.51it/s] 41%|████▏     | 4041/9770 [47:54<1:03:01,  1.52it/s] 41%|████▏  
+0: {'loss': 0.7035, 'grad_norm': 0.6491833386356902, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.41}
+0:    | 4042/9770 [47:55<1:02:07,  1.54it/s] 41%|████▏     | 4043/9770 [47:55<1:01:54,  1.54it/s] 41%|████▏     | 4044/9770 [47:56<1:02:26,  1.53it/s] 41%|████▏     | 4045/9770 [47:57<1:02:42,  1.52it/s] 41%|████▏     | 4046/9770 [47:57<1:03:30,  1.50it/s] 41%|████▏     | 4047/9770 [47:58<1:03:55,  1.49it/s] 41%|████▏     | 4048/9770 [47:59<1:03:35,  1.50it/s] 41%|████▏     | 4049/9770 [47:59<1:03:21,  1.51it/s] 41%|████▏     | 4050/9770 [48:00<1:02:49,  1.52it/s]                                                      41%|████▏     | 4050/9770 [48:00<1:02:49,  1.52it/s] 41%|████▏     | 4051/9770 [48:01<1:03:06,  1.51it/s] 41%|████▏     | 4052/9770 [48:01<1:03:16,  1.51it/s] 41%|████▏     | 4053/9770 [48:02<1:02:31,  1.52it/s] 41%|████▏     | 4054/9770 [48:03<1:02:17,  1.53it/s] 42%|████▏     | 4055/9770 [48:03<1:02:27,  1.53it/s] 42%|████▏     | 4056
+0: {'loss': 0.6669, 'grad_norm': 0.6035692806902782, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.42}
+0: /9770 [48:04<1:01:47,  1.54it/s] 42%|████▏     | 4057/9770 [48:05<1:01:56,  1.54it/s] 42%|████▏     | 4058/9770 [48:05<1:02:13,  1.53it/s] 42%|████▏     | 4059/9770 [48:06<1:02:39,  1.52it/s] 42%|████▏     | 4060/9770 [48:07<1:02:34,  1.52it/s]                                                      42%|████▏     | 4060/9770 [48:07<1:02:34,  1.52it/s] 42%|████▏     | 4061/9770 [48:07<1:02:40,  1.52it/s] 42%|████▏     | 4062/9770 [48:08<1:02:59,  1.51it/s] 42%|████▏     | 4063/9770 [48:09<1:02:47,  1.51it/s] 42%|████▏     | 4064/9770 [48:09<1:02:09,  1.53it/s] 42%|████▏     | 4065/9770 [48:10<1:01:43,  1.54it/s] 42%|████▏     | 4066/9770 [48:11<1:03:13,  1.50it/s] 42%|████▏     | 4067/9770 [48:11<1:02:36,  1.52it/s] 42%|████▏     | 4068/9770 [48:12<1:02:20,  1.52it/s] 42%|████▏     | 4069/9770 [48:13<1:02:20,  1.52it/s] 42%|████▏     | 4070/9770 [48
+0: {'loss': 0.6905, 'grad_norm': 0.6752122449540094, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.42}
+0: {'loss': 0.6832, 'grad_norm': 0.5972917322158103, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.42}
+0: :13<1:03:03,  1.51it/s]                                                      42%|████▏     | 4070/9770 [48:13<1:03:03,  1.51it/s] 42%|████▏     | 4071/9770 [48:14<1:03:10,  1.50it/s] 42%|████▏     | 4072/9770 [48:15<1:03:26,  1.50it/s] 42%|████▏     | 4073/9770 [48:15<1:03:05,  1.50it/s] 42%|████▏     | 4074/9770 [48:16<1:02:59,  1.51it/s] 42%|████▏     | 4075/9770 [48:17<1:03:21,  1.50it/s] 42%|████▏     | 4076/9770 [48:17<1:02:55,  1.51it/s] 42%|████▏     | 4077/9770 [48:18<1:03:36,  1.49it/s] 42%|████▏     | 4078/9770 [48:19<1:02:49,  1.51it/s] 42%|████▏     | 4079/9770 [48:19<1:02:41,  1.51it/s] 42%|████▏     | 4080/9770 [48:20<1:03:24,  1.50it/s]                                                      42%|████▏     | 4080/9770 [48:20<1:03:24,  1.50it/s] 42%|████▏     | 4081/9770 [48:21<1:03:31,  1.49it/s] 42%|████▏     | 4082/9770 [48:21<1:02:44,  1.51
+0: {'loss': 0.6759, 'grad_norm': 0.6177857483317616, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.42}
+0: it/s] 42%|████▏     | 4083/9770 [48:22<1:02:23,  1.52it/s] 42%|████▏     | 4084/9770 [48:22<1:02:20,  1.52it/s] 42%|████▏     | 4085/9770 [48:23<1:02:47,  1.51it/s] 42%|████▏     | 4086/9770 [48:24<1:01:56,  1.53it/s] 42%|████▏     | 4087/9770 [48:24<1:02:19,  1.52it/s] 42%|████▏     | 4088/9770 [48:25<1:02:25,  1.52it/s] 42%|████▏     | 4089/9770 [48:26<1:02:37,  1.51it/s] 42%|████▏     | 4090/9770 [48:26<1:01:55,  1.53it/s]                                                      42%|████▏     | 4090/9770 [48:26<1:01:55,  1.53it/s] 42%|████▏     | 4091/9770 [48:27<1:03:06,  1.50it/s] 42%|████▏     | 4092/9770 [48:28<1:03:23,  1.49it/s] 42%|████▏     | 4093/9770 [48:28<1:02:27,  1.51it/s] 42%|████▏     | 4094/9770 [48:29<1:02:34,  1.51it/s] 42%|████▏     | 4095/9770 [48:30<1:01:49,  1.53it/s] 42%|████▏     | 4096/9770 [48:30<1:02:43,  1.51it/s] 42
+0: {'loss': 0.6757, 'grad_norm': 0.626741728603864, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.42}
+0: %|████▏     | 4097/9770 [48:31<1:02:35,  1.51it/s] 42%|████▏     | 4098/9770 [48:32<1:02:28,  1.51it/s] 42%|████▏     | 4099/9770 [48:32<1:01:23,  1.54it/s] 42%|████▏     | 4100/9770 [48:33<1:02:04,  1.52it/s]                                                      42%|████▏     | 4100/9770 [48:33<1:02:04,  1.52it/s] 42%|████▏     | 4101/9770 [48:34<1:02:11,  1.52it/s] 42%|████▏     | 4102/9770 [48:34<1:02:03,  1.52it/s] 42%|████▏     | 4103/9770 [48:35<1:01:32,  1.53it/s] 42%|████▏     | 4104/9770 [48:36<1:01:37,  1.53it/s] 42%|████▏     | 4105/9770 [48:36<1:02:03,  1.52it/s] 42%|████▏     | 4106/9770 [48:37<1:01:09,  1.54it/s] 42%|████▏     | 4107/9770 [48:38<1:01:31,  1.53it/s] 42%|████▏     | 4108/9770 [48:38<1:02:53,  1.50it/s] 42%|████▏     | 4109/9770 [48:39<1:02:57,  1.50it/s] 42%|████▏     | 4110/9770 [48:40<1:03:10,  1.49it/s]            
+0: {'loss': 0.6875, 'grad_norm': 0.629559077414286, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.42}
+0: {'loss': 0.6652, 'grad_norm': 0.5871601900952612, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.42}
+0:                                           42%|████▏     | 4110/9770 [48:40<1:03:10,  1.49it/s] 42%|████▏     | 4111/9770 [48:40<1:02:29,  1.51it/s] 42%|████▏     | 4112/9770 [48:41<1:02:34,  1.51it/s] 42%|████▏     | 4113/9770 [48:42<1:02:34,  1.51it/s] 42%|████▏     | 4114/9770 [48:42<1:02:16,  1.51it/s] 42%|████▏     | 4115/9770 [48:43<1:02:13,  1.51it/s] 42%|████▏     | 4116/9770 [48:44<1:01:50,  1.52it/s] 42%|████▏     | 4117/9770 [48:44<1:02:02,  1.52it/s] 42%|████▏     | 4118/9770 [48:45<1:01:42,  1.53it/s] 42%|████▏     | 4119/9770 [48:46<1:02:00,  1.52it/s] 42%|████▏     | 4120/9770 [48:46<1:02:13,  1.51it/s]                                                      42%|████▏     | 4120/9770 [48:46<1:02:13,  1.51it/s] 42%|████▏     | 4121/9770 [48:47<1:01:48,  1.52it/s] 42%|████▏     | 4122/9770 [48:48<1:01:31,  1.53it/s] 42%|████▏     | 412
+0: {'loss': 0.6991, 'grad_norm': 0.6190659956535826, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.42}
+0: 3/9770 [48:48<1:02:02,  1.52it/s] 42%|████▏     | 4124/9770 [48:49<1:02:21,  1.51it/s] 42%|████▏     | 4125/9770 [48:50<1:01:54,  1.52it/s] 42%|████▏     | 4126/9770 [48:50<1:02:12,  1.51it/s] 42%|████▏     | 4127/9770 [48:51<1:02:43,  1.50it/s] 42%|████▏     | 4128/9770 [48:52<1:02:14,  1.51it/s] 42%|████▏     | 4129/9770 [48:52<1:02:24,  1.51it/s] 42%|████▏     | 4130/9770 [48:53<1:02:35,  1.50it/s]                                                      42%|████▏     | 4130/9770 [48:53<1:02:35,  1.50it/s] 42%|████▏     | 4131/9770 [48:53<1:02:00,  1.52it/s] 42%|████▏     | 4132/9770 [48:54<1:02:50,  1.50it/s] 42%|████▏     | 4133/9770 [48:55<1:02:40,  1.50it/s] 42%|████▏     | 4134/9770 [48:55<1:02:10,  1.51it/s] 42%|████▏     | 4135/9770 [48:56<1:01:41,  1.52it/s] 42%|████▏     | 4136/9770 [48:57<1:02:32,  1.50it/s] 42%|████▏     | 4137/9770 [4
+0: {'loss': 0.6714, 'grad_norm': 0.5957275873796352, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.42}
+0: 8:57<1:02:10,  1.51it/s] 42%|████▏     | 4138/9770 [48:58<1:01:43,  1.52it/s] 42%|████▏     | 4139/9770 [48:59<1:01:46,  1.52it/s] 42%|████▏     | 4140/9770 [48:59<1:02:06,  1.51it/s]                                                      42%|████▏     | 4140/9770 [48:59<1:02:06,  1.51it/s] 42%|████▏     | 4141/9770 [49:00<1:01:53,  1.52it/s] 42%|████▏     | 4142/9770 [49:01<1:01:07,  1.53it/s] 42%|████▏     | 4143/9770 [49:01<1:01:07,  1.53it/s] 42%|████▏     | 4144/9770 [49:02<1:01:10,  1.53it/s] 42%|████▏     | 4145/9770 [49:03<1:01:30,  1.52it/s] 42%|████▏     | 4146/9770 [49:03<1:01:28,  1.52it/s] 42%|████▏     | 4147/9770 [49:04<1:00:58,  1.54it/s] 42%|████▏     | 4148/9770 [49:05<1:01:03,  1.53it/s] 42%|████▏     | 4149/9770 [49:05<1:01:04,  1.53it/s] 42%|████▏     | 4150/9770 [49:06<1:00:37,  1.55it/s]                                                
+0: {'loss': 0.6486, 'grad_norm': 0.6680324829603406, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.42}
+0: {'loss': 0.6961, 'grad_norm': 0.6027922397960519, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.43}
+0:       42%|████▏     | 4150/9770 [49:06<1:00:37,  1.55it/s] 42%|████▏     | 4151/9770 [49:07<1:00:43,  1.54it/s] 42%|████▏     | 4152/9770 [49:07<1:00:46,  1.54it/s] 43%|████▎     | 4153/9770 [49:08<1:01:17,  1.53it/s] 43%|████▎     | 4154/9770 [49:09<1:01:29,  1.52it/s] 43%|████▎     | 4155/9770 [49:09<1:07:29,  1.39it/s] 43%|████▎     | 4156/9770 [49:10<1:06:07,  1.41it/s] 43%|████▎     | 4157/9770 [49:11<1:05:04,  1.44it/s] 43%|████▎     | 4158/9770 [49:11<1:04:19,  1.45it/s] 43%|████▎     | 4159/9770 [49:12<1:03:20,  1.48it/s] 43%|████▎     | 4160/9770 [49:13<1:03:25,  1.47it/s]                                                      43%|████▎     | 4160/9770 [49:13<1:03:25,  1.47it/s] 43%|████▎     | 4161/9770 [49:13<1:02:04,  1.51it/s] 43%|████▎     | 4162/9770 [49:14<1:02:07,  1.50it/s] 43%|████▎     | 4163/9770 [49:15<1:02:03,  1.51it/s] 4
+0: {'loss': 0.6844, 'grad_norm': 0.6293728208813729, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.43}
+0: 3%|████▎     | 4164/9770 [49:15<1:02:15,  1.50it/s] 43%|████▎     | 4165/9770 [49:16<1:01:51,  1.51it/s] 43%|████▎     | 4166/9770 [49:17<1:00:57,  1.53it/s] 43%|████▎     | 4167/9770 [49:17<1:01:12,  1.53it/s] 43%|████▎     | 4168/9770 [49:18<1:01:58,  1.51it/s] 43%|████▎     | 4169/9770 [49:19<1:01:02,  1.53it/s] 43%|████▎     | 4170/9770 [49:19<1:01:07,  1.53it/s]                                                      43%|████▎     | 4170/9770 [49:19<1:01:07,  1.53it/s] 43%|████▎     | 4171/9770 [49:20<1:00:39,  1.54it/s] 43%|████▎     | 4172/9770 [49:21<1:01:07,  1.53it/s] 43%|████▎     | 4173/9770 [49:21<1:00:41,  1.54it/s] 43%|████▎     | 4174/9770 [49:22<1:00:39,  1.54it/s] 43%|████▎     | 4175/9770 [49:23<1:00:39,  1.54it/s] 43%|████▎     | 4176/9770 [49:23<1:00:32,  1.54it/s] 43%|████▎     | 4177/9770 [49:24<1:00:19,  1.55it/s] 43%|██
+0: {'loss': 0.6832, 'grad_norm': 0.6980571092517832, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.43}
+0: {'loss': 0.6894, 'grad_norm': 0.6906019584574479, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.43}
+0: ██▎     | 4178/9770 [49:25<1:00:47,  1.53it/s] 43%|████▎     | 4179/9770 [49:25<1:00:42,  1.54it/s] 43%|████▎     | 4180/9770 [49:26<1:01:16,  1.52it/s]                                                      43%|████▎     | 4180/9770 [49:26<1:01:16,  1.52it/s] 43%|████▎     | 4181/9770 [49:27<1:01:48,  1.51it/s] 43%|████▎     | 4182/9770 [49:27<1:01:23,  1.52it/s] 43%|████▎     | 4183/9770 [49:28<1:01:30,  1.51it/s] 43%|████▎     | 4184/9770 [49:29<1:01:56,  1.50it/s] 43%|████▎     | 4185/9770 [49:29<1:02:54,  1.48it/s] 43%|████▎     | 4186/9770 [49:30<1:02:11,  1.50it/s] 43%|████▎     | 4187/9770 [49:31<1:01:12,  1.52it/s] 43%|████▎     | 4188/9770 [49:31<1:02:05,  1.50it/s] 43%|████▎     | 4189/9770 [49:32<1:00:42,  1.53it/s] 43%|████▎     | 4190/9770 [49:33<1:02:01,  1.50it/s]                                                      43%|████▎     | 41
+0: {'loss': 0.6495, 'grad_norm': 0.6284243646925342, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.43}
+0: 90/9770 [49:33<1:02:01,  1.50it/s] 43%|██��█▎     | 4191/9770 [49:33<1:01:55,  1.50it/s] 43%|████▎     | 4192/9770 [49:34<1:02:04,  1.50it/s] 43%|████▎     | 4193/9770 [49:35<1:01:36,  1.51it/s] 43%|████▎     | 4194/9770 [49:35<1:01:00,  1.52it/s] 43%|████▎     | 4195/9770 [49:36<1:01:08,  1.52it/s] 43%|████▎     | 4196/9770 [49:36<1:01:36,  1.51it/s] 43%|████▎     | 4197/9770 [49:37<1:01:25,  1.51it/s] 43%|████▎     | 4198/9770 [49:38<1:01:38,  1.51it/s] 43%|████▎     | 4199/9770 [49:38<1:01:42,  1.50it/s] 43%|████▎     | 4200/9770 [49:39<1:00:51,  1.53it/s]                                                      43%|████▎     | 4200/9770 [49:39<1:00:51,  1.53it/s] 43%|████▎     | 4201/9770 [49:40<1:01:31,  1.51it/s] 43%|████▎     | 4202/9770 [49:40<1:01:03,  1.52it/s] 43%|████▎     | 4203/9770 [49:41<1:01:18,  1.51it/s] 43%|████▎     | 4204/9770 [
+0: {'loss': 0.6508, 'grad_norm': 0.6186331595930812, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.43}
+0: 49:42<1:00:38,  1.53it/s] 43%|████▎     | 4205/9770 [49:42<1:01:01,  1.52it/s] 43%|████▎     | 4206/9770 [49:43<1:00:32,  1.53it/s] 43%|████▎     | 4207/9770 [49:44<1:00:09,  1.54it/s] 43%|████▎     | 4208/9770 [49:44<1:00:55,  1.52it/s] 43%|████▎     | 4209/9770 [49:45<1:01:27,  1.51it/s] 43%|████▎     | 4210/9770 [49:46<1:00:46,  1.52it/s]                                                      43%|████▎     | 4210/9770 [49:46<1:00:46,  1.52it/s] 43%|████▎     | 4211/9770 [49:46<1:02:29,  1.48it/s] 43%|████▎     | 4212/9770 [49:47<1:02:07,  1.49it/s] 43%|████▎     | 4213/9770 [49:48<1:02:00,  1.49it/s] 43%|████▎     | 4214/9770 [49:48<1:01:22,  1.51it/s] 43%|████▎     | 4215/9770 [49:49<1:01:18,  1.51it/s] 43%|████▎     | 4216/9770 [49:50<1:00:57,  1.52it/s] 43%|████▎     | 4217/9770 [49:50<1:01:02,  1.52it/s] 43%|████▎     | 4218/9770 [49:51<1:0
+0: {'loss': 0.6691, 'grad_norm': 0.675031932798919, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.43}
+0: {'loss': 0.6946, 'grad_norm': 0.6179850664736041, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.43}
+0: 0:34,  1.53it/s] 43%|████▎     | 4219/9770 [49:52<1:00:29,  1.53it/s] 43%|████▎     | 4220/9770 [49:52<1:01:43,  1.50it/s]                                                      43%|████▎     | 4220/9770 [49:52<1:01:43,  1.50it/s] 43%|████▎     | 4221/9770 [49:53<1:01:23,  1.51it/s] 43%|████▎     | 4222/9770 [49:54<1:01:59,  1.49it/s] 43%|████▎     | 4223/9770 [49:54<1:02:06,  1.49it/s] 43%|████▎     | 4224/9770 [49:55<1:01:36,  1.50it/s] 43%|████▎     | 4225/9770 [49:56<1:01:48,  1.50it/s] 43%|████▎     | 4226/9770 [49:56<1:02:37,  1.48it/s] 43%|████▎     | 4227/9770 [49:57<1:02:35,  1.48it/s] 43%|████▎     | 4228/9770 [49:58<1:03:23,  1.46it/s] 43%|████▎     | 4229/9770 [49:58<1:02:16,  1.48it/s] 43%|████▎     | 4230/9770 [49:59<1:01:49,  1.49it/s]                                                      43%|████▎     | 4230/9770 [49:59<1:01:49,  1.49it/s] 
+0: {'loss': 0.6883, 'grad_norm': 0.682126263648815, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.43}
+0: 43%|████▎     | 4231/9770 [50:00<1:02:09,  1.49it/s] 43%|████▎     | 4232/9770 [50:00<1:01:15,  1.51it/s] 43%|████▎     | 4233/9770 [50:01<1:02:34,  1.47it/s] 43%|████▎     | 4234/9770 [50:02<1:02:14,  1.48it/s] 43%|████▎     | 4235/9770 [50:02<1:01:47,  1.49it/s] 43%|████▎     | 4236/9770 [50:03<1:00:54,  1.51it/s] 43%|████▎     | 4237/9770 [50:04<1:00:55,  1.51it/s] 43%|████▎     | 4238/9770 [50:04<1:01:05,  1.51it/s] 43%|████▎     | 4239/9770 [50:05<1:01:04,  1.51it/s] 43%|████▎     | 4240/9770 [50:06<1:01:23,  1.50it/s]                                                      43%|████▎     | 4240/9770 [50:06<1:01:23,  1.50it/s] 43%|████▎     | 4241/9770 [50:06<1:02:35,  1.47it/s] 43%|████▎     | 4242/9770 [50:07<1:03:06,  1.46it/s] 43%|████▎     | 4243/9770 [50:08<1:03:41,  1.45it/s] 43%|████▎     | 4244/9770 [50:09<1:02:38,  1.47it/s] 43%|█�
+0: {'loss': 0.6824, 'grad_norm': 0.6093098914641012, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.43}
+0: �██▎     | 4245/9770 [50:09<1:02:47,  1.47it/s] 43%|████▎     | 4246/9770 [50:10<1:04:38,  1.42it/s] 43%|████▎     | 4247/9770 [50:11<1:03:00,  1.46it/s] 43%|████▎     | 4248/9770 [50:11<1:01:39,  1.49it/s] 43%|████▎     | 4249/9770 [50:12<1:01:55,  1.49it/s] 44%|████▎     | 4250/9770 [50:13<1:02:42,  1.47it/s]                                                      44%|████▎     | 4250/9770 [50:13<1:02:42,  1.47it/s] 44%|████▎     | 4251/9770 [50:14<1:11:11,  1.29it/s] 44%|████▎     | 4252/9770 [50:14<1:13:34,  1.25it/s] 44%|████▎     | 4253/9770 [50:16<1:24:33,  1.09it/s] 44%|████▎     | 4254/9770 [50:17<1:28:16,  1.04it/s] 44%|████▎     | 4255/9770 [50:18<1:25:39,  1.07it/s] 44%|████▎     | 4256/9770 [50:18<1:17:51,  1.18it/s] 44%|████▎     | 4257/9770 [50:20<1:33:23,  1.02s/it] 44%|████▎     | 4258/9770 [50:21<1:30:04,  1.02it/s] 44%|████�
+0: {'loss': 0.6668, 'grad_norm': 0.6636370947418444, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.44}
+0: {'loss': 0.6579, 'grad_norm': 0.6270054501030375, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.44}
+0: �     | 4259/9770 [50:22<1:33:33,  1.02s/it] 44%|████▎     | 4260/9770 [50:23<1:42:56,  1.12s/it]                                                      44%|████▎     | 4260/9770 [50:23<1:42:56,  1.12s/it] 44%|████▎     | 4261/9770 [50:24<1:31:14,  1.01it/s] 44%|████▎     | 4262/9770 [50:24<1:21:35,  1.13it/s] 44%|████▎     | 4263/9770 [50:25<1:15:38,  1.21it/s] 44%|████▎     | 4264/9770 [50:26<1:11:15,  1.29it/s] 44%|████▎     | 4265/9770 [50:26<1:07:11,  1.37it/s] 44%|████▎     | 4266/9770 [50:27<1:04:41,  1.42it/s] 44%|████▎     | 4267/9770 [50:28<1:04:14,  1.43it/s] 44%|████▎     | 4268/9770 [50:28<1:03:59,  1.43it/s] 44%|████▎     | 4269/9770 [50:29<1:02:19,  1.47it/s] 44%|████▎     | 4270/9770 [50:30<1:01:47,  1.48it/s]                                                      44%|████▎     | 4270/9770 [50:30<1:01:47,  1.48it/s] 44%|████▎     | 4271/9770 
+0: {'loss': 0.6578, 'grad_norm': 0.6622512603119264, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.44}
+0: [50:30<1:01:05,  1.50it/s] 44%|████▎     | 4272/9770 [50:31<1:01:01,  1.50it/s] 44%|████▎     | 4273/9770 [50:32<59:49,  1.53it/s]   44%|████▎     | 4274/9770 [50:32<59:29,  1.54it/s] 44%|████▍     | 4275/9770 [50:33<59:45,  1.53it/s] 44%|████▍     | 4276/9770 [50:34<1:00:30,  1.51it/s] 44%|████▍     | 4277/9770 [50:35<1:08:33,  1.34it/s] 44%|████▍     | 4278/9770 [50:35<1:12:41,  1.26it/s] 44%|████▍     | 4279/9770 [50:36<1:11:49,  1.27it/s] 44%|████▍     | 4280/9770 [50:37<1:12:32,  1.26it/s]                                                      44%|████▍     | 4280/9770 [50:37<1:12:32,  1.26it/s] 44%|████▍     | 4281/9770 [50:38<1:14:02,  1.24it/s] 44%|████▍     | 4282/9770 [50:39<1:15:37,  1.21it/s] 44%|████▍     | 4283/9770 [50:40<1:19:40,  1.15it/s] 44%|████▍     | 4284/9770 [50:41<1:22:35,  1.11it/s] 44%|████▍     | 4285/9770 [50:42<1:22:4
+0: {'loss': 0.6813, 'grad_norm': 0.6334268703476365, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.44}
+0: 5,  1.10it/s] 44%|████▍     | 4286/9770 [50:42<1:17:38,  1.18it/s] 44%|████▍     | 4287/9770 [50:43<1:20:53,  1.13it/s] 44%|████▍     | 4288/9770 [50:44<1:19:21,  1.15it/s] 44%|████▍     | 4289/9770 [50:47<2:04:44,  1.37s/it] 44%|████▍     | 4290/9770 [50:49<2:23:13,  1.57s/it]                                                      44%|████▍     | 4290/9770 [50:49<2:23:13,  1.57s/it] 44%|████▍     | 4291/9770 [50:49<2:02:33,  1.34s/it] 44%|████▍     | 4292/9770 [50:50<1:50:55,  1.21s/it] 44%|████▍     | 4293/9770 [50:52<1:49:47,  1.20s/it] 44%|████▍     | 4294/9770 [50:53<1:42:54,  1.13s/it] 44%|████▍     | 4295/9770 [50:54<1:43:31,  1.13s/it] 44%|████▍     | 4296/9770 [50:54<1:35:27,  1.05s/it] 44%|████▍     | 4297/9770 [50:56<2:00:06,  1.32s/it] 44%|████▍     | 4298/9770 [50:58<1:58:32,  1.30s/it] 44%|████▍     | 4299/9770 [50:59<1:47:05,  1.17s
+0: {'loss': 0.6973, 'grad_norm': 0.6113929859192986, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.44}
+0: {'loss': 0.648, 'grad_norm': 0.6182785212815289, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.44}
+0: /it] 44%|████▍     | 4300/9770 [50:59<1:36:38,  1.06s/it]                                                      44%|████▍     | 4300/9770 [50:59<1:36:38,  1.06s/it] 44%|████▍     | 4301/9770 [51:00<1:27:39,  1.04it/s] 44%|████▍     | 4302/9770 [51:01<1:26:00,  1.06it/s] 44%|████▍     | 4303/9770 [51:02<1:22:33,  1.10it/s] 44%|████▍     | 4304/9770 [51:03<1:17:47,  1.17it/s] 44%|████▍     | 4305/9770 [51:03<1:15:30,  1.21it/s] 44%|████▍     | 4306/9770 [51:04<1:20:09,  1.14it/s] 44%|████▍     | 4307/9770 [51:06<1:43:55,  1.14s/it] 44%|████▍     | 4308/9770 [51:07<1:39:20,  1.09s/it] 44%|████▍     | 4309/9770 [51:08<1:32:15,  1.01s/it] 44%|████▍     | 4310/9770 [51:09<1:27:33,  1.04it/s]                                                      44%|████▍     | 4310/9770 [51:09<1:27:33,  1.04it/s] 44%|████▍     | 4311/9770 [51:10<1:24:25,  1.08it/s] 44%|██�
+0: {'loss': 0.6631, 'grad_norm': 0.5824695037322758, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.44}
+0: �█▍     | 4312/9770 [51:10<1:17:26,  1.17it/s] 44%|████▍     | 4313/9770 [51:11<1:20:43,  1.13it/s] 44%|████▍     | 4314/9770 [51:12<1:17:32,  1.17it/s] 44%|████▍     | 4315/9770 [51:13<1:15:38,  1.20it/s] 44%|████▍     | 4316/9770 [51:14<1:16:22,  1.19it/s] 44%|████▍     | 4317/9770 [51:15<1:28:32,  1.03it/s] 44%|████▍     | 4318/9770 [51:16<1:33:06,  1.02s/it] 44%|████▍     | 4319/9770 [51:17<1:29:53,  1.01it/s] 44%|████▍     | 4320/9770 [51:18<1:22:25,  1.10it/s]                                                      44%|████▍     | 4320/9770 [51:18<1:22:25,  1.10it/s] 44%|████▍     | 4321/9770 [51:19<1:24:26,  1.08it/s] 44%|████▍     | 4322/9770 [51:20<1:21:47,  1.11it/s] 44%|████▍     | 4323/9770 [51:20<1:18:27,  1.16it/s] 44%|████▍     | 4324/9770 [51:21<1:15:26,  1.20it/s] 44%|████▍     | 4325/9770 [51:22<1:16:29,  1.19it/s] 44%|████▍  
+0: {'loss': 0.6894, 'grad_norm': 0.6206395447552205, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.44}
+0:    | 4326/9770 [51:23<1:16:19,  1.19it/s] 44%|████▍     | 4327/9770 [51:24<1:16:28,  1.19it/s] 44%|████▍     | 4328/9770 [51:24<1:16:55,  1.18it/s] 44%|████▍     | 4329/9770 [51:25<1:16:41,  1.18it/s] 44%|████▍     | 4330/9770 [51:26<1:14:21,  1.22it/s]                                                      44%|████▍     | 4330/9770 [51:26<1:14:21,  1.22it/s] 44%|████▍     | 4331/9770 [51:27<1:10:22,  1.29it/s] 44%|████▍     | 4332/9770 [51:28<1:10:54,  1.28it/s] 44%|████▍     | 4333/9770 [51:28<1:09:21,  1.31it/s] 44%|████▍     | 4334/9770 [51:29<1:08:14,  1.33it/s] 44%|████▍     | 4335/9770 [51:30<1:05:40,  1.38it/s] 44%|████▍     | 4336/9770 [51:30<1:03:59,  1.42it/s] 44%|████▍     | 4337/9770 [51:31<1:02:42,  1.44it/s] 44%|████▍     | 4338/9770 [51:32<1:03:40,  1.42it/s] 44%|████▍     | 4339/9770 [51:32<1:04:00,  1.41it/s] 44%|████▍     | 4340
+0: {'loss': 0.6739, 'grad_norm': 0.6925600692794549, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.44}
+0: {'loss': 0.6981, 'grad_norm': 0.6618428618174739, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.45}
+0: /9770 [51:33<1:03:51,  1.42it/s]                                                      44%|████▍     | 4340/9770 [51:33<1:03:51,  1.42it/s] 44%|████▍     | 4341/9770 [51:34<1:04:28,  1.40it/s] 44%|████▍     | 4342/9770 [51:35<1:03:38,  1.42it/s] 44%|████▍     | 4343/9770 [51:35<1:03:41,  1.42it/s] 44%|████▍     | 4344/9770 [51:36<1:02:45,  1.44it/s] 44%|████▍     | 4345/9770 [51:37<1:01:03,  1.48it/s] 44%|████▍     | 4346/9770 [51:37<1:01:26,  1.47it/s] 44%|████▍     | 4347/9770 [51:38<1:00:29,  1.49it/s] 45%|████▍     | 4348/9770 [51:39<1:02:42,  1.44it/s] 45%|████▍     | 4349/9770 [51:39<1:03:39,  1.42it/s] 45%|████▍     | 4350/9770 [51:40<1:01:55,  1.46it/s]                                                      45%|████▍     | 4350/9770 [51:40<1:01:55,  1.46it/s] 45%|████▍     | 4351/9770 [51:41<1:01:13,  1.48it/s] 45%|████▍     | 4352/9770 [51:41<1:01:
+0: {'loss': 0.6851, 'grad_norm': 0.5816948319590163, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.45}
+0: 53,  1.46it/s] 45%|████▍     | 4353/9770 [51:42<1:01:21,  1.47it/s] 45%|████▍     | 4354/9770 [51:43<1:00:57,  1.48it/s] 45%|████▍     | 4355/9770 [51:43<1:00:09,  1.50it/s] 45%|████▍     | 4356/9770 [51:44<1:00:20,  1.50it/s] 45%|████▍     | 4357/9770 [51:45<1:02:10,  1.45it/s] 45%|████▍     | 4358/9770 [51:46<1:05:54,  1.37it/s] 45%|████▍     | 4359/9770 [51:46<1:09:35,  1.30it/s] 45%|████▍     | 4360/9770 [51:47<1:06:59,  1.35it/s]                                                      45%|████▍     | 4360/9770 [51:47<1:06:59,  1.35it/s] 45%|████▍     | 4361/9770 [51:48<1:06:44,  1.35it/s] 45%|████▍     | 4362/9770 [51:49<1:06:10,  1.36it/s] 45%|████▍     | 4363/9770 [51:49<1:04:29,  1.40it/s] 45%|████▍     | 4364/9770 [51:50<1:06:34,  1.35it/s] 45%|████▍     | 4365/9770 [51:51<1:14:02,  1.22it/s] 45%|████▍     | 4366/9770 [51:52<1:14:34,  1.21
+0: {'loss': 0.6655, 'grad_norm': 0.6508653574816292, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.45}
+0: it/s] 45%|████▍     | 4367/9770 [51:53<1:12:07,  1.25it/s] 45%|████▍     | 4368/9770 [51:53<1:07:56,  1.33it/s] 45%|████▍     | 4369/9770 [51:54<1:04:46,  1.39it/s] 45%|████▍     | 4370/9770 [51:55<1:03:24,  1.42it/s]                                                      45%|████▍     | 4370/9770 [51:55<1:03:24,  1.42it/s] 45%|████▍     | 4371/9770 [51:55<1:03:37,  1.41it/s] 45%|████▍     | 4372/9770 [51:56<1:02:47,  1.43it/s] 45%|████▍     | 4373/9770 [51:57<1:02:21,  1.44it/s] 45%|████▍     | 4374/9770 [51:57<1:01:08,  1.47it/s] 45%|████▍     | 4375/9770 [51:58<1:00:40,  1.48it/s] 45%|████▍     | 4376/9770 [51:59<1:00:22,  1.49it/s] 45%|████▍     | 4377/9770 [51:59<1:00:00,  1.50it/s] 45%|████▍     | 4378/9770 [52:00<59:33,  1.51it/s]   45%|████▍     | 4379/9770 [52:01<59:25,  1.51it/s] 45%|████▍     | 4380/9770 [52:01<58:47,  1.53it/s]       
+0: {'loss': 0.6619, 'grad_norm': 0.5965585914970666, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.45}
+0: {'loss': 0.6798, 'grad_norm': 0.6020787042821834, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.45}
+0:                                              45%|████▍     | 4380/9770 [52:01<58:47,  1.53it/s] 45%|████▍     | 4381/9770 [52:02<58:37,  1.53it/s] 45%|████▍     | 4382/9770 [52:03<58:40,  1.53it/s] 45%|████▍     | 4383/9770 [52:03<1:01:25,  1.46it/s] 45%|████▍     | 4384/9770 [52:04<1:02:18,  1.44it/s] 45%|████▍     | 4385/9770 [52:05<1:01:57,  1.45it/s] 45%|████▍     | 4386/9770 [52:05<1:01:20,  1.46it/s] 45%|████▍     | 4387/9770 [52:06<1:00:40,  1.48it/s] 45%|████▍     | 4388/9770 [52:07<1:00:16,  1.49it/s] 45%|████▍     | 4389/9770 [52:07<59:26,  1.51it/s]   45%|████▍     | 4390/9770 [52:08<59:46,  1.50it/s]                                                    45%|████▍     | 4390/9770 [52:08<59:46,  1.50it/s] 45%|████▍     | 4391/9770 [52:09<1:00:10,  1.49it/s] 45%|████▍     | 4392/9770 [52:09<59:27,  1.51it/s]   45%|████▍     | 4393/9770 [5
+0: {'loss': 0.6616, 'grad_norm': 0.6641873635664651, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.45}
+0: 2:10<59:18,  1.51it/s] 45%|████▍     | 4394/9770 [52:11<59:12,  1.51it/s] 45%|████▍     | 4395/9770 [52:11<1:00:09,  1.49it/s] 45%|████▍     | 4396/9770 [52:12<1:00:37,  1.48it/s] 45%|████▌     | 4397/9770 [52:13<1:00:18,  1.48it/s] 45%|████▌     | 4398/9770 [52:13<1:00:11,  1.49it/s] 45%|████▌     | 4399/9770 [52:14<1:00:00,  1.49it/s] 45%|████▌     | 4400/9770 [52:15<58:44,  1.52it/s]                                                      45%|████▌     | 4400/9770 [52:15<58:44,  1.52it/s] 45%|████▌     | 4401/9770 [52:15<59:03,  1.52it/s] 45%|████▌     | 4402/9770 [52:16<1:00:15,  1.48it/s] 45%|████▌     | 4403/9770 [52:17<1:00:33,  1.48it/s] 45%|████▌     | 4404/9770 [52:17<59:59,  1.49it/s]   45%|████���     | 4405/9770 [52:18<59:41,  1.50it/s] 45%|████▌     | 4406/9770 [52:19<1:00:18,  1.48it/s] 45%|████▌     | 4407/9770 [52:19<1:00:19,  1.48it
+0: {'loss': 0.6962, 'grad_norm': 0.6255868311306785, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.45}
+0: {'loss': 0.6611, 'grad_norm': 0.6474186015364907, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.45}
+0: /s] 45%|████▌     | 4408/9770 [52:20<59:28,  1.50it/s]   45%|████▌     | 4409/9770 [52:21<59:28,  1.50it/s] 45%|████▌     | 4410/9770 [52:21<59:01,  1.51it/s]                                                    45%|████▌     | 4410/9770 [52:21<59:01,  1.51it/s] 45%|████▌     | 4411/9770 [52:22<58:59,  1.51it/s] 45%|████▌     | 4412/9770 [52:23<59:16,  1.51it/s] 45%|████▌     | 4413/9770 [52:23<59:14,  1.51it/s] 45%|████▌     | 4414/9770 [52:24<59:35,  1.50it/s] 45%|████▌     | 4415/9770 [52:25<59:04,  1.51it/s] 45%|████▌     | 4416/9770 [52:25<58:23,  1.53it/s] 45%|████▌     | 4417/9770 [52:26<58:58,  1.51it/s] 45%|████▌     | 4418/9770 [52:27<58:28,  1.53it/s] 45%|████▌     | 4419/9770 [52:27<58:36,  1.52it/s] 45%|████▌     | 4420/9770 [52:28<58:31,  1.52it/s]                                                    45%|████▌     | 4420/9770 [52:28<
+0: {'loss': 0.6715, 'grad_norm': 0.6228236916669737, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.45}
+0: 58:31,  1.52it/s] 45%|████▌     | 4421/9770 [52:29<58:17,  1.53it/s] 45%|████▌     | 4422/9770 [52:29<58:32,  1.52it/s] 45%|████▌     | 4423/9770 [52:30<59:04,  1.51it/s] 45%|████▌     | 4424/9770 [52:31<58:31,  1.52it/s] 45%|████▌     | 4425/9770 [52:31<57:58,  1.54it/s] 45%|████▌     | 4426/9770 [52:32<57:32,  1.55it/s] 45%|████▌     | 4427/9770 [52:33<58:57,  1.51it/s] 45%|████▌     | 4428/9770 [52:33<58:50,  1.51it/s] 45%|████▌     | 4429/9770 [52:34<58:31,  1.52it/s] 45%|████▌     | 4430/9770 [52:34<58:26,  1.52it/s]                                                    45%|████▌     | 4430/9770 [52:35<58:26,  1.52it/s] 45%|████▌     | 4431/9770 [52:35<59:01,  1.51it/s] 45%|████▌     | 4432/9770 [52:36<58:48,  1.51it/s] 45%|████▌     | 4433/9770 [52:37<59:01,  1.51it/s] 45%|████▌     | 4434/9770 [52:37<59:30,  1.49it/s] 45%|████▌   
+0: {'loss': 0.6592, 'grad_norm': 0.6441811924703615, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.45}
+0:   | 4435/9770 [52:38<59:36,  1.49it/s] 45%|████▌     | 4436/9770 [52:39<1:00:22,  1.47it/s] 45%|████▌     | 4437/9770 [52:39<1:00:39,  1.47it/s] 45%|████▌     | 4438/9770 [52:40<1:00:09,  1.48it/s] 45%|████▌     | 4439/9770 [52:41<59:53,  1.48it/s]   45%|████▌     | 4440/9770 [52:41<59:14,  1.50it/s]                                                    45%|████▌     | 4440/9770 [52:41<59:14,  1.50it/s] 45%|████▌     | 4441/9770 [52:42<59:05,  1.50it/s] 45%|████▌     | 4442/9770 [52:43<58:14,  1.52it/s] 45%|████▌     | 4443/9770 [52:43<58:57,  1.51it/s] 45%|████▌     | 4444/9770 [52:44<58:31,  1.52it/s] 45%|████▌     | 4445/9770 [52:45<59:46,  1.48it/s] 46%|████▌     | 4446/9770 [52:45<59:58,  1.48it/s] 46%|████▌     | 4447/9770 [52:46<59:48,  1.48it/s] 46%|████▌     | 4448/9770 [52:47<58:30,  1.52it/s] 46%|████▌     | 4449/9770 [52:47<58:38,  1.51
+0: {'loss': 0.6899, 'grad_norm': 0.6290440248726958, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.46}
+0: {'loss': 0.6848, 'grad_norm': 0.6456153704043983, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.46}
+0: it/s] 46%|████▌     | 4450/9770 [52:48<58:40,  1.51it/s]                                                    46%|████▌     | 4450/9770 [52:48<58:40,  1.51it/s] 46%|████▌     | 4451/9770 [52:49<58:38,  1.51it/s] 46%|████▌     | 4452/9770 [52:49<58:50,  1.51it/s] 46%|████▌     | 4453/9770 [52:50<58:17,  1.52it/s] 46%|████▌     | 4454/9770 [52:50<57:58,  1.53it/s] 46%|████▌     | 4455/9770 [52:51<58:03,  1.53it/s] 46%|████▌     | 4456/9770 [52:52<57:58,  1.53it/s] 46%|████▌     | 4457/9770 [52:52<58:11,  1.52it/s] 46%|████▌     | 4458/9770 [52:53<57:39,  1.54it/s] 46%|████▌     | 4459/9770 [52:54<57:38,  1.54it/s] 46%|████▌     | 4460/9770 [52:54<57:55,  1.53it/s]                                                    46%|████▌     | 4460/9770 [52:54<57:55,  1.53it/s] 46%|████▌     | 4461/9770 [52:55<58:21,  1.52it/s] 46%|████▌     | 4462/9770 [52:56<
+0: {'loss': 0.6663, 'grad_norm': 0.6714969665725109, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.46}
+0: 58:34,  1.51it/s] 46%|████▌     | 4463/9770 [52:56<58:56,  1.50it/s] 46%|████▌     | 4464/9770 [52:57<58:20,  1.52it/s] 46%|████▌     | 4465/9770 [52:58<57:51,  1.53it/s] 46%|████▌     | 4466/9770 [52:58<57:24,  1.54it/s] 46%|████▌     | 4467/9770 [52:59<57:43,  1.53it/s] 46%|████▌     | 4468/9770 [53:00<57:45,  1.53it/s] 46%|████▌     | 4469/9770 [53:00<57:02,  1.55it/s] 46%|████▌     | 4470/9770 [53:01<57:09,  1.55it/s]                                                    46%|████▌     | 4470/9770 [53:01<57:09,  1.55it/s] 46%|████▌     | 4471/9770 [53:02<57:23,  1.54it/s] 46%|████▌     | 4472/9770 [53:02<57:36,  1.53it/s] 46%|████▌     | 4473/9770 [53:03<57:34,  1.53it/s] 46%|████▌     | 4474/9770 [53:04<57:26,  1.54it/s] 46%|████▌     | 4475/9770 [53:04<57:24,  1.54it/s] 46%|████▌     | 4476/9770 [53:05<58:20,  1.51it/s] 46%|████▌   
+0: {'loss': 0.6888, 'grad_norm': 0.648316454573057, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.46}
+0: {'loss': 0.6643, 'grad_norm': 0.7099196805058802, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.46}
+0:   | 4477/9770 [53:06<58:30,  1.51it/s] 46%|████▌     | 4478/9770 [53:06<58:56,  1.50it/s] 46%|████▌     | 4479/9770 [53:07<58:34,  1.51it/s] 46%|████▌     | 4480/9770 [53:08<58:18,  1.51it/s]                                                    46%|████▌     | 4480/9770 [53:08<58:18,  1.51it/s] 46%|████▌     | 4481/9770 [53:08<58:19,  1.51it/s] 46%|████▌     | 4482/9770 [53:09<58:21,  1.51it/s] 46%|████▌     | 4483/9770 [53:10<58:35,  1.50it/s] 46%|████▌     | 4484/9770 [53:10<59:29,  1.48it/s] 46%|████▌     | 4485/9770 [53:11<58:45,  1.50it/s] 46%|████▌     | 4486/9770 [53:12<59:03,  1.49it/s] 46%|████▌     | 4487/9770 [53:12<58:28,  1.51it/s] 46%|████▌     | 4488/9770 [53:13<58:27,  1.51it/s] 46%|████▌     | 4489/9770 [53:14<58:29,  1.50it/s] 46%|████▌     | 4490/9770 [53:14<58:22,  1.51it/s]                                                    46%|██
+0: {'loss': 0.6844, 'grad_norm': 0.6431119880934836, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.46}
+0: ██▌     | 4490/9770 [53:14<58:22,  1.51it/s] 46%|████▌     | 4491/9770 [53:15<57:59,  1.52it/s] 46%|████▌     | 4492/9770 [53:16<58:21,  1.51it/s] 46%|████▌     | 4493/9770 [53:16<58:21,  1.51it/s] 46%|████▌     | 4494/9770 [53:17<58:32,  1.50it/s] 46%|████▌     | 4495/9770 [53:18<58:25,  1.50it/s] 46%|████▌     | 4496/9770 [53:18<58:08,  1.51it/s] 46%|████▌     | 4497/9770 [53:19<58:13,  1.51it/s] 46%|████▌     | 4498/9770 [53:20<58:34,  1.50it/s] 46%|████▌     | 4499/9770 [53:20<58:30,  1.50it/s] 46%|████▌     | 4500/9770 [53:21<57:59,  1.51it/s]                                                    46%|████▌     | 4500/9770 [53:21<57:59,  1.51it/s] 46%|████▌     | 4501/9770 [53:22<1:01:13,  1.43it/s] 46%|████▌     | 4502/9770 [53:22<1:00:12,  1.46it/s] 46%|████▌     | 4503/9770 [53:23<59:36,  1.47it/s]   46%|████▌     | 4504/9770 [53:24<59
+0: {'loss': 0.6672, 'grad_norm': 0.6170072839522381, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.46}
+0: :31,  1.47it/s] 46%|████▌     | 4505/9770 [53:24<58:24,  1.50it/s] 46%|████▌     | 4506/9770 [53:25<58:13,  1.51it/s] 46%|████▌     | 4507/9770 [53:26<58:12,  1.51it/s] 46%|████▌     | 4508/9770 [53:26<58:05,  1.51it/s] 46%|████▌     | 4509/9770 [53:27<58:18,  1.50it/s] 46%|████▌     | 4510/9770 [53:28<57:41,  1.52it/s]                                                    46%|████▌     | 4510/9770 [53:28<57:41,  1.52it/s] 46%|████▌     | 4511/9770 [53:28<58:00,  1.51it/s] 46%|████▌     | 4512/9770 [53:29<57:40,  1.52it/s] 46%|████▌     | 4513/9770 [53:30<58:11,  1.51it/s] 46%|████▌     | 4514/9770 [53:30<57:59,  1.51it/s] 46%|████▌     | 4515/9770 [53:31<57:20,  1.53it/s] 46%|████▌     | 4516/9770 [53:32<58:47,  1.49it/s] 46%|████▌     | 4517/9770 [53:32<58:06,  1.51it/s] 46%|████▌     | 4518/9770 [53:33<58:20,  1.50it/s] 46%|████▋     
+0: {'loss': 0.6677, 'grad_norm': 0.6134101429861566, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.46}
+0: {'loss': 0.6689, 'grad_norm': 0.627968441743086, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.46}
+0: | 4519/9770 [53:34<58:21,  1.50it/s] 46%|████▋     | 4520/9770 [53:34<58:22,  1.50it/s]                                                    46%|████▋     | 4520/9770 [53:34<58:22,  1.50it/s] 46%|████▋     | 4521/9770 [53:35<57:36,  1.52it/s] 46%|████▋     | 4522/9770 [53:36<57:36,  1.52it/s] 46%|████▋     | 4523/9770 [53:36<57:18,  1.53it/s] 46%|████▋     | 4524/9770 [53:37<57:39,  1.52it/s] 46%|████▋     | 4525/9770 [53:37<57:50,  1.51it/s] 46%|████▋     | 4526/9770 [53:38<57:35,  1.52it/s] 46%|████▋     | 4527/9770 [53:39<57:34,  1.52it/s] 46%|████▋     | 4528/9770 [53:39<57:28,  1.52it/s] 46%|████▋     | 4529/9770 [53:40<56:43,  1.54it/s] 46%|████▋     | 4530/9770 [53:41<57:05,  1.53it/s]                                                    46%|████▋     | 4530/9770 [53:41<57:05,  1.53it/s] 46%|████▋     | 4531/9770 [53:41<57:22,  1.52it/s] 46%|██�
+0: {'loss': 0.6532, 'grad_norm': 0.6286045234738764, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.46}
+0: �█▋     | 4532/9770 [53:42<57:19,  1.52it/s] 46%|████▋     | 4533/9770 [53:43<56:47,  1.54it/s] 46%|████▋     | 4534/9770 [53:43<57:04,  1.53it/s] 46%|████▋     | 4535/9770 [53:44<57:28,  1.52it/s] 46%|████▋     | 4536/9770 [53:45<57:37,  1.51it/s] 46%|████▋     | 4537/9770 [53:45<57:00,  1.53it/s] 46%|████▋     | 4538/9770 [53:46<57:16,  1.52it/s] 46%|████▋     | 4539/9770 [53:47<57:13,  1.52it/s] 46%|████▋     | 4540/9770 [53:47<57:22,  1.52it/s]                                                    46%|████▋     | 4540/9770 [53:47<57:22,  1.52it/s] 46%|████▋     | 4541/9770 [53:48<57:34,  1.51it/s] 46%|████▋     | 4542/9770 [53:49<57:53,  1.51it/s] 46%|████▋     | 4543/9770 [53:49<57:42,  1.51it/s] 47%|████▋     | 4544/9770 [53:50<57:12,  1.52it/s] 47%|████▋     | 4545/9770 [53:51<57:07,  1.52it/s] 47%|████▋     | 4546/9770 [53:51<57:32,  1.
+0: {'loss': 0.6694, 'grad_norm': 0.6455909292966766, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.47}
+0: 51it/s] 47%|████▋     | 4547/9770 [53:52<57:46,  1.51it/s] 47%|████▋     | 4548/9770 [53:53<58:02,  1.50it/s] 47%|████▋     | 4549/9770 [53:53<57:38,  1.51it/s] 47%|████▋     | 4550/9770 [53:54<57:47,  1.51it/s]                                                    47%|████▋     | 4550/9770 [53:54<57:47,  1.51it/s] 47%|████▋     | 4551/9770 [53:55<57:41,  1.51it/s] 47%|████▋     | 4552/9770 [53:55<57:35,  1.51it/s] 47%|████▋     | 4553/9770 [53:56<57:10,  1.52it/s] 47%|████▋     | 4554/9770 [53:57<57:25,  1.51it/s] 47%|████▋     | 4555/9770 [53:57<56:42,  1.53it/s] 47%|████▋     | 4556/9770 [53:58<57:06,  1.52it/s] 47%|████▋     | 4557/9770 [53:59<57:51,  1.50it/s] 47%|████▋     | 4558/9770 [53:59<57:45,  1.50it/s] 47%|████▋     | 4559/9770 [54:00<57:21,  1.51it/s] 47%|████▋     | 4560/9770 [54:01<57:44,  1.50it/s]                                 
+0: {'loss': 0.6683, 'grad_norm': 0.6539358209451853, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.47}
+0: {'loss': 0.6696, 'grad_norm': 0.6053304847937633, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.47}
+0:                    47%|████▋     | 4560/9770 [54:01<57:44,  1.50it/s] 47%|████▋     | 4561/9770 [54:01<57:41,  1.50it/s] 47%|████▋     | 4562/9770 [54:02<57:42,  1.50it/s] 47%|████▋     | 4563/9770 [54:03<58:20,  1.49it/s] 47%|████▋     | 4564/9770 [54:03<58:17,  1.49it/s] 47%|████▋     | 4565/9770 [54:04<58:04,  1.49it/s] 47%|████▋     | 4566/9770 [54:05<58:20,  1.49it/s] 47%|████▋     | 4567/9770 [54:05<58:26,  1.48it/s] 47%|████▋     | 4568/9770 [54:06<58:50,  1.47it/s] 47%|████▋     | 4569/9770 [54:07<58:09,  1.49it/s] 47%|████▋     | 4570/9770 [54:07<57:26,  1.51it/s]                                                    47%|████▋     | 4570/9770 [54:07<57:26,  1.51it/s] 47%|████▋     | 4571/9770 [54:08<56:33,  1.53it/s] 47%|████▋     | 4572/9770 [54:09<56:24,  1.54it/s] 47%|████▋     | 4573/9770 [54:09<56:56,  1.52it/s] 47%|████▋ 
+0: {'loss': 0.6602, 'grad_norm': 0.6293399451600545, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.47}
+0:     | 4574/9770 [54:10<57:41,  1.50it/s] 47%|████▋     | 4575/9770 [54:11<57:30,  1.51it/s] 47%|████▋     | 4576/9770 [54:11<57:20,  1.51it/s] 47%|████▋     | 4577/9770 [54:12<58:24,  1.48it/s] 47%|████▋     | 4578/9770 [54:13<58:01,  1.49it/s] 47%|████▋     | 4579/9770 [54:13<57:41,  1.50it/s] 47%|████▋     | 4580/9770 [54:14<57:02,  1.52it/s]                                                    47%|████▋     | 4580/9770 [54:14<57:02,  1.52it/s] 47%|████▋     | 4581/9770 [54:15<57:08,  1.51it/s] 47%|████▋     | 4582/9770 [54:15<57:27,  1.50it/s] 47%|████▋     | 4583/9770 [54:16<57:15,  1.51it/s] 47%|████▋     | 4584/9770 [54:17<56:39,  1.53it/s] 47%|████▋     | 4585/9770 [54:17<56:16,  1.54it/s] 47%|████▋     | 4586/9770 [54:18<55:57,  1.54it/s] 47%|████▋     | 4587/9770 [54:18<56:28,  1.53it/s] 47%|████▋     | 4588/9770 [54:19<57:42,  1.50it/s]
+0: {'loss': 0.665, 'grad_norm': 0.6855893782011705, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.47}
+0: {'loss': 0.6849, 'grad_norm': 0.6086835789368573, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.47}
+0:  47%|████▋     | 4589/9770 [54:20<57:28,  1.50it/s] 47%|████▋     | 4590/9770 [54:21<57:46,  1.49it/s]                                                    47%|████▋     | 4590/9770 [54:21<57:46,  1.49it/s] 47%|████▋     | 4591/9770 [54:21<57:25,  1.50it/s] 47%|████▋     | 4592/9770 [54:22<57:26,  1.50it/s] 47%|████▋     | 4593/9770 [54:22<57:20,  1.50it/s] 47%|████▋     | 4594/9770 [54:23<57:05,  1.51it/s] 47%|████▋     | 4595/9770 [54:24<57:00,  1.51it/s] 47%|████▋     | 4596/9770 [54:24<57:17,  1.51it/s] 47%|████▋     | 4597/9770 [54:25<56:55,  1.51it/s] 47%|████▋     | 4598/9770 [54:26<56:43,  1.52it/s] 47%|████▋     | 4599/9770 [54:26<57:05,  1.51it/s] 47%|████▋     | 4600/9770 [54:27<56:44,  1.52it/s]                                                    47%|████▋     | 4600/9770 [54:27<56:44,  1.52it/s] 47%|████▋     | 4601/9770 [54:28<57:10,
+0: {'loss': 0.6674, 'grad_norm': 0.6158777590905761, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.47}
+0:   1.51it/s] 47%|████▋     | 4602/9770 [54:28<57:27,  1.50it/s] 47%|████▋     | 4603/9770 [54:29<56:28,  1.52it/s] 47%|████▋     | 4604/9770 [54:30<56:09,  1.53it/s] 47%|████▋     | 4605/9770 [54:30<56:39,  1.52it/s] 47%|████▋     | 4606/9770 [54:31<56:49,  1.51it/s] 47%|████▋     | 4607/9770 [54:32<56:53,  1.51it/s] 47%|████▋     | 4608/9770 [54:32<57:06,  1.51it/s] 47%|████▋     | 4609/9770 [54:33<56:18,  1.53it/s] 47%|████▋     | 4610/9770 [54:34<56:49,  1.51it/s]                                                    47%|████▋     | 4610/9770 [54:34<56:49,  1.51it/s] 47%|████▋     | 4611/9770 [54:34<57:00,  1.51it/s] 47%|████▋     | 4612/9770 [54:35<58:24,  1.47it/s] 47%|████▋     | 4613/9770 [54:36<58:25,  1.47it/s] 47%|████▋     | 4614/9770 [54:36<57:50,  1.49it/s] 47%|████▋     | 4615/9770 [54:37<57:53,  1.48it/s] 47%|████▋     | 46
+0: {'loss': 0.6723, 'grad_norm': 0.6327481164496808, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.47}
+0: 16/9770 [54:38<57:24,  1.50it/s] 47%|████▋     | 4617/9770 [54:38<57:10,  1.50it/s] 47%|████▋     | 4618/9770 [54:39<56:55,  1.51it/s] 47%|████▋     | 4619/9770 [54:40<56:39,  1.52it/s] 47%|████▋     | 4620/9770 [54:40<57:33,  1.49it/s]                                                    47%|████▋     | 4620/9770 [54:40<57:33,  1.49it/s] 47%|████▋     | 4621/9770 [54:41<57:02,  1.50it/s] 47%|████▋     | 4622/9770 [54:42<57:38,  1.49it/s] 47%|████▋     | 4623/9770 [54:42<57:33,  1.49it/s] 47%|████▋     | 4624/9770 [54:43<57:13,  1.50it/s] 47%|████▋     | 4625/9770 [54:44<56:52,  1.51it/s] 47%|████▋     | 4626/9770 [54:44<57:14,  1.50it/s] 47%|████▋     | 4627/9770 [54:45<57:23,  1.49it/s] 47%|████▋     | 4628/9770 [54:46<57:12,  1.50it/s] 47%|████▋     | 4629/9770 [54:46<56:19,  1.52it/s] 47%|████▋     | 4630/9770 [54:47<56:40,  1.51it/s]        
+0: {'loss': 0.676, 'grad_norm': 0.6293183743531868, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.47}
+0: {'loss': 0.6857, 'grad_norm': 0.5956922279829752, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.47}
+0:                                             47%|████▋     | 4630/9770 [54:47<56:40,  1.51it/s] 47%|████▋     | 4631/9770 [54:48<57:16,  1.50it/s] 47%|████▋     | 4632/9770 [54:48<56:45,  1.51it/s] 47%|████▋     | 4633/9770 [54:49<56:14,  1.52it/s] 47%|████▋     | 4634/9770 [54:50<55:56,  1.53it/s] 47%|████▋     | 4635/9770 [54:50<56:02,  1.53it/s] 47%|████▋     | 4636/9770 [54:51<56:07,  1.52it/s] 47%|████▋     | 4637/9770 [54:52<56:22,  1.52it/s] 47%|████▋     | 4638/9770 [54:52<56:58,  1.50it/s] 47%|████▋     | 4639/9770 [54:53<56:36,  1.51it/s] 47%|████▋     | 4640/9770 [54:54<55:52,  1.53it/s]                                                    47%|████▋     | 4640/9770 [54:54<55:52,  1.53it/s] 48%|████▊     | 4641/9770 [54:54<57:23,  1.49it/s] 48%|████▊     | 4642/9770 [54:55<56:11,  1.52it/s] 48%|████▊     | 4643/9770 [54:56<56:44,  1.51it
+0: {'loss': 0.6889, 'grad_norm': 0.5980974658015646, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.48}
+0: /s] 48%|████▊     | 4644/9770 [54:56<56:15,  1.52it/s] 48%|████▊     | 4645/9770 [54:57<56:26,  1.51it/s] 48%|████▊     | 4646/9770 [54:58<56:13,  1.52it/s] 48%|████▊     | 4647/9770 [54:58<56:36,  1.51it/s] 48%|████▊     | 4648/9770 [54:59<56:38,  1.51it/s] 48%|████▊     | 4649/9770 [55:00<55:49,  1.53it/s] 48%|████▊     | 4650/9770 [55:00<56:39,  1.51it/s]                                                    48%|████▊     | 4650/9770 [55:00<56:39,  1.51it/s] 48%|████▊     | 4651/9770 [55:01<56:27,  1.51it/s] 48%|████▊     | 4652/9770 [55:02<56:29,  1.51it/s] 48%|████▊     | 4653/9770 [55:02<55:45,  1.53it/s] 48%|████▊     | 4654/9770 [55:03<56:18,  1.51it/s] 48%|████▊     | 4655/9770 [55:04<56:18,  1.51it/s] 48%|████▊     | 4656/9770 [55:04<55:45,  1.53it/s] 48%|████▊     | 4657/9770 [55:05<55:32,  1.53it/s] 48%|████▊     | 4658/9770 
+0: {'loss': 0.6861, 'grad_norm': 0.6463123215303295, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.48}
+0: {'loss': 0.6542, 'grad_norm': 0.6520847112408649, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.48}
+0: [55:06<55:54,  1.52it/s] 48%|████▊     | 4659/9770 [55:06<56:24,  1.51it/s] 48%|████▊     | 4660/9770 [55:07<56:33,  1.51it/s]                                                    48%|████▊     | 4660/9770 [55:07<56:33,  1.51it/s] 48%|████▊     | 4661/9770 [55:08<56:47,  1.50it/s] 48%|████▊     | 4662/9770 [55:08<56:23,  1.51it/s] 48%|████▊     | 4663/9770 [55:09<55:46,  1.53it/s] 48%|████▊     | 4664/9770 [55:09<55:28,  1.53it/s] 48%|████▊     | 4665/9770 [55:10<55:20,  1.54it/s] 48%|████▊     | 4666/9770 [55:11<55:05,  1.54it/s] 48%|████▊     | 4667/9770 [55:11<55:14,  1.54it/s] 48%|████▊     | 4668/9770 [55:12<55:35,  1.53it/s] 48%|████▊     | 4669/9770 [55:13<55:57,  1.52it/s] 48%|████▊     | 4670/9770 [55:13<55:45,  1.52it/s]                                                    48%|████▊     | 4670/9770 [55:13<55:45,  1.52it/s] 48%|████▊     
+0: {'loss': 0.6608, 'grad_norm': 0.5965875493466005, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.48}
+0: | 4671/9770 [55:14<56:11,  1.51it/s] 48%|████▊     | 4672/9770 [55:15<56:16,  1.51it/s] 48%|████▊     | 4673/9770 [55:15<56:16,  1.51it/s] 48%|████▊     | 4674/9770 [55:16<56:44,  1.50it/s] 48%|████▊     | 4675/9770 [55:17<56:41,  1.50it/s] 48%|████▊     | 4676/9770 [55:17<56:13,  1.51it/s] 48%|████▊     | 4677/9770 [55:18<55:59,  1.52it/s] 48%|████▊     | 4678/9770 [55:19<55:55,  1.52it/s] 48%|████▊     | 4679/9770 [55:19<55:56,  1.52it/s] 48%|████▊     | 4680/9770 [55:20<56:16,  1.51it/s]                                                    48%|████▊     | 4680/9770 [55:20<56:16,  1.51it/s] 48%|████▊     | 4681/9770 [55:21<56:16,  1.51it/s] 48%|████▊     | 4682/9770 [55:21<55:47,  1.52it/s] 48%|████▊     | 4683/9770 [55:22<55:54,  1.52it/s] 48%|████▊     | 4684/9770 [55:23<56:01,  1.51it/s] 48%|████▊     | 4685/9770 [55:23<56:13,  1.51it/s] 48%
+0: {'loss': 0.6523, 'grad_norm': 0.6215592034303498, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.48}
+0: |████▊     | 4686/9770 [55:24<56:18,  1.50it/s] 48%|████▊     | 4687/9770 [55:25<56:12,  1.51it/s] 48%|████▊     | 4688/9770 [55:25<55:50,  1.52it/s] 48%|████▊     | 4689/9770 [55:26<55:52,  1.52it/s] 48%|████▊     | 4690/9770 [55:27<55:36,  1.52it/s]                                                    48%|████▊     | 4690/9770 [55:27<55:36,  1.52it/s] 48%|████▊     | 4691/9770 [55:27<55:58,  1.51it/s] 48%|████▊     | 4692/9770 [55:28<55:39,  1.52it/s] 48%|████▊     | 4693/9770 [55:29<55:26,  1.53it/s] 48%|████▊     | 4694/9770 [55:29<55:48,  1.52it/s] 48%|████▊     | 4695/9770 [55:30<55:58,  1.51it/s] 48%|████▊     | 4696/9770 [55:31<56:34,  1.49it/s] 48%|████▊     | 4697/9770 [55:31<56:06,  1.51it/s] 48%|████▊     | 4698/9770 [55:32<57:08,  1.48it/s] 48%|████▊     | 4699/9770 [55:33<57:09,  1.48it/s] 48%|████▊     | 4700/9770 [55:33<5
+0: {'loss': 0.6868, 'grad_norm': 0.6496241447214529, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.48}
+0: {'loss': 0.6662, 'grad_norm': 0.6689949941729074, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.48}
+0: 7:09,  1.48it/s]                                                    48%|████▊     | 4700/9770 [55:33<57:09,  1.48it/s] 48%|████▊     | 4701/9770 [55:34<57:53,  1.46it/s] 48%|████▊     | 4702/9770 [55:35<58:33,  1.44it/s] 48%|████▊     | 4703/9770 [55:35<57:20,  1.47it/s] 48%|████▊     | 4704/9770 [55:36<57:15,  1.47it/s] 48%|████▊     | 4705/9770 [55:37<56:38,  1.49it/s] 48%|████▊     | 4706/9770 [55:37<56:23,  1.50it/s] 48%|████▊     | 4707/9770 [55:38<56:17,  1.50it/s] 48%|████▊     | 4708/9770 [55:39<56:13,  1.50it/s] 48%|████▊     | 4709/9770 [55:39<56:00,  1.51it/s] 48%|████▊     | 4710/9770 [55:40<56:17,  1.50it/s]                                                    48%|████▊     | 4710/9770 [55:40<56:17,  1.50it/s] 48%|████▊     | 4711/9770 [55:41<57:09,  1.48it/s] 48%|████▊     | 4712/9770 [55:41<56:15,  1.50it/s] 48%|████▊     | 4713/9
+0: {'loss': 0.6561, 'grad_norm': 0.6230667745183371, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.48}
+0: 770 [55:42<56:13,  1.50it/s] 48%|████▊     | 4714/9770 [55:43<55:23,  1.52it/s] 48%|████▊     | 4715/9770 [55:43<55:32,  1.52it/s] 48%|████▊     | 4716/9770 [55:44<54:55,  1.53it/s] 48%|████▊     | 4717/9770 [55:45<56:14,  1.50it/s] 48%|████▊     | 4718/9770 [55:45<56:09,  1.50it/s] 48%|████▊     | 4719/9770 [55:46<56:02,  1.50it/s] 48%|████▊     | 4720/9770 [55:47<55:10,  1.53it/s]                                                    48%|████▊     | 4720/9770 [55:47<55:10,  1.53it/s] 48%|████▊     | 4721/9770 [55:47<55:15,  1.52it/s] 48%|████▊     | 4722/9770 [55:48<54:59,  1.53it/s] 48%|████▊     | 4723/9770 [55:49<55:24,  1.52it/s] 48%|████▊     | 4724/9770 [55:49<55:30,  1.52it/s] 48%|████▊     | 4725/9770 [55:50<55:53,  1.50it/s] 48%|████▊     | 4726/9770 [55:51<56:54,  1.48it/s] 48%|████▊     | 4727/9770 [55:51<56:05,  1.50it/s] 48%|██�
+0: {'loss': 0.6839, 'grad_norm': 0.6560895732592161, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.48}
+0: {'loss': 0.6696, 'grad_norm': 0.6276750270818932, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.48}
+0: ��█▊     | 4728/9770 [55:52<55:51,  1.50it/s] 48%|████▊     | 4729/9770 [55:53<55:11,  1.52it/s] 48%|████▊     | 4730/9770 [55:53<55:45,  1.51it/s]                                                    48%|████▊     | 4730/9770 [55:53<55:45,  1.51it/s] 48%|████▊     | 4731/9770 [55:54<55:31,  1.51it/s] 48%|████▊     | 4732/9770 [55:55<56:28,  1.49it/s] 48%|████▊     | 4733/9770 [55:55<57:17,  1.47it/s] 48%|████▊     | 4734/9770 [55:56<57:55,  1.45it/s] 48%|████▊     | 4735/9770 [55:57<57:28,  1.46it/s] 48%|████▊     | 4736/9770 [55:57<57:58,  1.45it/s] 48%|████▊     | 4737/9770 [55:58<56:56,  1.47it/s] 48%|████▊     | 4738/9770 [55:59<56:43,  1.48it/s] 49%|████▊     | 4739/9770 [55:59<56:17,  1.49it/s] 49%|████▊     | 4740/9770 [56:00<55:55,  1.50it/s]                                                    49%|████▊     | 4740/9770 [56:00<55:55,  1.50it/s]
+0: {'loss': 0.6439, 'grad_norm': 0.621410727524644, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.49}
+0:  49%|████▊     | 4741/9770 [56:01<55:43,  1.50it/s] 49%|████▊     | 4742/9770 [56:01<56:04,  1.49it/s] 49%|████▊     | 4743/9770 [56:02<55:34,  1.51it/s] 49%|████▊     | 4744/9770 [56:03<55:11,  1.52it/s] 49%|████▊     | 4745/9770 [56:03<54:44,  1.53it/s] 49%|████▊     | 4746/9770 [56:04<54:49,  1.53it/s] 49%|████▊     | 4747/9770 [56:05<55:03,  1.52it/s] 49%|████▊     | 4748/9770 [56:05<54:59,  1.52it/s] 49%|████▊     | 4749/9770 [56:06<55:01,  1.52it/s] 49%|████▊     | 4750/9770 [56:07<55:00,  1.52it/s]                                                    49%|████▊     | 4750/9770 [56:07<55:00,  1.52it/s] 49%|████▊     | 4751/9770 [56:07<54:59,  1.52it/s] 49%|████▊     | 4752/9770 [56:08<54:51,  1.52it/s] 49%|████▊     | 4753/9770 [56:09<55:04,  1.52it/s] 49%|████▊     | 4754/9770 [56:09<55:34,  1.50it/s] 49%|████▊     | 4755/9770 [56:
+0: {'loss': 0.6599, 'grad_norm': 0.6095403300108553, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.49}
+0: 10<54:57,  1.52it/s] 49%|████▊     | 4756/9770 [56:11<54:52,  1.52it/s] 49%|████▊     | 4757/9770 [56:11<55:05,  1.52it/s] 49%|████▊     | 4758/9770 [56:12<55:30,  1.51it/s] 49%|████▊     | 4759/9770 [56:13<56:33,  1.48it/s] 49%|████▊     | 4760/9770 [56:13<56:11,  1.49it/s]                                                    49%|████▊     | 4760/9770 [56:13<56:11,  1.49it/s] 49%|████▊     | 4761/9770 [56:14<56:13,  1.48it/s] 49%|████▊     | 4762/9770 [56:15<55:51,  1.49it/s] 49%|████▉     | 4763/9770 [56:15<55:41,  1.50it/s] 49%|████▉     | 4764/9770 [56:16<56:28,  1.48it/s] 49%|████▉     | 4765/9770 [56:17<55:53,  1.49it/s] 49%|████▉     | 4766/9770 [56:17<55:15,  1.51it/s] 49%|████▉     | 4767/9770 [56:18<55:05,  1.51it/s] 49%|████▉     | 4768/9770 [56:19<54:23,  1.53it/s] 49%|████▉     | 4769/9770 [56:19<54:01,  1.54it/s] 49%|████▉
+0: {'loss': 0.6858, 'grad_norm': 0.6462969335376962, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.49}
+0: {'loss': 0.7043, 'grad_norm': 0.6474250293118189, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.49}
+0:      | 4770/9770 [56:20<54:23,  1.53it/s]                                                    49%|████▉     | 4770/9770 [56:20<54:23,  1.53it/s] 49%|████▉     | 4771/9770 [56:21<54:31,  1.53it/s] 49%|████▉     | 4772/9770 [56:21<55:10,  1.51it/s] 49%|████▉     | 4773/9770 [56:22<54:58,  1.51it/s] 49%|████▉     | 4774/9770 [56:23<55:00,  1.51it/s] 49%|████▉     | 4775/9770 [56:23<54:52,  1.52it/s] 49%|████▉     | 4776/9770 [56:24<54:56,  1.52it/s] 49%|████▉     | 4777/9770 [56:25<54:23,  1.53it/s] 49%|████▉     | 4778/9770 [56:25<54:45,  1.52it/s] 49%|████▉     | 4779/9770 [56:26<54:35,  1.52it/s] 49%|████▉     | 4780/9770 [56:27<54:49,  1.52it/s]                                                    49%|████▉     | 4780/9770 [56:27<54:49,  1.52it/s] 49%|████▉     | 4781/9770 [56:27<54:21,  1.53it/s] 49%|████▉     | 4782/9770 [56:28<54:26,  1.53it/s] 49%|█
+0: {'loss': 0.6722, 'grad_norm': 0.6352876260360985, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.49}
+0: ███▉     | 4783/9770 [56:28<54:06,  1.54it/s] 49%|████▉     | 4784/9770 [56:29<55:13,  1.50it/s] 49%|████▉     | 4785/9770 [56:30<55:13,  1.50it/s] 49%|████▉     | 4786/9770 [56:30<55:30,  1.50it/s] 49%|████▉     | 4787/9770 [56:31<55:14,  1.50it/s] 49%|████▉     | 4788/9770 [56:32<55:00,  1.51it/s] 49%|████▉     | 4789/9770 [56:32<54:19,  1.53it/s] 49%|████▉     | 4790/9770 [56:33<54:25,  1.53it/s]                                                    49%|████▉     | 4790/9770 [56:33<54:25,  1.53it/s] 49%|████▉     | 4791/9770 [56:34<54:25,  1.52it/s] 49%|████▉     | 4792/9770 [56:34<54:05,  1.53it/s] 49%|████▉     | 4793/9770 [56:35<54:21,  1.53it/s] 49%|████▉     | 4794/9770 [56:36<54:04,  1.53it/s] 49%|████▉     | 4795/9770 [56:36<54:02,  1.53it/s] 49%|████▉     | 4796/9770 [56:37<54:21,  1.53it/s] 49%|████▉     | 4797/9770 [56:38<54:37
+0: {'loss': 0.66, 'grad_norm': 0.646113174427689, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.49}
+0: {'loss': 0.6703, 'grad_norm': 0.5939444943463341, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.49}
+0: ,  1.52it/s] 49%|████▉     | 4798/9770 [56:38<54:30,  1.52it/s] 49%|████▉     | 4799/9770 [56:39<55:06,  1.50it/s] 49%|████▉     | 4800/9770 [56:40<55:05,  1.50it/s]                                                    49%|████▉     | 4800/9770 [56:40<55:05,  1.50it/s] 49%|████▉     | 4801/9770 [56:40<54:39,  1.52it/s] 49%|████▉     | 4802/9770 [56:41<54:47,  1.51it/s] 49%|████▉     | 4803/9770 [56:42<54:44,  1.51it/s] 49%|████▉     | 4804/9770 [56:42<55:00,  1.50it/s] 49%|████▉     | 4805/9770 [56:43<54:59,  1.50it/s] 49%|████▉     | 4806/9770 [56:44<54:39,  1.51it/s] 49%|████▉     | 4807/9770 [56:44<54:03,  1.53it/s] 49%|████▉     | 4808/9770 [56:45<54:11,  1.53it/s] 49%|████▉     | 4809/9770 [56:46<53:53,  1.53it/s] 49%|████▉     | 4810/9770 [56:46<54:03,  1.53it/s]                                                    49%|████▉     | 4810/9770 
+0: {'loss': 0.6833, 'grad_norm': 0.6105768527945395, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.49}
+0: [56:46<54:03,  1.53it/s] 49%|████▉     | 4811/9770 [56:47<54:14,  1.52it/s] 49%|████▉     | 4812/9770 [56:48<53:50,  1.53it/s] 49%|████▉     | 4813/9770 [56:48<53:41,  1.54it/s] 49%|████▉     | 4814/9770 [56:49<53:46,  1.54it/s] 49%|████▉     | 4815/9770 [56:50<54:02,  1.53it/s] 49%|████▉     | 4816/9770 [56:50<54:17,  1.52it/s] 49%|████▉     | 4817/9770 [56:51<55:03,  1.50it/s] 49%|████▉     | 4818/9770 [56:52<55:33,  1.49it/s] 49%|████▉     | 4819/9770 [56:52<55:03,  1.50it/s] 49%|████▉     | 4820/9770 [56:53<55:42,  1.48it/s]                                                    49%|████▉     | 4820/9770 [56:53<55:42,  1.48it/s] 49%|████▉     | 4821/9770 [56:54<55:10,  1.49it/s] 49%|████▉     | 4822/9770 [56:54<55:23,  1.49it/s] 49%|████▉     | 4823/9770 [56:55<54:47,  1.50it/s] 49%|████▉     | 4824/9770 [56:56<55:34,  1.48it/s] 49%|███�
+0: {'loss': 0.667, 'grad_norm': 0.6034095860421688, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.49}
+0: �▉     | 4825/9770 [56:56<55:03,  1.50it/s] 49%|████▉     | 4826/9770 [56:57<54:16,  1.52it/s] 49%|████▉     | 4827/9770 [56:58<53:37,  1.54it/s] 49%|████▉     | 4828/9770 [56:58<54:08,  1.52it/s] 49%|████▉     | 4829/9770 [56:59<54:18,  1.52it/s] 49%|████▉     | 4830/9770 [56:59<53:31,  1.54it/s]                                                    49%|████▉     | 4830/9770 [56:59<53:31,  1.54it/s] 49%|████▉     | 4831/9770 [57:00<53:53,  1.53it/s] 49%|████▉     | 4832/9770 [57:01<54:16,  1.52it/s] 49%|████▉     | 4833/9770 [57:01<53:40,  1.53it/s] 49%|████▉     | 4834/9770 [57:02<53:37,  1.53it/s] 49%|████▉     | 4835/9770 [57:03<54:07,  1.52it/s] 49%|████▉     | 4836/9770 [57:03<55:15,  1.49it/s] 50%|████▉     | 4837/9770 [57:04<54:30,  1.51it/s] 50%|████▉     | 4838/9770 [57:05<54:48,  1.50it/s] 50%|████▉     | 4839/9770 [57:05<54:44,  1.50i
+0: {'loss': 0.6601, 'grad_norm': 0.6103518557766079, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.5}
+0: {'loss': 0.671, 'grad_norm': 0.603213918217476, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.5}
+0: t/s] 50%|████▉     | 4840/9770 [57:06<54:30,  1.51it/s]                                                    50%|████▉     | 4840/9770 [57:06<54:30,  1.51it/s] 50%|████▉     | 4841/9770 [57:07<53:46,  1.53it/s] 50%|████▉     | 4842/9770 [57:07<53:18,  1.54it/s] 50%|████▉     | 4843/9770 [57:08<53:03,  1.55it/s] 50%|████▉     | 4844/9770 [57:09<54:05,  1.52it/s] 50%|████▉     | 4845/9770 [57:09<53:45,  1.53it/s] 50%|████▉     | 4846/9770 [57:10<53:39,  1.53it/s] 50%|████▉     | 4847/9770 [57:11<53:23,  1.54it/s] 50%|████▉     | 4848/9770 [57:11<53:22,  1.54it/s] 50%|████▉     | 4849/9770 [57:12<52:56,  1.55it/s] 50%|████▉     | 4850/9770 [57:13<53:33,  1.53it/s]                                                    50%|████▉     | 4850/9770 [57:13<53:33,  1.53it/s] 50%|████▉     | 4851/9770 [57:13<54:06,  1.52it/s] 50%|████▉     | 4852/9770 [57:14<5
+0: {'loss': 0.6818, 'grad_norm': 0.6165028283423052, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.5}
+0: 4:06,  1.51it/s] 50%|████▉     | 4853/9770 [57:15<54:45,  1.50it/s] 50%|████▉     | 4854/9770 [57:15<54:22,  1.51it/s] 50%|████▉     | 4855/9770 [57:16<54:09,  1.51it/s] 50%|████▉     | 4856/9770 [57:17<53:44,  1.52it/s] 50%|████▉     | 4857/9770 [57:17<53:22,  1.53it/s] 50%|████▉     | 4858/9770 [57:18<53:47,  1.52it/s] 50%|████▉     | 4859/9770 [57:19<53:07,  1.54it/s] 50%|████▉     | 4860/9770 [57:19<53:23,  1.53it/s]                                                    50%|████▉     | 4860/9770 [57:19<53:23,  1.53it/s] 50%|████▉     | 4861/9770 [57:20<53:47,  1.52it/s] 50%|████▉     | 4862/9770 [57:21<54:33,  1.50it/s] 50%|████▉     | 4863/9770 [57:21<53:41,  1.52it/s] 50%|████▉     | 4864/9770 [57:22<53:28,  1.53it/s] 50%|████▉     | 4865/9770 [57:22<53:26,  1.53it/s] 50%|████▉     | 4866/9770 [57:23<53:57,  1.51it/s] 50%|████▉    
+0: {'loss': 0.6806, 'grad_norm': 0.6207253006825786, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.5}
+0: {'loss': 0.6792, 'grad_norm': 0.6250894090780208, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.5}
+0:  | 4867/9770 [57:24<53:39,  1.52it/s] 50%|████▉     | 4868/9770 [57:24<53:47,  1.52it/s] 50%|████▉     | 4869/9770 [57:25<54:28,  1.50it/s] 50%|████▉     | 4870/9770 [57:26<53:35,  1.52it/s]                                                    50%|████▉     | 4870/9770 [57:26<53:35,  1.52it/s] 50%|████▉     | 4871/9770 [57:26<54:02,  1.51it/s] 50%|████▉     | 4872/9770 [57:27<53:53,  1.51it/s] 50%|████▉     | 4873/9770 [57:28<53:18,  1.53it/s] 50%|████▉     | 4874/9770 [57:28<52:52,  1.54it/s] 50%|████▉     | 4875/9770 [57:29<52:55,  1.54it/s] 50%|████▉     | 4876/9770 [57:30<52:55,  1.54it/s] 50%|████▉     | 4877/9770 [57:30<54:04,  1.51it/s] 50%|████▉     | 4878/9770 [57:31<53:32,  1.52it/s] 50%|████▉     | 4879/9770 [57:32<53:34,  1.52it/s] 50%|████▉     | 4880/9770 [57:32<54:06,  1.51it/s]                                                    50%|██�
+0: {'loss': 0.6811, 'grad_norm': 0.596448713420271, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.5}
+0: ��█▉     | 4880/9770 [57:32<54:06,  1.51it/s] 50%|████▉     | 4881/9770 [57:33<54:12,  1.50it/s] 50%|████▉     | 4882/9770 [57:34<53:48,  1.51it/s] 50%|████▉     | 4883/9770 [57:34<53:53,  1.51it/s] 50%|████▉     | 4884/9770 [57:35<53:35,  1.52it/s] 50%|█████     | 4885/9770 [57:36<54:34,  1.49it/s] 50%|█████     | 4886/9770 [57:36<54:23,  1.50it/s] 50%|█████     | 4887/9770 [57:37<54:21,  1.50it/s] 50%|█████     | 4888/9770 [57:38<54:08,  1.50it/s] 50%|█████     | 4889/9770 [57:38<54:55,  1.48it/s] 50%|█████     | 4890/9770 [57:39<54:21,  1.50it/s]                                                    50%|█████     | 4890/9770 [57:39<54:21,  1.50it/s] 50%|█████     | 4891/9770 [57:40<54:13,  1.50it/s] 50%|█████     | 4892/9770 [57:40<53:35,  1.52it/s] 50%|█████     | 4893/9770 [57:41<53:23,  1.52it/s] 50%|█████     | 4894/9770 [57:42<53:17,  1
+0: {'loss': 0.6585, 'grad_norm': 0.652195539241238, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.5}
+0: .53it/s] 50%|█████     | 4895/9770 [57:42<53:15,  1.53it/s] 50%|█████     | 4896/9770 [57:43<53:40,  1.51it/s] 50%|█████     | 4897/9770 [57:44<53:50,  1.51it/s] 50%|█████     | 4898/9770 [57:44<53:53,  1.51it/s] 50%|█████     | 4899/9770 [57:45<53:54,  1.51it/s] 50%|█████     | 4900/9770 [57:46<54:02,  1.50it/s]                                                    50%|█████     | 4900/9770 [57:46<54:02,  1.50it/s] 50%|█████     | 4901/9770 [57:46<54:10,  1.50it/s] 50%|█████     | 4902/9770 [57:47<54:08,  1.50it/s] 50%|█████     | 4903/9770 [57:48<53:48,  1.51it/s] 50%|█████     | 4904/9770 [57:48<53:43,  1.51it/s] 50%|█████     | 4905/9770 [57:49<53:07,  1.53it/s] 50%|█████     | 4906/9770 [57:50<53:14,  1.52it/s] 50%|█████     | 4907/9770 [57:50<53:23,  1.52it/s] 50%|█████     | 4908/9770 [57:51<54:29,  1.49it/s] 50%|█████     | 4909/
+0: {'loss': 0.6786, 'grad_norm': 0.6268866892127483, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.5}
+0: {'loss': 0.6671, 'grad_norm': 0.6134672595493911, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.5}
+0: 9770 [57:52<54:31,  1.49it/s] 50%|█████     | 4910/9770 [57:52<53:48,  1.51it/s]                                                    50%|█████     | 4910/9770 [57:52<53:48,  1.51it/s] 50%|█████     | 4911/9770 [57:53<53:54,  1.50it/s] 50%|█████     | 4912/9770 [57:54<54:53,  1.48it/s] 50%|█████     | 4913/9770 [57:54<55:05,  1.47it/s] 50%|█████     | 4914/9770 [57:55<55:16,  1.46it/s] 50%|█████     | 4915/9770 [57:56<54:15,  1.49it/s] 50%|█████     | 4916/9770 [57:56<53:52,  1.50it/s] 50%|█████     | 4917/9770 [57:57<53:42,  1.51it/s] 50%|█████     | 4918/9770 [57:58<53:55,  1.50it/s] 50%|█████     | 4919/9770 [57:58<53:43,  1.50it/s] 50%|█████     | 4920/9770 [57:59<53:33,  1.51it/s]                                                    50%|█████     | 4920/9770 [57:59<53:33,  1.51it/s] 50%|█████     | 4921/9770 [58:00<52:59,  1.53it/s] 50%|█████
+0: {'loss': 0.6594, 'grad_norm': 0.600250727870295, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.5}
+0:      | 4922/9770 [58:00<53:10,  1.52it/s] 50%|█████     | 4923/9770 [58:01<53:11,  1.52it/s] 50%|█████     | 4924/9770 [58:02<52:45,  1.53it/s] 50%|█████     | 4925/9770 [58:02<53:14,  1.52it/s] 50%|█████     | 4926/9770 [58:03<52:34,  1.54it/s] 50%|█████     | 4927/9770 [58:04<53:01,  1.52it/s] 50%|█████     | 4928/9770 [58:04<52:50,  1.53it/s] 50%|█████     | 4929/9770 [58:05<52:29,  1.54it/s] 50%|█████     | 4930/9770 [58:06<53:15,  1.51it/s]                                                    50%|█████     | 4930/9770 [58:06<53:15,  1.51it/s] 50%|█████     | 4931/9770 [58:06<53:13,  1.52it/s] 50%|█████     | 4932/9770 [58:07<54:16,  1.49it/s] 50%|█████     | 4933/9770 [58:08<53:47,  1.50it/s] 51%|█████     | 4934/9770 [58:08<53:18,  1.51it/s] 51%|█████     | 4935/9770 [58:09<53:09,  1.52it/s] 51%|█████     | 4936/9770 [58:10<53:06,  1.52it/s]
+0: {'loss': 0.6639, 'grad_norm': 0.6984372897619658, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.51}
+0:  51%|█████     | 4937/9770 [58:10<52:36,  1.53it/s] 51%|█████     | 4938/9770 [58:11<53:58,  1.49it/s] 51%|█████     | 4939/9770 [58:12<53:29,  1.51it/s] 51%|█████     | 4940/9770 [58:12<53:53,  1.49it/s]                                                    51%|█████     | 4940/9770 [58:12<53:53,  1.49it/s] 51%|█████     | 4941/9770 [58:13<53:25,  1.51it/s] 51%|█████     | 4942/9770 [58:13<53:07,  1.51it/s] 51%|██��██     | 4943/9770 [58:14<53:15,  1.51it/s] 51%|█████     | 4944/9770 [58:15<53:01,  1.52it/s] 51%|█████     | 4945/9770 [58:15<52:31,  1.53it/s] 51%|█████     | 4946/9770 [58:16<53:10,  1.51it/s] 51%|█████     | 4947/9770 [58:17<53:12,  1.51it/s] 51%|█████     | 4948/9770 [58:17<52:56,  1.52it/s] 51%|█████     | 4949/9770 [58:18<52:58,  1.52it/s] 51%|█████     | 4950/9770 [58:19<52:40,  1.53it/s]                                        
+0: {'loss': 0.6789, 'grad_norm': 0.6889980569853584, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.51}
+0: {'loss': 0.6559, 'grad_norm': 0.6653437753539656, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.51}
+0:             51%|█████     | 4950/9770 [58:19<52:40,  1.53it/s] 51%|█████     | 4951/9770 [58:19<52:39,  1.53it/s] 51%|█████     | 4952/9770 [58:20<51:51,  1.55it/s] 51%|█████     | 4953/9770 [58:21<52:32,  1.53it/s] 51%|█████     | 4954/9770 [58:21<52:20,  1.53it/s] 51%|█████     | 4955/9770 [58:22<52:20,  1.53it/s] 51%|█████     | 4956/9770 [58:23<52:16,  1.53it/s] 51%|█████     | 4957/9770 [58:23<52:16,  1.53it/s] 51%|█████     | 4958/9770 [58:24<52:20,  1.53it/s] 51%|█████     | 4959/9770 [58:25<52:27,  1.53it/s] 51%|█████     | 4960/9770 [58:25<53:44,  1.49it/s]                                                    51%|█████     | 4960/9770 [58:25<53:44,  1.49it/s] 51%|█████     | 4961/9770 [58:26<52:56,  1.51it/s] 51%|█████     | 4962/9770 [58:27<52:36,  1.52it/s] 51%|█████     | 4963/9770 [58:27<52:02,  1.54it/s] 51%|█████     | 4
+0: {'loss': 0.666, 'grad_norm': 0.6380552114207497, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.51}
+0: 964/9770 [58:28<52:43,  1.52it/s] 51%|█████     | 4965/9770 [58:29<52:32,  1.52it/s] 51%|█████     | 4966/9770 [58:29<52:38,  1.52it/s] 51%|█████     | 4967/9770 [58:30<52:51,  1.51it/s] 51%|█████     | 4968/9770 [58:31<52:56,  1.51it/s] 51%|█████     | 4969/9770 [58:31<53:05,  1.51it/s] 51%|█████     | 4970/9770 [58:32<52:13,  1.53it/s]                                                    51%|█████     | 4970/9770 [58:32<52:13,  1.53it/s] 51%|█████     | 4971/9770 [58:33<52:20,  1.53it/s] 51%|█████     | 4972/9770 [58:33<52:22,  1.53it/s] 51%|█████     | 4973/9770 [58:34<52:31,  1.52it/s] 51%|█████     | 4974/9770 [58:34<52:28,  1.52it/s] 51%|█████     | 4975/9770 [58:35<52:16,  1.53it/s] 51%|█████     | 4976/9770 [58:36<51:46,  1.54it/s] 51%|█████     | 4977/9770 [58:36<53:01,  1.51it/s] 51%|█████     | 4978/9770 [58:37<53:29,  1.49it/s] 51%|�
+0: {'loss': 0.6616, 'grad_norm': 0.6149642780972093, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.51}
+0: {'loss': 0.6607, 'grad_norm': 0.6259600119306887, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.51}
+0: �████     | 4979/9770 [58:38<52:33,  1.52it/s] 51%|█████     | 4980/9770 [58:38<52:43,  1.51it/s]                                                    51%|█████     | 4980/9770 [58:38<52:43,  1.51it/s] 51%|█████     | 4981/9770 [58:39<52:16,  1.53it/s] 51%|█████     | 4982/9770 [58:40<52:27,  1.52it/s] 51%|█████     | 4983/9770 [58:40<52:31,  1.52it/s] 51%|█████     | 4984/9770 [58:41<52:53,  1.51it/s] 51%|█████     | 4985/9770 [58:42<53:00,  1.50it/s] 51%|█████     | 4986/9770 [58:42<53:05,  1.50it/s] 51%|█████     | 4987/9770 [58:43<52:38,  1.51it/s] 51%|█████     | 4988/9770 [58:44<52:30,  1.52it/s] 51%|█████     | 4989/9770 [58:44<52:17,  1.52it/s] 51%|█████     | 4990/9770 [58:45<52:10,  1.53it/s]                                                    51%|█████     | 4990/9770 [58:45<52:10,  1.53it/s] 51%|█████     | 4991/9770 [58:46<52:03,  1.53i
+0: {'loss': 0.6552, 'grad_norm': 0.6039468761627335, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.51}
+0: t/s] 51%|█████     | 4992/9770 [58:46<52:01,  1.53it/s] 51%|█████     | 4993/9770 [58:47<51:55,  1.53it/s] 51%|█████     | 4994/9770 [58:48<52:30,  1.52it/s] 51%|█████     | 4995/9770 [58:48<51:57,  1.53it/s] 51%|█████     | 4996/9770 [58:49<52:11,  1.52it/s] 51%|█████     | 4997/9770 [58:50<51:34,  1.54it/s] 51%|█████     | 4998/9770 [58:50<51:58,  1.53it/s] 51%|█████     | 4999/9770 [58:51<52:51,  1.50it/s] 51%|█████     | 5000/9770 [58:52<52:23,  1.52it/s]                                                    51%|█████     | 5000/9770 [58:52<52:23,  1.52it/s] 51%|█████     | 5001/9770 [58:52<51:48,  1.53it/s] 51%|█████     | 5002/9770 [58:53<51:46,  1.53it/s] 51%|█████     | 5003/9770 [58:54<51:31,  1.54it/s] 51%|█████     | 5004/9770 [58:54<51:33,  1.54it/s] 51%|█████     | 5005/9770 [58:55<51:36,  1.54it/s] 51%|█████     | 5006/9770
+0: {'loss': 0.6698, 'grad_norm': 0.6050012098233605, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.51}
+0:  [58:55<51:21,  1.55it/s] 51%|█████     | 5007/9770 [58:56<52:14,  1.52it/s] 51%|█████▏    | 5008/9770 [58:57<52:14,  1.52it/s] 51%|█████▏    | 5009/9770 [58:57<51:51,  1.53it/s] 51%|█████▏    | 5010/9770 [58:58<52:13,  1.52it/s]                                                    51%|█████▏    | 5010/9770 [58:58<52:13,  1.52it/s] 51%|█████▏    | 5011/9770 [58:59<52:27,  1.51it/s] 51%|█████▏    | 5012/9770 [58:59<52:31,  1.51it/s] 51%|█████▏    | 5013/9770 [59:00<52:27,  1.51it/s] 51%|█████▏    | 5014/9770 [59:01<53:07,  1.49it/s] 51%|█████▏    | 5015/9770 [59:01<53:35,  1.48it/s] 51%|█████▏    | 5016/9770 [59:02<52:58,  1.50it/s] 51%|█████▏    | 5017/9770 [59:03<52:55,  1.50it/s] 51%|█████▏    | 5018/9770 [59:03<52:37,  1.50it/s] 51%|█████▏    | 5019/9770 [59:04<52:34,  1.51it/s] 51%|█████▏    | 5020/9770 [59:05<52:15
+0: {'loss': 0.6814, 'grad_norm': 0.6387138190564051, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.51}
+0: {'loss': 0.6672, 'grad_norm': 0.6161556190000104, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.51}
+0: ,  1.52it/s]                                                    51%|█████▏    | 5020/9770 [59:05<52:15,  1.52it/s] 51%|█████▏    | 5021/9770 [59:05<52:33,  1.51it/s] 51%|█████▏    | 5022/9770 [59:06<51:46,  1.53it/s] 51%|█████▏    | 5023/9770 [59:07<51:33,  1.53it/s] 51%|█████▏    | 5024/9770 [59:07<51:24,  1.54it/s] 51%|█████▏    | 5025/9770 [59:08<51:00,  1.55it/s] 51%|█████▏    | 5026/9770 [59:09<51:05,  1.55it/s] 51%|█████▏    | 5027/9770 [59:09<51:15,  1.54it/s] 51%|█████▏    | 5028/9770 [59:10<50:59,  1.55it/s] 51%|█████▏    | 5029/9770 [59:11<51:33,  1.53it/s] 51%|█████▏    | 5030/9770 [59:11<51:37,  1.53it/s]                                                    51%|█████▏    | 5030/9770 [59:11<51:37,  1.53it/s] 51%|█████▏    | 5031/9770 [59:12<51:46,  1.53it/s] 52%|█████▏    | 5032/9770 [59:13<52:01,  1.52it/s] 52%|█�
+0: {'loss': 0.6501, 'grad_norm': 0.6103714082201293, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.52}
+0: ��███▏    | 5033/9770 [59:13<51:14,  1.54it/s] 52%|█████▏    | 5034/9770 [59:14<51:15,  1.54it/s] 52%|█████▏    | 5035/9770 [59:15<51:25,  1.53it/s] 52%|█████▏    | 5036/9770 [59:15<56:20,  1.40it/s] 52%|█████▏    | 5037/9770 [59:16<54:56,  1.44it/s] 52%|█████▏    | 5038/9770 [59:17<54:02,  1.46it/s] 52%|█████▏    | 5039/9770 [59:17<53:17,  1.48it/s] 52%|█████▏    | 5040/9770 [59:18<52:28,  1.50it/s]                                                    52%|█████▏    | 5040/9770 [59:18<52:28,  1.50it/s] 52%|█████▏    | 5041/9770 [59:19<52:02,  1.51it/s] 52%|█████▏    | 5042/9770 [59:19<52:20,  1.51it/s] 52%|█████▏    | 5043/9770 [59:20<51:47,  1.52it/s] 52%|█████▏    | 5044/9770 [59:21<52:25,  1.50it/s] 52%|█████▏    | 5045/9770 [59:21<52:00,  1.51it/s] 52%|█████▏    | 5046/9770 [59:22<51:38,  1.52it/s] 52%|█████
+0: {'loss': 0.6636, 'grad_norm': 0.6142671087698502, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.52}
+0: ▏    | 5047/9770 [59:23<51:59,  1.51it/s] 52%|█████▏    | 5048/9770 [59:23<51:43,  1.52it/s] 52%|█████▏    | 5049/9770 [59:24<52:21,  1.50it/s] 52%|���████▏    | 5050/9770 [59:25<51:59,  1.51it/s]                                                    52%|█████▏    | 5050/9770 [59:25<51:59,  1.51it/s] 52%|█████▏    | 5051/9770 [59:25<51:48,  1.52it/s] 52%|█████▏    | 5052/9770 [59:26<52:04,  1.51it/s] 52%|█████▏    | 5053/9770 [59:27<51:48,  1.52it/s] 52%|█████▏    | 5054/9770 [59:27<51:30,  1.53it/s] 52%|█████▏    | 5055/9770 [59:28<51:38,  1.52it/s] 52%|█████▏    | 5056/9770 [59:29<50:42,  1.55it/s] 52%|█████▏    | 5057/9770 [59:29<50:38,  1.55it/s] 52%|█████▏    | 5058/9770 [59:30<51:16,  1.53it/s] 52%|█████▏    | 5059/9770 [59:30<51:14,  1.53it/s] 52%|█████▏    | 5060/9770 [59:31<51:45,  1.52it/s]                               
+0: {'loss': 0.662, 'grad_norm': 0.6304596907220404, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.52}
+0: {'loss': 0.6619, 'grad_norm': 0.6165093955269434, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.52}
+0:                      52%|█████▏    | 5060/9770 [59:31<51:45,  1.52it/s] 52%|█████▏    | 5061/9770 [59:32<51:49,  1.51it/s] 52%|█████▏    | 5062/9770 [59:32<51:28,  1.52it/s] 52%|█████▏    | 5063/9770 [59:33<51:36,  1.52it/s] 52%|█████▏    | 5064/9770 [59:34<51:31,  1.52it/s] 52%|█████▏    | 5065/9770 [59:34<51:06,  1.53it/s] 52%|█████▏    | 5066/9770 [59:35<51:55,  1.51it/s] 52%|█████▏    | 5067/9770 [59:36<52:06,  1.50it/s] 52%|█████▏    | 5068/9770 [59:36<52:06,  1.50it/s] 52%|█████▏    | 5069/9770 [59:37<51:16,  1.53it/s] 52%|█████▏    | 5070/9770 [59:38<51:15,  1.53it/s]                                                    52%|█████▏    | 5070/9770 [59:38<51:15,  1.53it/s] 52%|█████▏    | 5071/9770 [59:38<51:11,  1.53it/s] 52%|█████▏    | 5072/9770 [59:39<51:30,  1.52it/s] 52%|█████▏    | 5073/9770 [59:40<50:57, 
+0: {'loss': 0.6767, 'grad_norm': 0.6046949541372585, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.52}
+0:  1.54it/s] 52%|█████▏    | 5074/9770 [59:40<51:41,  1.51it/s] 52%|█████▏    | 5075/9770 [59:41<51:50,  1.51it/s] 52%|█████▏    | 5076/9770 [59:42<51:39,  1.51it/s] 52%|█████▏    | 5077/9770 [59:42<51:23,  1.52it/s] 52%|█████▏    | 5078/9770 [59:43<51:20,  1.52it/s] 52%|█████▏    | 5079/9770 [59:44<51:46,  1.51it/s] 52%|█████▏    | 5080/9770 [59:44<51:55,  1.51it/s]                                                    52%|█████▏    | 5080/9770 [59:44<51:55,  1.51it/s] 52%|█████▏    | 5081/9770 [59:45<51:58,  1.50it/s] 52%|█████▏    | 5082/9770 [59:46<51:46,  1.51it/s] 52%|█████▏    | 5083/9770 [59:46<52:26,  1.49it/s] 52%|█████▏    | 5084/9770 [59:47<52:46,  1.48it/s] 52%|█████▏    | 5085/9770 [59:48<53:17,  1.47it/s] 52%|█████▏    | 5086/9770 [59:48<53:52,  1.45it/s] 52%|█████▏    | 5087/9770 [59:49<52:59,  1.47it/s]
+0: {'loss': 0.6785, 'grad_norm': 0.6266865570543698, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.52}
+0: {'loss': 0.6767, 'grad_norm': 0.6342178104993592, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.52}
+0:  52%|█████▏    | 5088/9770 [59:50<52:29,  1.49it/s] 52%|█████▏    | 5089/9770 [59:50<51:32,  1.51it/s] 52%|█████▏    | 5090/9770 [59:51<52:03,  1.50it/s]                                                    52%|█████▏    | 5090/9770 [59:51<52:03,  1.50it/s] 52%|█████▏    | 5091/9770 [59:52<52:06,  1.50it/s] 52%|█████▏    | 5092/9770 [59:52<52:16,  1.49it/s] 52%|█████▏    | 5093/9770 [59:53<51:14,  1.52it/s] 52%|█████▏    | 5094/9770 [59:54<51:24,  1.52it/s] 52%|█████▏    | 5095/9770 [59:54<51:29,  1.51it/s] 52%|█████▏    | 5096/9770 [59:55<51:21,  1.52it/s] 52%|█████▏    | 5097/9770 [59:56<50:42,  1.54it/s] 52%|█████▏    | 5098/9770 [59:56<50:50,  1.53it/s] 52%|█████▏    | 5099/9770 [59:57<51:20,  1.52it/s] 52%|█████▏    | 5100/9770 [59:58<50:57,  1.53it/s]                                                    52%|█████�
+0: {'loss': 0.6796, 'grad_norm': 0.6464930117233029, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.52}
+0: �    | 5100/9770 [59:58<50:57,  1.53it/s] 52%|█████▏    | 5101/9770 [59:58<51:26,  1.51it/s] 52%|█████▏    | 5102/9770 [59:59<50:59,  1.53it/s] 52%|█████▏    | 5103/9770 [1:00:00<50:36,  1.54it/s] 52%|█████▏    | 5104/9770 [1:00:00<50:55,  1.53it/s] 52%|█████▏    | 5105/9770 [1:00:01<50:50,  1.53it/s] 52%|█████▏    | 5106/9770 [1:00:02<51:45,  1.50it/s] 52%|█████▏    | 5107/9770 [1:00:02<51:30,  1.51it/s] 52%|█████▏    | 5108/9770 [1:00:03<51:58,  1.50it/s] 52%|█████▏    | 5109/9770 [1:00:04<51:32,  1.51it/s] 52%|█████▏    | 5110/9770 [1:00:04<51:23,  1.51it/s]                                                      52%|█████▏    | 5110/9770 [1:00:04<51:23,  1.51it/s] 52%|█████▏    | 5111/9770 [1:00:05<51:12,  1.52it/s] 52%|█████▏    | 5112/9770 [1:00:06<50:35,  1.53it/s] 52%|█████▏    | 5113/9770 [1:00:06<50:21,  1.54it/s] 52%|�
+0: {'loss': 0.6708, 'grad_norm': 0.6656520912260043, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.52}
+0: �████▏    | 5114/9770 [1:00:07<50:54,  1.52it/s] 52%|█████▏    | 5115/9770 [1:00:07<50:35,  1.53it/s] 52%|█████▏    | 5116/9770 [1:00:08<50:26,  1.54it/s] 52%|█████▏    | 5117/9770 [1:00:09<50:50,  1.53it/s] 52%|█████▏    | 5118/9770 [1:00:09<50:43,  1.53it/s] 52%|█████▏    | 5119/9770 [1:00:10<50:12,  1.54it/s] 52%|█████▏    | 5120/9770 [1:00:11<50:33,  1.53it/s]                                                      52%|█████▏    | 5120/9770 [1:00:11<50:33,  1.53it/s] 52%|█████▏    | 5121/9770 [1:00:11<50:45,  1.53it/s] 52%|█████▏    | 5122/9770 [1:00:12<50:51,  1.52it/s] 52%|█████▏    | 5123/9770 [1:00:13<50:28,  1.53it/s] 52%|█████▏    | 5124/9770 [1:00:13<50:54,  1.52it/s] 52%|█████▏    | 5125/9770 [1:00:14<51:10,  1.51it/s] 52%|█████▏    | 5126/9770 [1:00:15<50:49,  1.52it/s] 52%|█████▏    | 5127/9770 [1:00:15<50:5
+0: {'loss': 0.6657, 'grad_norm': 0.6129089321466914, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.52}
+0: 6,  1.52it/s] 52%|█████▏    | 5128/9770 [1:00:16<50:44,  1.52it/s] 52%|█████▏    | 5129/9770 [1:00:17<50:54,  1.52it/s] 53%|█████▎    | 5130/9770 [1:00:17<51:04,  1.51it/s]                                                      53%|█████▎    | 5130/9770 [1:00:17<51:04,  1.51it/s] 53%|█████▎    | 5131/9770 [1:00:18<50:52,  1.52it/s] 53%|█████▎    | 5132/9770 [1:00:19<51:01,  1.51it/s] 53%|█████▎    | 5133/9770 [1:00:19<51:40,  1.50it/s] 53%|█████▎    | 5134/9770 [1:00:20<50:59,  1.52it/s] 53%|█████▎    | 5135/9770 [1:00:21<50:53,  1.52it/s] 53%|█████▎    | 5136/9770 [1:00:21<50:26,  1.53it/s] 53%|█████▎    | 5137/9770 [1:00:22<50:55,  1.52it/s] 53%|█████▎    | 5138/9770 [1:00:23<51:10,  1.51it/s] 53%|█████▎    | 5139/9770 [1:00:23<50:23,  1.53it/s] 53%|█████▎    | 5140/9770 [1:00:24<50:57,  1.51it/s]                               
+0: {'loss': 0.664, 'grad_norm': 0.6331759177336027, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.53}
+0: {'loss': 0.6643, 'grad_norm': 0.5925651718111511, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.53}
+0:                        53%|█████▎    | 5140/9770 [1:00:24<50:57,  1.51it/s] 53%|█████▎    | 5141/9770 [1:00:25<51:21,  1.50it/s] 53%|█████▎    | 5142/9770 [1:00:25<51:13,  1.51it/s] 53%|█████▎    | 5143/9770 [1:00:26<50:59,  1.51it/s] 53%|█████▎    | 5144/9770 [1:00:27<51:46,  1.49it/s] 53%|█████▎    | 5145/9770 [1:00:27<51:16,  1.50it/s] 53%|█████▎    | 5146/9770 [1:00:28<50:43,  1.52it/s] 53%|█████▎    | 5147/9770 [1:00:29<50:29,  1.53it/s] 53%|█████▎    | 5148/9770 [1:00:29<49:55,  1.54it/s] 53%|█████▎    | 5149/9770 [1:00:30<50:11,  1.53it/s] 53%|█████▎    | 5150/9770 [1:00:30<50:00,  1.54it/s]                                                      53%|█████▎    | 5150/9770 [1:00:31<50:00,  1.54it/s] 53%|█████▎    | 5151/9770 [1:00:31<50:28,  1.52it/s] 53%|█████▎    | 5152/9770 [1:00:32<50:36,  1.52it/s] 53%|█████�
+0: {'loss': 0.6604, 'grad_norm': 0.6180175090034707, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.53}
+0: ��    | 5153/9770 [1:00:33<51:03,  1.51it/s] 53%|█████▎    | 5154/9770 [1:00:33<51:44,  1.49it/s] 53%|█████▎    | 5155/9770 [1:00:34<51:20,  1.50it/s] 53%|█████▎    | 5156/9770 [1:00:35<51:07,  1.50it/s] 53%|█████▎    | 5157/9770 [1:00:35<50:50,  1.51it/s] 53%|█████▎    | 5158/9770 [1:00:36<50:51,  1.51it/s] 53%|█████▎    | 5159/9770 [1:00:36<50:45,  1.51it/s] 53%|█████▎    | 5160/9770 [1:00:37<50:41,  1.52it/s]                                                      53%|█████▎    | 5160/9770 [1:00:37<50:41,  1.52it/s] 53%|█████▎    | 5161/9770 [1:00:38<50:28,  1.52it/s] 53%|█████▎    | 5162/9770 [1:00:38<50:39,  1.52it/s] 53%|█████▎    | 5163/9770 [1:00:39<50:12,  1.53it/s] 53%|█████▎    | 5164/9770 [1:00:40<49:24,  1.55it/s] 53%|█████▎    | 5165/9770 [1:00:40<50:03,  1.53it/s] 53%|█████▎    | 5166/9770 [1:00:41<50:10,  1.53it/s]
+0: {'loss': 0.6589, 'grad_norm': 0.6013981899067725, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.53}
+0:  53%|█████▎    | 5167/9770 [1:00:42<50:22,  1.52it/s] 53%|█████▎    | 5168/9770 [1:00:42<51:31,  1.49it/s] 53%|█████▎    | 5169/9770 [1:00:43<52:01,  1.47it/s] 53%|█████▎    | 5170/9770 [1:00:44<51:40,  1.48it/s]                                                      53%|█████▎    | 5170/9770 [1:00:44<51:40,  1.48it/s] 53%|█████▎    | 5171/9770 [1:00:44<51:31,  1.49it/s] 53%|█████▎    | 5172/9770 [1:00:45<51:25,  1.49it/s] 53%|█████▎    | 5173/9770 [1:00:46<51:53,  1.48it/s] 53%|█████▎    | 5174/9770 [1:00:46<51:39,  1.48it/s] 53%|█████▎    | 5175/9770 [1:00:47<52:33,  1.46it/s] 53%|█████▎    | 5176/9770 [1:00:48<51:37,  1.48it/s] 53%|█████▎    | 5177/9770 [1:00:48<51:14,  1.49it/s] 53%|█████▎    | 5178/9770 [1:00:49<51:19,  1.49it/s] 53%|█████▎    | 5179/9770 [1:00:50<50:44,  1.51it/s] 53%|█████▎    | 5180/9770 [1:00:
+0: {'loss': 0.6451, 'grad_norm': 0.6063826526139969, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.53}
+0: {'loss': 0.6684, 'grad_norm': 0.6126606239033013, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.53}
+0: 50<50:36,  1.51it/s]                                                      53%|█████▎    | 5180/9770 [1:00:50<50:36,  1.51it/s] 53%|█████▎    | 5181/9770 [1:00:51<50:54,  1.50it/s] 53%|█████▎    | 5182/9770 [1:00:52<51:35,  1.48it/s] 53%|█████▎    | 5183/9770 [1:00:53<51:10,  1.49it/s] 53%|█████▎    | 5184/9770 [1:00:53<50:12,  1.52it/s] 53%|█████▎    | 5185/9770 [1:00:54<50:00,  1.53it/s] 53%|█████▎    | 5186/9770 [1:00:54<50:18,  1.52it/s] 53%|█████▎    | 5187/9770 [1:00:55<50:16,  1.52it/s] 53%|█████▎    | 5188/9770 [1:00:56<50:06,  1.52it/s] 53%|█████▎    | 5189/9770 [1:00:56<49:46,  1.53it/s] 53%|█████▎    | 5190/9770 [1:00:57<50:09,  1.52it/s]                                                      53%|█████▎    | 5190/9770 [1:00:57<50:09,  1.52it/s] 53%|█████▎    | 5191/9770 [1:00:58<50:41,  1.51it/s] 53%|█████▎    | 5192/9
+0: {'loss': 0.6329, 'grad_norm': 0.6224836483508644, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.53}
+0: 770 [1:00:58<50:23,  1.51it/s] 53%|█████▎    | 5193/9770 [1:00:59<50:12,  1.52it/s] 53%|█████▎    | 5194/9770 [1:01:00<51:17,  1.49it/s] 53%|█████▎    | 5195/9770 [1:01:00<51:10,  1.49it/s] 53%|█████▎    | 5196/9770 [1:01:01<51:04,  1.49it/s] 53%|█████▎    | 5197/9770 [1:01:02<51:08,  1.49it/s] 53%|█████▎    | 5198/9770 [1:01:02<50:27,  1.51it/s] 53%|█████▎    | 5199/9770 [1:01:03<50:01,  1.52it/s] 53%|█████▎    | 5200/9770 [1:01:04<49:43,  1.53it/s]                                                      53%|█████▎    | 5200/9770 [1:01:04<49:43,  1.53it/s] 53%|█████▎    | 5201/9770 [1:01:04<49:40,  1.53it/s] 53%|█████▎    | 5202/9770 [1:01:05<50:00,  1.52it/s] 53%|█████▎    | 5203/9770 [1:01:06<50:39,  1.50it/s] 53%|█████▎    | 5204/9770 [1:01:06<50:15,  1.51it/s] 53%|█████▎    | 5205/9770 [1:01:07<50:04,  1.52it/s] 53%|███
+0: {'loss': 0.6659, 'grad_norm': 0.6046861240025698, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.53}
+0: ██▎    | 5206/9770 [1:01:08<49:30,  1.54it/s] 53%|█████▎    | 5207/9770 [1:01:08<50:47,  1.50it/s] 53%|█████▎    | 5208/9770 [1:01:09<50:34,  1.50it/s] 53%|█████▎    | 5209/9770 [1:01:10<51:24,  1.48it/s] 53%|█████▎    | 5210/9770 [1:01:10<51:40,  1.47it/s]                                                      53%|█████▎    | 5210/9770 [1:01:10<51:40,  1.47it/s] 53%|█████▎    | 5211/9770 [1:01:11<51:45,  1.47it/s] 53%|█████▎    | 5212/9770 [1:01:12<52:07,  1.46it/s] 53%|█████▎    | 5213/9770 [1:01:12<51:58,  1.46it/s] 53%|█████▎    | 5214/9770 [1:01:13<52:26,  1.45it/s] 53%|█████▎    | 5215/9770 [1:01:14<52:08,  1.46it/s] 53%|█████▎    | 5216/9770 [1:01:15<52:15,  1.45it/s] 53%|█████▎    | 5217/9770 [1:01:15<52:06,  1.46it/s] 53%|█████▎    | 5218/9770 [1:01:16<51:27,  1.47it/s] 53%|█████▎    | 5219/9770 [1:01:17<51:00,  1.4
+0: {'loss': 0.6744, 'grad_norm': 0.6061564935373156, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.53}
+0: {'loss': 0.6673, 'grad_norm': 0.612465793179078, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.54}
+0: 9it/s] 53%|█████▎    | 5220/9770 [1:01:17<51:58,  1.46it/s]                                                      53%|█████▎    | 5220/9770 [1:01:17<51:58,  1.46it/s] 53%|█████▎    | 5221/9770 [1:01:18<51:35,  1.47it/s] 53%|█████▎    | 5222/9770 [1:01:19<50:54,  1.49it/s] 53%|█████▎    | 5223/9770 [1:01:19<50:23,  1.50it/s] 53%|█████▎    | 5224/9770 [1:01:20<50:44,  1.49it/s] 53%|█████▎    | 5225/9770 [1:01:21<51:25,  1.47it/s] 53%|█████▎    | 5226/9770 [1:01:21<52:01,  1.46it/s] 54%|█████▎    | 5227/9770 [1:01:22<51:47,  1.46it/s] 54%|█████▎    | 5228/9770 [1:01:23<51:44,  1.46it/s] 54%|█████▎    | 5229/9770 [1:01:23<51:48,  1.46it/s] 54%|█████▎    | 5230/9770 [1:01:24<51:56,  1.46it/s]                                                      54%|█████▎    | 5230/9770 [1:01:24<51:56,  1.46it/s] 54%|█████▎    | 5231/9770 [1:01:25<5
+0: {'loss': 0.682, 'grad_norm': 0.6577209596441055, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.54}
+0: 1:47,  1.46it/s] 54%|█████▎    | 5232/9770 [1:01:25<51:58,  1.46it/s] 54%|█████▎    | 5233/9770 [1:01:26<52:50,  1.43it/s] 54%|█████▎    | 5234/9770 [1:01:27<51:36,  1.47it/s] 54%|█████▎    | 5235/9770 [1:01:27<50:44,  1.49it/s] 54%|█████▎    | 5236/9770 [1:01:28<50:35,  1.49it/s] 54%|█████▎    | 5237/9770 [1:01:29<50:42,  1.49it/s] 54%|█████▎    | 5238/9770 [1:01:29<50:18,  1.50it/s] 54%|█████▎    | 5239/9770 [1:01:30<50:05,  1.51it/s] 54%|█████▎    | 5240/9770 [1:01:31<49:56,  1.51it/s]                                                      54%|█████▎    | 5240/9770 [1:01:31<49:56,  1.51it/s] 54%|█████▎    | 5241/9770 [1:01:31<50:04,  1.51it/s] 54%|█████▎    | 5242/9770 [1:01:32<50:24,  1.50it/s] 54%|█████▎    | 5243/9770 [1:01:33<49:57,  1.51it/s] 54%|█████▎    | 5244/9770 [1:01:33<49:59,  1.51it/s] 54%|█████▎    |
+0: {'loss': 0.6653, 'grad_norm': 0.5962466507500274, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.54}
+0:  5245/9770 [1:01:34<50:25,  1.50it/s] 54%|█████▎    | 5246/9770 [1:01:35<49:44,  1.52it/s] 54%|█████▎    | 5247/9770 [1:01:35<50:32,  1.49it/s] 54%|█████▎    | 5248/9770 [1:01:36<50:07,  1.50it/s] 54%|█████▎    | 5249/9770 [1:01:37<50:01,  1.51it/s] 54%|█████▎    | 5250/9770 [1:01:37<50:25,  1.49it/s]                                                      54%|█████▎    | 5250/9770 [1:01:37<50:25,  1.49it/s] 54%|█████▎    | 5251/9770 [1:01:38<50:15,  1.50it/s] 54%|█████▍    | 5252/9770 [1:01:39<49:53,  1.51it/s] 54%|█████▍    | 5253/9770 [1:01:39<50:46,  1.48it/s] 54%|█████▍    | 5254/9770 [1:01:40<50:42,  1.48it/s] 54%|█████▍    | 5255/9770 [1:01:41<51:09,  1.47it/s] 54%|█████▍    | 5256/9770 [1:01:41<51:15,  1.47it/s] 54%|█████▍    | 5257/9770 [1:01:42<50:57,  1.48it/s] 54%|█████▍    | 5258/9770 [1:01:43<50:13,  1.50it/s] 54%|�
+0: {'loss': 0.6665, 'grad_norm': 0.6335504374501189, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.54}
+0: {'loss': 0.6731, 'grad_norm': 0.6099947883842839, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.54}
+0: �████▍    | 5259/9770 [1:01:43<49:50,  1.51it/s] 54%|█████▍    | 5260/9770 [1:01:44<50:02,  1.50it/s]                                                      54%|█████▍    | 5260/9770 [1:01:44<50:02,  1.50it/s] 54%|█████▍    | 5261/9770 [1:01:45<50:16,  1.49it/s] 54%|█████▍    | 5262/9770 [1:01:45<49:54,  1.51it/s] 54%|█████▍    | 5263/9770 [1:01:46<49:59,  1.50it/s] 54%|█████▍    | 5264/9770 [1:01:47<50:10,  1.50it/s] 54%|█████▍    | 5265/9770 [1:01:47<50:06,  1.50it/s] 54%|█████▍    | 5266/9770 [1:01:48<50:56,  1.47it/s] 54%|█████▍    | 5267/9770 [1:01:49<50:45,  1.48it/s] 54%|█████▍    | 5268/9770 [1:01:49<50:14,  1.49it/s] 54%|█████▍    | 5269/9770 [1:01:50<49:58,  1.50it/s] 54%|█████▍    | 5270/9770 [1:01:51<49:29,  1.52it/s]                                                      54%|█████▍    | 5270/9770 [1:01:51<49:29,  1.52it/
+0: {'loss': 0.6527, 'grad_norm': 0.6081425355020218, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.54}
+0: s] 54%|█████▍    | 5271/9770 [1:01:51<49:45,  1.51it/s] 54%|█████▍    | 5272/9770 [1:01:52<49:35,  1.51it/s] 54%|█████▍    | 5273/9770 [1:01:53<49:34,  1.51it/s] 54%|█████▍    | 5274/9770 [1:01:53<49:09,  1.52it/s] 54%|█████▍    | 5275/9770 [1:01:54<48:19,  1.55it/s] 54%|█████▍    | 5276/9770 [1:01:55<48:38,  1.54it/s] 54%|█████▍    | 5277/9770 [1:01:55<48:52,  1.53it/s] 54%|█████▍    | 5278/9770 [1:01:56<49:05,  1.53it/s] 54%|█████▍    | 5279/9770 [1:01:57<49:17,  1.52it/s] 54%|█████▍    | 5280/9770 [1:01:57<49:07,  1.52it/s]                                                      54%|█████▍    | 5280/9770 [1:01:57<49:07,  1.52it/s] 54%|█████▍    | 5281/9770 [1:01:58<49:14,  1.52it/s] 54%|█████▍    | 5282/9770 [1:01:59<49:25,  1.51it/s] 54%|█████▍    | 5283/9770 [1:01:59<49:13,  1.52it/s] 54%|█████▍    | 5284/9770 [1:
+0: {'loss': 0.6567, 'grad_norm': 0.6164764821651015, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.54}
+0: 02:00<48:50,  1.53it/s] 54%|█████▍    | 5285/9770 [1:02:01<48:23,  1.54it/s] 54%|█████▍    | 5286/9770 [1:02:01<48:49,  1.53it/s] 54%|█████▍    | 5287/9770 [1:02:02<48:50,  1.53it/s] 54%|█████▍    | 5288/9770 [1:02:03<49:09,  1.52it/s] 54%|█████▍    | 5289/9770 [1:02:03<49:20,  1.51it/s] 54%|█████▍    | 5290/9770 [1:02:04<48:37,  1.54it/s]                                                      54%|█████▍    | 5290/9770 [1:02:04<48:37,  1.54it/s] 54%|█████▍    | 5291/9770 [1:02:05<49:49,  1.50it/s] 54%|█████▍    | 5292/9770 [1:02:05<49:10,  1.52it/s] 54%|█████▍    | 5293/9770 [1:02:06<49:22,  1.51it/s] 54%|█████▍    | 5294/9770 [1:02:07<49:14,  1.51it/s] 54%|█████▍    | 5295/9770 [1:02:07<49:18,  1.51it/s] 54%|█████▍    | 5296/9770 [1:02:08<49:10,  1.52it/s] 54%|█████▍    | 5297/9770 [1:02:09<49:03,  1.52it/s] 54%|█████�
+0: {'loss': 0.6656, 'grad_norm': 0.6557561791126008, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.54}
+0: {'loss': 0.6654, 'grad_norm': 0.6200148141126642, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.54}
+0: ��    | 5298/9770 [1:02:09<49:34,  1.50it/s] 54%|█████▍    | 5299/9770 [1:02:10<50:02,  1.49it/s] 54%|█████▍    | 5300/9770 [1:02:11<49:51,  1.49it/s]                                                      54%|█████▍    | 5300/9770 [1:02:11<49:51,  1.49it/s] 54%|█████▍    | 5301/9770 [1:02:11<49:18,  1.51it/s] 54%|█████▍    | 5302/9770 [1:02:12<49:34,  1.50it/s] 54%|█████▍    | 5303/9770 [1:02:13<49:33,  1.50it/s] 54%|█████▍    | 5304/9770 [1:02:13<48:58,  1.52it/s] 54%|█████▍    | 5305/9770 [1:02:14<48:43,  1.53it/s] 54%|█████▍    | 5306/9770 [1:02:15<49:32,  1.50it/s] 54%|█████▍    | 5307/9770 [1:02:15<50:08,  1.48it/s] 54%|█████▍    | 5308/9770 [1:02:16<50:10,  1.48it/s] 54%|█████▍    | 5309/9770 [1:02:17<49:45,  1.49it/s] 54%|█████▍    | 5310/9770 [1:02:17<49:23,  1.51it/s]                                                      54%|██
+0: {'loss': 0.6832, 'grad_norm': 0.6455936589980646, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.54}
+0: ███▍    | 5310/9770 [1:02:17<49:23,  1.51it/s] 54%|█████▍    | 5311/9770 [1:02:18<48:37,  1.53it/s] 54%|█████▍    | 5312/9770 [1:02:18<48:07,  1.54it/s] 54%|█████▍    | 5313/9770 [1:02:19<48:19,  1.54it/s] 54%|█████▍    | 5314/9770 [1:02:20<48:32,  1.53it/s] 54%|█████▍    | 5315/9770 [1:02:20<48:32,  1.53it/s] 54%|█████▍    | 5316/9770 [1:02:21<48:31,  1.53it/s] 54%|█████▍    | 5317/9770 [1:02:22<49:28,  1.50it/s] 54%|█████▍    | 5318/9770 [1:02:22<49:07,  1.51it/s] 54%|█████▍    | 5319/9770 [1:02:23<49:13,  1.51it/s] 54%|█████▍    | 5320/9770 [1:02:24<48:46,  1.52it/s]                                                      54%|█████▍    | 5320/9770 [1:02:24<48:46,  1.52it/s] 54%|█████▍    | 5321/9770 [1:02:24<48:51,  1.52it/s] 54%|█████▍    | 5322/9770 [1:02:25<49:03,  1.51it/s] 54%|█████▍    | 5323/9770 [1:02:26<48:29,  
+0: {'loss': 0.6815, 'grad_norm': 0.634737508337699, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.55}
+0: 1.53it/s] 54%|█████▍    | 5324/9770 [1:02:26<48:59,  1.51it/s] 55%|█████▍    | 5325/9770 [1:02:27<48:59,  1.51it/s] 55%|█████▍    | 5326/9770 [1:02:28<48:47,  1.52it/s] 55%|█████▍    | 5327/9770 [1:02:28<48:13,  1.54it/s] 55%|█████▍    | 5328/9770 [1:02:29<47:59,  1.54it/s] 55%|█████▍    | 5329/9770 [1:02:30<48:03,  1.54it/s] 55%|█████▍    | 5330/9770 [1:02:30<48:27,  1.53it/s]                                                      55%|█████▍    | 5330/9770 [1:02:30<48:27,  1.53it/s] 55%|█████▍    | 5331/9770 [1:02:31<48:38,  1.52it/s] 55%|█████▍    | 5332/9770 [1:02:32<48:26,  1.53it/s] 55%|█████▍    | 5333/9770 [1:02:32<47:49,  1.55it/s] 55%|█████▍    | 5334/9770 [1:02:33<47:39,  1.55it/s] 55%|█████▍    | 5335/9770 [1:02:34<48:23,  1.53it/s] 55%|█████▍    | 5336/9770 [1:02:34<48:42,  1.52it/s] 55%|█████▍    | 5337/9
+0: {'loss': 0.6518, 'grad_norm': 0.617815014780144, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.55}
+0: 770 [1:02:35<49:51,  1.48it/s] 55%|█████▍    | 5338/9770 [1:02:36<48:35,  1.52it/s] 55%|█████▍    | 5339/9770 [1:02:36<48:19,  1.53it/s] 55%|█████▍    | 5340/9770 [1:02:37<47:59,  1.54it/s]                                                      55%|█████▍    | 5340/9770 [1:02:37<47:59,  1.54it/s] 55%|█████▍    | 5341/9770 [1:02:38<48:45,  1.51it/s] 55%|█████▍    | 5342/9770 [1:02:38<48:22,  1.53it/s] 55%|█████▍    | 5343/9770 [1:02:39<48:07,  1.53it/s] 55%|█████▍    | 5344/9770 [1:02:39<47:40,  1.55it/s] 55%|█████▍    | 5345/9770 [1:02:40<47:53,  1.54it/s] 55%|█████▍    | 5346/9770 [1:02:41<48:04,  1.53it/s] 55%|█████▍    | 5347/9770 [1:02:41<48:49,  1.51it/s] 55%|█████▍    | 5348/9770 [1:02:42<47:44,  1.54it/s] 55%|█████▍    | 5349/9770 [1:02:43<47:52,  1.54it/s] 55%|█████▍    | 5350/9770 [1:02:43<48:03,  1.53it/s]              
+0: {'loss': 0.6466, 'grad_norm': 0.6161852209158847, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.55}
+0: {'loss': 0.6639, 'grad_norm': 0.6175638914997587, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.55}
+0:                                         55%|█████▍    | 5350/9770 [1:02:43<48:03,  1.53it/s] 55%|█████▍    | 5351/9770 [1:02:44<49:19,  1.49it/s] 55%|█████▍    | 5352/9770 [1:02:45<49:02,  1.50it/s] 55%|█████▍    | 5353/9770 [1:02:45<48:54,  1.51it/s] 55%|█████▍    | 5354/9770 [1:02:46<48:43,  1.51it/s] 55%|█████▍    | 5355/9770 [1:02:47<48:32,  1.52it/s] 55%|█████▍    | 5356/9770 [1:02:47<48:09,  1.53it/s] 55%|█████▍    | 5357/9770 [1:02:48<48:05,  1.53it/s] 55%|█████▍    | 5358/9770 [1:02:49<48:15,  1.52it/s] 55%|█████▍    | 5359/9770 [1:02:49<48:23,  1.52it/s] 55%|█████▍    | 5360/9770 [1:02:50<48:17,  1.52it/s]                                                      55%|█████▍    | 5360/9770 [1:02:50<48:17,  1.52it/s] 55%|█████▍    | 5361/9770 [1:02:51<48:44,  1.51it/s] 55%|█████▍    | 5362/9770 [1:02:51<47:59,  1.53it/s] 55%
+0: {'loss': 0.6785, 'grad_norm': 0.6340827133531325, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.55}
+0: |█████▍    | 5363/9770 [1:02:52<48:04,  1.53it/s] 55%|█████▍    | 5364/9770 [1:02:53<49:06,  1.50it/s] 55%|█████▍    | 5365/9770 [1:02:53<48:34,  1.51it/s] 55%|█████▍    | 5366/9770 [1:02:54<48:20,  1.52it/s] 55%|█████▍    | 5367/9770 [1:02:55<48:03,  1.53it/s] 55%|█████▍    | 5368/9770 [1:02:55<48:14,  1.52it/s] 55%|█████▍    | 5369/9770 [1:02:56<47:47,  1.53it/s] 55%|█████▍    | 5370/9770 [1:02:57<47:47,  1.53it/s]                                                      55%|█████▍    | 5370/9770 [1:02:57<47:47,  1.53it/s] 55%|█████▍    | 5371/9770 [1:02:57<47:43,  1.54it/s] 55%|█████▍    | 5372/9770 [1:02:58<48:02,  1.53it/s] 55%|█████▍    | 5373/9770 [1:02:59<48:42,  1.50it/s] 55%|█████▌    | 5374/9770 [1:02:59<48:37,  1.51it/s] 55%|█████▌    | 5375/9770 [1:03:00<48:35,  1.51it/s] 55%|█████▌    | 5376/9770 [1:03:01<4
+0: {'loss': 0.6621, 'grad_norm': 0.604460760658246, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.55}
+0: 8:18,  1.52it/s] 55%|█████▌    | 5377/9770 [1:03:01<48:12,  1.52it/s] 55%|█████▌    | 5378/9770 [1:03:02<48:27,  1.51it/s] 55%|█████▌    | 5379/9770 [1:03:03<48:24,  1.51it/s] 55%|█████▌    | 5380/9770 [1:03:03<47:48,  1.53it/s]                                                      55%|█████▌    | 5380/9770 [1:03:03<47:48,  1.53it/s] 55%|█████▌    | 5381/9770 [1:03:04<48:01,  1.52it/s] 55%|█████▌    | 5382/9770 [1:03:05<48:56,  1.49it/s] 55%|█████▌    | 5383/9770 [1:03:05<49:00,  1.49it/s] 55%|█████▌    | 5384/9770 [1:03:06<48:56,  1.49it/s] 55%|█████▌    | 5385/9770 [1:03:07<49:01,  1.49it/s] 55%|█████▌    | 5386/9770 [1:03:07<48:35,  1.50it/s] 55%|█████▌    | 5387/9770 [1:03:08<49:15,  1.48it/s] 55%|█████▌    | 5388/9770 [1:03:09<48:52,  1.49it/s] 55%|█████▌    | 5389/9770 [1:03:09<48:54,  1.49it/s] 55%|█████▌    |
+0: {'loss': 0.6735, 'grad_norm': 0.6677450244913148, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.55}
+0: {'loss': 0.661, 'grad_norm': 0.6251935515347155, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.55}
+0:  5390/9770 [1:03:10<48:25,  1.51it/s]                                                      55%|█████▌    | 5390/9770 [1:03:10<48:25,  1.51it/s] 55%|█████▌    | 5391/9770 [1:03:11<48:24,  1.51it/s] 55%|█████▌    | 5392/9770 [1:03:11<47:51,  1.52it/s] 55%|█████▌    | 5393/9770 [1:03:12<47:45,  1.53it/s] 55%|█████▌    | 5394/9770 [1:03:12<48:12,  1.51it/s] 55%|█████▌    | 5395/9770 [1:03:13<48:14,  1.51it/s] 55%|█████▌    | 5396/9770 [1:03:14<48:21,  1.51it/s] 55%|█████▌    | 5397/9770 [1:03:14<48:18,  1.51it/s] 55%|█████▌    | 5398/9770 [1:03:15<48:07,  1.51it/s] 55%|█████▌    | 5399/9770 [1:03:16<48:12,  1.51it/s] 55%|█████▌    | 5400/9770 [1:03:16<47:42,  1.53it/s]                                                      55%|█████▌    | 5400/9770 [1:03:16<47:42,  1.53it/s] 55%|█████▌    | 5401/9770 [1:03:17<47:28,  1.53it/s] 55%|████�
+0: {'loss': 0.6622, 'grad_norm': 0.5964295386334724, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.55}
+0: ��▌    | 5402/9770 [1:03:18<47:49,  1.52it/s] 55%|█████▌    | 5403/9770 [1:03:18<47:50,  1.52it/s] 55%|█████▌    | 5404/9770 [1:03:19<47:53,  1.52it/s] 55%|█████▌    | 5405/9770 [1:03:20<47:36,  1.53it/s] 55%|█████▌    | 5406/9770 [1:03:20<47:41,  1.53it/s] 55%|█████▌    | 5407/9770 [1:03:21<47:31,  1.53it/s] 55%|█████▌    | 5408/9770 [1:03:22<47:30,  1.53it/s] 55%|█████▌    | 5409/9770 [1:03:22<47:02,  1.54it/s] 55%|█████▌    | 5410/9770 [1:03:23<47:19,  1.54it/s]                                                      55%|█████▌    | 5410/9770 [1:03:23<47:19,  1.54it/s] 55%|█████▌    | 5411/9770 [1:03:24<47:17,  1.54it/s] 55%|█████▌    | 5412/9770 [1:03:24<47:10,  1.54it/s] 55%|█████▌    | 5413/9770 [1:03:25<46:51,  1.55it/s] 55%|█████▌    | 5414/9770 [1:03:26<47:05,  1.54it/s] 55%|█████▌    | 5415/9770 [1:03:26<47:14,  1.54it/
+0: {'loss': 0.6531, 'grad_norm': 0.6073224849247759, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.55}
+0: s] 55%|█████▌    | 5416/9770 [1:03:27<47:06,  1.54it/s] 55%|█████▌    | 5417/9770 [1:03:28<47:17,  1.53it/s] 55%|█████▌    | 5418/9770 [1:03:28<47:56,  1.51it/s] 55%|█████▌    | 5419/9770 [1:03:29<47:24,  1.53it/s] 55%|█████▌    | 5420/9770 [1:03:29<47:46,  1.52it/s]                                                      55%|█████▌    | 5420/9770 [1:03:30<47:46,  1.52it/s] 55%|█████▌    | 5421/9770 [1:03:30<47:58,  1.51it/s] 55%|█████▌    | 5422/9770 [1:03:31<48:06,  1.51it/s] 56%|█████▌    | 5423/9770 [1:03:32<48:30,  1.49it/s] 56%|█████▌    | 5424/9770 [1:03:32<48:21,  1.50it/s] 56%|█████▌    | 5425/9770 [1:03:33<48:48,  1.48it/s] 56%|█████▌    | 5426/9770 [1:03:34<48:50,  1.48it/s] 56%|█████▌    | 5427/9770 [1:03:34<48:26,  1.49it/s] 56%|█████▌    | 5428/9770 [1:03:35<48:19,  1.50it/s] 56%|█████▌    | 5429/9770 [1:
+0: {'loss': 0.6837, 'grad_norm': 0.6364591932416096, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.56}
+0: {'loss': 0.6881, 'grad_norm': 0.5988762548913954, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.56}
+0: 03:36<47:43,  1.52it/s] 56%|█████▌    | 5430/9770 [1:03:36<47:41,  1.52it/s]                                                      56%|█████▌    | 5430/9770 [1:03:36<47:41,  1.52it/s] 56%|█████▌    | 5431/9770 [1:03:37<48:19,  1.50it/s] 56%|█████▌    | 5432/9770 [1:03:38<48:05,  1.50it/s] 56%|█████▌    | 5433/9770 [1:03:38<47:41,  1.52it/s] 56%|█████▌    | 5434/9770 [1:03:39<47:24,  1.52it/s] 56%|█████▌    | 5435/9770 [1:03:39<47:39,  1.52it/s] 56%|█████▌    | 5436/9770 [1:03:40<47:36,  1.52it/s] 56%|█████▌    | 5437/9770 [1:03:41<47:48,  1.51it/s] 56%|█████▌    | 5438/9770 [1:03:41<47:36,  1.52it/s] 56%|█████▌    | 5439/9770 [1:03:42<48:29,  1.49it/s] 56%|█████▌    | 5440/9770 [1:03:43<48:02,  1.50it/s]                                                      56%|█████▌    | 5440/9770 [1:03:43<48:02,  1.50it/s] 56%|█████▌    | 544
+0: {'loss': 0.658, 'grad_norm': 0.6173900873759363, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.56}
+0: 1/9770 [1:03:43<47:59,  1.50it/s] 56%|█████▌    | 5442/9770 [1:03:44<48:10,  1.50it/s] 56%|█████▌    | 5443/9770 [1:03:45<47:29,  1.52it/s] 56%|█████▌    | 5444/9770 [1:03:45<47:28,  1.52it/s] 56%|█████▌    | 5445/9770 [1:03:46<46:50,  1.54it/s] 56%|█████▌    | 5446/9770 [1:03:47<47:27,  1.52it/s] 56%|█████▌    | 5447/9770 [1:03:47<47:35,  1.51it/s] 56%|█████▌    | 5448/9770 [1:03:48<47:30,  1.52it/s] 56%|█████▌    | 5449/9770 [1:03:49<47:02,  1.53it/s] 56%|█████▌    | 5450/9770 [1:03:49<46:29,  1.55it/s]                                                      56%|█████▌    | 5450/9770 [1:03:49<46:29,  1.55it/s] 56%|█████▌    | 5451/9770 [1:03:50<46:53,  1.54it/s] 56%|█████▌    | 5452/9770 [1:03:51<47:02,  1.53it/s] 56%|█████▌    | 5453/9770 [1:03:51<47:05,  1.53it/s] 56%|█████▌    | 5454/9770 [1:03:52<47:50,  1.50it/s] 56%|██
+0: {'loss': 0.6651, 'grad_norm': 0.606547513460977, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.56}
+0: ███▌    | 5455/9770 [1:03:53<47:21,  1.52it/s] 56%|█████▌    | 5456/9770 [1:03:53<46:57,  1.53it/s] 56%|█████▌    | 5457/9770 [1:03:54<46:44,  1.54it/s] 56%|█████▌    | 5458/9770 [1:03:55<47:13,  1.52it/s] 56%|█████▌    | 5459/9770 [1:03:55<47:34,  1.51it/s] 56%|█████▌    | 5460/9770 [1:03:56<47:48,  1.50it/s]                                                      56%|█████▌    | 5460/9770 [1:03:56<47:48,  1.50it/s] 56%|█████▌    | 5461/9770 [1:03:57<47:24,  1.51it/s] 56%|█████▌    | 5462/9770 [1:03:57<47:25,  1.51it/s] 56%|█████▌    | 5463/9770 [1:03:58<47:11,  1.52it/s] 56%|█████▌    | 5464/9770 [1:03:59<46:30,  1.54it/s] 56%|█████▌    | 5465/9770 [1:03:59<46:13,  1.55it/s] 56%|█████▌    | 5466/9770 [1:04:00<46:29,  1.54it/s] 56%|█████▌    | 5467/9770 [1:04:01<46:59,  1.53it/s] 56%|█████▌    | 5468/9770 [1:04:01<46:56,  
+0: {'loss': 0.6603, 'grad_norm': 0.7025307382685239, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.56}
+0: {'loss': 0.6713, 'grad_norm': 0.6496061793260294, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.56}
+0: 1.53it/s] 56%|█████▌    | 5469/9770 [1:04:02<47:04,  1.52it/s] 56%|█████▌    | 5470/9770 [1:04:02<46:56,  1.53it/s]                                                      56%|█████▌    | 5470/9770 [1:04:02<46:56,  1.53it/s] 56%|█████▌    | 5471/9770 [1:04:03<47:00,  1.52it/s] 56%|█████▌    | 5472/9770 [1:04:04<47:08,  1.52it/s] 56%|█████▌    | 5473/9770 [1:04:04<47:12,  1.52it/s] 56%|█████▌    | 5474/9770 [1:04:05<47:25,  1.51it/s] 56%|█████▌    | 5475/9770 [1:04:06<47:30,  1.51it/s] 56%|█████▌    | 5476/9770 [1:04:06<47:38,  1.50it/s] 56%|█████▌    | 5477/9770 [1:04:07<47:23,  1.51it/s] 56%|█████▌    | 5478/9770 [1:04:08<47:38,  1.50it/s] 56%|█████▌    | 5479/9770 [1:04:08<47:37,  1.50it/s] 56%|█████▌    | 5480/9770 [1:04:09<47:25,  1.51it/s]                                                      56%|█████▌    | 5480/9770 [1:04:0
+0: {'loss': 0.654, 'grad_norm': 0.5842679073675652, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.56}
+0: 9<47:25,  1.51it/s] 56%|█████▌    | 5481/9770 [1:04:10<47:04,  1.52it/s] 56%|█████▌    | 5482/9770 [1:04:10<47:03,  1.52it/s] 56%|█████▌    | 5483/9770 [1:04:11<46:45,  1.53it/s] 56%|█████▌    | 5484/9770 [1:04:12<46:57,  1.52it/s] 56%|█████▌    | 5485/9770 [1:04:12<47:05,  1.52it/s] 56%|█████▌    | 5486/9770 [1:04:13<46:32,  1.53it/s] 56%|█████▌    | 5487/9770 [1:04:14<46:25,  1.54it/s] 56%|█████▌    | 5488/9770 [1:04:14<47:44,  1.49it/s] 56%|█████▌    | 5489/9770 [1:04:15<46:37,  1.53it/s] 56%|█████▌    | 5490/9770 [1:04:16<46:42,  1.53it/s]                                                      56%|█████▌    | 5490/9770 [1:04:16<46:42,  1.53it/s] 56%|█████▌    | 5491/9770 [1:04:16<47:05,  1.51it/s] 56%|█████▌    | 5492/9770 [1:04:17<47:47,  1.49it/s] 56%|█████▌    | 5493/9770 [1:04:18<47:18,  1.51it/s] 56%|█████▌  
+0: {'loss': 0.6859, 'grad_norm': 0.6692935819569421, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.56}
+0:   | 5494/9770 [1:04:18<46:58,  1.52it/s] 56%|█████▌    | 5495/9770 [1:04:19<47:23,  1.50it/s] 56%|█████▋    | 5496/9770 [1:04:20<47:13,  1.51it/s] 56%|█████▋    | 5497/9770 [1:04:20<47:18,  1.51it/s] 56%|█████▋    | 5498/9770 [1:04:21<47:30,  1.50it/s] 56%|█████▋    | 5499/9770 [1:04:22<47:31,  1.50it/s] 56%|█████▋    | 5500/9770 [1:04:22<47:01,  1.51it/s]                                                      56%|█████▋    | 5500/9770 [1:04:22<47:01,  1.51it/s] 56%|█████▋    | 5501/9770 [1:04:23<47:14,  1.51it/s] 56%|█████▋    | 5502/9770 [1:04:24<46:29,  1.53it/s] 56%|█████▋    | 5503/9770 [1:04:24<45:56,  1.55it/s] 56%|█████▋    | 5504/9770 [1:04:25<45:51,  1.55it/s] 56%|█████▋    | 5505/9770 [1:04:26<46:09,  1.54it/s] 56%|█████▋    | 5506/9770 [1:04:26<47:04,  1.51it/s] 56%|█████▋    | 5507/9770 [1:04:27<47:05,  1.51it/s] 56%
+0: {'loss': 0.6593, 'grad_norm': 0.6110359421557113, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.56}
+0: |█████▋    | 5508/9770 [1:04:28<47:04,  1.51it/s] 56%|█████▋    | 5509/9770 [1:04:28<46:45,  1.52it/s] 56%|█████▋    | 5510/9770 [1:04:29<46:32,  1.53it/s]                                                      56%|█████▋    | 5510/9770 [1:04:29<46:32,  1.53it/s] 56%|█████▋    | 5511/9770 [1:04:30<46:55,  1.51it/s] 56%|█████▋    | 5512/9770 [1:04:30<46:52,  1.51it/s] 56%|█████▋    | 5513/9770 [1:04:31<47:08,  1.51it/s] 56%|█████▋    | 5514/9770 [1:04:32<47:02,  1.51it/s] 56%|█████▋    | 5515/9770 [1:04:32<47:23,  1.50it/s] 56%|█████▋    | 5516/9770 [1:04:33<46:56,  1.51it/s] 56%|█████▋    | 5517/9770 [1:04:34<46:39,  1.52it/s] 56%|█████▋    | 5518/9770 [1:04:34<46:38,  1.52it/s] 56%|█████▋    | 5519/9770 [1:04:35<46:57,  1.51it/s] 56%|█████▋    | 5520/9770 [1:04:35<46:46,  1.51it/s]                                                 
+0: {'loss': 0.674, 'grad_norm': 0.594650251780948, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.56}
+0: {'loss': 0.6588, 'grad_norm': 0.5886772556219194, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.57}
+0:      56%|█████▋    | 5520/9770 [1:04:36<46:46,  1.51it/s] 57%|█████▋    | 5521/9770 [1:04:36<47:07,  1.50it/s] 57%|█████▋    | 5522/9770 [1:04:37<46:20,  1.53it/s] 57%|█████▋    | 5523/9770 [1:04:37<46:30,  1.52it/s] 57%|█████▋    | 5524/9770 [1:04:38<46:47,  1.51it/s] 57%|█████▋    | 5525/9770 [1:04:39<46:36,  1.52it/s] 57%|█████▋    | 5526/9770 [1:04:39<46:18,  1.53it/s] 57%|█████▋    | 5527/9770 [1:04:40<46:53,  1.51it/s] 57%|█████▋    | 5528/9770 [1:04:41<47:09,  1.50it/s] 57%|█████▋    | 5529/9770 [1:04:41<46:53,  1.51it/s] 57%|█████▋    | 5530/9770 [1:04:42<46:51,  1.51it/s]                                                      57%|█████▋    | 5530/9770 [1:04:42<46:51,  1.51it/s] 57%|█████▋    | 5531/9770 [1:04:43<47:33,  1.49it/s] 57%|█████▋    | 5532/9770 [1:04:44<48:03,  1.47it/s] 57%|█████▋    | 5533/9770 
+0: {'loss': 0.6786, 'grad_norm': 0.6441412445192954, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.57}
+0: [1:04:44<47:26,  1.49it/s] 57%|█████▋    | 5534/9770 [1:04:45<47:09,  1.50it/s] 57%|█████▋    | 5535/9770 [1:04:45<46:54,  1.50it/s] 57%|█████▋    | 5536/9770 [1:04:46<46:25,  1.52it/s] 57%|█████▋    | 5537/9770 [1:04:47<45:42,  1.54it/s] 57%|█████▋    | 5538/9770 [1:04:47<46:07,  1.53it/s] 57%|█████▋    | 5539/9770 [1:04:48<46:38,  1.51it/s] 57%|█████▋    | 5540/9770 [1:04:49<46:43,  1.51it/s]                                                      57%|█████▋    | 5540/9770 [1:04:49<46:43,  1.51it/s] 57%|█████▋    | 5541/9770 [1:04:49<46:45,  1.51it/s] 57%|█████▋    | 5542/9770 [1:04:50<46:48,  1.51it/s] 57%|█████▋    | 5543/9770 [1:04:51<46:25,  1.52it/s] 57%|█████▋    | 5544/9770 [1:04:51<46:44,  1.51it/s] 57%|█████▋    | 5545/9770 [1:04:52<47:37,  1.48it/s] 57%|█████▋    | 5546/9770 [1:04:53<47:19,  1.49it/s] 57%|████�
+0: {'loss': 0.6766, 'grad_norm': 0.5979832170617129, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.57}
+0: ��▋    | 5547/9770 [1:04:53<46:25,  1.52it/s] 57%|█████▋    | 5548/9770 [1:04:54<47:12,  1.49it/s] 57%|█████▋    | 5549/9770 [1:04:55<46:51,  1.50it/s] 57%|█████▋    | 5550/9770 [1:04:55<46:29,  1.51it/s]                                                      57%|█████▋    | 5550/9770 [1:04:55<46:29,  1.51it/s] 57%|█████▋    | 5551/9770 [1:04:56<46:04,  1.53it/s] 57%|█████▋    | 5552/9770 [1:04:57<46:42,  1.51it/s] 57%|█████▋    | 5553/9770 [1:04:57<46:12,  1.52it/s] 57%|█████▋    | 5554/9770 [1:04:58<46:02,  1.53it/s] 57%|█████▋    | 5555/9770 [1:04:59<46:17,  1.52it/s] 57%|█████▋    | 5556/9770 [1:04:59<45:47,  1.53it/s] 57%|█████▋    | 5557/9770 [1:05:00<45:13,  1.55it/s] 57%|█████▋    | 5558/9770 [1:05:01<45:37,  1.54it/s] 57%|█████▋    | 5559/9770 [1:05:01<45:56,  1.53it/s] 57%|█████▋    | 5560/9770 [1:05:02<46:12,  1.52it/
+0: {'loss': 0.668, 'grad_norm': 0.6479565407748453, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.57}
+0: {'loss': 0.6977, 'grad_norm': 0.6371957492564437, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.57}
+0: s]                                                      57%|█████▋    | 5560/9770 [1:05:02<46:12,  1.52it/s] 57%|█████▋    | 5561/9770 [1:05:03<46:08,  1.52it/s] 57%|█████▋    | 5562/9770 [1:05:03<46:12,  1.52it/s] 57%|█████▋    | 5563/9770 [1:05:04<46:34,  1.51it/s] 57%|█████▋    | 5564/9770 [1:05:05<46:00,  1.52it/s] 57%|█████▋    | 5565/9770 [1:05:05<46:04,  1.52it/s] 57%|█████▋    | 5566/9770 [1:05:06<45:54,  1.53it/s] 57%|█████▋    | 5567/9770 [1:05:07<46:08,  1.52it/s] 57%|█████▋    | 5568/9770 [1:05:07<46:25,  1.51it/s] 57%|█████▋    | 5569/9770 [1:05:08<45:57,  1.52it/s] 57%|█████▋    | 5570/9770 [1:05:09<45:48,  1.53it/s]                                                      57%|█████▋    | 5570/9770 [1:05:09<45:48,  1.53it/s] 57%|█████▋    | 5571/9770 [1:05:09<46:17,  1.51it/s] 57%|█████▋    | 5572/9770 [1:05:10<46:28
+0: {'loss': 0.6499, 'grad_norm': 0.6183997452845711, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.57}
+0: ,  1.51it/s] 57%|█████▋    | 5573/9770 [1:05:11<46:25,  1.51it/s] 57%|█████▋    | 5574/9770 [1:05:11<46:26,  1.51it/s] 57%|█████▋    | 5575/9770 [1:05:12<49:27,  1.41it/s] 57%|█████▋    | 5576/9770 [1:05:13<52:29,  1.33it/s] 57%|█████▋    | 5577/9770 [1:05:14<54:59,  1.27it/s] 57%|█████▋    | 5578/9770 [1:05:15<55:19,  1.26it/s] 57%|█████▋    | 5579/9770 [1:05:15<53:10,  1.31it/s] 57%|█████▋    | 5580/9770 [1:05:16<54:28,  1.28it/s]                                                      57%|█████▋    | 5580/9770 [1:05:16<54:28,  1.28it/s] 57%|█████▋    | 5581/9770 [1:05:17<54:50,  1.27it/s] 57%|█████▋    | 5582/9770 [1:05:18<54:23,  1.28it/s] 57%|█████▋    | 5583/9770 [1:05:18<53:44,  1.30it/s] 57%|█████▋    | 5584/9770 [1:05:19<53:07,  1.31it/s] 57%|█████▋    | 5585/9770 [1:05:20<52:28,  1.33it/s] 57%|█████▋    | 558
+0: {'loss': 0.6525, 'grad_norm': 0.6537594655218257, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.57}
+0: 6/9770 [1:05:20<50:40,  1.38it/s] 57%|█████▋    | 5587/9770 [1:05:21<49:33,  1.41it/s] 57%|█████▋    | 5588/9770 [1:05:22<48:12,  1.45it/s] 57%|█████▋    | 5589/9770 [1:05:22<47:18,  1.47it/s] 57%|█████▋    | 5590/9770 [1:05:23<46:59,  1.48it/s]                                                      57%|█████▋    | 5590/9770 [1:05:23<46:59,  1.48it/s] 57%|█████▋    | 5591/9770 [1:05:24<47:42,  1.46it/s] 57%|█████▋    | 5592/9770 [1:05:25<47:26,  1.47it/s] 57%|█████▋    | 5593/9770 [1:05:25<46:44,  1.49it/s] 57%|█████▋    | 5594/9770 [1:05:26<46:43,  1.49it/s] 57%|█████▋    | 5595/9770 [1:05:26<45:56,  1.51it/s] 57%|█████▋    | 5596/9770 [1:05:27<45:55,  1.51it/s] 57%|█████▋    | 5597/9770 [1:05:28<45:46,  1.52it/s] 57%|█████▋    | 5598/9770 [1:05:28<45:50,  1.52it/s] 57%|█████▋    | 5599/9770 [1:05:29<46:12,  1.50it/s] 57%|██
+0: {'loss': 0.6876, 'grad_norm': 0.6235942372326306, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.57}
+0: {'loss': 0.6784, 'grad_norm': 0.6644109009918524, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.57}
+0: ███▋    | 5600/9770 [1:05:30<45:42,  1.52it/s]                                                      57%|█████▋    | 5600/9770 [1:05:30<45:42,  1.52it/s] 57%|█████▋    | 5601/9770 [1:05:30<46:05,  1.51it/s] 57%|█████▋    | 5602/9770 [1:05:31<46:35,  1.49it/s] 57%|█████▋    | 5603/9770 [1:05:32<46:44,  1.49it/s] 57%|█████▋    | 5604/9770 [1:05:32<46:08,  1.50it/s] 57%|█████▋    | 5605/9770 [1:05:33<46:01,  1.51it/s] 57%|█████▋    | 5606/9770 [1:05:34<46:00,  1.51it/s] 57%|█████▋    | 5607/9770 [1:05:34<46:16,  1.50it/s] 57%|█████▋    | 5608/9770 [1:05:35<46:35,  1.49it/s] 57%|█████▋    | 5609/9770 [1:05:36<45:56,  1.51it/s] 57%|█████▋    | 5610/9770 [1:05:36<46:57,  1.48it/s]                                                      57%|█████▋    | 5610/9770 [1:05:36<46:57,  1.48it/s] 57%|█████▋    | 5611/9770 [1:05:37<46:35,  1.49it/s] 
+0: {'loss': 0.6629, 'grad_norm': 0.6603418650841691, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.57}
+0: 57%|█████▋    | 5612/9770 [1:05:38<46:11,  1.50it/s] 57%|█████▋    | 5613/9770 [1:05:38<45:40,  1.52it/s] 57%|█████▋    | 5614/9770 [1:05:39<45:53,  1.51it/s] 57%|█████▋    | 5615/9770 [1:05:40<46:05,  1.50it/s] 57%|█████▋    | 5616/9770 [1:05:40<46:44,  1.48it/s] 57%|█████▋    | 5617/9770 [1:05:41<46:41,  1.48it/s] 58%|█████▊    | 5618/9770 [1:05:42<46:55,  1.47it/s] 58%|█████▊    | 5619/9770 [1:05:42<46:26,  1.49it/s] 58%|█████▊    | 5620/9770 [1:05:43<45:47,  1.51it/s]                                                      58%|█████▊    | 5620/9770 [1:05:43<45:47,  1.51it/s] 58%|█████▊    | 5621/9770 [1:05:44<46:03,  1.50it/s] 58%|█████▊    | 5622/9770 [1:05:44<46:17,  1.49it/s] 58%|█████▊    | 5623/9770 [1:05:45<46:09,  1.50it/s] 58%|█████▊    | 5624/9770 [1:05:46<46:04,  1.50it/s] 58%|█████▊    | 5625/9770 [1:05:4
+0: {'loss': 0.6547, 'grad_norm': 0.6541120070414278, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.58}
+0: 6<45:53,  1.51it/s] 58%|█████▊    | 5626/9770 [1:05:47<46:09,  1.50it/s] 58%|█████▊    | 5627/9770 [1:05:48<45:57,  1.50it/s] 58%|█████▊    | 5628/9770 [1:05:48<46:14,  1.49it/s] 58%|█████▊    | 5629/9770 [1:05:49<46:58,  1.47it/s] 58%|█████▊    | 5630/9770 [1:05:50<46:44,  1.48it/s]                                                      58%|█████▊    | 5630/9770 [1:05:50<46:44,  1.48it/s] 58%|█████▊    | 5631/9770 [1:05:51<46:43,  1.48it/s] 58%|█████▊    | 5632/9770 [1:05:51<46:52,  1.47it/s] 58%|█████▊    | 5633/9770 [1:05:52<46:28,  1.48it/s] 58%|█████▊    | 5634/9770 [1:05:53<46:08,  1.49it/s] 58%|█████▊    | 5635/9770 [1:05:53<45:45,  1.51it/s] 58%|█████▊    | 5636/9770 [1:05:54<45:44,  1.51it/s] 58%|█████▊    | 5637/9770 [1:05:55<45:30,  1.51it/s] 58%|█████▊    | 5638/9770 [1:05:55<45:04,  1.53it/s] 58%|█████▊  
+0: {'loss': 0.6796, 'grad_norm': 0.6563746176347128, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.58}
+0: {'loss': 0.6493, 'grad_norm': 0.6319310754522596, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.58}
+0:   | 5639/9770 [1:05:56<45:13,  1.52it/s] 58%|█████▊    | 5640/9770 [1:05:56<45:29,  1.51it/s]                                                      58%|█████▊    | 5640/9770 [1:05:56<45:29,  1.51it/s] 58%|█████▊    | 5641/9770 [1:05:57<45:51,  1.50it/s] 58%|█████▊    | 5642/9770 [1:05:58<45:18,  1.52it/s] 58%|█████▊    | 5643/9770 [1:05:58<44:51,  1.53it/s] 58%|█████▊    | 5644/9770 [1:05:59<45:09,  1.52it/s] 58%|█████▊    | 5645/9770 [1:06:00<44:46,  1.54it/s] 58%|█████▊    | 5646/9770 [1:06:00<45:21,  1.52it/s] 58%|█████▊    | 5647/9770 [1:06:01<45:38,  1.51it/s] 58%|█████▊    | 5648/9770 [1:06:02<45:19,  1.52it/s] 58%|█████▊    | 5649/9770 [1:06:02<45:13,  1.52it/s] 58%|█████▊    | 5650/9770 [1:06:03<45:29,  1.51it/s]                                                      58%|█████▊    | 5650/9770 [1:06:03<45:29,  1.51it/s] 58%|███�
+0: {'loss': 0.6733, 'grad_norm': 0.63236566476251, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.58}
+0: ��█▊    | 5651/9770 [1:06:04<45:18,  1.52it/s] 58%|█████▊    | 5652/9770 [1:06:04<46:22,  1.48it/s] 58%|█████▊    | 5653/9770 [1:06:05<45:29,  1.51it/s] 58%|█████▊    | 5654/9770 [1:06:06<45:14,  1.52it/s] 58%|█████▊    | 5655/9770 [1:06:06<45:02,  1.52it/s] 58%|█████▊    | 5656/9770 [1:06:07<45:27,  1.51it/s] 58%|█████▊    | 5657/9770 [1:06:08<45:40,  1.50it/s] 58%|█████▊    | 5658/9770 [1:06:08<45:31,  1.51it/s] 58%|█████▊    | 5659/9770 [1:06:09<45:30,  1.51it/s] 58%|█████▊    | 5660/9770 [1:06:10<45:17,  1.51it/s]                                                      58%|█████▊    | 5660/9770 [1:06:10<45:17,  1.51it/s] 58%|█████▊    | 5661/9770 [1:06:10<45:38,  1.50it/s] 58%|█████▊    | 5662/9770 [1:06:11<45:25,  1.51it/s] 58%|█████▊    | 5663/9770 [1:06:12<45:51,  1.49it/s] 58%|█████▊    | 5664/9770 [1:06:12<45:57,  1.49
+0: {'loss': 0.6432, 'grad_norm': 0.5930895035110533, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.58}
+0: it/s] 58%|█████▊    | 5665/9770 [1:06:13<45:29,  1.50it/s] 58%|█████▊    | 5666/9770 [1:06:14<45:33,  1.50it/s] 58%|█████▊    | 5667/9770 [1:06:14<45:00,  1.52it/s] 58%|█████▊    | 5668/9770 [1:06:15<45:10,  1.51it/s] 58%|█████▊    | 5669/9770 [1:06:16<45:32,  1.50it/s] 58%|█████▊    | 5670/9770 [1:06:16<45:26,  1.50it/s]                                                      58%|█████▊    | 5670/9770 [1:06:16<45:26,  1.50it/s] 58%|█████▊    | 5671/9770 [1:06:17<45:30,  1.50it/s] 58%|█████▊    | 5672/9770 [1:06:18<45:55,  1.49it/s] 58%|█████▊    | 5673/9770 [1:06:18<44:57,  1.52it/s] 58%|█████▊    | 5674/9770 [1:06:19<44:51,  1.52it/s] 58%|█████▊    | 5675/9770 [1:06:20<45:01,  1.52it/s] 58%|█████▊    | 5676/9770 [1:06:20<44:57,  1.52it/s] 58%|█████▊    | 5677/9770 [1:06:21<45:21,  1.50it/s] 58%|█████▊    | 5678/9770 
+0: {'loss': 0.6543, 'grad_norm': 0.582187518779254, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.58}
+0: {'loss': 0.6744, 'grad_norm': 0.6236457720880778, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.58}
+0: [1:06:22<44:40,  1.53it/s] 58%|█████▊    | 5679/9770 [1:06:22<44:59,  1.52it/s] 58%|█████▊    | 5680/9770 [1:06:23<45:04,  1.51it/s]                                                      58%|█████▊    | 5680/9770 [1:06:23<45:04,  1.51it/s] 58%|█████▊    | 5681/9770 [1:06:24<44:53,  1.52it/s] 58%|█████▊    | 5682/9770 [1:06:24<44:34,  1.53it/s] 58%|█████▊    | 5683/9770 [1:06:25<44:49,  1.52it/s] 58%|█████▊    | 5684/9770 [1:06:26<44:47,  1.52it/s] 58%|█████▊    | 5685/9770 [1:06:26<44:53,  1.52it/s] 58%|█████▊    | 5686/9770 [1:06:27<44:55,  1.52it/s] 58%|█████▊    | 5687/9770 [1:06:28<45:03,  1.51it/s] 58%|█████▊    | 5688/9770 [1:06:28<44:35,  1.53it/s] 58%|█████▊    | 5689/9770 [1:06:29<44:31,  1.53it/s] 58%|█████▊    | 5690/9770 [1:06:30<44:08,  1.54it/s]                                                      58%|█████▊    | 
+0: {'loss': 0.6832, 'grad_norm': 0.5858399871882379, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.58}
+0: 5690/9770 [1:06:30<44:08,  1.54it/s] 58%|█████▊    | 5691/9770 [1:06:30<44:09,  1.54it/s] 58%|█████▊    | 5692/9770 [1:06:31<44:15,  1.54it/s] 58%|█████▊    | 5693/9770 [1:06:31<44:38,  1.52it/s] 58%|█████▊    | 5694/9770 [1:06:32<44:45,  1.52it/s] 58%|█████▊    | 5695/9770 [1:06:33<44:13,  1.54it/s] 58%|█████▊    | 5696/9770 [1:06:33<44:30,  1.53it/s] 58%|█████▊    | 5697/9770 [1:06:34<43:53,  1.55it/s] 58%|█████▊    | 5698/9770 [1:06:35<44:21,  1.53it/s] 58%|█████▊    | 5699/9770 [1:06:35<44:25,  1.53it/s] 58%|█████▊    | 5700/9770 [1:06:36<44:22,  1.53it/s]                                                      58%|█████▊    | 5700/9770 [1:06:36<44:22,  1.53it/s] 58%|█████▊    | 5701/9770 [1:06:37<44:52,  1.51it/s] 58%|█████▊    | 5702/9770 [1:06:37<45:04,  1.50it/s] 58%|█████▊    | 5703/9770 [1:06:38<45:06,  1.50it/s] 58%|█
+0: {'loss': 0.6815, 'grad_norm': 0.6216407289584691, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.58}
+0: ████▊    | 5704/9770 [1:06:39<44:48,  1.51it/s] 58%|█████▊    | 5705/9770 [1:06:39<44:52,  1.51it/s] 58%|█████▊    | 5706/9770 [1:06:40<44:20,  1.53it/s] 58%|█████▊    | 5707/9770 [1:06:41<43:53,  1.54it/s] 58%|█████▊    | 5708/9770 [1:06:41<44:04,  1.54it/s] 58%|█████▊    | 5709/9770 [1:06:42<44:06,  1.53it/s] 58%|█████▊    | 5710/9770 [1:06:43<44:19,  1.53it/s]                                                      58%|█████▊    | 5710/9770 [1:06:43<44:19,  1.53it/s] 58%|█████▊    | 5711/9770 [1:06:43<44:34,  1.52it/s] 58%|█████▊    | 5712/9770 [1:06:44<44:32,  1.52it/s] 58%|█████▊    | 5713/9770 [1:06:45<44:19,  1.53it/s] 58%|█████▊    | 5714/9770 [1:06:45<43:55,  1.54it/s] 58%|█████▊    | 5715/9770 [1:06:46<43:32,  1.55it/s] 59%|█████▊    | 5716/9770 [1:06:47<43:43,  1.55it/s] 59%|█████▊    | 5717/9770 [1:06:47<43:43
+0: {'loss': 0.663, 'grad_norm': 0.6248991203726749, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.59}
+0: ,  1.54it/s] 59%|█████▊    | 5718/9770 [1:06:48<43:33,  1.55it/s] 59%|█████▊    | 5719/9770 [1:06:48<43:15,  1.56it/s] 59%|█████▊    | 5720/9770 [1:06:49<44:24,  1.52it/s]                                                      59%|█████▊    | 5720/9770 [1:06:49<44:24,  1.52it/s] 59%|█████▊    | 5721/9770 [1:06:50<44:44,  1.51it/s] 59%|█████▊    | 5722/9770 [1:06:50<44:17,  1.52it/s] 59%|█████▊    | 5723/9770 [1:06:51<44:34,  1.51it/s] 59%|█████▊    | 5724/9770 [1:06:52<44:20,  1.52it/s] 59%|█████▊    | 5725/9770 [1:06:52<44:16,  1.52it/s] 59%|█████▊    | 5726/9770 [1:06:53<44:49,  1.50it/s] 59%|█████▊    | 5727/9770 [1:06:54<45:06,  1.49it/s] 59%|█████▊    | 5728/9770 [1:06:54<44:46,  1.50it/s] 59%|█████▊    | 5729/9770 [1:06:55<44:36,  1.51it/s] 59%|█████▊    | 5730/9770 [1:06:56<44:31,  1.51it/s]                                
+0: {'loss': 0.6482, 'grad_norm': 0.6149293927060516, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.59}
+0: {'loss': 0.6356, 'grad_norm': 0.6041723375141027, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.59}
+0:                       59%|█████▊    | 5730/9770 [1:06:56<44:31,  1.51it/s] 59%|█████▊    | 5731/9770 [1:06:56<44:00,  1.53it/s] 59%|█████▊    | 5732/9770 [1:06:57<43:37,  1.54it/s] 59%|█████▊    | 5733/9770 [1:06:58<43:47,  1.54it/s] 59%|█████▊    | 5734/9770 [1:06:58<44:03,  1.53it/s] 59%|█████▊    | 5735/9770 [1:06:59<44:10,  1.52it/s] 59%|█████▊    | 5736/9770 [1:07:00<44:09,  1.52it/s] 59%|█████▊    | 5737/9770 [1:07:00<43:48,  1.53it/s] 59%|█████▊    | 5738/9770 [1:07:01<43:55,  1.53it/s] 59%|█████▊    | 5739/9770 [1:07:02<43:50,  1.53it/s] 59%|█████▉    | 5740/9770 [1:07:02<43:54,  1.53it/s]                                                      59%|█████▉    | 5740/9770 [1:07:02<43:54,  1.53it/s] 59%|█████▉    | 5741/9770 [1:07:03<44:22,  1.51it/s] 59%|█████▉    | 5742/9770 [1:07:04<44:30,  1.51it/s] 59%|█████�
+0: {'loss': 0.6721, 'grad_norm': 0.6561568108484856, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.59}
+0: �    | 5743/9770 [1:07:04<44:35,  1.51it/s] 59%|█████▉    | 5744/9770 [1:07:05<44:22,  1.51it/s] 59%|█████▉    | 5745/9770 [1:07:06<44:55,  1.49it/s] 59%|█████▉    | 5746/9770 [1:07:06<44:33,  1.50it/s] 59%|█████▉    | 5747/9770 [1:07:07<44:15,  1.52it/s] 59%|█████▉    | 5748/9770 [1:07:08<44:14,  1.52it/s] 59%|█████▉    | 5749/9770 [1:07:08<44:21,  1.51it/s] 59%|█████▉    | 5750/9770 [1:07:09<44:29,  1.51it/s]                                                      59%|█████▉    | 5750/9770 [1:07:09<44:29,  1.51it/s] 59%|█████▉    | 5751/9770 [1:07:10<45:23,  1.48it/s] 59%|█████▉    | 5752/9770 [1:07:10<45:24,  1.47it/s] 59%|█████▉    | 5753/9770 [1:07:11<44:55,  1.49it/s] 59%|█████▉    | 5754/9770 [1:07:12<44:14,  1.51it/s] 59%|█████▉    | 5755/9770 [1:07:12<44:07,  1.52it/s] 59%|█████▉    | 5756/9770 [1:07:13<44:15,  1.51it/s] 
+0: {'loss': 0.697, 'grad_norm': 0.5842000879747412, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.59}
+0: 59%|█████▉    | 5757/9770 [1:07:14<44:53,  1.49it/s] 59%|█████▉    | 5758/9770 [1:07:14<44:45,  1.49it/s] 59%|█████▉    | 5759/9770 [1:07:15<44:32,  1.50it/s] 59%|█████▉    | 5760/9770 [1:07:16<44:22,  1.51it/s]                                                      59%|█████▉    | 5760/9770 [1:07:16<44:22,  1.51it/s] 59%|█████▉    | 5761/9770 [1:07:16<44:26,  1.50it/s] 59%|█████▉    | 5762/9770 [1:07:17<44:32,  1.50it/s] 59%|█████▉    | 5763/9770 [1:07:18<43:52,  1.52it/s] 59%|█████▉    | 5764/9770 [1:07:18<44:53,  1.49it/s] 59%|█████▉    | 5765/9770 [1:07:19<44:31,  1.50it/s] 59%|█████▉    | 5766/9770 [1:07:20<44:13,  1.51it/s] 59%|█████▉    | 5767/9770 [1:07:20<44:03,  1.51it/s] 59%|█████▉    | 5768/9770 [1:07:21<43:45,  1.52it/s] 59%|█████▉    | 5769/9770 [1:07:22<43:36,  1.53it/s] 59%|█████▉    | 5770/9770 [1:07:2
+0: {'loss': 0.6646, 'grad_norm': 0.6418740986255856, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.59}
+0: {'loss': 0.6435, 'grad_norm': 0.6081004975742742, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.59}
+0: 2<43:44,  1.52it/s]                                                      59%|█████▉    | 5770/9770 [1:07:22<43:44,  1.52it/s] 59%|█████▉    | 5771/9770 [1:07:23<43:50,  1.52it/s] 59%|█████▉    | 5772/9770 [1:07:24<43:34,  1.53it/s] 59%|█████▉    | 5773/9770 [1:07:24<43:27,  1.53it/s] 59%|█████▉    | 5774/9770 [1:07:25<44:34,  1.49it/s] 59%|█████▉    | 5775/9770 [1:07:26<44:28,  1.50it/s] 59%|█████▉    | 5776/9770 [1:07:26<44:08,  1.51it/s] 59%|█████▉    | 5777/9770 [1:07:27<43:30,  1.53it/s] 59%|█████▉    | 5778/9770 [1:07:27<43:17,  1.54it/s] 59%|█████▉    | 5779/9770 [1:07:28<43:08,  1.54it/s] 59%|█████▉    | 5780/9770 [1:07:29<43:03,  1.54it/s]                                                      59%|█████▉    | 5780/9770 [1:07:29<43:03,  1.54it/s] 59%|█████▉    | 5781/9770 [1:07:29<43:34,  1.53it/s] 59%|█████▉    | 5782/97
+0: {'loss': 0.6575, 'grad_norm': 0.6151352129951854, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.59}
+0: 70 [1:07:30<43:19,  1.53it/s] 59%|█████▉    | 5783/9770 [1:07:31<43:03,  1.54it/s] 59%|█████▉    | 5784/9770 [1:07:31<42:38,  1.56it/s] 59%|█████▉    | 5785/9770 [1:07:32<42:51,  1.55it/s] 59%|█████▉    | 5786/9770 [1:07:33<42:57,  1.55it/s] 59%|█████▉    | 5787/9770 [1:07:33<43:03,  1.54it/s] 59%|█████▉    | 5788/9770 [1:07:34<43:01,  1.54it/s] 59%|█████▉    | 5789/9770 [1:07:35<44:04,  1.51it/s] 59%|█████▉    | 5790/9770 [1:07:35<43:15,  1.53it/s]                                                      59%|█████▉    | 5790/9770 [1:07:35<43:15,  1.53it/s] 59%|█████▉    | 5791/9770 [1:07:36<43:06,  1.54it/s] 59%|█████▉    | 5792/9770 [1:07:37<43:10,  1.54it/s] 59%|█████▉    | 5793/9770 [1:07:37<43:59,  1.51it/s] 59%|█████▉    | 5794/9770 [1:07:38<44:41,  1.48it/s] 59%|█████▉    | 5795/9770 [1:07:39<44:05,  1.50it/s] 59%|███�
+0: {'loss': 0.6624, 'grad_norm': 0.6189362010778219, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.59}
+0: ��█▉    | 5796/9770 [1:07:39<43:49,  1.51it/s] 59%|█████▉    | 5797/9770 [1:07:40<43:47,  1.51it/s] 59%|█████▉    | 5798/9770 [1:07:41<43:16,  1.53it/s] 59%|█████▉    | 5799/9770 [1:07:41<43:20,  1.53it/s] 59%|█████▉    | 5800/9770 [1:07:42<43:45,  1.51it/s]                                                      59%|█████▉    | 5800/9770 [1:07:42<43:45,  1.51it/s] 59%|█████▉    | 5801/9770 [1:07:43<43:50,  1.51it/s] 59%|█████▉    | 5802/9770 [1:07:43<43:38,  1.52it/s] 59%|█████▉    | 5803/9770 [1:07:44<43:16,  1.53it/s] 59%|█████▉    | 5804/9770 [1:07:45<43:13,  1.53it/s] 59%|█████▉    | 5805/9770 [1:07:45<43:19,  1.53it/s] 59%|█████▉    | 5806/9770 [1:07:46<43:04,  1.53it/s] 59%|█████▉    | 5807/9770 [1:07:46<42:55,  1.54it/s] 59%|█████▉    | 5808/9770 [1:07:47<42:46,  1.54it/s] 59%|█████▉    | 5809/9770 [1:07:48<42:53,  1.54
+0: {'loss': 0.6775, 'grad_norm': 0.6080866405818461, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.59}
+0: {'loss': 0.6701, 'grad_norm': 0.6308195833394955, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.6}
+0: it/s] 59%|█████▉    | 5810/9770 [1:07:48<43:18,  1.52it/s]                                                      59%|█████▉    | 5810/9770 [1:07:48<43:18,  1.52it/s] 59%|█████▉    | 5811/9770 [1:07:49<44:23,  1.49it/s] 59%|█████▉    | 5812/9770 [1:07:50<44:06,  1.50it/s] 59%|█████▉    | 5813/9770 [1:07:50<43:27,  1.52it/s] 60%|█████▉    | 5814/9770 [1:07:51<43:08,  1.53it/s] 60%|█████▉    | 5815/9770 [1:07:52<42:56,  1.54it/s] 60%|█████▉    | 5816/9770 [1:07:52<44:13,  1.49it/s] 60%|█████▉    | 5817/9770 [1:07:53<43:19,  1.52it/s] 60%|█████▉    | 5818/9770 [1:07:54<43:23,  1.52it/s] 60%|█████▉    | 5819/9770 [1:07:54<43:46,  1.50it/s] 60%|█████▉    | 5820/9770 [1:07:55<43:19,  1.52it/s]                                                      60%|█████▉    | 5820/9770 [1:07:55<43:19,  1.52it/s] 60%|████���▉    | 5821/9770 [1:07:56<43
+0: {'loss': 0.678, 'grad_norm': 0.6136074934812409, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.6}
+0: :17,  1.52it/s] 60%|█████▉    | 5822/9770 [1:07:56<44:17,  1.49it/s] 60%|█████▉    | 5823/9770 [1:07:57<45:59,  1.43it/s] 60%|█████▉    | 5824/9770 [1:07:58<49:07,  1.34it/s] 60%|█████▉    | 5825/9770 [1:07:59<49:53,  1.32it/s] 60%|█████▉    | 5826/9770 [1:08:00<52:46,  1.25it/s] 60%|█████▉    | 5827/9770 [1:08:00<51:04,  1.29it/s] 60%|█████▉    | 5828/9770 [1:08:01<49:13,  1.33it/s] 60%|█████▉    | 5829/9770 [1:08:02<50:33,  1.30it/s] 60%|█████▉    | 5830/9770 [1:08:03<52:00,  1.26it/s]                                                      60%|█████▉    | 5830/9770 [1:08:03<52:00,  1.26it/s] 60%|█████▉    | 5831/9770 [1:08:04<1:00:53,  1.08it/s] 60%|█████▉    | 5832/9770 [1:08:05<1:02:05,  1.06it/s] 60%|█████▉    | 5833/9770 [1:08:06<1:08:49,  1.05s/it] 60%|█████▉    | 5834/9770 [1:08:07<1:10:00,  1.07s/it] 60%|█████�
+0: {'loss': 0.6791, 'grad_norm': 0.6501062341965902, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.6}
+0: ��    | 5835/9770 [1:08:08<1:07:14,  1.03s/it] 60%|█████▉    | 5836/9770 [1:08:10<1:10:38,  1.08s/it] 60%|█████▉    | 5837/9770 [1:08:11<1:15:59,  1.16s/it] 60%|█████▉    | 5838/9770 [1:08:12<1:10:48,  1.08s/it] 60%|█████▉    | 5839/9770 [1:08:12<1:02:47,  1.04it/s] 60%|█████▉    | 5840/9770 [1:08:13<56:41,  1.16it/s]                                                        60%|█████▉    | 5840/9770 [1:08:13<56:41,  1.16it/s] 60%|█████▉    | 5841/9770 [1:08:14<52:42,  1.24it/s] 60%|█████▉    | 5842/9770 [1:08:14<49:21,  1.33it/s] 60%|█████▉    | 5843/9770 [1:08:15<48:33,  1.35it/s] 60%|█████▉    | 5844/9770 [1:08:16<46:54,  1.39it/s] 60%|█████▉    | 5845/9770 [1:08:16<45:39,  1.43it/s] 60%|█████▉    | 5846/9770 [1:08:17<44:54,  1.46it/s] 60%|█████▉    | 5847/9770 [1:08:18<44:02,  1.48it/s] 60%|█████▉    | 5848/9770 [1:08:18<43:48,
+0: {'loss': 0.6844, 'grad_norm': 0.6291475157307491, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.6}
+0: {'loss': 0.6621, 'grad_norm': 0.6156725856265245, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.6}
+0:   1.49it/s] 60%|█████▉    | 5849/9770 [1:08:19<43:14,  1.51it/s] 60%|█████▉    | 5850/9770 [1:08:20<43:02,  1.52it/s]                                                      60%|█████▉    | 5850/9770 [1:08:20<43:02,  1.52it/s] 60%|█████▉    | 5851/9770 [1:08:20<43:11,  1.51it/s] 60%|█████▉    | 5852/9770 [1:08:21<42:28,  1.54it/s] 60%|█████▉    | 5853/9770 [1:08:22<42:42,  1.53it/s] 60%|█████▉    | 5854/9770 [1:08:22<42:16,  1.54it/s] 60%|█████▉    | 5855/9770 [1:08:23<43:23,  1.50it/s] 60%|█████▉    | 5856/9770 [1:08:24<43:28,  1.50it/s] 60%|█████▉    | 5857/9770 [1:08:24<43:25,  1.50it/s] 60%|█████▉    | 5858/9770 [1:08:25<43:13,  1.51it/s] 60%|█████▉    | 5859/9770 [1:08:26<43:06,  1.51it/s] 60%|█████▉    | 5860/9770 [1:08:26<43:21,  1.50it/s]                                                      60%|█████▉    | 5860/9770 [1:08
+0: [2025-09-02 21:04:29,557] [INFO] [axolotl.core.trainers.base._save:613] [PID:3622631] [RANK:0] Saving model checkpoint to /lustre/fswork/projects/rech/dgo/udv55np/math/Qwen3-235B-A22B/Qwen2.5-0.5B_reasoning/1/checkpoint-5862[39m
+0: [2025-09-02 21:04:30,494] [INFO] [axolotl.core.trainers.base._save:662] [PID:3622631] [RANK:0] Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`[39m
+0: {'loss': 0.6401, 'grad_norm': 0.592768214859416, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.6}
+0: :26<43:21,  1.50it/s] 60%|█████▉    | 5861/9770 [1:08:27<43:25,  1.50it/s] 60%|██████    | 5862/9770 [1:08:28<43:04,  1.51it/s] 60%|██████    | 5863/9770 [1:08:31<1:27:38,  1.35s/it] 60%|██████    | 5864/9770 [1:08:31<1:13:39,  1.13s/it] 60%|██████    | 5865/9770 [1:08:32<1:03:57,  1.02it/s] 60%|██████    | 5866/9770 [1:08:33<57:47,  1.13it/s]   60%|██████    | 5867/9770 [1:08:33<52:57,  1.23it/s] 60%|██████    | 5868/9770 [1:08:34<50:43,  1.28it/s] 60%|██████    | 5869/9770 [1:08:35<48:56,  1.33it/s] 60%|██████    | 5870/9770 [1:08:35<46:34,  1.40it/s]                                                      60%|██████    | 5870/9770 [1:08:35<46:34,  1.40it/s] 60%|██████    | 5871/9770 [1:08:36<45:28,  1.43it/s] 60%|██████    | 5872/9770 [1:08:36<44:30,  1.46it/s] 60%|██████    | 5873/9770 [1:08:37<43:36,  1.49it/s] 60%|███�
+0: {'loss': 0.6718, 'grad_norm': 0.6318186032419443, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.6}
+0: ��██    | 5874/9770 [1:08:38<42:43,  1.52it/s] 60%|██████    | 5875/9770 [1:08:38<42:47,  1.52it/s] 60%|██████    | 5876/9770 [1:08:39<43:36,  1.49it/s] 60%|██████    | 5877/9770 [1:08:40<42:55,  1.51it/s] 60%|██████    | 5878/9770 [1:08:40<42:26,  1.53it/s] 60%|██████    | 5879/9770 [1:08:41<42:00,  1.54it/s] 60%|██████    | 5880/9770 [1:08:42<42:20,  1.53it/s]                                                      60%|██████    | 5880/9770 [1:08:42<42:20,  1.53it/s] 60%|██████    | 5881/9770 [1:08:42<42:17,  1.53it/s] 60%|██████    | 5882/9770 [1:08:43<42:36,  1.52it/s] 60%|██████    | 5883/9770 [1:08:44<42:29,  1.52it/s] 60%|██████    | 5884/9770 [1:08:44<41:44,  1.55it/s] 60%|██████    | 5885/9770 [1:08:45<42:13,  1.53it/s] 60%|██████    | 5886/9770 [1:08:46<43:13,  1.50it/s] 60%|██████    | 5887/9770 [1:08:46<43:10,  1.50
+0: {'loss': 0.6802, 'grad_norm': 0.6351110163673325, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.6}
+0: it/s] 60%|██████    | 5888/9770 [1:08:47<42:46,  1.51it/s] 60%|██████    | 5889/9770 [1:08:48<42:15,  1.53it/s] 60%|██████    | 5890/9770 [1:08:48<41:46,  1.55it/s]                                                      60%|██████    | 5890/9770 [1:08:48<41:46,  1.55it/s] 60%|██████    | 5891/9770 [1:08:49<41:54,  1.54it/s] 60%|██████    | 5892/9770 [1:08:50<42:13,  1.53it/s] 60%|██████    | 5893/9770 [1:08:50<41:47,  1.55it/s] 60%|██████    | 5894/9770 [1:08:51<42:34,  1.52it/s] 60%|██████    | 5895/9770 [1:08:52<42:31,  1.52it/s] 60%|██████    | 5896/9770 [1:08:52<42:14,  1.53it/s] 60%|██████    | 5897/9770 [1:08:53<42:18,  1.53it/s] 60%|██████    | 5898/9770 [1:08:53<42:20,  1.52it/s] 60%|██████    | 5899/9770 [1:08:54<41:56,  1.54it/s] 60%|██████    | 5900/9770 [1:08:55<41:56,  1.54it/s]                                       
+0: {'loss': 0.652, 'grad_norm': 0.6338496819624907, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.6}
+0: {'loss': 0.6647, 'grad_norm': 0.626124748800393, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.6}
+0:                60%|██████    | 5900/9770 [1:08:55<41:56,  1.54it/s] 60%|██████    | 5901/9770 [1:08:55<41:56,  1.54it/s] 60%|██████    | 5902/9770 [1:08:56<41:44,  1.54it/s] 60%|██████    | 5903/9770 [1:08:57<41:59,  1.53it/s] 60%|██████    | 5904/9770 [1:08:57<41:37,  1.55it/s] 60%|██████    | 5905/9770 [1:08:58<41:28,  1.55it/s] 60%|██████    | 5906/9770 [1:08:59<41:00,  1.57it/s] 60%|██████    | 5907/9770 [1:08:59<41:22,  1.56it/s] 60%|██████    | 5908/9770 [1:09:00<41:21,  1.56it/s] 60%|██████    | 5909/9770 [1:09:01<41:19,  1.56it/s] 60%|██████    | 5910/9770 [1:09:01<41:10,  1.56it/s]                                                      60%|██████    | 5910/9770 [1:09:01<41:10,  1.56it/s] 61%|██████    | 5911/9770 [1:09:02<40:52,  1.57it/s] 61%|██████    | 5912/9770 [1:09:03<41:40,  1.54it/s] 61%|██████    | 
+0: {'loss': 0.6969, 'grad_norm': 0.6676084137739385, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.61}
+0: 5913/9770 [1:09:03<42:52,  1.50it/s] 61%|██████    | 5914/9770 [1:09:04<42:26,  1.51it/s] 61%|██████    | 5915/9770 [1:09:04<41:58,  1.53it/s] 61%|██████    | 5916/9770 [1:09:05<41:39,  1.54it/s] 61%|██████    | 5917/9770 [1:09:06<41:55,  1.53it/s] 61%|██████    | 5918/9770 [1:09:06<42:38,  1.51it/s] 61%|██████    | 5919/9770 [1:09:07<42:23,  1.51it/s] 61%|██████    | 5920/9770 [1:09:08<43:03,  1.49it/s]                                                      61%|██████    | 5920/9770 [1:09:08<43:03,  1.49it/s] 61%|██████    | 5921/9770 [1:09:08<42:52,  1.50it/s] 61%|██████    | 5922/9770 [1:09:09<42:19,  1.52it/s] 61%|██████    | 5923/9770 [1:09:10<42:18,  1.52it/s] 61%|██████    | 5924/9770 [1:09:10<42:34,  1.51it/s] 61%|██████    | 5925/9770 [1:09:11<42:16,  1.52it/s] 61%|██████    | 5926/9770 [1:09:12<42:00,  1.53it/s] 61%|█
+0: {'loss': 0.6614, 'grad_norm': 0.6359984224492452, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.61}
+0: █████    | 5927/9770 [1:09:12<42:20,  1.51it/s] 61%|██████    | 5928/9770 [1:09:13<42:13,  1.52it/s] 61%|██████    | 5929/9770 [1:09:14<42:03,  1.52it/s] 61%|██████    | 5930/9770 [1:09:14<41:45,  1.53it/s]                                                      61%|██████    | 5930/9770 [1:09:14<41:45,  1.53it/s] 61%|██████    | 5931/9770 [1:09:15<41:40,  1.54it/s] 61%|██████    | 5932/9770 [1:09:16<41:47,  1.53it/s] 61%|██████    | 5933/9770 [1:09:16<42:08,  1.52it/s] 61%|██████    | 5934/9770 [1:09:17<42:16,  1.51it/s] 61%|██████    | 5935/9770 [1:09:18<41:27,  1.54it/s] 61%|██████    | 5936/9770 [1:09:18<41:10,  1.55it/s] 61%|██████    | 5937/9770 [1:09:19<41:07,  1.55it/s] 61%|██████    | 5938/9770 [1:09:20<40:44,  1.57it/s] 61%|██████    | 5939/9770 [1:09:20<41:18,  1.55it/s] 61%|██████    | 5940/9770 [1:09:21<41:15
+0: {'loss': 0.6483, 'grad_norm': 0.6270648944919128, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.61}
+0: {'loss': 0.6561, 'grad_norm': 0.6461598273167389, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.61}
+0: ,  1.55it/s]                                                      61%|██████    | 5940/9770 [1:09:21<41:15,  1.55it/s] 61%|██████    | 5941/9770 [1:09:22<41:40,  1.53it/s] 61%|██████    | 5942/9770 [1:09:22<41:25,  1.54it/s] 61%|██████    | 5943/9770 [1:09:23<41:30,  1.54it/s] 61%|██████    | 5944/9770 [1:09:23<41:32,  1.53it/s] 61%|██████    | 5945/9770 [1:09:24<41:39,  1.53it/s] 61%|██████    | 5946/9770 [1:09:25<41:13,  1.55it/s] 61%|██████    | 5947/9770 [1:09:25<41:45,  1.53it/s] 61%|██████    | 5948/9770 [1:09:26<41:34,  1.53it/s] 61%|██████    | 5949/9770 [1:09:27<41:42,  1.53it/s] 61%|██████    | 5950/9770 [1:09:27<41:48,  1.52it/s]                                                      61%|██████    | 5950/9770 [1:09:27<41:48,  1.52it/s] 61%|██████    | 5951/9770 [1:09:28<41:56,  1.52it/s] 61%|██████    | 5952/9770 [1:0
+0: {'loss': 0.6657, 'grad_norm': 0.6213399572232323, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.61}
+0: 9:29<41:12,  1.54it/s] 61%|██████    | 5953/9770 [1:09:29<41:03,  1.55it/s] 61%|██████    | 5954/9770 [1:09:30<40:56,  1.55it/s] 61%|██████    | 5955/9770 [1:09:31<41:28,  1.53it/s] 61%|██████    | 5956/9770 [1:09:31<42:10,  1.51it/s] 61%|██████    | 5957/9770 [1:09:32<42:11,  1.51it/s] 61%|██████    | 5958/9770 [1:09:33<41:22,  1.54it/s] 61%|██████    | 5959/9770 [1:09:33<41:40,  1.52it/s] 61%|██████    | 5960/9770 [1:09:34<41:27,  1.53it/s]                                                      61%|██████    | 5960/9770 [1:09:34<41:27,  1.53it/s] 61%|██████    | 5961/9770 [1:09:35<42:19,  1.50it/s] 61%|██████    | 5962/9770 [1:09:35<42:10,  1.51it/s] 61%|██████    | 5963/9770 [1:09:36<42:21,  1.50it/s] 61%|██████    | 5964/9770 [1:09:37<41:59,  1.51it/s] 61%|██████    | 5965/9770 [1:09:37<41:30,  1.53it/s] 61%|█████�
+0: {'loss': 0.6767, 'grad_norm': 0.6061871593456278, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.61}
+0: �    | 5966/9770 [1:09:38<40:50,  1.55it/s] 61%|██████    | 5967/9770 [1:09:39<41:27,  1.53it/s] 61%|██████    | 5968/9770 [1:09:39<41:14,  1.54it/s] 61%|██████    | 5969/9770 [1:09:40<41:09,  1.54it/s] 61%|██████    | 5970/9770 [1:09:40<40:50,  1.55it/s]                                                      61%|██████    | 5970/9770 [1:09:40<40:50,  1.55it/s] 61%|██████    | 5971/9770 [1:09:41<41:15,  1.53it/s] 61%|██████    | 5972/9770 [1:09:42<41:11,  1.54it/s] 61%|██████    | 5973/9770 [1:09:42<41:10,  1.54it/s] 61%|██████    | 5974/9770 [1:09:43<41:05,  1.54it/s] 61%|██████    | 5975/9770 [1:09:44<41:41,  1.52it/s] 61%|██████    | 5976/9770 [1:09:44<42:27,  1.49it/s] 61%|██████    | 5977/9770 [1:09:45<42:04,  1.50it/s] 61%|██████    | 5978/9770 [1:09:46<42:34,  1.48it/s] 61%|██████    | 5979/9770 [1:09:46<42:16,  1.49it/s] 
+0: {'loss': 0.6565, 'grad_norm': 0.6108349498367607, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.61}
+0: {'loss': 0.6877, 'grad_norm': 0.6891945262851742, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.61}
+0: 61%|██████    | 5980/9770 [1:09:47<41:41,  1.52it/s]                                                      61%|██████    | 5980/9770 [1:09:47<41:41,  1.52it/s] 61%|██████    | 5981/9770 [1:09:48<42:43,  1.48it/s] 61%|██████    | 5982/9770 [1:09:48<42:13,  1.50it/s] 61%|██████    | 5983/9770 [1:09:49<42:16,  1.49it/s] 61%|██████    | 5984/9770 [1:09:50<41:35,  1.52it/s] 61%|██████▏   | 5985/9770 [1:09:50<41:31,  1.52it/s] 61%|██████▏   | 5986/9770 [1:09:51<41:39,  1.51it/s] 61%|██████▏   | 5987/9770 [1:09:52<41:20,  1.53it/s] 61%|██████▏   | 5988/9770 [1:09:52<40:55,  1.54it/s] 61%|██████▏   | 5989/9770 [1:09:53<41:43,  1.51it/s] 61%|██████▏   | 5990/9770 [1:09:54<41:30,  1.52it/s]                                                      61%|██████▏   | 5990/9770 [1:09:54<41:30,  1.52it/s] 61%|██████▏   | 5991/9770 [1
+0: {'loss': 0.6837, 'grad_norm': 0.7839686916696933, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.61}
+0: :09:54<41:09,  1.53it/s] 61%|██████▏   | 5992/9770 [1:09:55<41:11,  1.53it/s] 61%|██████▏   | 5993/9770 [1:09:56<40:59,  1.54it/s] 61%|██████▏   | 5994/9770 [1:09:56<41:09,  1.53it/s] 61%|██████▏   | 5995/9770 [1:09:57<41:13,  1.53it/s] 61%|██████▏   | 5996/9770 [1:09:58<41:04,  1.53it/s] 61%|██████▏   | 5997/9770 [1:09:58<40:42,  1.54it/s] 61%|██████▏   | 5998/9770 [1:09:59<40:37,  1.55it/s] 61%|██████▏   | 5999/9770 [1:10:00<40:45,  1.54it/s] 61%|██████▏   | 6000/9770 [1:10:00<41:18,  1.52it/s]                                                      61%|██████▏   | 6000/9770 [1:10:00<41:18,  1.52it/s] 61%|██████▏   | 6001/9770 [1:10:01<41:25,  1.52it/s] 61%|██████▏   | 6002/9770 [1:10:02<41:11,  1.52it/s] 61%|██████▏   | 6003/9770 [1:10:02<40:38,  1.55it/s] 61%|██████▏   | 6004/9770 [1:10:03<41:01,  1.
+0: {'loss': 0.656, 'grad_norm': 0.6488306883856158, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.61}
+0: 53it/s] 61%|██████▏   | 6005/9770 [1:10:03<40:35,  1.55it/s] 61%|██████▏   | 6006/9770 [1:10:04<40:21,  1.55it/s] 61%|██████▏   | 6007/9770 [1:10:05<40:39,  1.54it/s] 61%|██████▏   | 6008/9770 [1:10:05<40:51,  1.53it/s] 62%|██████▏   | 6009/9770 [1:10:06<40:40,  1.54it/s] 62%|██████▏   | 6010/9770 [1:10:07<41:25,  1.51it/s]                                                      62%|██████▏   | 6010/9770 [1:10:07<41:25,  1.51it/s] 62%|██████▏   | 6011/9770 [1:10:07<41:12,  1.52it/s] 62%|██████▏   | 6012/9770 [1:10:08<41:13,  1.52it/s] 62%|██████▏   | 6013/9770 [1:10:09<40:53,  1.53it/s] 62%|██████▏   | 6014/9770 [1:10:09<41:00,  1.53it/s] 62%|██████▏   | 6015/9770 [1:10:10<41:42,  1.50it/s] 62%|██████▏   | 6016/9770 [1:10:11<41:30,  1.51it/s] 62%|██████▏   | 6017/9770 [1:10:11<41:13,  1.52it/s] 62%|█�
+0: {'loss': 0.6637, 'grad_norm': 0.6296898968920162, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.62}
+0: ��████▏   | 6018/9770 [1:10:12<41:16,  1.52it/s] 62%|██████▏   | 6019/9770 [1:10:13<41:39,  1.50it/s] 62%|██████▏   | 6020/9770 [1:10:13<41:34,  1.50it/s]                                                      62%|██████▏   | 6020/9770 [1:10:13<41:34,  1.50it/s] 62%|██████▏   | 6021/9770 [1:10:14<41:04,  1.52it/s] 62%|██████▏   | 6022/9770 [1:10:15<40:54,  1.53it/s] 62%|██████▏   | 6023/9770 [1:10:15<40:34,  1.54it/s] 62%|██████▏   | 6024/9770 [1:10:16<40:30,  1.54it/s] 62%|██████▏   | 6025/9770 [1:10:17<40:24,  1.54it/s] 62%|██████▏   | 6026/9770 [1:10:17<40:41,  1.53it/s] 62%|██████▏   | 6027/9770 [1:10:18<40:29,  1.54it/s] 62%|██████▏   | 6028/9770 [1:10:19<40:15,  1.55it/s] 62%|██████▏   | 6029/9770 [1:10:19<40:11,  1.55it/s] 62%|██████▏   | 6030/9770 [1:10:20<40:36,  1.54it/s]                          
+0: {'loss': 0.6787, 'grad_norm': 0.6218360050616779, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.62}
+0: {'loss': 0.6545, 'grad_norm': 0.6268304939226396, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.62}
+0:                             62%|██████▏   | 6030/9770 [1:10:20<40:36,  1.54it/s] 62%|██████▏   | 6031/9770 [1:10:21<40:44,  1.53it/s] 62%|██████▏   | 6032/9770 [1:10:21<40:41,  1.53it/s] 62%|██████▏   | 6033/9770 [1:10:22<41:27,  1.50it/s] 62%|██████▏   | 6034/9770 [1:10:22<40:39,  1.53it/s] 62%|██████▏   | 6035/9770 [1:10:23<40:33,  1.53it/s] 62%|██████▏   | 6036/9770 [1:10:24<40:05,  1.55it/s] 62%|██████▏   | 6037/9770 [1:10:24<41:06,  1.51it/s] 62%|██████▏   | 6038/9770 [1:10:25<41:12,  1.51it/s] 62%|██████▏   | 6039/9770 [1:10:26<40:54,  1.52it/s] 62%|██████▏   | 6040/9770 [1:10:26<40:31,  1.53it/s]                                                      62%|██████▏   | 6040/9770 [1:10:26<40:31,  1.53it/s] 62%|██████▏   | 6041/9770 [1:10:27<40:22,  1.54it/s] 62%|██████▏   | 6042/9770 [1:10:28<40:34,
+0: {'loss': 0.6721, 'grad_norm': 0.6913627312689128, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.62}
+0:   1.53it/s] 62%|██████▏   | 6043/9770 [1:10:28<40:25,  1.54it/s] 62%|██████▏   | 6044/9770 [1:10:29<40:09,  1.55it/s] 62%|██████▏   | 6045/9770 [1:10:30<40:29,  1.53it/s] 62%|██████▏   | 6046/9770 [1:10:30<40:23,  1.54it/s] 62%|██████▏   | 6047/9770 [1:10:31<40:07,  1.55it/s] 62%|██████▏   | 6048/9770 [1:10:32<40:34,  1.53it/s] 62%|██████▏   | 6049/9770 [1:10:32<40:21,  1.54it/s] 62%|██████▏   | 6050/9770 [1:10:33<39:50,  1.56it/s]                                                      62%|██████▏   | 6050/9770 [1:10:33<39:50,  1.56it/s] 62%|██████▏   | 6051/9770 [1:10:34<39:52,  1.55it/s] 62%|██████▏   | 6052/9770 [1:10:34<39:41,  1.56it/s] 62%|██████▏   | 6053/9770 [1:10:35<39:30,  1.57it/s] 62%|██████▏   | 6054/9770 [1:10:35<39:59,  1.55it/s] 62%|██████▏   | 6055/9770 [1:10:36<39:49,  1.56it/s] 62%|
+0: {'loss': 0.6791, 'grad_norm': 0.6885231239561105, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.62}
+0: ██████▏   | 6056/9770 [1:10:37<39:11,  1.58it/s] 62%|██████▏   | 6057/9770 [1:10:37<40:02,  1.55it/s] 62%|██████▏   | 6058/9770 [1:10:38<40:16,  1.54it/s] 62%|██████▏   | 6059/9770 [1:10:39<41:25,  1.49it/s] 62%|██████▏   | 6060/9770 [1:10:39<40:33,  1.52it/s]                                                      62%|██████▏   | 6060/9770 [1:10:39<40:33,  1.52it/s] 62%|██████▏   | 6061/9770 [1:10:40<40:21,  1.53it/s] 62%|██████▏   | 6062/9770 [1:10:41<40:21,  1.53it/s] 62%|██████▏   | 6063/9770 [1:10:41<40:07,  1.54it/s] 62%|██████▏   | 6064/9770 [1:10:42<40:22,  1.53it/s] 62%|██████▏   | 6065/9770 [1:10:43<40:08,  1.54it/s] 62%|██████▏   | 6066/9770 [1:10:43<40:17,  1.53it/s] 62%|██████▏   | 6067/9770 [1:10:44<40:34,  1.52it/s] 62%|██████▏   | 6068/9770 [1:10:45<40:15,  1.53it/s] 62%|█████�
+0: {'loss': 0.6673, 'grad_norm': 0.611433905548775, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.62}
+0: {'loss': 0.6689, 'grad_norm': 0.6517878854669044, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.62}
+0: �▏   | 6069/9770 [1:10:45<40:14,  1.53it/s] 62%|██████▏   | 6070/9770 [1:10:46<40:26,  1.53it/s]                                                      62%|██████▏   | 6070/9770 [1:10:46<40:26,  1.53it/s] 62%|██████▏   | 6071/9770 [1:10:47<39:55,  1.54it/s] 62%|██████▏   | 6072/9770 [1:10:47<39:54,  1.54it/s] 62%|██████▏   | 6073/9770 [1:10:48<39:46,  1.55it/s] 62%|██████▏   | 6074/9770 [1:10:48<39:16,  1.57it/s] 62%|██████▏   | 6075/9770 [1:10:49<39:24,  1.56it/s] 62%|██████▏   | 6076/9770 [1:10:50<39:47,  1.55it/s] 62%|██████▏   | 6077/9770 [1:10:50<39:46,  1.55it/s] 62%|██████▏   | 6078/9770 [1:10:51<39:30,  1.56it/s] 62%|██████▏   | 6079/9770 [1:10:52<39:09,  1.57it/s] 62%|██████▏   | 6080/9770 [1:10:52<39:16,  1.57it/s]                                                      62%|██████▏   | 6080/9770 [1:10:52<39
+0: {'loss': 0.6643, 'grad_norm': 0.6257153782863818, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.62}
+0: :16,  1.57it/s] 62%|██████▏   | 6081/9770 [1:10:53<39:28,  1.56it/s] 62%|██████▏   | 6082/9770 [1:10:54<39:46,  1.55it/s] 62%|██████▏   | 6083/9770 [1:10:54<39:50,  1.54it/s] 62%|██████▏   | 6084/9770 [1:10:55<39:49,  1.54it/s] 62%|██████▏   | 6085/9770 [1:10:56<39:53,  1.54it/s] 62%|██████▏   | 6086/9770 [1:10:56<40:57,  1.50it/s] 62%|██████▏   | 6087/9770 [1:10:57<40:50,  1.50it/s] 62%|██████▏   | 6088/9770 [1:10:58<40:48,  1.50it/s] 62%|██████▏   | 6089/9770 [1:10:58<39:59,  1.53it/s] 62%|██████▏   | 6090/9770 [1:10:59<40:00,  1.53it/s]                                                      62%|██████▏   | 6090/9770 [1:10:59<40:00,  1.53it/s] 62%|██████▏   | 6091/9770 [1:11:00<40:06,  1.53it/s] 62%|██████▏   | 6092/9770 [1:11:00<40:13,  1.52it/s] 62%|██████▏   | 6093/9770 [1:11:01<40:13,  1.52it/s] 
+0: {'loss': 0.642, 'grad_norm': 0.6082842976633233, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.62}
+0: 62%|██████▏   | 6094/9770 [1:11:01<39:51,  1.54it/s] 62%|██████▏   | 6095/9770 [1:11:02<40:00,  1.53it/s] 62%|██████▏   | 6096/9770 [1:11:03<39:53,  1.53it/s] 62%|██████▏   | 6097/9770 [1:11:03<39:28,  1.55it/s] 62%|██████▏   | 6098/9770 [1:11:04<39:41,  1.54it/s] 62%|██████▏   | 6099/9770 [1:11:05<39:42,  1.54it/s] 62%|██████▏   | 6100/9770 [1:11:05<39:50,  1.54it/s]                                                      62%|██████▏   | 6100/9770 [1:11:05<39:50,  1.54it/s] 62%|██████▏   | 6101/9770 [1:11:06<39:44,  1.54it/s] 62%|██████▏   | 6102/9770 [1:11:07<39:54,  1.53it/s] 62%|██████▏   | 6103/9770 [1:11:07<40:13,  1.52it/s] 62%|██████▏   | 6104/9770 [1:11:08<40:17,  1.52it/s] 62%|██████▏   | 6105/9770 [1:11:09<39:54,  1.53it/s] 62%|██████▏   | 6106/9770 [1:11:09<39:23,  1.55it/s] 63%|████�
+0: {'loss': 0.674, 'grad_norm': 0.652381596197015, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.63}
+0: ��█▎   | 6107/9770 [1:11:10<39:19,  1.55it/s] 63%|██████▎   | 6108/9770 [1:11:11<39:07,  1.56it/s] 63%|██████▎   | 6109/9770 [1:11:11<39:10,  1.56it/s] 63%|██████▎   | 6110/9770 [1:11:12<39:17,  1.55it/s]                                                      63%|██████▎   | 6110/9770 [1:11:12<39:17,  1.55it/s] 63%|██████▎   | 6111/9770 [1:11:13<39:18,  1.55it/s] 63%|██████▎   | 6112/9770 [1:11:13<38:55,  1.57it/s] 63%|██████▎   | 6113/9770 [1:11:14<38:59,  1.56it/s] 63%|██████▎   | 6114/9770 [1:11:14<39:21,  1.55it/s] 63%|██████▎   | 6115/9770 [1:11:15<39:14,  1.55it/s] 63%|██████▎   | 6116/9770 [1:11:16<40:24,  1.51it/s] 63%|██████▎   | 6117/9770 [1:11:16<40:16,  1.51it/s] 63%|██████▎   | 6118/9770 [1:11:17<39:58,  1.52it/s] 63%|██████▎   | 6119/9770 [1:11:18<40:00,  1.52it/s] 63%|██████▎   | 6120
+0: {'loss': 0.6395, 'grad_norm': 0.5992901947274607, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.63}
+0: {'loss': 0.6662, 'grad_norm': 0.6075924975531513, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.63}
+0: /9770 [1:11:18<39:38,  1.53it/s]                                                      63%|██████▎   | 6120/9770 [1:11:18<39:38,  1.53it/s] 63%|██████▎   | 6121/9770 [1:11:19<39:33,  1.54it/s] 63%|██████▎   | 6122/9770 [1:11:20<39:31,  1.54it/s] 63%|██████▎   | 6123/9770 [1:11:20<39:25,  1.54it/s] 63%|██████▎   | 6124/9770 [1:11:21<39:19,  1.55it/s] 63%|██████▎   | 6125/9770 [1:11:22<39:13,  1.55it/s] 63%|██████▎   | 6126/9770 [1:11:22<39:17,  1.55it/s] 63%|██████▎   | 6127/9770 [1:11:23<39:20,  1.54it/s] 63%|██████▎   | 6128/9770 [1:11:24<39:12,  1.55it/s] 63%|██████▎   | 6129/9770 [1:11:24<39:44,  1.53it/s] 63%|██████▎   | 6130/9770 [1:11:25<39:48,  1.52it/s]                                                      63%|██████▎   | 6130/9770 [1:11:25<39:48,  1.52it/s] 63%|██████▎   | 6131/9770 [1:11:26<39:26,  1.54it/
+0: {'loss': 0.6465, 'grad_norm': 0.6583391324854634, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.63}
+0: s] 63%|██████▎   | 6132/9770 [1:11:26<39:33,  1.53it/s] 63%|██████▎   | 6133/9770 [1:11:27<39:24,  1.54it/s] 63%|██████▎   | 6134/9770 [1:11:27<38:59,  1.55it/s] 63%|██████▎   | 6135/9770 [1:11:28<38:47,  1.56it/s] 63%|██████▎   | 6136/9770 [1:11:29<38:41,  1.57it/s] 63%|██████▎   | 6137/9770 [1:11:29<39:53,  1.52it/s] 63%|██████▎   | 6138/9770 [1:11:30<39:38,  1.53it/s] 63%|██████▎   | 6139/9770 [1:11:31<39:12,  1.54it/s] 63%|██████▎   | 6140/9770 [1:11:31<39:30,  1.53it/s]                                                      63%|██████▎   | 6140/9770 [1:11:31<39:30,  1.53it/s] 63%|██████▎   | 6141/9770 [1:11:32<39:16,  1.54it/s] 63%|██████▎   | 6142/9770 [1:11:33<39:08,  1.54it/s] 63%|██████▎   | 6143/9770 [1:11:33<39:09,  1.54it/s] 63%|██████▎   | 6144/9770 [1:11:34<39:20,  1.54it/s] 63%|███
+0: {'loss': 0.684, 'grad_norm': 0.6282871558381556, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.63}
+0: ███▎   | 6145/9770 [1:11:35<39:35,  1.53it/s] 63%|██████▎   | 6146/9770 [1:11:35<39:16,  1.54it/s] 63%|██████▎   | 6147/9770 [1:11:36<39:18,  1.54it/s] 63%|██████▎   | 6148/9770 [1:11:37<39:47,  1.52it/s] 63%|██████▎   | 6149/9770 [1:11:37<39:30,  1.53it/s] 63%|██████▎   | 6150/9770 [1:11:38<39:33,  1.53it/s]                                                      63%|██████▎   | 6150/9770 [1:11:38<39:33,  1.53it/s] 63%|██████▎   | 6151/9770 [1:11:39<39:58,  1.51it/s] 63%|██████▎   | 6152/9770 [1:11:39<39:38,  1.52it/s] 63%|██████▎   | 6153/9770 [1:11:40<39:41,  1.52it/s] 63%|██████▎   | 6154/9770 [1:11:41<39:35,  1.52it/s] 63%|██████▎   | 6155/9770 [1:11:41<39:30,  1.53it/s] 63%|██████▎   | 6156/9770 [1:11:42<39:33,  1.52it/s] 63%|██████▎   | 6157/9770 [1:11:43<39:37,  1.52it/s] 63%|██████▎   | 
+0: {'loss': 0.6486, 'grad_norm': 0.5795499591269432, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.63}
+0: 6158/9770 [1:11:43<39:25,  1.53it/s] 63%|██████▎   | 6159/9770 [1:11:44<39:40,  1.52it/s] 63%|██████▎   | 6160/9770 [1:11:44<39:26,  1.53it/s]                                                      63%|██████▎   | 6160/9770 [1:11:44<39:26,  1.53it/s] 63%|██████▎   | 6161/9770 [1:11:45<39:33,  1.52it/s] 63%|██████▎   | 6162/9770 [1:11:46<39:25,  1.53it/s] 63%|██████▎   | 6163/9770 [1:11:46<39:09,  1.54it/s] 63%|██████▎   | 6164/9770 [1:11:47<39:20,  1.53it/s] 63%|██████▎   | 6165/9770 [1:11:48<39:08,  1.54it/s] 63%|██████▎   | 6166/9770 [1:11:48<39:15,  1.53it/s] 63%|██████▎   | 6167/9770 [1:11:49<39:17,  1.53it/s] 63%|██████▎   | 6168/9770 [1:11:50<38:49,  1.55it/s] 63%|██████▎   | 6169/9770 [1:11:50<38:50,  1.55it/s] 63%|██████▎   | 6170/9770 [1:11:51<38:22,  1.56it/s]                                                
+0: {'loss': 0.681, 'grad_norm': 0.6470430136592646, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.63}
+0: {'loss': 0.659, 'grad_norm': 0.614568533659564, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.63}
+0:       63%|██████▎   | 6170/9770 [1:11:51<38:22,  1.56it/s] 63%|██████▎   | 6171/9770 [1:11:52<38:19,  1.56it/s] 63%|██████▎   | 6172/9770 [1:11:52<38:40,  1.55it/s] 63%|██████▎   | 6173/9770 [1:11:53<38:20,  1.56it/s] 63%|██████▎   | 6174/9770 [1:11:54<38:20,  1.56it/s] 63%|██████▎   | 6175/9770 [1:11:54<38:32,  1.55it/s] 63%|██████▎   | 6176/9770 [1:11:55<39:14,  1.53it/s] 63%|██████▎   | 6177/9770 [1:11:56<39:33,  1.51it/s] 63%|██████▎   | 6178/9770 [1:11:56<39:30,  1.52it/s] 63%|██████▎   | 6179/9770 [1:11:57<40:00,  1.50it/s] 63%|██████▎   | 6180/9770 [1:11:58<39:24,  1.52it/s]                                                      63%|██████▎   | 6180/9770 [1:11:58<39:24,  1.52it/s] 63%|██████▎   | 6181/9770 [1:11:58<39:13,  1.53it/s] 63%|██████▎   | 6182/9770 [1:11:59<38:38,  1.55it/s] 63%|█�
+0: {'loss': 0.6696, 'grad_norm': 0.6186075441983189, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.63}
+0: �████▎   | 6183/9770 [1:11:59<38:24,  1.56it/s] 63%|██████▎   | 6184/9770 [1:12:00<38:02,  1.57it/s] 63%|██████▎   | 6185/9770 [1:12:01<37:48,  1.58it/s] 63%|██████▎   | 6186/9770 [1:12:01<37:45,  1.58it/s] 63%|██████▎   | 6187/9770 [1:12:02<37:45,  1.58it/s] 63%|██████▎   | 6188/9770 [1:12:03<37:42,  1.58it/s] 63%|██████▎   | 6189/9770 [1:12:03<38:10,  1.56it/s] 63%|██████▎   | 6190/9770 [1:12:04<38:31,  1.55it/s]                                                      63%|██████▎   | 6190/9770 [1:12:04<38:31,  1.55it/s] 63%|██████▎   | 6191/9770 [1:12:05<38:21,  1.56it/s] 63%|██████▎   | 6192/9770 [1:12:05<38:06,  1.56it/s] 63%|██████▎   | 6193/9770 [1:12:06<38:34,  1.55it/s] 63%|██████▎   | 6194/9770 [1:12:06<38:39,  1.54it/s] 63%|██████▎   | 6195/9770 [1:12:07<38:43,  1.54it/s] 63%|██████▎ 
+0: {'loss': 0.6619, 'grad_norm': 0.6370901660722852, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.63}
+0:   | 6196/9770 [1:12:08<38:46,  1.54it/s] 63%|██████▎   | 6197/9770 [1:12:08<39:03,  1.52it/s] 63%|██████▎   | 6198/9770 [1:12:09<38:56,  1.53it/s] 63%|██████▎   | 6199/9770 [1:12:10<39:40,  1.50it/s] 63%|██████▎   | 6200/9770 [1:12:10<39:47,  1.50it/s]                                                      63%|██████▎   | 6200/9770 [1:12:10<39:47,  1.50it/s] 63%|██████▎   | 6201/9770 [1:12:11<39:46,  1.50it/s] 63%|██████▎   | 6202/9770 [1:12:12<39:44,  1.50it/s] 63%|██████▎   | 6203/9770 [1:12:12<39:40,  1.50it/s] 64%|██████▎   | 6204/9770 [1:12:13<39:29,  1.50it/s] 64%|██████▎   | 6205/9770 [1:12:14<39:15,  1.51it/s] 64%|██████▎   | 6206/9770 [1:12:14<39:56,  1.49it/s] 64%|██████▎   | 6207/9770 [1:12:15<39:06,  1.52it/s] 64%|██████▎   | 6208/9770 [1:12:16<38:47,  1.53it/s] 64%|██████▎   | 6209/9770 [1:
+0: {'loss': 0.6551, 'grad_norm': 0.6037277538780117, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.64}
+0: {'loss': 0.6641, 'grad_norm': 0.6217541178222828, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.64}
+0: 12:16<38:47,  1.53it/s] 64%|██████▎   | 6210/9770 [1:12:17<38:43,  1.53it/s]                                                      64%|██████▎   | 6210/9770 [1:12:17<38:43,  1.53it/s] 64%|██████▎   | 6211/9770 [1:12:18<38:55,  1.52it/s] 64%|██████▎   | 6212/9770 [1:12:18<39:01,  1.52it/s] 64%|██████▎   | 6213/9770 [1:12:19<38:23,  1.54it/s] 64%|██████▎   | 6214/9770 [1:12:20<38:29,  1.54it/s] 64%|██████▎   | 6215/9770 [1:12:20<38:12,  1.55it/s] 64%|██████▎   | 6216/9770 [1:12:21<38:36,  1.53it/s] 64%|██████▎   | 6217/9770 [1:12:22<38:29,  1.54it/s] 64%|██████▎   | 6218/9770 [1:12:22<38:27,  1.54it/s] 64%|██████▎   | 6219/9770 [1:12:23<38:47,  1.53it/s] 64%|██████▎   | 6220/9770 [1:12:24<38:38,  1.53it/s]                                                      64%|██████▎   | 6220/9770 [1:12:24<38:38,  1.53it/s] 64%|�
+0: {'loss': 0.6681, 'grad_norm': 0.6183644365808582, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.64}
+0: ��█████▎   | 6221/9770 [1:12:24<39:01,  1.52it/s] 64%|██████▎   | 6222/9770 [1:12:25<39:01,  1.52it/s] 64%|██████▎   | 6223/9770 [1:12:26<39:00,  1.52it/s] 64%|██████▎   | 6224/9770 [1:12:26<38:22,  1.54it/s] 64%|██████▎   | 6225/9770 [1:12:27<38:17,  1.54it/s] 64%|██████▎   | 6226/9770 [1:12:27<38:43,  1.53it/s] 64%|██████▎   | 6227/9770 [1:12:28<38:33,  1.53it/s] 64%|██████▎   | 6228/9770 [1:12:29<38:40,  1.53it/s] 64%|██████▍   | 6229/9770 [1:12:29<38:42,  1.52it/s] 64%|██████▍   | 6230/9770 [1:12:30<38:42,  1.52it/s]                                                      64%|██████▍   | 6230/9770 [1:12:30<38:42,  1.52it/s] 64%|██████▍   | 6231/9770 [1:12:31<38:58,  1.51it/s] 64%|██████▍   | 6232/9770 [1:12:31<38:46,  1.52it/s] 64%|██████▍   | 6233/9770 [1:12:32<38:58,  1.51it/s] 64%|██████
+0: {'loss': 0.6765, 'grad_norm': 0.6131289071043077, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.64}
+0: ▍   | 6234/9770 [1:12:33<38:43,  1.52it/s] 64%|██████▍   | 6235/9770 [1:12:33<38:11,  1.54it/s] 64%|██████▍   | 6236/9770 [1:12:34<37:56,  1.55it/s] 64%|██████▍   | 6237/9770 [1:12:35<37:48,  1.56it/s] 64%|██████▍   | 6238/9770 [1:12:35<37:48,  1.56it/s] 64%|██████▍   | 6239/9770 [1:12:36<38:06,  1.54it/s] 64%|██████▍   | 6240/9770 [1:12:37<38:00,  1.55it/s]                                                      64%|██████▍   | 6240/9770 [1:12:37<38:00,  1.55it/s] 64%|██████▍   | 6241/9770 [1:12:37<39:15,  1.50it/s] 64%|██████▍   | 6242/9770 [1:12:38<38:53,  1.51it/s] 64%|██████▍   | 6243/9770 [1:12:39<38:55,  1.51it/s] 64%|██████▍   | 6244/9770 [1:12:39<38:55,  1.51it/s] 64%|██████▍   | 6245/9770 [1:12:40<38:25,  1.53it/s] 64%|██████▍   | 6246/9770 [1:12:41<38:23,  1.53it/s] 64%|██████▍   | 6247/9770
+0: {'loss': 0.6729, 'grad_norm': 0.5970843397588241, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.64}
+0:  [1:12:41<37:51,  1.55it/s] 64%|██████▍   | 6248/9770 [1:12:42<37:44,  1.56it/s] 64%|██████▍   | 6249/9770 [1:12:43<38:32,  1.52it/s] 64%|██████▍   | 6250/9770 [1:12:43<38:09,  1.54it/s]                                                      64%|██████▍   | 6250/9770 [1:12:43<38:09,  1.54it/s] 64%|██████▍   | 6251/9770 [1:12:44<38:24,  1.53it/s] 64%|██████▍   | 6252/9770 [1:12:45<38:48,  1.51it/s] 64%|██████▍   | 6253/9770 [1:12:45<37:56,  1.54it/s] 64%|██████▍   | 6254/9770 [1:12:46<38:00,  1.54it/s] 64%|██████▍   | 6255/9770 [1:12:46<38:05,  1.54it/s] 64%|██████▍   | 6256/9770 [1:12:47<38:14,  1.53it/s] 64%|██████▍   | 6257/9770 [1:12:48<38:45,  1.51it/s] 64%|██████▍   | 6258/9770 [1:12:48<39:00,  1.50it/s] 64%|██████▍   | 6259/9770 [1:12:49<38:17,  1.53it/s] 64%|██████▍   | 6260/9770 [1:12:50<38:30, 
+0: {'loss': 0.6574, 'grad_norm': 0.6583016744522384, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.64}
+0: {'loss': 0.6682, 'grad_norm': 0.6457912419212775, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.64}
+0:  1.52it/s]                                                      64%|██████▍   | 6260/9770 [1:12:50<38:30,  1.52it/s] 64%|██████▍   | 6261/9770 [1:12:50<38:12,  1.53it/s] 64%|██████▍   | 6262/9770 [1:12:51<38:07,  1.53it/s] 64%|██████▍   | 6263/9770 [1:12:52<37:48,  1.55it/s] 64%|██████▍   | 6264/9770 [1:12:52<37:56,  1.54it/s] 64%|██████▍   | 6265/9770 [1:12:53<38:05,  1.53it/s] 64%|██████▍   | 6266/9770 [1:12:54<38:19,  1.52it/s] 64%|██████▍   | 6267/9770 [1:12:54<38:07,  1.53it/s] 64%|██████▍   | 6268/9770 [1:12:55<38:13,  1.53it/s] 64%|██████▍   | 6269/9770 [1:12:56<37:53,  1.54it/s] 64%|██████▍   | 6270/9770 [1:12:56<37:29,  1.56it/s]                                                      64%|██████▍   | 6270/9770 [1:12:56<37:29,  1.56it/s] 64%|██████▍   | 6271/9770 [1:12:57<37:40,  1.55it/s] 64%|████�
+0: {'loss': 0.6707, 'grad_norm': 0.6145655516421002, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.64}
+0: �█▍   | 6272/9770 [1:12:58<37:58,  1.54it/s] 64%|██████▍   | 6273/9770 [1:12:58<38:12,  1.53it/s] 64%|██████▍   | 6274/9770 [1:12:59<38:01,  1.53it/s] 64%|██████▍   | 6275/9770 [1:12:59<37:44,  1.54it/s] 64%|██████▍   | 6276/9770 [1:13:00<37:27,  1.55it/s] 64%|██████▍   | 6277/9770 [1:13:01<37:39,  1.55it/s] 64%|██████▍   | 6278/9770 [1:13:01<37:26,  1.55it/s] 64%|██████▍   | 6279/9770 [1:13:02<37:43,  1.54it/s] 64%|██████▍   | 6280/9770 [1:13:03<37:49,  1.54it/s]                                                      64%|██████▍   | 6280/9770 [1:13:03<37:49,  1.54it/s] 64%|██████▍   | 6281/9770 [1:13:03<39:00,  1.49it/s] 64%|██████▍   | 6282/9770 [1:13:04<38:14,  1.52it/s] 64%|██████▍   | 6283/9770 [1:13:05<38:02,  1.53it/s] 64%|██████▍   | 6284/9770 [1:13:05<38:13,  1.52it/s] 64%|██████▍   | 6285/
+0: {'loss': 0.6617, 'grad_norm': 0.6320067587190664, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.64}
+0: 9770 [1:13:06<38:12,  1.52it/s] 64%|██████▍   | 6286/9770 [1:13:07<38:06,  1.52it/s] 64%|██████▍   | 6287/9770 [1:13:07<37:34,  1.54it/s] 64%|██████▍   | 6288/9770 [1:13:08<37:51,  1.53it/s] 64%|██████▍   | 6289/9770 [1:13:09<37:56,  1.53it/s] 64%|██████▍   | 6290/9770 [1:13:09<37:42,  1.54it/s]                                                      64%|██████▍   | 6290/9770 [1:13:09<37:42,  1.54it/s] 64%|██████▍   | 6291/9770 [1:13:10<37:33,  1.54it/s] 64%|██████▍   | 6292/9770 [1:13:11<37:31,  1.54it/s] 64%|██████▍   | 6293/9770 [1:13:11<37:42,  1.54it/s] 64%|██████▍   | 6294/9770 [1:13:12<37:44,  1.53it/s] 64%|██████▍   | 6295/9770 [1:13:13<37:53,  1.53it/s] 64%|██████▍   | 6296/9770 [1:13:13<37:51,  1.53it/s] 64%|██████▍   | 6297/9770 [1:13:14<37:52,  1.53it/s] 64%|██████▍   | 6298/9770 [1:13:15<38:
+0: {'loss': 0.6693, 'grad_norm': 0.6549017186900042, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.64}
+0: {'loss': 0.6739, 'grad_norm': 0.6417299597488738, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.65}
+0: 05,  1.52it/s] 64%|██████▍   | 6299/9770 [1:13:15<37:35,  1.54it/s] 64%|██████▍   | 6300/9770 [1:13:16<37:17,  1.55it/s]                                                      64%|██████▍   | 6300/9770 [1:13:16<37:17,  1.55it/s] 64%|██████▍   | 6301/9770 [1:13:16<37:29,  1.54it/s] 65%|██████▍   | 6302/9770 [1:13:17<37:29,  1.54it/s] 65%|██████▍   | 6303/9770 [1:13:18<37:15,  1.55it/s] 65%|██████▍   | 6304/9770 [1:13:18<36:56,  1.56it/s] 65%|██████▍   | 6305/9770 [1:13:19<36:49,  1.57it/s] 65%|██████▍   | 6306/9770 [1:13:20<37:01,  1.56it/s] 65%|██████▍   | 6307/9770 [1:13:20<36:58,  1.56it/s] 65%|██████▍   | 6308/9770 [1:13:21<36:36,  1.58it/s] 65%|██████▍   | 6309/9770 [1:13:22<36:55,  1.56it/s] 65%|██████▍   | 6310/9770 [1:13:22<37:03,  1.56it/s]                                                      65%|███�
+0: {'loss': 0.6727, 'grad_norm': 0.6375896737195781, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.65}
+0: ��██▍   | 6310/9770 [1:13:22<37:03,  1.56it/s] 65%|██████▍   | 6311/9770 [1:13:23<37:16,  1.55it/s] 65%|██████▍   | 6312/9770 [1:13:24<37:41,  1.53it/s] 65%|██████▍   | 6313/9770 [1:13:24<37:24,  1.54it/s] 65%|██████▍   | 6314/9770 [1:13:25<37:45,  1.53it/s] 65%|██████▍   | 6315/9770 [1:13:25<37:29,  1.54it/s] 65%|██████▍   | 6316/9770 [1:13:26<37:10,  1.55it/s] 65%|██████▍   | 6317/9770 [1:13:27<37:21,  1.54it/s] 65%|██████▍   | 6318/9770 [1:13:27<37:20,  1.54it/s] 65%|██████▍   | 6319/9770 [1:13:28<36:55,  1.56it/s] 65%|██████▍   | 6320/9770 [1:13:29<36:58,  1.56it/s]                                                      65%|██████▍   | 6320/9770 [1:13:29<36:58,  1.56it/s] 65%|██████▍   | 6321/9770 [1:13:29<37:06,  1.55it/s] 65%|██████▍   | 6322/9770 [1:13:30<38:01,  1.51it/s] 65%|██████▍   | 6
+0: {'loss': 0.6639, 'grad_norm': 0.6329122036803733, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.65}
+0: 323/9770 [1:13:31<37:36,  1.53it/s] 65%|██████▍   | 6324/9770 [1:13:31<37:40,  1.52it/s] 65%|██████▍   | 6325/9770 [1:13:32<37:35,  1.53it/s] 65%|██████▍   | 6326/9770 [1:13:33<37:25,  1.53it/s] 65%|██████▍   | 6327/9770 [1:13:33<37:25,  1.53it/s] 65%|██████▍   | 6328/9770 [1:13:34<37:04,  1.55it/s] 65%|██████▍   | 6329/9770 [1:13:35<37:18,  1.54it/s] 65%|██████▍   | 6330/9770 [1:13:35<37:05,  1.55it/s]                                                      65%|██████▍   | 6330/9770 [1:13:35<37:05,  1.55it/s] 65%|██████▍   | 6331/9770 [1:13:36<37:41,  1.52it/s] 65%|██████▍   | 6332/9770 [1:13:37<37:52,  1.51it/s] 65%|██████▍   | 6333/9770 [1:13:37<37:27,  1.53it/s] 65%|██████▍   | 6334/9770 [1:13:38<37:36,  1.52it/s] 65%|██████▍   | 6335/9770 [1:13:38<36:56,  1.55it/s] 65%|██████▍   | 6336/9770 [1:13:39
+0: {'loss': 0.6571, 'grad_norm': 0.6173711414772245, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.65}
+0: <36:55,  1.55it/s] 65%|██████▍   | 6337/9770 [1:13:40<36:54,  1.55it/s] 65%|██████▍   | 6338/9770 [1:13:40<36:55,  1.55it/s] 65%|██████▍   | 6339/9770 [1:13:41<37:06,  1.54it/s] 65%|██████▍   | 6340/9770 [1:13:42<37:35,  1.52it/s]                                                      65%|██████▍   | 6340/9770 [1:13:42<37:35,  1.52it/s] 65%|██████▍   | 6341/9770 [1:13:42<37:35,  1.52it/s] 65%|██████▍   | 6342/9770 [1:13:43<37:41,  1.52it/s] 65%|██████▍   | 6343/9770 [1:13:44<37:26,  1.53it/s] 65%|██████▍   | 6344/9770 [1:13:44<38:06,  1.50it/s] 65%|██████▍   | 6345/9770 [1:13:45<37:46,  1.51it/s] 65%|██████▍   | 6346/9770 [1:13:46<37:38,  1.52it/s] 65%|██████▍   | 6347/9770 [1:13:46<37:49,  1.51it/s] 65%|██████▍   | 6348/9770 [1:13:47<37:56,  1.50it/s] 65%|██████▍   | 6349/9770 [1:13:48<37:41,  1.51it/s
+0: {'loss': 0.6586, 'grad_norm': 0.5961268309458277, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.65}
+0: {'loss': 0.6664, 'grad_norm': 0.6169471478383356, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.65}
+0: ] 65%|██████▍   | 6350/9770 [1:13:48<37:03,  1.54it/s]                                                      65%|██████▍   | 6350/9770 [1:13:48<37:03,  1.54it/s] 65%|██████▌   | 6351/9770 [1:13:49<37:01,  1.54it/s] 65%|██████▌   | 6352/9770 [1:13:50<38:01,  1.50it/s] 65%|██████▌   | 6353/9770 [1:13:50<38:44,  1.47it/s] 65%|██████▌   | 6354/9770 [1:13:51<38:08,  1.49it/s] 65%|██████▌   | 6355/9770 [1:13:52<37:18,  1.53it/s] 65%|██████▌   | 6356/9770 [1:13:52<37:26,  1.52it/s] 65%|██████▌   | 6357/9770 [1:13:53<37:38,  1.51it/s] 65%|██████▌   | 6358/9770 [1:13:54<37:08,  1.53it/s] 65%|██████▌   | 6359/9770 [1:13:54<36:49,  1.54it/s] 65%|██████▌   | 6360/9770 [1:13:55<37:02,  1.53it/s]                                                      65%|██████▌   | 6360/9770 [1:13:55<37:02,  1.53it/s] 65%|██████▌  
+0: {'loss': 0.6521, 'grad_norm': 0.581540052822275, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.65}
+0:  | 6361/9770 [1:13:56<37:54,  1.50it/s] 65%|██████▌   | 6362/9770 [1:13:56<37:31,  1.51it/s] 65%|██████▌   | 6363/9770 [1:13:57<37:21,  1.52it/s] 65%|██████▌   | 6364/9770 [1:13:58<37:48,  1.50it/s] 65%|██████▌   | 6365/9770 [1:13:58<37:15,  1.52it/s] 65%|██████▌   | 6366/9770 [1:13:59<37:42,  1.50it/s] 65%|██████▌   | 6367/9770 [1:14:00<37:28,  1.51it/s] 65%|██████▌   | 6368/9770 [1:14:00<37:18,  1.52it/s] 65%|██████▌   | 6369/9770 [1:14:01<37:22,  1.52it/s] 65%|██████▌   | 6370/9770 [1:14:02<37:08,  1.53it/s]                                                      65%|██████▌   | 6370/9770 [1:14:02<37:08,  1.53it/s] 65%|██████▌   | 6371/9770 [1:14:02<37:00,  1.53it/s] 65%|██████▌   | 6372/9770 [1:14:03<36:28,  1.55it/s] 65%|██████▌   | 6373/9770 [1:14:03<36:35,  1.55it/s] 65%|██████▌   | 6374/9770 [1:1
+0: {'loss': 0.6648, 'grad_norm': 0.6752141016610559, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.65}
+0: 4:04<36:55,  1.53it/s] 65%|██████▌   | 6375/9770 [1:14:05<36:57,  1.53it/s] 65%|██████▌   | 6376/9770 [1:14:05<37:50,  1.49it/s] 65%|██████▌   | 6377/9770 [1:14:06<37:04,  1.53it/s] 65%|██████▌   | 6378/9770 [1:14:07<36:53,  1.53it/s] 65%|██████▌   | 6379/9770 [1:14:07<36:57,  1.53it/s] 65%|██████▌   | 6380/9770 [1:14:08<37:41,  1.50it/s]                                                      65%|██████▌   | 6380/9770 [1:14:08<37:41,  1.50it/s] 65%|██████▌   | 6381/9770 [1:14:09<37:33,  1.50it/s] 65%|██████▌   | 6382/9770 [1:14:09<37:24,  1.51it/s] 65%|██████▌   | 6383/9770 [1:14:10<37:08,  1.52it/s] 65%|██████▌   | 6384/9770 [1:14:11<36:44,  1.54it/s] 65%|██████▌   | 6385/9770 [1:14:11<37:08,  1.52it/s] 65%|██████▌   | 6386/9770 [1:14:12<36:42,  1.54it/s] 65%|██████▌   | 6387/9770 [1:14:13<36:46,  1.53
+0: {'loss': 0.6502, 'grad_norm': 0.6082760860616365, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.65}
+0: it/s] 65%|██████▌   | 6388/9770 [1:14:13<36:30,  1.54it/s] 65%|██████▌   | 6389/9770 [1:14:14<37:28,  1.50it/s] 65%|██████▌   | 6390/9770 [1:14:15<36:59,  1.52it/s]                                                      65%|██████▌   | 6390/9770 [1:14:15<36:59,  1.52it/s] 65%|██████▌   | 6391/9770 [1:14:15<37:02,  1.52it/s] 65%|██████▌   | 6392/9770 [1:14:16<36:51,  1.53it/s] 65%|██████▌   | 6393/9770 [1:14:17<37:16,  1.51it/s] 65%|██████▌   | 6394/9770 [1:14:17<36:38,  1.54it/s] 65%|██████▌   | 6395/9770 [1:14:18<36:10,  1.55it/s] 65%|██████▌   | 6396/9770 [1:14:19<36:14,  1.55it/s] 65%|██████▌   | 6397/9770 [1:14:19<36:27,  1.54it/s] 65%|██████▌   | 6398/9770 [1:14:20<36:26,  1.54it/s] 65%|██████▌   | 6399/9770 [1:14:20<36:23,  1.54it/s] 66%|██████▌   | 6400/9770 [1:14:21<36:15,  1.55it/s]           
+0: {'loss': 0.6523, 'grad_norm': 0.6530998637352518, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.65}
+0: {'loss': 0.6551, 'grad_norm': 0.667131242856233, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.66}
+0:                                            66%|██████▌   | 6400/9770 [1:14:21<36:15,  1.55it/s] 66%|██████▌   | 6401/9770 [1:14:22<36:01,  1.56it/s] 66%|██████▌   | 6402/9770 [1:14:22<37:10,  1.51it/s] 66%|██████▌   | 6403/9770 [1:14:23<37:10,  1.51it/s] 66%|██████▌   | 6404/9770 [1:14:24<37:01,  1.52it/s] 66%|██████▌   | 6405/9770 [1:14:24<37:25,  1.50it/s] 66%|██████▌   | 6406/9770 [1:14:25<37:51,  1.48it/s] 66%|██████▌   | 6407/9770 [1:14:26<37:45,  1.48it/s] 66%|██████▌   | 6408/9770 [1:14:27<38:03,  1.47it/s] 66%|██████▌   | 6409/9770 [1:14:27<37:48,  1.48it/s] 66%|██████▌   | 6410/9770 [1:14:28<37:34,  1.49it/s]                                                      66%|██████▌   | 6410/9770 [1:14:28<37:34,  1.49it/s] 66%|██████▌   | 6411/9770 [1:14:29<38:21,  1.46it/s] 66%|██████▌   | 6412/9770 
+0: {'loss': 0.6569, 'grad_norm': 0.594956367277564, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.66}
+0: [1:14:29<37:43,  1.48it/s] 66%|██████▌   | 6413/9770 [1:14:30<37:10,  1.50it/s] 66%|██████▌   | 6414/9770 [1:14:31<36:38,  1.53it/s] 66%|██████▌   | 6415/9770 [1:14:31<36:32,  1.53it/s] 66%|██████▌   | 6416/9770 [1:14:32<36:24,  1.54it/s] 66%|██████▌   | 6417/9770 [1:14:32<35:48,  1.56it/s] 66%|██████▌   | 6418/9770 [1:14:33<35:19,  1.58it/s] 66%|██████▌   | 6419/9770 [1:14:34<35:24,  1.58it/s] 66%|██████▌   | 6420/9770 [1:14:34<35:48,  1.56it/s]                                                      66%|██████▌   | 6420/9770 [1:14:34<35:48,  1.56it/s] 66%|██████▌   | 6421/9770 [1:14:35<35:57,  1.55it/s] 66%|██████▌   | 6422/9770 [1:14:36<36:29,  1.53it/s] 66%|██████▌   | 6423/9770 [1:14:36<36:33,  1.53it/s] 66%|██████▌   | 6424/9770 [1:14:37<42:42,  1.31it/s] 66%|██████▌   | 6425/9770 [1:14:38<41:12,  
+0: {'loss': 0.6997, 'grad_norm': 0.6134620436602249, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.66}
+0: 1.35it/s] 66%|██████▌   | 6426/9770 [1:14:39<39:53,  1.40it/s] 66%|██████▌   | 6427/9770 [1:14:39<38:51,  1.43it/s] 66%|██████▌   | 6428/9770 [1:14:40<38:10,  1.46it/s] 66%|██████▌   | 6429/9770 [1:14:41<37:56,  1.47it/s] 66%|██████▌   | 6430/9770 [1:14:41<37:18,  1.49it/s]                                                      66%|██████▌   | 6430/9770 [1:14:41<37:18,  1.49it/s] 66%|██████▌   | 6431/9770 [1:14:42<37:03,  1.50it/s] 66%|██████▌   | 6432/9770 [1:14:43<37:29,  1.48it/s] 66%|██████▌   | 6433/9770 [1:14:43<37:04,  1.50it/s] 66%|██████▌   | 6434/9770 [1:14:44<36:35,  1.52it/s] 66%|██████▌   | 6435/9770 [1:14:45<36:39,  1.52it/s] 66%|██████▌   | 6436/9770 [1:14:45<36:20,  1.53it/s] 66%|██████▌   | 6437/9770 [1:14:46<36:17,  1.53it/s] 66%|██████▌   | 6438/9770 [1:14:47<37:12,  1.49it/s] 66%|�
+0: {'loss': 0.6668, 'grad_norm': 0.6662470100499767, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.66}
+0: {'loss': 0.6599, 'grad_norm': 0.5885489684254362, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.66}
+0: �█████▌   | 6439/9770 [1:14:47<37:44,  1.47it/s] 66%|██████▌   | 6440/9770 [1:14:48<37:38,  1.47it/s]                                                      66%|██████▌   | 6440/9770 [1:14:48<37:38,  1.47it/s] 66%|██████▌   | 6441/9770 [1:14:49<36:50,  1.51it/s] 66%|██████▌   | 6442/9770 [1:14:49<36:31,  1.52it/s] 66%|██████▌   | 6443/9770 [1:14:50<36:16,  1.53it/s] 66%|██████▌   | 6444/9770 [1:14:51<37:03,  1.50it/s] 66%|██████▌   | 6445/9770 [1:14:51<36:32,  1.52it/s] 66%|██████▌   | 6446/9770 [1:14:52<36:24,  1.52it/s] 66%|██████▌   | 6447/9770 [1:14:53<36:25,  1.52it/s] 66%|██████▌   | 6448/9770 [1:14:53<36:07,  1.53it/s] 66%|██████▌   | 6449/9770 [1:14:54<35:36,  1.55it/s] 66%|██████▌   | 6450/9770 [1:14:54<35:43,  1.55it/s]                                                      66%|██████▌   | 6450/9
+0: {'loss': 0.6723, 'grad_norm': 0.619610510777976, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.66}
+0: 770 [1:14:54<35:43,  1.55it/s] 66%|██████▌   | 6451/9770 [1:14:55<36:27,  1.52it/s] 66%|██████▌   | 6452/9770 [1:14:56<36:48,  1.50it/s] 66%|██████▌   | 6453/9770 [1:14:56<36:13,  1.53it/s] 66%|██████▌   | 6454/9770 [1:14:57<35:39,  1.55it/s] 66%|██████▌   | 6455/9770 [1:14:58<36:34,  1.51it/s] 66%|██████▌   | 6456/9770 [1:14:58<36:37,  1.51it/s] 66%|██████▌   | 6457/9770 [1:14:59<37:00,  1.49it/s] 66%|██████▌   | 6458/9770 [1:15:00<36:55,  1.50it/s] 66%|██████▌   | 6459/9770 [1:15:00<36:27,  1.51it/s] 66%|██████▌   | 6460/9770 [1:15:01<36:12,  1.52it/s]                                                      66%|██████▌   | 6460/9770 [1:15:01<36:12,  1.52it/s] 66%|██████▌   | 6461/9770 [1:15:02<35:52,  1.54it/s] 66%|██████▌   | 6462/9770 [1:15:02<35:44,  1.54it/s] 66%|██████▌   | 6463/9770 [1:15:03<35:5
+0: {'loss': 0.6848, 'grad_norm': 0.672171900724939, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.66}
+0: 2,  1.54it/s] 66%|██████▌   | 6464/9770 [1:15:04<35:38,  1.55it/s] 66%|██████▌   | 6465/9770 [1:15:04<35:39,  1.55it/s] 66%|██████▌   | 6466/9770 [1:15:05<35:32,  1.55it/s] 66%|██████▌   | 6467/9770 [1:15:06<35:51,  1.54it/s] 66%|██████▌   | 6468/9770 [1:15:06<36:46,  1.50it/s] 66%|██████▌   | 6469/9770 [1:15:07<41:55,  1.31it/s] 66%|██████▌   | 6470/9770 [1:15:08<40:33,  1.36it/s]                                                      66%|██████▌   | 6470/9770 [1:15:08<40:33,  1.36it/s] 66%|██████▌   | 6471/9770 [1:15:09<39:02,  1.41it/s] 66%|██████▌   | 6472/9770 [1:15:09<37:51,  1.45it/s] 66%|██████▋   | 6473/9770 [1:15:10<37:27,  1.47it/s] 66%|██████▋   | 6474/9770 [1:15:11<36:30,  1.50it/s] 66%|█���████▋   | 6475/9770 [1:15:11<36:02,  1.52it/s] 66%|██████▋   | 6476/9770 [1:15:12<40:43,  1.35it/s] 66
+0: {'loss': 0.6737, 'grad_norm': 0.6175115663977871, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.66}
+0: %|██████▋   | 6477/9770 [1:15:13<39:16,  1.40it/s] 66%|██████▋   | 6478/9770 [1:15:13<38:17,  1.43it/s] 66%|██████▋   | 6479/9770 [1:15:14<37:16,  1.47it/s] 66%|██████▋   | 6480/9770 [1:15:15<37:00,  1.48it/s]                                                      66%|██████▋   | 6480/9770 [1:15:15<37:00,  1.48it/s] 66%|██████▋   | 6481/9770 [1:15:15<36:41,  1.49it/s] 66%|██████▋   | 6482/9770 [1:15:16<36:13,  1.51it/s] 66%|██████▋   | 6483/9770 [1:15:17<35:56,  1.52it/s] 66%|██████▋   | 6484/9770 [1:15:17<35:42,  1.53it/s] 66%|██████▋   | 6485/9770 [1:15:18<35:32,  1.54it/s] 66%|██████▋   | 6486/9770 [1:15:19<35:34,  1.54it/s] 66%|██████▋   | 6487/9770 [1:15:19<35:20,  1.55it/s] 66%|██████▋   | 6488/9770 [1:15:20<35:29,  1.54it/s] 66%|██████▋   | 6489/9770 [1:15:21<36:29,  1.50it/s] 66%|█████
+0: {'loss': 0.6599, 'grad_norm': 0.6570466816903526, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.66}
+0: {'loss': 0.6549, 'grad_norm': 0.6427027214829611, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.66}
+0: █▋   | 6490/9770 [1:15:21<36:17,  1.51it/s]                                                      66%|██████▋   | 6490/9770 [1:15:21<36:17,  1.51it/s] 66%|██████▋   | 6491/9770 [1:15:22<36:17,  1.51it/s] 66%|██████▋   | 6492/9770 [1:15:23<36:19,  1.50it/s] 66%|██████▋   | 6493/9770 [1:15:23<36:07,  1.51it/s] 66%|██████▋   | 6494/9770 [1:15:24<36:10,  1.51it/s] 66%|██████▋   | 6495/9770 [1:15:25<36:02,  1.51it/s] 66%|██████▋   | 6496/9770 [1:15:25<35:50,  1.52it/s] 66%|██████▋   | 6497/9770 [1:15:26<35:37,  1.53it/s] 67%|██████▋   | 6498/9770 [1:15:27<35:57,  1.52it/s] 67%|██████▋   | 6499/9770 [1:15:27<35:38,  1.53it/s] 67%|██████▋   | 6500/9770 [1:15:28<35:52,  1.52it/s]                                                      67%|██████▋   | 6500/9770 [1:15:28<35:52,  1.52it/s] 67%|██████▋   | 6501/9770 [1:15:29<
+0: {'loss': 0.6613, 'grad_norm': 0.6206673552340211, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.67}
+0: 36:04,  1.51it/s] 67%|██████▋   | 6502/9770 [1:15:29<35:42,  1.53it/s] 67%|██████▋   | 6503/9770 [1:15:30<36:26,  1.49it/s] 67%|██████▋   | 6504/9770 [1:15:31<35:58,  1.51it/s] 67%|██████▋   | 6505/9770 [1:15:31<35:24,  1.54it/s] 67%|██████▋   | 6506/9770 [1:15:32<35:15,  1.54it/s] 67%|██████▋   | 6507/9770 [1:15:32<34:46,  1.56it/s] 67%|██████▋   | 6508/9770 [1:15:33<34:19,  1.58it/s] 67%|██████▋   | 6509/9770 [1:15:34<34:35,  1.57it/s] 67%|██████▋   | 6510/9770 [1:15:35<40:15,  1.35it/s]                                                      67%|██████▋   | 6510/9770 [1:15:35<40:15,  1.35it/s] 67%|██████▋   | 6511/9770 [1:15:35<38:36,  1.41it/s] 67%|██████▋   | 6512/9770 [1:15:36<43:43,  1.24it/s] 67%|██████▋   | 6513/9770 [1:15:37<46:05,  1.18it/s] 67%|██████▋   | 6514/9770 [1:15:38<42:58,  1.26it/s]
+0: {'loss': 0.6661, 'grad_norm': 0.5671139138267554, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.67}
+0:  67%|██████▋   | 6515/9770 [1:15:39<40:57,  1.32it/s] 67%|██████▋   | 6516/9770 [1:15:39<38:51,  1.40it/s] 67%|██████▋   | 6517/9770 [1:15:40<38:02,  1.43it/s] 67%|██████▋   | 6518/9770 [1:15:41<37:24,  1.45it/s] 67%|██████▋   | 6519/9770 [1:15:41<36:28,  1.49it/s] 67%|██████▋   | 6520/9770 [1:15:42<35:45,  1.51it/s]                                                      67%|██████▋   | 6520/9770 [1:15:42<35:45,  1.51it/s] 67%|██████▋   | 6521/9770 [1:15:42<35:44,  1.52it/s] 67%|██████▋   | 6522/9770 [1:15:43<40:36,  1.33it/s] 67%|██████▋   | 6523/9770 [1:15:44<38:37,  1.40it/s] 67%|██████▋   | 6524/9770 [1:15:45<37:45,  1.43it/s] 67%|██████▋   | 6525/9770 [1:15:45<37:18,  1.45it/s] 67%|██████▋   | 6526/9770 [1:15:46<36:45,  1.47it/s] 67%|██████▋   | 6527/9770 [1:15:47<36:27,  1.48it/s] 67%|███�
+0: {'loss': 0.6729, 'grad_norm': 0.6154789972421226, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.67}
+0: �██▋   | 6528/9770 [1:15:47<36:43,  1.47it/s] 67%|██████▋   | 6529/9770 [1:15:48<36:49,  1.47it/s] 67%|██████▋   | 6530/9770 [1:15:49<36:12,  1.49it/s]                                                      67%|██████▋   | 6530/9770 [1:15:49<36:12,  1.49it/s] 67%|██████▋   | 6531/9770 [1:15:49<36:04,  1.50it/s] 67%|██████▋   | 6532/9770 [1:15:50<35:49,  1.51it/s] 67%|██████▋   | 6533/9770 [1:15:51<35:31,  1.52it/s] 67%|██████▋   | 6534/9770 [1:15:51<35:26,  1.52it/s] 67%|██████▋   | 6535/9770 [1:15:52<35:07,  1.54it/s] 67%|██████▋   | 6536/9770 [1:15:53<34:54,  1.54it/s] 67%|██████▋   | 6537/9770 [1:15:53<35:15,  1.53it/s] 67%|██████▋   | 6538/9770 [1:15:54<35:02,  1.54it/s] 67%|██████▋   | 6539/9770 [1:15:55<34:45,  1.55it/s] 67%|██████▋   | 6540/9770 [1:15:55<34:59,  1.54it/s]                                 
+0: {'loss': 0.6504, 'grad_norm': 0.661393630158148, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.67}
+0: {'loss': 0.671, 'grad_norm': 0.6165761208441942, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.67}
+0:                      67%|██████▋   | 6540/9770 [1:15:55<34:59,  1.54it/s] 67%|██████▋   | 6541/9770 [1:15:56<35:00,  1.54it/s] 67%|██████▋   | 6542/9770 [1:15:57<34:53,  1.54it/s] 67%|██████▋   | 6543/9770 [1:15:57<35:19,  1.52it/s] 67%|██████▋   | 6544/9770 [1:15:58<35:09,  1.53it/s] 67%|██████▋   | 6545/9770 [1:15:59<35:21,  1.52it/s] 67%|██████▋   | 6546/9770 [1:16:00<40:34,  1.32it/s] 67%|██████▋   | 6547/9770 [1:16:00<42:20,  1.27it/s] 67%|██████▋   | 6548/9770 [1:16:01<41:02,  1.31it/s] 67%|██████▋   | 6549/9770 [1:16:02<38:57,  1.38it/s] 67%|██████▋   | 6550/9770 [1:16:02<37:09,  1.44it/s]                                                      67%|██████▋   | 6550/9770 [1:16:02<37:09,  1.44it/s] 67%|██████▋   | 6551/9770 [1:16:03<36:53,  1.45it/s] 67%|██████▋   | 6552/9770 [1:16:04<36:15,  1.48i
+0: {'loss': 0.6594, 'grad_norm': 0.6211973298546264, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.67}
+0: t/s] 67%|██████▋   | 6553/9770 [1:16:04<35:48,  1.50it/s] 67%|██████▋   | 6554/9770 [1:16:05<35:25,  1.51it/s] 67%|██████▋   | 6555/9770 [1:16:06<35:06,  1.53it/s] 67%|██████▋   | 6556/9770 [1:16:06<34:51,  1.54it/s] 67%|██████▋   | 6557/9770 [1:16:07<35:01,  1.53it/s] 67%|██████▋   | 6558/9770 [1:16:08<40:01,  1.34it/s] 67%|██████▋   | 6559/9770 [1:16:09<38:08,  1.40it/s] 67%|██████▋   | 6560/9770 [1:16:09<42:21,  1.26it/s]                                                      67%|██████▋   | 6560/9770 [1:16:09<42:21,  1.26it/s] 67%|██████▋   | 6561/9770 [1:16:10<40:05,  1.33it/s] 67%|██████▋   | 6562/9770 [1:16:11<38:19,  1.40it/s] 67%|██████▋   | 6563/9770 [1:16:11<37:31,  1.42it/s] 67%|██████▋   | 6564/9770 [1:16:12<36:57,  1.45it/s] 67%|██████▋   | 6565/9770 [1:16:13<35:52,  1.49it/s] 67%|██�
+0: {'loss': 0.6793, 'grad_norm': 0.6803455642466582, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.67}
+0: ��███▋   | 6566/9770 [1:16:13<35:29,  1.50it/s] 67%|██████▋   | 6567/9770 [1:16:14<34:59,  1.53it/s] 67%|██████▋   | 6568/9770 [1:16:15<35:12,  1.52it/s] 67%|██████▋   | 6569/9770 [1:16:16<40:00,  1.33it/s] 67%|██████▋   | 6570/9770 [1:16:16<38:53,  1.37it/s]                                                      67%|██████▋   | 6570/9770 [1:16:16<38:53,  1.37it/s] 67%|██████▋   | 6571/9770 [1:16:17<37:29,  1.42it/s] 67%|██████▋   | 6572/9770 [1:16:18<37:12,  1.43it/s] 67%|██████▋   | 6573/9770 [1:16:18<36:42,  1.45it/s] 67%|██████▋   | 6574/9770 [1:16:19<35:56,  1.48it/s] 67%|██████▋   | 6575/9770 [1:16:20<35:37,  1.49it/s] 67%|██████▋   | 6576/9770 [1:16:20<35:06,  1.52it/s] 67%|██████▋   | 6577/9770 [1:16:21<34:45,  1.53it/s] 67%|██████▋   | 6578/9770 [1:16:22<34:54,  1.52it/s] 67%|██████▋   
+0: {'loss': 0.6892, 'grad_norm': 0.6249111655847285, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.67}
+0: {'loss': 0.688, 'grad_norm': 0.6369179943692846, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.67}
+0: | 6579/9770 [1:16:22<35:43,  1.49it/s] 67%|██████▋   | 6580/9770 [1:16:23<35:28,  1.50it/s]                                                      67%|██████▋   | 6580/9770 [1:16:23<35:28,  1.50it/s] 67%|██████▋   | 6581/9770 [1:16:24<35:21,  1.50it/s] 67%|██████▋   | 6582/9770 [1:16:24<34:54,  1.52it/s] 67%|██████▋   | 6583/9770 [1:16:25<34:58,  1.52it/s] 67%|██████▋   | 6584/9770 [1:16:26<34:53,  1.52it/s] 67%|██████▋   | 6585/9770 [1:16:26<34:27,  1.54it/s] 67%|██████▋   | 6586/9770 [1:16:27<34:36,  1.53it/s] 67%|██████▋   | 6587/9770 [1:16:27<34:15,  1.55it/s] 67%|██████▋   | 6588/9770 [1:16:28<34:12,  1.55it/s] 67%|██████▋   | 6589/9770 [1:16:29<34:07,  1.55it/s] 67%|██████▋   | 6590/9770 [1:16:29<34:03,  1.56it/s]                                                      67%|██████▋   | 6590/9770 [1:16:29<34:03,  1
+0: {'loss': 0.6648, 'grad_norm': 0.628182765662115, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.68}
+0: .56it/s] 67%|██████▋   | 6591/9770 [1:16:30<34:23,  1.54it/s] 67%|██████▋   | 6592/9770 [1:16:31<35:12,  1.50it/s] 67%|██████▋   | 6593/9770 [1:16:31<35:05,  1.51it/s] 67%|██████▋   | 6594/9770 [1:16:32<34:55,  1.52it/s] 68%|██████▊   | 6595/9770 [1:16:33<34:12,  1.55it/s] 68%|██████▊   | 6596/9770 [1:16:33<34:15,  1.54it/s] 68%|██████▊   | 6597/9770 [1:16:34<34:08,  1.55it/s] 68%|██████▊   | 6598/9770 [1:16:35<34:18,  1.54it/s] 68%|██████▊   | 6599/9770 [1:16:35<34:11,  1.55it/s] 68%|██████▊   | 6600/9770 [1:16:36<34:04,  1.55it/s]                                                      68%|██████▊   | 6600/9770 [1:16:36<34:04,  1.55it/s] 68%|██████▊   | 6601/9770 [1:16:37<34:25,  1.53it/s] 68%|██████▊   | 6602/9770 [1:16:37<34:22,  1.54it/s] 68%|██████▊   | 6603/9770 [1:16:38<34:24,  1.53it/s] 68%|█
+0: {'loss': 0.6425, 'grad_norm': 0.658591188243479, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.68}
+0: █████▊   | 6604/9770 [1:16:38<33:56,  1.55it/s] 68%|██████▊   | 6605/9770 [1:16:39<33:54,  1.56it/s] 68%|██████▊   | 6606/9770 [1:16:40<34:12,  1.54it/s] 68%|██████▊   | 6607/9770 [1:16:40<34:30,  1.53it/s] 68%|██████▊   | 6608/9770 [1:16:41<34:11,  1.54it/s] 68%|██████▊   | 6609/9770 [1:16:42<34:01,  1.55it/s] 68%|██████▊   | 6610/9770 [1:16:42<34:51,  1.51it/s]                                                      68%|██████▊   | 6610/9770 [1:16:42<34:51,  1.51it/s] 68%|██████▊   | 6611/9770 [1:16:43<34:37,  1.52it/s] 68%|██████▊   | 6612/9770 [1:16:44<34:04,  1.54it/s] 68%|██████▊   | 6613/9770 [1:16:44<34:09,  1.54it/s] 68%|██████▊   | 6614/9770 [1:16:45<33:49,  1.56it/s] 68%|██████▊   | 6615/9770 [1:16:46<34:05,  1.54it/s] 68%|██████▊   | 6616/9770 [1:16:46<33:59,  1.55it/s] 68%|██████�
+0: {'loss': 0.6629, 'grad_norm': 0.6536838426154358, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.68}
+0: �   | 6617/9770 [1:16:47<35:03,  1.50it/s] 68%|██████▊   | 6618/9770 [1:16:48<34:58,  1.50it/s] 68%|██████▊   | 6619/9770 [1:16:48<34:44,  1.51it/s] 68%|██████▊   | 6620/9770 [1:16:49<34:28,  1.52it/s]                                                      68%|██████▊   | 6620/9770 [1:16:49<34:28,  1.52it/s] 68%|██████▊   | 6621/9770 [1:16:50<34:11,  1.53it/s] 68%|██████▊   | 6622/9770 [1:16:50<33:43,  1.56it/s] 68%|██████▊   | 6623/9770 [1:16:51<33:42,  1.56it/s] 68%|██████▊   | 6624/9770 [1:16:52<33:49,  1.55it/s] 68%|██████▊   | 6625/9770 [1:16:52<34:09,  1.53it/s] 68%|█████���▊   | 6626/9770 [1:16:53<34:09,  1.53it/s] 68%|██████▊   | 6627/9770 [1:16:54<34:15,  1.53it/s] 68%|██████▊   | 6628/9770 [1:16:54<34:03,  1.54it/s] 68%|██████▊   | 6629/9770 [1:16:55<34:10,  1.53it/s] 68%|██████▊   | 6630/9770 [
+0: {'loss': 0.6884, 'grad_norm': 0.6120148503555509, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.68}
+0: {'loss': 0.683, 'grad_norm': 0.6054140378200428, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.68}
+0: 1:16:55<34:37,  1.51it/s]                                                      68%|██████▊   | 6630/9770 [1:16:55<34:37,  1.51it/s] 68%|██████▊   | 6631/9770 [1:16:56<35:27,  1.48it/s] 68%|██████▊   | 6632/9770 [1:16:57<34:58,  1.50it/s] 68%|██████▊   | 6633/9770 [1:16:58<34:45,  1.50it/s] 68%|██████▊   | 6634/9770 [1:16:58<34:18,  1.52it/s] 68%|██████▊   | 6635/9770 [1:16:59<34:03,  1.53it/s] 68%|██████▊   | 6636/9770 [1:16:59<33:50,  1.54it/s] 68%|██████▊   | 6637/9770 [1:17:00<33:31,  1.56it/s] 68%|██████▊   | 6638/9770 [1:17:01<33:46,  1.55it/s] 68%|██████▊   | 6639/9770 [1:17:01<34:29,  1.51it/s] 68%|██████▊   | 6640/9770 [1:17:02<34:11,  1.53it/s]                                                      68%|██████▊   | 6640/9770 [1:17:02<34:11,  1.53it/s] 68%|██████▊   | 6641/9770 [1:17:03<34:08,  1.53it/s] 68%
+0: {'loss': 0.6588, 'grad_norm': 0.6760128021442565, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.68}
+0: |██████▊   | 6642/9770 [1:17:03<34:52,  1.50it/s] 68%|██████▊   | 6643/9770 [1:17:04<34:32,  1.51it/s] 68%|██████▊   | 6644/9770 [1:17:05<34:13,  1.52it/s] 68%|██████▊   | 6645/9770 [1:17:05<34:16,  1.52it/s] 68%|██████▊   | 6646/9770 [1:17:06<34:20,  1.52it/s] 68%|██████▊   | 6647/9770 [1:17:07<34:51,  1.49it/s] 68%|██████▊   | 6648/9770 [1:17:07<34:43,  1.50it/s] 68%|██████▊   | 6649/9770 [1:17:08<34:25,  1.51it/s] 68%|██████▊   | 6650/9770 [1:17:09<34:05,  1.53it/s]                                                      68%|██████▊   | 6650/9770 [1:17:09<34:05,  1.53it/s] 68%|██████▊   | 6651/9770 [1:17:09<34:20,  1.51it/s] 68%|██████▊   | 6652/9770 [1:17:10<34:52,  1.49it/s] 68%|██████▊   | 6653/9770 [1:17:11<34:20,  1.51it/s] 68%|██████▊   | 6654/9770 [1:17:11<34:57,  1.49it/s] 68%|█████�
+0: {'loss': 0.675, 'grad_norm': 0.6295464781057989, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.68}
+0: ��▊   | 6655/9770 [1:17:12<34:28,  1.51it/s] 68%|██████▊   | 6656/9770 [1:17:13<34:11,  1.52it/s] 68%|██████▊   | 6657/9770 [1:17:13<34:08,  1.52it/s] 68%|██████▊   | 6658/9770 [1:17:14<34:02,  1.52it/s] 68%|██████▊   | 6659/9770 [1:17:15<34:16,  1.51it/s] 68%|██████▊   | 6660/9770 [1:17:15<33:40,  1.54it/s]                                                      68%|██████▊   | 6660/9770 [1:17:15<33:40,  1.54it/s] 68%|██████▊   | 6661/9770 [1:17:16<34:31,  1.50it/s] 68%|██████▊   | 6662/9770 [1:17:17<34:37,  1.50it/s] 68%|██████▊   | 6663/9770 [1:17:17<34:29,  1.50it/s] 68%|██████▊   | 6664/9770 [1:17:18<34:44,  1.49it/s] 68%|██████▊   | 6665/9770 [1:17:19<34:58,  1.48it/s] 68%|██████▊   | 6666/9770 [1:17:19<34:35,  1.50it/s] 68%|██████▊   | 6667/9770 [1:17:20<34:33,  1.50it/s] 68%|██████▊   | 6668/97
+0: {'loss': 0.6701, 'grad_norm': 0.6227957466848296, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.68}
+0: {'loss': 0.6755, 'grad_norm': 0.7060683051148555, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.68}
+0: 70 [1:17:21<35:01,  1.48it/s] 68%|██████▊   | 6669/9770 [1:17:21<34:38,  1.49it/s] 68%|██████▊   | 6670/9770 [1:17:22<34:35,  1.49it/s]                                                      68%|██████▊   | 6670/9770 [1:17:22<34:35,  1.49it/s] 68%|██████▊   | 6671/9770 [1:17:23<34:41,  1.49it/s] 68%|██████▊   | 6672/9770 [1:17:23<34:12,  1.51it/s] 68%|██████▊   | 6673/9770 [1:17:24<34:11,  1.51it/s] 68%|██████▊   | 6674/9770 [1:17:25<34:47,  1.48it/s] 68%|██████▊   | 6675/9770 [1:17:25<34:25,  1.50it/s] 68%|██████▊   | 6676/9770 [1:17:26<34:05,  1.51it/s] 68%|██████▊   | 6677/9770 [1:17:27<34:15,  1.50it/s] 68%|██████▊   | 6678/9770 [1:17:27<34:57,  1.47it/s] 68%|██████▊   | 6679/9770 [1:17:28<35:04,  1.47it/s] 68%|██████▊   | 6680/9770 [1:17:29<34:42,  1.48it/s]                                                     
+0: {'loss': 0.6844, 'grad_norm': 0.6717450705216048, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.68}
+0:  68%|██████▊   | 6680/9770 [1:17:29<34:42,  1.48it/s] 68%|██████▊   | 6681/9770 [1:17:29<34:43,  1.48it/s] 68%|██████▊   | 6682/9770 [1:17:30<34:40,  1.48it/s] 68%|██████▊   | 6683/9770 [1:17:31<34:21,  1.50it/s] 68%|██████▊   | 6684/9770 [1:17:31<34:09,  1.51it/s] 68%|██████▊   | 6685/9770 [1:17:32<34:38,  1.48it/s] 68%|██████▊   | 6686/9770 [1:17:33<34:36,  1.48it/s] 68%|██████▊   | 6687/9770 [1:17:33<34:24,  1.49it/s] 68%|██████▊   | 6688/9770 [1:17:34<33:52,  1.52it/s] 68%|██████▊   | 6689/9770 [1:17:35<33:47,  1.52it/s] 68%|██████▊   | 6690/9770 [1:17:35<33:32,  1.53it/s]                                                      68%|██████▊   | 6690/9770 [1:17:35<33:32,  1.53it/s] 68%|██████▊   | 6691/9770 [1:17:36<34:04,  1.51it/s] 68%|██████▊   | 6692/9770 [1:17:37<33:21,  1.54it/s] 69%|████
+0: {'loss': 0.6895, 'grad_norm': 0.6596544990822748, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.69}
+0: ██▊   | 6693/9770 [1:17:37<33:30,  1.53it/s] 69%|██████▊   | 6694/9770 [1:17:38<33:44,  1.52it/s] 69%|██████▊   | 6695/9770 [1:17:39<33:30,  1.53it/s] 69%|██████▊   | 6696/9770 [1:17:39<33:27,  1.53it/s] 69%|██████▊   | 6697/9770 [1:17:40<33:02,  1.55it/s] 69%|██████▊   | 6698/9770 [1:17:41<33:38,  1.52it/s] 69%|██████▊   | 6699/9770 [1:17:41<33:44,  1.52it/s] 69%|██████▊   | 6700/9770 [1:17:42<33:32,  1.53it/s]                                                      69%|██████▊   | 6700/9770 [1:17:42<33:32,  1.53it/s] 69%|██████▊   | 6701/9770 [1:17:43<33:22,  1.53it/s] 69%|██████▊   | 6702/9770 [1:17:43<32:56,  1.55it/s] 69%|██████▊   | 6703/9770 [1:17:44<33:09,  1.54it/s] 69%|██████▊   | 6704/9770 [1:17:44<33:11,  1.54it/s] 69%|██████▊   | 6705/9770 [1:17:45<33:30,  1.52it/s] 69%|██████▊   | 670
+0: {'loss': 0.6675, 'grad_norm': 0.6793583770539431, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.69}
+0: 6/9770 [1:17:46<33:32,  1.52it/s] 69%|██████▊   | 6707/9770 [1:17:46<33:23,  1.53it/s] 69%|██████▊   | 6708/9770 [1:17:47<33:55,  1.50it/s] 69%|██████▊   | 6709/9770 [1:17:48<33:25,  1.53it/s] 69%|██████▊   | 6710/9770 [1:17:48<33:38,  1.52it/s]                                                      69%|██████▊   | 6710/9770 [1:17:48<33:38,  1.52it/s] 69%|██████▊   | 6711/9770 [1:17:49<33:44,  1.51it/s] 69%|██████▊   | 6712/9770 [1:17:50<33:14,  1.53it/s] 69%|██████▊   | 6713/9770 [1:17:50<33:45,  1.51it/s] 69%|██████▊   | 6714/9770 [1:17:51<33:24,  1.52it/s] 69%|██████▊   | 6715/9770 [1:17:52<33:27,  1.52it/s] 69%|██████▊   | 6716/9770 [1:17:52<33:34,  1.52it/s] 69%|██████▉   | 6717/9770 [1:17:53<33:18,  1.53it/s] 69%|██████▉   | 6718/9770 [1:17:54<32:58,  1.54it/s] 69%|██████▉   | 6719/9770 [1:17:54<3
+0: {'loss': 0.6569, 'grad_norm': 0.6822316189326495, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.69}
+0: {'loss': 0.6515, 'grad_norm': 0.5938488492227458, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.69}
+0: 2:42,  1.55it/s] 69%|██████▉   | 6720/9770 [1:17:55<33:28,  1.52it/s]                                                      69%|██████▉   | 6720/9770 [1:17:55<33:28,  1.52it/s] 69%|██████▉   | 6721/9770 [1:17:56<33:20,  1.52it/s] 69%|██████▉   | 6722/9770 [1:17:56<33:28,  1.52it/s] 69%|██████▉   | 6723/9770 [1:17:57<33:02,  1.54it/s] 69%|██████▉   | 6724/9770 [1:17:58<32:35,  1.56it/s] 69%|██████▉   | 6725/9770 [1:17:58<32:39,  1.55it/s] 69%|██████▉   | 6726/9770 [1:17:59<32:31,  1.56it/s] 69%|██████▉   | 6727/9770 [1:18:00<32:52,  1.54it/s] 69%|██████▉   | 6728/9770 [1:18:00<32:56,  1.54it/s] 69%|██████▉   | 6729/9770 [1:18:01<33:15,  1.52it/s] 69%|██████▉   | 6730/9770 [1:18:01<33:09,  1.53it/s]                                                      69%|██████▉   | 6730/9770 [1:18:01<33:09,  1.53it/s] 69%|██�
+0: {'loss': 0.6472, 'grad_norm': 0.615696959525628, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.69}
+0: �███▉   | 6731/9770 [1:18:02<33:03,  1.53it/s] 69%|██████▉   | 6732/9770 [1:18:03<33:08,  1.53it/s] 69%|██████▉   | 6733/9770 [1:18:03<32:38,  1.55it/s] 69%|██████▉   | 6734/9770 [1:18:04<33:11,  1.52it/s] 69%|██████▉   | 6735/9770 [1:18:05<32:51,  1.54it/s] 69%|██████▉   | 6736/9770 [1:18:05<33:02,  1.53it/s] 69%|██████▉   | 6737/9770 [1:18:06<32:58,  1.53it/s] 69%|██████▉   | 6738/9770 [1:18:07<32:28,  1.56it/s] 69%|██████▉   | 6739/9770 [1:18:07<32:46,  1.54it/s] 69%|██████▉   | 6740/9770 [1:18:08<32:55,  1.53it/s]                                                      69%|██████▉   | 6740/9770 [1:18:08<32:55,  1.53it/s] 69%|██████▉   | 6741/9770 [1:18:09<36:11,  1.39it/s] 69%|██████▉   | 6742/9770 [1:18:10<35:16,  1.43it/s] 69%|██████▉   | 6743/9770 [1:18:10<34:56,  1.44it/s] 69%|██████▉   |
+0: {'loss': 0.6534, 'grad_norm': 0.6151599925839777, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.69}
+0:  6744/9770 [1:18:11<34:26,  1.46it/s] 69%|██████▉   | 6745/9770 [1:18:11<33:40,  1.50it/s] 69%|██████▉   | 6746/9770 [1:18:12<32:56,  1.53it/s] 69%|██████▉   | 6747/9770 [1:18:13<32:27,  1.55it/s] 69%|██████▉   | 6748/9770 [1:18:13<32:39,  1.54it/s] 69%|██████▉   | 6749/9770 [1:18:14<32:52,  1.53it/s] 69%|██████▉   | 6750/9770 [1:18:15<32:46,  1.54it/s]                                                      69%|██████▉   | 6750/9770 [1:18:15<32:46,  1.54it/s] 69%|██████▉   | 6751/9770 [1:18:15<33:29,  1.50it/s] 69%|██████▉   | 6752/9770 [1:18:16<33:06,  1.52it/s] 69%|██████▉   | 6753/9770 [1:18:17<32:49,  1.53it/s] 69%|██████▉   | 6754/9770 [1:18:17<32:53,  1.53it/s] 69%|██████▉   | 6755/9770 [1:18:18<32:48,  1.53it/s] 69%|██████▉   | 6756/9770 [1:18:19<32:27,  1.55it/s] 69%|██████▉   | 6757/9770 [1:18:
+0: {'loss': 0.6588, 'grad_norm': 0.6786478314661076, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.69}
+0: 19<32:28,  1.55it/s] 69%|██████▉   | 6758/9770 [1:18:20<32:25,  1.55it/s] 69%|██████▉   | 6759/9770 [1:18:21<32:32,  1.54it/s] 69%|██████▉   | 6760/9770 [1:18:21<32:51,  1.53it/s]                                                      69%|██████▉   | 6760/9770 [1:18:21<32:51,  1.53it/s] 69%|██████▉   | 6761/9770 [1:18:22<33:11,  1.51it/s] 69%|██████▉   | 6762/9770 [1:18:23<33:01,  1.52it/s] 69%|██████▉   | 6763/9770 [1:18:23<32:58,  1.52it/s] 69%|██████▉   | 6764/9770 [1:18:24<32:37,  1.54it/s] 69%|██████▉   | 6765/9770 [1:18:24<32:28,  1.54it/s] 69%|██████▉   | 6766/9770 [1:18:25<32:58,  1.52it/s] 69%|██████▉   | 6767/9770 [1:18:26<33:20,  1.50it/s] 69%|██████▉   | 6768/9770 [1:18:27<33:13,  1.51it/s] 69%|██████▉   | 6769/9770 [1:18:27<32:40,  1.53it/s] 69%|██████▉   | 6770/9770 [1:18:28<32:24,  1.54it
+0: {'loss': 0.6578, 'grad_norm': 0.575305858631143, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.69}
+0: {'loss': 0.673, 'grad_norm': 0.5767315509497407, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.69}
+0: /s]                                                      69%|██████▉   | 6770/9770 [1:18:28<32:24,  1.54it/s] 69%|██████▉   | 6771/9770 [1:18:28<32:21,  1.54it/s] 69%|██████▉   | 6772/9770 [1:18:29<32:10,  1.55it/s] 69%|██████▉   | 6773/9770 [1:18:30<32:03,  1.56it/s] 69%|██████��   | 6774/9770 [1:18:30<32:17,  1.55it/s] 69%|██████▉   | 6775/9770 [1:18:31<32:24,  1.54it/s] 69%|██████▉   | 6776/9770 [1:18:32<32:34,  1.53it/s] 69%|██████▉   | 6777/9770 [1:18:32<32:28,  1.54it/s] 69%|██████▉   | 6778/9770 [1:18:33<32:21,  1.54it/s] 69%|██████▉   | 6779/9770 [1:18:34<32:39,  1.53it/s] 69%|██████▉   | 6780/9770 [1:18:34<32:12,  1.55it/s]                                                      69%|██████▉   | 6780/9770 [1:18:34<32:12,  1.55it/s] 69%|██████▉   | 6781/9770 [1:18:35<32:04,  1.55it/s] 69%|██████▉
+0: {'loss': 0.6468, 'grad_norm': 0.6239627762349214, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.69}
+0:    | 6782/9770 [1:18:36<32:20,  1.54it/s] 69%|██████▉   | 6783/9770 [1:18:36<33:11,  1.50it/s] 69%|██████▉   | 6784/9770 [1:18:37<33:48,  1.47it/s] 69%|██████▉   | 6785/9770 [1:18:38<33:13,  1.50it/s] 69%|██████▉   | 6786/9770 [1:18:38<33:01,  1.51it/s] 69%|██████▉   | 6787/9770 [1:18:39<32:37,  1.52it/s] 69%|██████▉   | 6788/9770 [1:18:40<32:27,  1.53it/s] 69%|██████▉   | 6789/9770 [1:18:40<32:24,  1.53it/s] 69%|██████▉   | 6790/9770 [1:18:41<32:26,  1.53it/s]                                                      69%|██████▉   | 6790/9770 [1:18:41<32:26,  1.53it/s] 70%|██████▉   | 6791/9770 [1:18:42<32:06,  1.55it/s] 70%|██████▉   | 6792/9770 [1:18:42<31:46,  1.56it/s] 70%|██████▉   | 6793/9770 [1:18:43<31:56,  1.55it/s] 70%|██████▉   | 6794/9770 [1:18:43<31:43,  1.56it/s] 70%|██████▉   | 6795/9770 [1
+0: {'loss': 0.6748, 'grad_norm': 0.6029363948297037, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.7}
+0: :18:44<32:01,  1.55it/s] 70%|██████▉   | 6796/9770 [1:18:45<33:01,  1.50it/s] 70%|██████▉   | 6797/9770 [1:18:45<32:44,  1.51it/s] 70%|██████▉   | 6798/9770 [1:18:46<32:42,  1.51it/s] 70%|██████▉   | 6799/9770 [1:18:47<32:46,  1.51it/s] 70%|██████▉   | 6800/9770 [1:18:47<32:36,  1.52it/s]                                                      70%|██████▉   | 6800/9770 [1:18:47<32:36,  1.52it/s] 70%|██████▉   | 6801/9770 [1:18:48<32:14,  1.53it/s] 70%|██████▉   | 6802/9770 [1:18:49<32:26,  1.52it/s] 70%|██████▉   | 6803/9770 [1:18:49<32:41,  1.51it/s] 70%|██████▉   | 6804/9770 [1:18:50<32:31,  1.52it/s] 70%|██████▉   | 6805/9770 [1:18:51<32:28,  1.52it/s] 70%|██████▉   | 6806/9770 [1:18:51<32:20,  1.53it/s] 70%|██████▉   | 6807/9770 [1:18:52<32:18,  1.53it/s] 70%|██████▉   | 6808/9770 [1:18:53<32:09,  1.
+0: {'loss': 0.6581, 'grad_norm': 0.575547301774997, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.7}
+0: {'loss': 0.6576, 'grad_norm': 0.6123159396652619, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.7}
+0: 54it/s] 70%|██████▉   | 6809/9770 [1:18:53<31:37,  1.56it/s] 70%|██████▉   | 6810/9770 [1:18:54<31:52,  1.55it/s]                                                      70%|██████▉   | 6810/9770 [1:18:54<31:52,  1.55it/s] 70%|██████▉   | 6811/9770 [1:18:55<32:07,  1.54it/s] 70%|██████▉   | 6812/9770 [1:18:55<31:45,  1.55it/s] 70%|██████▉   | 6813/9770 [1:18:56<31:57,  1.54it/s] 70%|██████▉   | 6814/9770 [1:18:57<31:57,  1.54it/s] 70%|██████▉   | 6815/9770 [1:18:57<31:58,  1.54it/s] 70%|██████▉   | 6816/9770 [1:18:58<32:05,  1.53it/s] 70%|██████▉   | 6817/9770 [1:18:58<31:52,  1.54it/s] 70%|██████▉   | 6818/9770 [1:18:59<32:03,  1.53it/s] 70%|██████▉   | 6819/9770 [1:19:00<32:48,  1.50it/s] 70%|██████▉   | 6820/9770 [1:19:00<32:37,  1.51it/s]                                                      70%|█████�
+0: {'loss': 0.6586, 'grad_norm': 0.6278886841949828, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.7}
+0: �▉   | 6820/9770 [1:19:00<32:37,  1.51it/s] 70%|██████▉   | 6821/9770 [1:19:01<32:50,  1.50it/s] 70%|██████▉   | 6822/9770 [1:19:02<32:29,  1.51it/s] 70%|██████▉   | 6823/9770 [1:19:02<32:06,  1.53it/s] 70%|██████▉   | 6824/9770 [1:19:03<31:48,  1.54it/s] 70%|██████▉   | 6825/9770 [1:19:04<31:28,  1.56it/s] 70%|██████▉   | 6826/9770 [1:19:04<32:20,  1.52it/s] 70%|██████▉   | 6827/9770 [1:19:05<32:37,  1.50it/s] 70%|██████▉   | 6828/9770 [1:19:06<32:32,  1.51it/s] 70%|██████▉   | 6829/9770 [1:19:06<32:56,  1.49it/s] 70%|██████▉   | 6830/9770 [1:19:07<32:43,  1.50it/s]                                                      70%|██████▉   | 6830/9770 [1:19:07<32:43,  1.50it/s] 70%|██████▉   | 6831/9770 [1:19:08<32:47,  1.49it/s] 70%|██████▉   | 6832/9770 [1:19:08<32:25,  1.51it/s] 70%|██████▉   | 6833/977
+0: {'loss': 0.6368, 'grad_norm': 0.5834538267684685, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.7}
+0: 0 [1:19:09<32:24,  1.51it/s] 70%|██████▉   | 6834/9770 [1:19:10<32:01,  1.53it/s] 70%|██████▉   | 6835/9770 [1:19:10<32:42,  1.50it/s] 70%|██████▉   | 6836/9770 [1:19:11<32:50,  1.49it/s] 70%|██████▉   | 6837/9770 [1:19:12<32:38,  1.50it/s] 70%|██████▉   | 6838/9770 [1:19:12<32:28,  1.51it/s] 70%|███████   | 6839/9770 [1:19:13<32:23,  1.51it/s] 70%|███████   | 6840/9770 [1:19:14<32:00,  1.53it/s]                                                      70%|███████   | 6840/9770 [1:19:14<32:00,  1.53it/s] 70%|███████   | 6841/9770 [1:19:14<32:05,  1.52it/s] 70%|███████   | 6842/9770 [1:19:15<31:52,  1.53it/s] 70%|███████   | 6843/9770 [1:19:16<31:44,  1.54it/s] 70%|███████   | 6844/9770 [1:19:17<35:10,  1.39it/s] 70%|███████   | 6845/9770 [1:19:17<34:11,  1.43it/s] 70%|███████   | 6846/9770 [1:19:18<32:57,
+0: {'loss': 0.6574, 'grad_norm': 0.5921325280693264, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.7}
+0:   1.48it/s] 70%|███████   | 6847/9770 [1:19:18<32:08,  1.52it/s] 70%|███████   | 6848/9770 [1:19:19<31:56,  1.52it/s] 70%|███████   | 6849/9770 [1:19:20<31:47,  1.53it/s] 70%|███████   | 6850/9770 [1:19:20<32:27,  1.50it/s]                                                      70%|███████   | 6850/9770 [1:19:20<32:27,  1.50it/s] 70%|███████   | 6851/9770 [1:19:21<32:04,  1.52it/s] 70%|███████   | 6852/9770 [1:19:22<32:05,  1.52it/s] 70%|███████   | 6853/9770 [1:19:22<32:48,  1.48it/s] 70%|███████   | 6854/9770 [1:19:23<32:14,  1.51it/s] 70%|███████   | 6855/9770 [1:19:24<32:11,  1.51it/s] 70%|███████   | 6856/9770 [1:19:24<32:04,  1.51it/s] 70%|███████   | 6857/9770 [1:19:25<32:25,  1.50it/s] 70%|███████   | 6858/9770 [1:19:26<32:05,  1.51it/s] 70%|███████   | 6859/9770 [1:19:26<31:59,  1.52it/s] 70%|
+0: {'loss': 0.6533, 'grad_norm': 0.5825754139324131, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.7}
+0: {'loss': 0.6486, 'grad_norm': 0.5925507355304772, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.7}
+0: ███████   | 6860/9770 [1:19:27<31:30,  1.54it/s]                                                      70%|███████   | 6860/9770 [1:19:27<31:30,  1.54it/s] 70%|███████   | 6861/9770 [1:19:28<31:20,  1.55it/s] 70%|███████   | 6862/9770 [1:19:28<31:21,  1.55it/s] 70%|███████   | 6863/9770 [1:19:29<31:21,  1.54it/s] 70%|███████   | 6864/9770 [1:19:30<32:13,  1.50it/s] 70%|███████   | 6865/9770 [1:19:30<31:57,  1.52it/s] 70%|███████   | 6866/9770 [1:19:31<31:16,  1.55it/s] 70%|███████   | 6867/9770 [1:19:32<31:15,  1.55it/s] 70%|███████   | 6868/9770 [1:19:32<31:15,  1.55it/s] 70%|███████   | 6869/9770 [1:19:33<31:24,  1.54it/s] 70%|███████   | 6870/9770 [1:19:33<31:18,  1.54it/s]                                                      70%|███████   | 6870/9770 [1:19:33<31:18,  1.54it/s] 70%|███████   | 6871
+0: {'loss': 0.6388, 'grad_norm': 0.5718033196205995, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.7}
+0: /9770 [1:19:34<31:29,  1.53it/s] 70%|███████   | 6872/9770 [1:19:35<31:55,  1.51it/s] 70%|███████   | 6873/9770 [1:19:35<31:44,  1.52it/s] 70%|███████   | 6874/9770 [1:19:36<31:43,  1.52it/s] 70%|███████   | 6875/9770 [1:19:37<31:54,  1.51it/s] 70%|███████   | 6876/9770 [1:19:37<31:58,  1.51it/s] 70%|███████   | 6877/9770 [1:19:38<31:55,  1.51it/s] 70%|███████   | 6878/9770 [1:19:39<31:21,  1.54it/s] 70%|███████   | 6879/9770 [1:19:39<31:39,  1.52it/s] 70%|███████   | 6880/9770 [1:19:40<31:25,  1.53it/s]                                                      70%|███████   | 6880/9770 [1:19:40<31:25,  1.53it/s] 70%|███████   | 6881/9770 [1:19:41<31:19,  1.54it/s] 70%|███████   | 6882/9770 [1:19:41<31:02,  1.55it/s] 70%|███████   | 6883/9770 [1:19:42<31:13,  1.54it/s] 70%|███████   | 6884/9770 [1:19:43<31
+0: {'loss': 0.6693, 'grad_norm': 0.6239262658093406, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.7}
+0: :08,  1.54it/s] 70%|███████   | 6885/9770 [1:19:43<31:16,  1.54it/s] 70%|███████   | 6886/9770 [1:19:44<31:22,  1.53it/s] 70%|███████   | 6887/9770 [1:19:45<30:56,  1.55it/s] 71%|███████   | 6888/9770 [1:19:45<31:04,  1.55it/s] 71%|███████   | 6889/9770 [1:19:46<31:14,  1.54it/s] 71%|███████   | 6890/9770 [1:19:47<31:23,  1.53it/s]                                                      71%|███████   | 6890/9770 [1:19:47<31:23,  1.53it/s] 71%|███████   | 6891/9770 [1:19:47<31:35,  1.52it/s] 71%|███████   | 6892/9770 [1:19:48<32:08,  1.49it/s] 71%|███████   | 6893/9770 [1:19:49<31:46,  1.51it/s] 71%|███████   | 6894/9770 [1:19:49<32:23,  1.48it/s] 71%|███████   | 6895/9770 [1:19:50<31:45,  1.51it/s] 71%|███████   | 6896/9770 [1:19:51<31:21,  1.53it/s] 71%|███████   | 6897/9770 [1:19:51<31:07,  1.54it/s] 
+0: {'loss': 0.6558, 'grad_norm': 0.6352474947058601, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.71}
+0: 71%|███████   | 6898/9770 [1:19:52<31:11,  1.53it/s] 71%|███████   | 6899/9770 [1:19:53<31:11,  1.53it/s] 71%|███████   | 6900/9770 [1:19:53<31:50,  1.50it/s]                                                      71%|███████   | 6900/9770 [1:19:53<31:50,  1.50it/s] 71%|███████   | 6901/9770 [1:19:54<31:34,  1.51it/s] 71%|███████   | 6902/9770 [1:19:54<31:08,  1.54it/s] 71%|███████   | 6903/9770 [1:19:55<30:46,  1.55it/s] 71%|███████   | 6904/9770 [1:19:56<30:46,  1.55it/s] 71%|███████   | 6905/9770 [1:19:56<30:58,  1.54it/s] 71%|███████   | 6906/9770 [1:19:57<31:15,  1.53it/s] 71%|███████   | 6907/9770 [1:19:58<30:59,  1.54it/s] 71%|███████   | 6908/9770 [1:19:58<30:55,  1.54it/s] 71%|███████   | 6909/9770 [1:19:59<31:27,  1.52it/s] 71%|███████   | 6910/9770 [1:20:00<31:30,  1.51it/s]                  
+0: {'loss': 0.6502, 'grad_norm': 0.6361891715027244, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.71}
+0: {'loss': 0.6261, 'grad_norm': 0.5857497709112279, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.71}
+0:                                     71%|███████   | 6910/9770 [1:20:00<31:30,  1.51it/s] 71%|███████   | 6911/9770 [1:20:00<31:33,  1.51it/s] 71%|███████   | 6912/9770 [1:20:01<31:20,  1.52it/s] 71%|███████   | 6913/9770 [1:20:02<31:09,  1.53it/s] 71%|███████   | 6914/9770 [1:20:02<31:50,  1.49it/s] 71%|███████   | 6915/9770 [1:20:03<31:38,  1.50it/s] 71%|███████   | 6916/9770 [1:20:04<31:18,  1.52it/s] 71%|███████   | 6917/9770 [1:20:04<31:16,  1.52it/s] 71%|███████   | 6918/9770 [1:20:05<31:16,  1.52it/s] 71%|███████   | 6919/9770 [1:20:06<31:21,  1.52it/s] 71%|███████   | 6920/9770 [1:20:06<31:25,  1.51it/s]                                                      71%|███████   | 6920/9770 [1:20:06<31:25,  1.51it/s] 71%|███████   | 6921/9770 [1:20:07<31:36,  1.50it/s] 71%|███████   | 6922/9770 [1:20:0
+0: {'loss': 0.6744, 'grad_norm': 0.6224643305708777, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.71}
+0: 8<31:24,  1.51it/s] 71%|███████   | 6923/9770 [1:20:08<31:10,  1.52it/s] 71%|███████   | 6924/9770 [1:20:09<31:13,  1.52it/s] 71%|███████   | 6925/9770 [1:20:10<31:28,  1.51it/s] 71%|███████   | 6926/9770 [1:20:10<31:16,  1.52it/s] 71%|███████   | 6927/9770 [1:20:11<31:53,  1.49it/s] 71%|███████   | 6928/9770 [1:20:12<31:17,  1.51it/s] 71%|███████   | 6929/9770 [1:20:12<31:05,  1.52it/s] 71%|███████   | 6930/9770 [1:20:13<30:49,  1.54it/s]                                                      71%|███████   | 6930/9770 [1:20:13<30:49,  1.54it/s] 71%|███████   | 6931/9770 [1:20:14<30:44,  1.54it/s] 71%|███████   | 6932/9770 [1:20:14<30:27,  1.55it/s] 71%|███████   | 6933/9770 [1:20:15<30:23,  1.56it/s] 71%|███████   | 6934/9770 [1:20:15<30:22,  1.56it/s] 71%|███████   | 6935/9770 [1:20:16<30:37,  1.54it/
+0: {'loss': 0.6668, 'grad_norm': 0.6262758144806874, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.71}
+0: s] 71%|███████   | 6936/9770 [1:20:17<30:41,  1.54it/s] 71%|███████   | 6937/9770 [1:20:17<30:20,  1.56it/s] 71%|███████   | 6938/9770 [1:20:18<30:24,  1.55it/s] 71%|███████   | 6939/9770 [1:20:19<30:33,  1.54it/s] 71%|███████   | 6940/9770 [1:20:19<30:21,  1.55it/s]                                                      71%|███████   | 6940/9770 [1:20:19<30:21,  1.55it/s] 71%|███████   | 6941/9770 [1:20:20<30:34,  1.54it/s] 71%|███████   | 6942/9770 [1:20:21<30:50,  1.53it/s] 71%|███████   | 6943/9770 [1:20:21<30:49,  1.53it/s] 71%|███████   | 6944/9770 [1:20:22<30:55,  1.52it/s] 71%|███████   | 6945/9770 [1:20:23<30:43,  1.53it/s] 71%|███████   | 6946/9770 [1:20:23<31:24,  1.50it/s] 71%|███████   | 6947/9770 [1:20:24<31:02,  1.52it/s] 71%|███████   | 6948/9770 [1:20:25<31:04,  1.51it/s] 71%|███
+0: {'loss': 0.6506, 'grad_norm': 0.6208488686671941, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.71}
+0: {'loss': 0.6686, 'grad_norm': 0.5927485796495109, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.71}
+0: ████   | 6949/9770 [1:20:25<30:26,  1.54it/s] 71%|███████   | 6950/9770 [1:20:26<30:19,  1.55it/s]                                                      71%|███████   | 6950/9770 [1:20:26<30:19,  1.55it/s] 71%|███████   | 6951/9770 [1:20:27<31:13,  1.50it/s] 71%|███████   | 6952/9770 [1:20:27<31:04,  1.51it/s] 71%|███████   | 6953/9770 [1:20:28<30:54,  1.52it/s] 71%|███████   | 6954/9770 [1:20:29<30:37,  1.53it/s] 71%|███████   | 6955/9770 [1:20:29<30:09,  1.56it/s] 71%|███████   | 6956/9770 [1:20:30<30:11,  1.55it/s] 71%|███████   | 6957/9770 [1:20:30<30:02,  1.56it/s] 71%|███████   | 6958/9770 [1:20:31<29:52,  1.57it/s] 71%|███████   | 6959/9770 [1:20:32<30:06,  1.56it/s] 71%|███████   | 6960/9770 [1:20:32<30:03,  1.56it/s]                                                      71%|███████   | 6960/9770 [1:
+0: {'loss': 0.6243, 'grad_norm': 0.5629670801469756, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.71}
+0: 20:32<30:03,  1.56it/s] 71%|███████   | 6961/9770 [1:20:33<30:41,  1.53it/s] 71%|███████▏  | 6962/9770 [1:20:34<31:08,  1.50it/s] 71%|███████▏  | 6963/9770 [1:20:34<30:31,  1.53it/s] 71%|███████▏  | 6964/9770 [1:20:35<30:13,  1.55it/s] 71%|███████▏  | 6965/9770 [1:20:36<30:02,  1.56it/s] 71%|███████▏  | 6966/9770 [1:20:36<30:03,  1.56it/s] 71%|███████▏  | 6967/9770 [1:20:37<30:22,  1.54it/s] 71%|███████▏  | 6968/9770 [1:20:38<30:14,  1.54it/s] 71%|███████▏  | 6969/9770 [1:20:38<30:10,  1.55it/s] 71%|███████▏  | 6970/9770 [1:20:39<30:01,  1.55it/s]                                                      71%|███████▏  | 6970/9770 [1:20:39<30:01,  1.55it/s] 71%|███████▏  | 6971/9770 [1:20:40<30:08,  1.55it/s] 71%|███████▏  | 6972/9770 [1:20:40<30:17,  1.54it/s] 71%|███████▏  | 6973
+0: {'loss': 0.6526, 'grad_norm': 0.6561745459175042, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.71}
+0: /9770 [1:20:41<30:35,  1.52it/s] 71%|███████▏  | 6974/9770 [1:20:41<30:17,  1.54it/s] 71%|███████▏  | 6975/9770 [1:20:42<30:16,  1.54it/s] 71%|███████▏  | 6976/9770 [1:20:43<30:04,  1.55it/s] 71%|███████▏  | 6977/9770 [1:20:43<30:00,  1.55it/s] 71%|███████▏  | 6978/9770 [1:20:44<30:01,  1.55it/s] 71%|███████▏  | 6979/9770 [1:20:45<30:52,  1.51it/s] 71%|███████▏  | 6980/9770 [1:20:45<31:21,  1.48it/s]                                                      71%|███████▏  | 6980/9770 [1:20:45<31:21,  1.48it/s] 71%|███████▏  | 6981/9770 [1:20:46<30:56,  1.50it/s] 71%|███████▏  | 6982/9770 [1:20:47<31:26,  1.48it/s] 71%|███████▏  | 6983/9770 [1:20:47<31:09,  1.49it/s] 71%|███████▏  | 6984/9770 [1:20:48<30:39,  1.51it/s] 71%|███████▏  | 6985/9770 [1:20:49<30:22,  1.53it/s] 72%|███████
+0: {'loss': 0.6542, 'grad_norm': 0.6079073579913449, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.72}
+0: ▏  | 6986/9770 [1:20:49<30:12,  1.54it/s] 72%|███████▏  | 6987/9770 [1:20:50<29:45,  1.56it/s] 72%|███████▏  | 6988/9770 [1:20:51<30:24,  1.52it/s] 72%|███████▏  | 6989/9770 [1:20:51<30:25,  1.52it/s] 72%|███████▏  | 6990/9770 [1:20:52<30:30,  1.52it/s]                                                      72%|███████▏  | 6990/9770 [1:20:52<30:30,  1.52it/s] 72%|███████▏  | 6991/9770 [1:20:53<30:20,  1.53it/s] 72%|███████▏  | 6992/9770 [1:20:53<30:25,  1.52it/s] 72%|███████▏  | 6993/9770 [1:20:54<30:13,  1.53it/s] 72%|███████▏  | 6994/9770 [1:20:55<30:23,  1.52it/s] 72%|███████▏  | 6995/9770 [1:20:55<29:55,  1.55it/s] 72%|███████▏  | 6996/9770 [1:20:56<30:26,  1.52it/s] 72%|███████▏  | 6997/9770 [1:20:57<30:03,  1.54it/s] 72%|███████▏  | 6998/9770 [1:20:57<30:04,  1.54it/s] 72%|███�
+0: {'loss': 0.6488, 'grad_norm': 0.5753313691472444, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.72}
+0: {'loss': 0.6336, 'grad_norm': 0.5944756617514969, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.72}
+0: ��███▏  | 6999/9770 [1:20:58<30:08,  1.53it/s] 72%|███████▏  | 7000/9770 [1:20:59<29:56,  1.54it/s]                                                      72%|███████▏  | 7000/9770 [1:20:59<29:56,  1.54it/s] 72%|███████▏  | 7001/9770 [1:20:59<30:07,  1.53it/s] 72%|███████▏  | 7002/9770 [1:21:00<29:51,  1.55it/s] 72%|███████▏  | 7003/9770 [1:21:00<29:22,  1.57it/s] 72%|███████▏  | 7004/9770 [1:21:01<29:00,  1.59it/s] 72%|███████▏  | 7005/9770 [1:21:02<28:52,  1.60it/s] 72%|███████▏  | 7006/9770 [1:21:02<29:07,  1.58it/s] 72%|███████▏  | 7007/9770 [1:21:03<29:34,  1.56it/s] 72%|███████▏  | 7008/9770 [1:21:04<29:25,  1.56it/s] 72%|███████▏  | 7009/9770 [1:21:04<30:10,  1.52it/s] 72%|███████▏  | 7010/9770 [1:21:05<30:15,  1.52it/s]                                                      72%|████�
+0: {'loss': 0.6839, 'grad_norm': 0.6204004173055316, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.72}
+0: �██▏  | 7010/9770 [1:21:05<30:15,  1.52it/s] 72%|███████▏  | 7011/9770 [1:21:06<30:05,  1.53it/s] 72%|███████▏  | 7012/9770 [1:21:06<29:57,  1.53it/s] 72%|███████▏  | 7013/9770 [1:21:07<30:05,  1.53it/s] 72%|███████▏  | 7014/9770 [1:21:08<29:58,  1.53it/s] 72%|███████▏  | 7015/9770 [1:21:08<30:43,  1.49it/s] 72%|███████▏  | 7016/9770 [1:21:09<30:19,  1.51it/s] 72%|███████▏  | 7017/9770 [1:21:10<29:59,  1.53it/s] 72%|███████▏  | 7018/9770 [1:21:10<30:02,  1.53it/s] 72%|███████▏  | 7019/9770 [1:21:11<30:06,  1.52it/s] 72%|███████▏  | 7020/9770 [1:21:12<30:10,  1.52it/s]                                                      72%|███████▏  | 7020/9770 [1:21:12<30:10,  1.52it/s] 72%|███████▏  | 7021/9770 [1:21:12<30:10,  1.52it/s] 72%|███████▏  | 7022/9770 [1:21:13<30:13,  1.52it/s] 72%|█
+0: {'loss': 0.6519, 'grad_norm': 0.57989699012608, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.72}
+0: ██████▏  | 7023/9770 [1:21:14<30:00,  1.53it/s] 72%|███████���  | 7024/9770 [1:21:14<29:45,  1.54it/s] 72%|███████▏  | 7025/9770 [1:21:15<29:43,  1.54it/s] 72%|███████▏  | 7026/9770 [1:21:15<29:53,  1.53it/s] 72%|███████▏  | 7027/9770 [1:21:16<30:27,  1.50it/s] 72%|███████▏  | 7028/9770 [1:21:17<30:19,  1.51it/s] 72%|███████▏  | 7029/9770 [1:21:17<30:18,  1.51it/s] 72%|███████▏  | 7030/9770 [1:21:18<30:02,  1.52it/s]                                                      72%|███████▏  | 7030/9770 [1:21:18<30:02,  1.52it/s] 72%|███████▏  | 7031/9770 [1:21:19<29:54,  1.53it/s] 72%|███████▏  | 7032/9770 [1:21:19<29:56,  1.52it/s] 72%|███████▏  | 7033/9770 [1:21:20<30:41,  1.49it/s] 72%|███████▏  | 7034/9770 [1:21:21<30:57,  1.47it/s] 72%|███████▏  | 7035/9770 [1:21:21<30:29,  1.50it/
+0: {'loss': 0.6633, 'grad_norm': 0.6232905363297797, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.72}
+0: s] 72%|███████▏  | 7036/9770 [1:21:22<30:26,  1.50it/s] 72%|███████▏  | 7037/9770 [1:21:23<30:06,  1.51it/s] 72%|███████▏  | 7038/9770 [1:21:23<29:57,  1.52it/s] 72%|███████▏  | 7039/9770 [1:21:24<30:16,  1.50it/s] 72%|███████▏  | 7040/9770 [1:21:25<30:44,  1.48it/s]                                                      72%|███████▏  | 7040/9770 [1:21:25<30:44,  1.48it/s] 72%|███████▏  | 7041/9770 [1:21:25<30:32,  1.49it/s] 72%|███████▏  | 7042/9770 [1:21:26<30:29,  1.49it/s] 72%|███████▏  | 7043/9770 [1:21:27<30:09,  1.51it/s] 72%|███████▏  | 7044/9770 [1:21:27<29:55,  1.52it/s] 72%|███████▏  | 7045/9770 [1:21:28<29:34,  1.54it/s] 72%|███████▏  | 7046/9770 [1:21:29<30:01,  1.51it/s] 72%|███████▏  | 7047/9770 [1:21:29<29:47,  1.52it/s] 72%|███████▏  | 7048/9770 [1:21:30<29:2
+0: {'loss': 0.6448, 'grad_norm': 0.6003479512567722, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.72}
+0: 7,  1.54it/s] 72%|███████▏  | 7049/9770 [1:21:31<29:20,  1.55it/s] 72%|███████▏  | 7050/9770 [1:21:31<29:30,  1.54it/s]                                                      72%|███████▏  | 7050/9770 [1:21:31<29:30,  1.54it/s] 72%|███████▏  | 7051/9770 [1:21:32<29:18,  1.55it/s] 72%|███████▏  | 7052/9770 [1:21:33<29:45,  1.52it/s] 72%|███████▏  | 7053/9770 [1:21:33<29:33,  1.53it/s] 72%|███████▏  | 7054/9770 [1:21:34<30:11,  1.50it/s] 72%|███████▏  | 7055/9770 [1:21:35<30:39,  1.48it/s] 72%|███████▏  | 7056/9770 [1:21:35<30:13,  1.50it/s] 72%|███████▏  | 7057/9770 [1:21:36<29:56,  1.51it/s] 72%|███████▏  | 7058/9770 [1:21:37<29:40,  1.52it/s] 72%|███████▏  | 7059/9770 [1:21:37<29:33,  1.53it/s] 72%|███████▏  | 7060/9770 [1:21:38<29:37,  1.52it/s]                                             
+0: {'loss': 0.6594, 'grad_norm': 0.5866288400266059, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.72}
+0: {'loss': 0.6558, 'grad_norm': 0.6146188645465911, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.72}
+0:          72%|███████▏  | 7060/9770 [1:21:38<29:37,  1.52it/s] 72%|███████▏  | 7061/9770 [1:21:39<29:17,  1.54it/s] 72%|███████▏  | 7062/9770 [1:21:39<30:00,  1.50it/s] 72%|███████▏  | 7063/9770 [1:21:40<29:58,  1.51it/s] 72%|███████▏  | 7064/9770 [1:21:41<29:51,  1.51it/s] 72%|███████▏  | 7065/9770 [1:21:41<29:38,  1.52it/s] 72%|███████▏  | 7066/9770 [1:21:42<29:24,  1.53it/s] 72%|███████▏  | 7067/9770 [1:21:43<29:40,  1.52it/s] 72%|███████▏  | 7068/9770 [1:21:43<29:35,  1.52it/s] 72%|███████▏  | 7069/9770 [1:21:44<29:28,  1.53it/s] 72%|███████▏  | 7070/9770 [1:21:45<29:23,  1.53it/s]                                                      72%|███████▏  | 7070/9770 [1:21:45<29:23,  1.53it/s] 72%|███████▏  | 7071/9770 [1:21:45<29:24,  1.53it/s] 72%|███████▏  | 7072/9770 [1:21:
+0: {'loss': 0.6406, 'grad_norm': 0.6223869616327192, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.72}
+0: 46<29:05,  1.55it/s] 72%|██���████▏  | 7073/9770 [1:21:46<29:35,  1.52it/s] 72%|███████▏  | 7074/9770 [1:21:47<29:40,  1.51it/s] 72%|███████▏  | 7075/9770 [1:21:48<29:28,  1.52it/s] 72%|███████▏  | 7076/9770 [1:21:48<29:27,  1.52it/s] 72%|███████▏  | 7077/9770 [1:21:49<29:23,  1.53it/s] 72%|███████▏  | 7078/9770 [1:21:50<29:26,  1.52it/s] 72%|███████▏  | 7079/9770 [1:21:50<28:59,  1.55it/s] 72%|███████▏  | 7080/9770 [1:21:51<29:07,  1.54it/s]                                                      72%|███████▏  | 7080/9770 [1:21:51<29:07,  1.54it/s] 72%|███████▏  | 7081/9770 [1:21:52<28:56,  1.55it/s] 72%|███████▏  | 7082/9770 [1:21:52<29:04,  1.54it/s] 72%|███████▏  | 7083/9770 [1:21:53<29:13,  1.53it/s] 73%|███████▎  | 7084/9770 [1:21:54<29:14,  1.53it/s] 73%|███████▎  | 7085/
+0: {'loss': 0.6524, 'grad_norm': 0.6426328606997298, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.73}
+0: 9770 [1:21:54<29:19,  1.53it/s] 73%|███████▎  | 7086/9770 [1:21:55<29:09,  1.53it/s] 73%|███████▎  | 7087/9770 [1:21:56<29:08,  1.53it/s] 73%|███████▎  | 7088/9770 [1:21:56<29:03,  1.54it/s] 73%|███████▎  | 7089/9770 [1:21:57<29:10,  1.53it/s] 73%|███████▎  | 7090/9770 [1:21:58<28:59,  1.54it/s]                                                      73%|███████▎  | 7090/9770 [1:21:58<28:59,  1.54it/s] 73%|███████▎  | 7091/9770 [1:21:58<29:02,  1.54it/s] 73%|███████▎  | 7092/9770 [1:21:59<28:40,  1.56it/s] 73%|███████▎  | 7093/9770 [1:21:59<28:53,  1.54it/s] 73%|███████▎  | 7094/9770 [1:22:00<29:28,  1.51it/s] 73%|███████▎  | 7095/9770 [1:22:01<29:03,  1.53it/s] 73%|███████▎  | 7096/9770 [1:22:01<29:04,  1.53it/s] 73%|███████▎  | 7097/9770 [1:22:02<29:02,  1.53it/s] 73%|███████�
+0: {'loss': 0.6771, 'grad_norm': 0.5918909773169833, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.73}
+0: ��  | 7098/9770 [1:22:03<28:32,  1.56it/s] 73%|███████▎  | 7099/9770 [1:22:03<28:29,  1.56it/s] 73%|███████▎  | 7100/9770 [1:22:04<28:30,  1.56it/s]                                                      73%|███████▎  | 7100/9770 [1:22:04<28:30,  1.56it/s] 73%|███████▎  | 7101/9770 [1:22:05<28:41,  1.55it/s] 73%|███████▎  | 7102/9770 [1:22:05<28:15,  1.57it/s] 73%|███████▎  | 7103/9770 [1:22:06<28:22,  1.57it/s] 73%|███████▎  | 7104/9770 [1:22:07<28:49,  1.54it/s] 73%|███████▎  | 7105/9770 [1:22:07<28:48,  1.54it/s] 73%|███████▎  | 7106/9770 [1:22:08<29:01,  1.53it/s] 73%|███████▎  | 7107/9770 [1:22:09<28:32,  1.56it/s] 73%|███████▎  | 7108/9770 [1:22:09<28:37,  1.55it/s] 73%|███████▎  | 7109/9770 [1:22:10<29:17,  1.51it/s] 73%|███████▎  | 7110/9770 [1:22:11<29:08,  1.52it/s]                
+0: {'loss': 0.6498, 'grad_norm': 0.5815130158801066, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.73}
+0: {'loss': 0.6228, 'grad_norm': 0.5793914989442194, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.73}
+0:                                       73%|███████▎  | 7110/9770 [1:22:11<29:08,  1.52it/s] 73%|███████▎  | 7111/9770 [1:22:11<29:10,  1.52it/s] 73%|███████▎  | 7112/9770 [1:22:12<29:07,  1.52it/s] 73%|███████▎  | 7113/9770 [1:22:12<28:37,  1.55it/s] 73%|███████▎  | 7114/9770 [1:22:13<28:33,  1.55it/s] 73%|███████▎  | 7115/9770 [1:22:14<28:33,  1.55it/s] 73%|███████▎  | 7116/9770 [1:22:14<28:50,  1.53it/s] 73%|███████▎  | 7117/9770 [1:22:15<28:33,  1.55it/s] 73%|███████▎  | 7118/9770 [1:22:16<28:32,  1.55it/s] 73%|███████▎  | 7119/9770 [1:22:16<28:27,  1.55it/s] 73%|███████▎  | 7120/9770 [1:22:17<28:16,  1.56it/s]                                                      73%|███████▎  | 7120/9770 [1:22:17<28:16,  1.56it/s] 73%|███████▎  | 7121/9770 [1:22:18<28:34,  1.54it/s] 73%|█████
+0: {'loss': 0.6562, 'grad_norm': 0.5924726055110466, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.73}
+0: ██▎  | 7122/9770 [1:22:18<28:28,  1.55it/s] 73%|███████▎  | 7123/9770 [1:22:19<28:10,  1.57it/s] 73%|███████▎  | 7124/9770 [1:22:20<28:48,  1.53it/s] 73%|███████▎  | 7125/9770 [1:22:20<28:33,  1.54it/s] 73%|███████▎  | 7126/9770 [1:22:21<28:31,  1.54it/s] 73%|███████▎  | 7127/9770 [1:22:22<28:40,  1.54it/s] 73%|███████▎  | 7128/9770 [1:22:22<28:33,  1.54it/s] 73%|███████▎  | 7129/9770 [1:22:23<29:19,  1.50it/s] 73%|███████▎  | 7130/9770 [1:22:24<28:51,  1.52it/s]                                                      73%|███████▎  | 7130/9770 [1:22:24<28:51,  1.52it/s] 73%|███████▎  | 7131/9770 [1:22:24<28:46,  1.53it/s] 73%|███████▎  | 7132/9770 [1:22:25<28:39,  1.53it/s] 73%|███████▎  | 7133/9770 [1:22:25<28:27,  1.54it/s] 73%|███████▎  | 7134/9770 [1:22:26<28:35,  1.54it/s] 73%|█�
+0: {'loss': 0.6808, 'grad_norm': 0.5776311203110236, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.73}
+0: ��█████▎  | 7135/9770 [1:22:27<28:21,  1.55it/s] 73%|███████▎  | 7136/9770 [1:22:27<28:58,  1.51it/s] 73%|███████▎  | 7137/9770 [1:22:28<28:36,  1.53it/s] 73%|███████▎  | 7138/9770 [1:22:29<28:31,  1.54it/s] 73%|███████▎  | 7139/9770 [1:22:29<28:30,  1.54it/s] 73%|███████▎  | 7140/9770 [1:22:30<28:23,  1.54it/s]                                                      73%|███████▎  | 7140/9770 [1:22:30<28:23,  1.54it/s] 73%|███████▎  | 7141/9770 [1:22:31<28:54,  1.52it/s] 73%|███████▎  | 7142/9770 [1:22:31<28:53,  1.52it/s] 73%|███████▎  | 7143/9770 [1:22:32<28:38,  1.53it/s] 73%|███████▎  | 7144/9770 [1:22:33<29:10,  1.50it/s] 73%|███████▎  | 7145/9770 [1:22:33<28:46,  1.52it/s] 73%|███████▎  | 7146/9770 [1:22:34<28:31,  1.53it/s] 73%|███████▎  | 7147/9770 [1:22:35<28:21,  1.54it/s
+0: {'loss': 0.6514, 'grad_norm': 0.6172256297917075, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.73}
+0: ] 73%|███████▎  | 7148/9770 [1:22:35<28:07,  1.55it/s] 73%|███████▎  | 7149/9770 [1:22:36<27:57,  1.56it/s] 73%|███████▎  | 7150/9770 [1:22:37<28:04,  1.56it/s]                                                      73%|███████▎  | 7150/9770 [1:22:37<28:04,  1.56it/s] 73%|███████▎  | 7151/9770 [1:22:37<28:31,  1.53it/s] 73%|███████▎  | 7152/9770 [1:22:38<28:24,  1.54it/s] 73%|███████▎  | 7153/9770 [1:22:39<29:09,  1.50it/s] 73%|███████▎  | 7154/9770 [1:22:39<29:01,  1.50it/s] 73%|███████▎  | 7155/9770 [1:22:40<28:41,  1.52it/s] 73%|███████▎  | 7156/9770 [1:22:41<29:11,  1.49it/s] 73%|███████▎  | 7157/9770 [1:22:41<28:24,  1.53it/s] 73%|███████▎  | 7158/9770 [1:22:42<28:27,  1.53it/s] 73%|███████▎  | 7159/9770 [1:22:42<28:39,  1.52it/s] 73%|███████▎  | 7160/9770 [1:22:43<28:31
+0: {'loss': 0.6424, 'grad_norm': 0.5606820504968753, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.73}
+0: {'loss': 0.6537, 'grad_norm': 0.567872453373072, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.73}
+0: ,  1.52it/s]                                                      73%|███████▎  | 7160/9770 [1:22:43<28:31,  1.52it/s] 73%|███████▎  | 7161/9770 [1:22:44<28:28,  1.53it/s] 73%|███████▎  | 7162/9770 [1:22:44<28:40,  1.52it/s] 73%|███████▎  | 7163/9770 [1:22:45<28:04,  1.55it/s] 73%|███████▎  | 7164/9770 [1:22:46<28:08,  1.54it/s] 73%|███████▎  | 7165/9770 [1:22:46<28:03,  1.55it/s] 73%|███████▎  | 7166/9770 [1:22:47<28:18,  1.53it/s] 73%|███████▎  | 7167/9770 [1:22:48<28:16,  1.53it/s] 73%|███████▎  | 7168/9770 [1:22:48<28:36,  1.52it/s] 73%|███████▎  | 7169/9770 [1:22:49<28:24,  1.53it/s] 73%|███████▎  | 7170/9770 [1:22:50<28:23,  1.53it/s]                                                      73%|███████▎  | 7170/9770 [1:22:50<28:23,  1.53it/s] 73%|███████▎  | 7171/9770 [1:22:50<28:35,  1
+0: {'loss': 0.6525, 'grad_norm': 0.6213660601265412, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.73}
+0: .51it/s] 73%|███████▎  | 7172/9770 [1:22:51<28:13,  1.53it/s] 73%|███████▎  | 7173/9770 [1:22:52<27:51,  1.55it/s] 73%|███████▎  | 7174/9770 [1:22:52<27:48,  1.56it/s] 73%|███████▎  | 7175/9770 [1:22:53<27:52,  1.55it/s] 73%|███████▎  | 7176/9770 [1:22:53<27:27,  1.57it/s] 73%|███████▎  | 7177/9770 [1:22:54<27:50,  1.55it/s] 73%|███████▎  | 7178/9770 [1:22:55<27:50,  1.55it/s] 73%|███████▎  | 7179/9770 [1:22:55<28:01,  1.54it/s] 73%|███████▎  | 7180/9770 [1:22:56<28:50,  1.50it/s]                                                      73%|███████▎  | 7180/9770 [1:22:56<28:50,  1.50it/s] 74%|███████▎  | 7181/9770 [1:22:57<28:25,  1.52it/s] 74%|███████▎  | 7182/9770 [1:22:58<29:04,  1.48it/s] 74%|███████▎  | 7183/9770 [1:22:58<28:47,  1.50it/s] 74%|███████▎  | 7184/9770 [1:22:5
+0: {'loss': 0.6557, 'grad_norm': 0.5873302338932653, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.74}
+0: 9<28:32,  1.51it/s] 74%|███████▎  | 7185/9770 [1:22:59<28:18,  1.52it/s] 74%|███████▎  | 7186/9770 [1:23:00<28:08,  1.53it/s] 74%|███████▎  | 7187/9770 [1:23:01<28:38,  1.50it/s] 74%|███████▎  | 7188/9770 [1:23:01<28:46,  1.50it/s] 74%|███████▎  | 7189/9770 [1:23:02<28:23,  1.51it/s] 74%|███████▎  | 7190/9770 [1:23:03<28:04,  1.53it/s]                                                      74%|███████▎  | 7190/9770 [1:23:03<28:04,  1.53it/s] 74%|███████▎  | 7191/9770 [1:23:03<28:03,  1.53it/s] 74%|███████▎  | 7192/9770 [1:23:04<28:02,  1.53it/s] 74%|███████▎  | 7193/9770 [1:23:05<27:51,  1.54it/s] 74%|███████▎  | 7194/9770 [1:23:05<27:37,  1.55it/s] 74%|███████▎  | 7195/9770 [1:23:06<27:41,  1.55it/s] 74%|███████▎  | 7196/9770 [1:23:07<28:02,  1.53it/s] 74%|███████▎  | 7197/9
+0: {'loss': 0.6389, 'grad_norm': 0.6106008594952007, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.74}
+0: 770 [1:23:07<28:02,  1.53it/s] 74%|███████▎  | 7198/9770 [1:23:08<28:05,  1.53it/s] 74%|███████▎  | 7199/9770 [1:23:09<28:01,  1.53it/s] 74%|███████▎  | 7200/9770 [1:23:09<28:01,  1.53it/s]                                                      74%|███████▎  | 7200/9770 [1:23:09<28:01,  1.53it/s] 74%|███████▎  | 7201/9770 [1:23:10<27:59,  1.53it/s] 74%|███████▎  | 7202/9770 [1:23:11<27:28,  1.56it/s] 74%|███████▎  | 7203/9770 [1:23:11<27:42,  1.54it/s] 74%|███████▎  | 7204/9770 [1:23:12<27:42,  1.54it/s] 74%|███████▎  | 7205/9770 [1:23:12<27:27,  1.56it/s] 74%|███████▍  | 7206/9770 [1:23:13<27:19,  1.56it/s] 74%|███████▍  | 7207/9770 [1:23:14<27:32,  1.55it/s] 74%|███████▍  | 7208/9770 [1:23:14<27:37,  1.55it/s] 74%|███████▍  | 7209/9770 [1:23:15<27:29,  1.55it/s] 74%|███████�
+0: {'loss': 0.6463, 'grad_norm': 0.6182190792551714, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.74}
+0: {'loss': 0.6667, 'grad_norm': 0.5927060992796735, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.74}
+0: �  | 7210/9770 [1:23:16<27:27,  1.55it/s]                                                      74%|███████▍  | 7210/9770 [1:23:16<27:27,  1.55it/s] 74%|███████▍  | 7211/9770 [1:23:16<27:48,  1.53it/s] 74%|███████▍  | 7212/9770 [1:23:17<28:05,  1.52it/s] 74%|███████▍  | 7213/9770 [1:23:18<28:15,  1.51it/s] 74%|███████▍  | 7214/9770 [1:23:18<27:59,  1.52it/s] 74%|███████▍  | 7215/9770 [1:23:19<27:56,  1.52it/s] 74%|███████▍  | 7216/9770 [1:23:20<27:46,  1.53it/s] 74%|███████▍  | 7217/9770 [1:23:20<27:57,  1.52it/s] 74%|███████▍  | 7218/9770 [1:23:21<27:54,  1.52it/s] 74%|███████▍  | 7219/9770 [1:23:22<28:21,  1.50it/s] 74%|███████▍  | 7220/9770 [1:23:22<28:13,  1.51it/s]                                                      74%|███████▍  | 7220/9770 [1:23:22<28:13,  1.51it/s] 74%|███████▍  |
+0: {'loss': 0.6618, 'grad_norm': 0.6413888712498511, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.74}
+0:  7221/9770 [1:23:23<28:02,  1.52it/s] 74%|███████▍  | 7222/9770 [1:23:24<28:08,  1.51it/s] 74%|███████▍  | 7223/9770 [1:23:24<27:55,  1.52it/s] 74%|███████▍  | 7224/9770 [1:23:25<27:33,  1.54it/s] 74%|███████▍  | 7225/9770 [1:23:26<27:37,  1.54it/s] 74%|███████▍  | 7226/9770 [1:23:26<27:16,  1.55it/s] 74%|███████▍  | 7227/9770 [1:23:27<27:25,  1.55it/s] 74%|███████▍  | 7228/9770 [1:23:28<28:10,  1.50it/s] 74%|███████▍  | 7229/9770 [1:23:28<28:03,  1.51it/s] 74%|███████▍  | 7230/9770 [1:23:29<28:33,  1.48it/s]                                                      74%|███████▍  | 7230/9770 [1:23:29<28:33,  1.48it/s] 74%|███████▍  | 7231/9770 [1:23:30<28:16,  1.50it/s] 74%|███████▍  | 7232/9770 [1:23:30<28:07,  1.50it/s] 74%|███████▍  | 7233/9770 [1:23:31<27:42,  1.53it/s] 74%|█████�
+0: {'loss': 0.6494, 'grad_norm': 0.6053832444121335, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.74}
+0: ��█▍  | 7234/9770 [1:23:32<27:32,  1.53it/s] 74%|███████▍  | 7235/9770 [1:23:32<27:54,  1.51it/s] 74%|███████▍  | 7236/9770 [1:23:33<27:40,  1.53it/s] 74%|███████▍  | 7237/9770 [1:23:34<27:48,  1.52it/s] 74%|███████▍  | 7238/9770 [1:23:34<27:20,  1.54it/s] 74%|███████▍  | 7239/9770 [1:23:35<26:59,  1.56it/s] 74%|███████▍  | 7240/9770 [1:23:35<26:58,  1.56it/s]                                                      74%|███████▍  | 7240/9770 [1:23:35<26:58,  1.56it/s] 74%|███████▍  | 7241/9770 [1:23:36<27:14,  1.55it/s] 74%|███████▍  | 7242/9770 [1:23:37<27:57,  1.51it/s] 74%|███████▍  | 7243/9770 [1:23:37<27:50,  1.51it/s] 74%|███████▍  | 7244/9770 [1:23:38<27:31,  1.53it/s] 74%|███████▍  | 7245/9770 [1:23:39<27:31,  1.53it/s] 74%|███████▍  | 7246/9770 [1:23:39<27:22,  1.54it/s] 74%|█�
+0: {'loss': 0.6451, 'grad_norm': 0.5643537416975682, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.74}
+0: �█████▍  | 7247/9770 [1:23:40<27:29,  1.53it/s] 74%|███████▍  | 7248/9770 [1:23:41<28:08,  1.49it/s] 74%|███████▍  | 7249/9770 [1:23:41<27:45,  1.51it/s] 74%|███████▍  | 7250/9770 [1:23:42<27:44,  1.51it/s]                                                      74%|███████▍  | 7250/9770 [1:23:42<27:44,  1.51it/s] 74%|███████▍  | 7251/9770 [1:23:43<27:15,  1.54it/s] 74%|███████▍  | 7252/9770 [1:23:43<27:09,  1.54it/s] 74%|███████▍  | 7253/9770 [1:23:44<27:21,  1.53it/s] 74%|███████▍  | 7254/9770 [1:23:45<27:29,  1.52it/s] 74%|███████▍  | 7255/9770 [1:23:45<27:51,  1.50it/s] 74%|███████▍  | 7256/9770 [1:23:46<27:23,  1.53it/s] 74%|███████▍  | 7257/9770 [1:23:47<27:11,  1.54it/s] 74%|███████▍  | 7258/9770 [1:23:47<27:51,  1.50it/s] 74%|███████▍  | 7259/9770 [1:23:48<27:33,  1.52it/s]
+0: {'loss': 0.6534, 'grad_norm': 0.5841152597879908, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.74}
+0: {'loss': 0.6422, 'grad_norm': 0.5954131460693213, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.74}
+0:  74%|███████▍  | 7260/9770 [1:23:49<27:04,  1.55it/s]                                                      74%|███████▍  | 7260/9770 [1:23:49<27:04,  1.55it/s] 74%|███████▍  | 7261/9770 [1:23:49<27:17,  1.53it/s] 74%|███████▍  | 7262/9770 [1:23:50<27:02,  1.55it/s] 74%|███████▍  | 7263/9770 [1:23:50<27:08,  1.54it/s] 74%|███████▍  | 7264/9770 [1:23:51<27:09,  1.54it/s] 74%|███████▍  | 7265/9770 [1:23:52<27:00,  1.55it/s] 74%|███████▍  | 7266/9770 [1:23:52<26:59,  1.55it/s] 74%|███████▍  | 7267/9770 [1:23:53<26:56,  1.55it/s] 74%|███████▍  | 7268/9770 [1:23:54<27:03,  1.54it/s] 74%|███████▍  | 7269/9770 [1:23:54<26:55,  1.55it/s] 74%|███████▍  | 7270/9770 [1:23:55<26:53,  1.55it/s]                                                      74%|███████▍  | 7270/9770 [1:23:55<26:53,  1.55it/s] 74
+0: {'loss': 0.6448, 'grad_norm': 0.5946845944286797, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.74}
+0: %|███████▍  | 7271/9770 [1:23:56<26:48,  1.55it/s] 74%|███████▍  | 7272/9770 [1:23:56<26:33,  1.57it/s] 74%|███████▍  | 7273/9770 [1:23:57<26:46,  1.55it/s] 74%|███████▍  | 7274/9770 [1:23:58<26:35,  1.56it/s] 74%|███████▍  | 7275/9770 [1:23:58<26:53,  1.55it/s] 74%|███████▍  | 7276/9770 [1:23:59<26:46,  1.55it/s] 74%|███████▍  | 7277/9770 [1:24:00<26:50,  1.55it/s] 74%|███████▍  | 7278/9770 [1:24:00<26:29,  1.57it/s] 75%|███████▍  | 7279/9770 [1:24:01<26:32,  1.56it/s] 75%|███████▍  | 7280/9770 [1:24:01<26:39,  1.56it/s]                                                      75%|███████▍  | 7280/9770 [1:24:01<26:39,  1.56it/s] 75%|███████▍  | 7281/9770 [1:24:02<26:52,  1.54it/s] 75%|███████▍  | 7282/9770 [1:24:03<26:51,  1.54it/s] 75%|███████▍  | 7283/9770 [1:24:03<26:51,  1.
+0: {'loss': 0.6593, 'grad_norm': 0.5843315131096042, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.75}
+0: 54it/s] 75%|███████▍  | 7284/9770 [1:24:04<26:54,  1.54it/s] 75%|███████▍  | 7285/9770 [1:24:05<27:00,  1.53it/s] 75%|███████▍  | 7286/9770 [1:24:05<26:50,  1.54it/s] 75%|███████▍  | 7287/9770 [1:24:06<26:54,  1.54it/s] 75%|███████▍  | 7288/9770 [1:24:07<27:05,  1.53it/s] 75%|███████▍  | 7289/9770 [1:24:07<26:46,  1.54it/s] 75%|███████▍  | 7290/9770 [1:24:08<26:44,  1.55it/s]                                                      75%|███████▍  | 7290/9770 [1:24:08<26:44,  1.55it/s] 75%|███████▍  | 7291/9770 [1:24:09<26:58,  1.53it/s] 75%|███████▍  | 7292/9770 [1:24:09<27:04,  1.53it/s] 75%|███████▍  | 7293/9770 [1:24:10<27:05,  1.52it/s] 75%|███████▍  | 7294/9770 [1:24:11<26:45,  1.54it/s] 75%|███████▍  | 7295/9770 [1:24:11<27:09,  1.52it/s] 75%|███████▍  | 7296/9770 [1:24:12
+0: {'loss': 0.6495, 'grad_norm': 0.6262861925557471, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.75}
+0: <27:14,  1.51it/s] 75%|███████▍  | 7297/9770 [1:24:13<27:00,  1.53it/s] 75%|███████▍  | 7298/9770 [1:24:13<27:04,  1.52it/s] 75%|███████▍  | 7299/9770 [1:24:14<26:35,  1.55it/s] 75%|███████▍  | 7300/9770 [1:24:14<26:44,  1.54it/s]                                                      75%|███████▍  | 7300/9770 [1:24:14<26:44,  1.54it/s] 75%|███████▍  | 7301/9770 [1:24:15<27:06,  1.52it/s] 75%|███████▍  | 7302/9770 [1:24:16<26:58,  1.53it/s] 75%|███████▍  | 7303/9770 [1:24:16<26:35,  1.55it/s] 75%|███████▍  | 7304/9770 [1:24:17<26:20,  1.56it/s] 75%|███████▍  | 7305/9770 [1:24:18<26:17,  1.56it/s] 75%|███████▍  | 7306/9770 [1:24:18<26:21,  1.56it/s] 75%|███████▍  | 7307/9770 [1:24:19<26:24,  1.55it/s] 75%|███████▍  | 7308/9770 [1:24:20<26:39,  1.54it/s] 75%|███████▍  | 7309/97
+0: {'loss': 0.6508, 'grad_norm': 0.6229127142839981, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.75}
+0: {'loss': 0.6659, 'grad_norm': 0.5912851874001394, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.75}
+0: 70 [1:24:20<26:48,  1.53it/s] 75%|███████▍  | 7310/9770 [1:24:21<26:42,  1.54it/s]                                                      75%|███████▍  | 7310/9770 [1:24:21<26:42,  1.54it/s] 75%|███████▍  | 7311/9770 [1:24:22<26:41,  1.54it/s] 75%|███████▍  | 7312/9770 [1:24:22<26:44,  1.53it/s] 75%|███████▍  | 7313/9770 [1:24:23<26:34,  1.54it/s] 75%|███████▍  | 7314/9770 [1:24:24<26:45,  1.53it/s] 75%|███████▍  | 7315/9770 [1:24:24<26:48,  1.53it/s] 75%|███████▍  | 7316/9770 [1:24:25<26:37,  1.54it/s] 75%|███████▍  | 7317/9770 [1:24:26<26:31,  1.54it/s] 75%|███████▍  | 7318/9770 [1:24:26<26:43,  1.53it/s] 75%|███████▍  | 7319/9770 [1:24:27<26:44,  1.53it/s] 75%|███████▍  | 7320/9770 [1:24:28<27:08,  1.50it/s]                                                      75%|███���███▍  | 7320/9770 [
+0: {'loss': 0.6685, 'grad_norm': 0.6169956620319743, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.75}
+0: 1:24:28<27:08,  1.50it/s] 75%|███████▍  | 7321/9770 [1:24:28<27:17,  1.50it/s] 75%|███████▍  | 7322/9770 [1:24:29<27:19,  1.49it/s] 75%|███████▍  | 7323/9770 [1:24:30<27:05,  1.51it/s] 75%|███████▍  | 7324/9770 [1:24:30<26:38,  1.53it/s] 75%|███████▍  | 7325/9770 [1:24:31<26:20,  1.55it/s] 75%|███████▍  | 7326/9770 [1:24:31<26:16,  1.55it/s] 75%|███████▍  | 7327/9770 [1:24:32<26:18,  1.55it/s] 75%|███████▌  | 7328/9770 [1:24:33<26:12,  1.55it/s] 75%|███████▌  | 7329/9770 [1:24:33<26:18,  1.55it/s] 75%|███████▌  | 7330/9770 [1:24:34<26:33,  1.53it/s]                                                      75%|███████▌  | 7330/9770 [1:24:34<26:33,  1.53it/s] 75%|███████▌  | 7331/9770 [1:24:35<26:21,  1.54it/s] 75%|███████▌  | 7332/9770 [1:24:35<26:16,  1.55it/s] 75%|███████▌  | 
+0: {'loss': 0.6591, 'grad_norm': 0.6208719769887372, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.75}
+0: 7333/9770 [1:24:36<26:19,  1.54it/s] 75%|███████▌  | 7334/9770 [1:24:37<26:27,  1.53it/s] 75%|███████▌  | 7335/9770 [1:24:37<26:46,  1.52it/s] 75%|███████▌  | 7336/9770 [1:24:38<27:10,  1.49it/s] 75%|███████▌  | 7337/9770 [1:24:39<27:02,  1.50it/s] 75%|███████▌  | 7338/9770 [1:24:39<26:48,  1.51it/s] 75%|███████▌  | 7339/9770 [1:24:40<26:36,  1.52it/s] 75%|███████▌  | 7340/9770 [1:24:41<26:52,  1.51it/s]                                                      75%|███████▌  | 7340/9770 [1:24:41<26:52,  1.51it/s] 75%|███████▌  | 7341/9770 [1:24:41<26:36,  1.52it/s] 75%|███████▌  | 7342/9770 [1:24:42<26:43,  1.51it/s] 75%|███████▌  | 7343/9770 [1:24:43<26:44,  1.51it/s] 75%|███████▌  | 7344/9770 [1:24:43<26:40,  1.52it/s] 75%|███████▌  | 7345/9770 [1:24:44<26:22,  1.53it/s] 75%|█████�
+0: {'loss': 0.6643, 'grad_norm': 0.5981268168191592, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.75}
+0: �█▌  | 7346/9770 [1:24:45<26:51,  1.50it/s] 75%|███████▌  | 7347/9770 [1:24:45<26:39,  1.52it/s] 75%|███████▌  | 7348/9770 [1:24:46<26:26,  1.53it/s] 75%|███████▌  | 7349/9770 [1:24:47<27:01,  1.49it/s] 75%|███████▌  | 7350/9770 [1:24:47<26:54,  1.50it/s]                                                      75%|███████▌  | 7350/9770 [1:24:47<26:54,  1.50it/s] 75%|███████▌  | 7351/9770 [1:24:48<26:41,  1.51it/s] 75%|███████▌  | 7352/9770 [1:24:49<26:32,  1.52it/s] 75%|███████▌  | 7353/9770 [1:24:49<26:28,  1.52it/s] 75%|███████▌  | 7354/9770 [1:24:50<26:47,  1.50it/s] 75%|███████▌  | 7355/9770 [1:24:51<26:26,  1.52it/s] 75%|███████▌  | 7356/9770 [1:24:51<26:19,  1.53it/s] 75%|███████▌  | 7357/9770 [1:24:52<26:00,  1.55it/s] 75%|███████▌  | 7358/9770 [1:24:52<26:24,  1.52it/s] 75%|██
+0: {'loss': 0.6468, 'grad_norm': 0.6032178405209255, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.75}
+0: {'loss': 0.6546, 'grad_norm': 0.623968033296279, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.75}
+0: █████▌  | 7359/9770 [1:24:53<26:11,  1.53it/s] 75%|███████▌  | 7360/9770 [1:24:54<26:42,  1.50it/s]                                                      75%|███████▌  | 7360/9770 [1:24:54<26:42,  1.50it/s] 75%|███████▌  | 7361/9770 [1:24:54<26:30,  1.51it/s] 75%|███████▌  | 7362/9770 [1:24:55<26:13,  1.53it/s] 75%|███████▌  | 7363/9770 [1:24:56<26:30,  1.51it/s] 75%|███████▌  | 7364/9770 [1:24:56<26:56,  1.49it/s] 75%|███████▌  | 7365/9770 [1:24:57<26:39,  1.50it/s] 75%|███████▌  | 7366/9770 [1:24:58<26:31,  1.51it/s] 75%|███████▌  | 7367/9770 [1:24:58<26:22,  1.52it/s] 75%|███████▌  | 7368/9770 [1:24:59<26:53,  1.49it/s] 75%|███████▌  | 7369/9770 [1:25:00<26:40,  1.50it/s] 75%|███████▌  | 7370/9770 [1:25:00<26:29,  1.51it/s]                                                      75%|███�
+0: {'loss': 0.6553, 'grad_norm': 0.5745720584742292, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.75}
+0: ��███▌  | 7370/9770 [1:25:00<26:29,  1.51it/s] 75%|███████▌  | 7371/9770 [1:25:01<26:33,  1.51it/s] 75%|███████▌  | 7372/9770 [1:25:02<26:26,  1.51it/s] 75%|███████▌  | 7373/9770 [1:25:02<26:33,  1.50it/s] 75%|███████▌  | 7374/9770 [1:25:03<26:40,  1.50it/s] 75%|███████▌  | 7375/9770 [1:25:04<26:24,  1.51it/s] 75%|███████▌  | 7376/9770 [1:25:04<26:21,  1.51it/s] 76%|███████▌  | 7377/9770 [1:25:05<26:51,  1.48it/s] 76%|███████▌  | 7378/9770 [1:25:06<26:22,  1.51it/s] 76%|███████▌  | 7379/9770 [1:25:06<26:18,  1.52it/s] 76%|███████▌  | 7380/9770 [1:25:07<26:10,  1.52it/s]                                                      76%|███████▌  | 7380/9770 [1:25:07<26:10,  1.52it/s] 76%|███████▌  | 7381/9770 [1:25:08<26:04,  1.53it/s] 76%|███████▌  | 7382/9770 [1:25:08<26:19,  1.51it/s] 76%
+0: {'loss': 0.6574, 'grad_norm': 0.6353691229960085, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.76}
+0: |███████▌  | 7383/9770 [1:25:09<26:01,  1.53it/s] 76%|███████▌  | 7384/9770 [1:25:10<26:32,  1.50it/s] 76%|███████▌  | 7385/9770 [1:25:10<26:28,  1.50it/s] 76%|███████▌  | 7386/9770 [1:25:11<26:20,  1.51it/s] 76%|███████▌  | 7387/9770 [1:25:12<26:47,  1.48it/s] 76%|███████▌  | 7388/9770 [1:25:12<26:21,  1.51it/s] 76%|███████▌  | 7389/9770 [1:25:13<25:57,  1.53it/s] 76%|███████▌  | 7390/9770 [1:25:14<26:01,  1.52it/s]                                                      76%|███████▌  | 7390/9770 [1:25:14<26:01,  1.52it/s] 76%|███████▌  | 7391/9770 [1:25:14<26:09,  1.52it/s] 76%|███████▌  | 7392/9770 [1:25:15<25:56,  1.53it/s] 76%|███████▌  | 7393/9770 [1:25:16<25:46,  1.54it/s] 76%|███████▌  | 7394/9770 [1:25:16<25:52,  1.53it/s] 76%|███████▌  | 7395/9770 [1:25:17<25:36,  1.5
+0: {'loss': 0.6433, 'grad_norm': 0.5987619216176655, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.76}
+0: 5it/s] 76%|███████▌  | 7396/9770 [1:25:18<25:48,  1.53it/s] 76%|███████▌  | 7397/9770 [1:25:18<25:37,  1.54it/s] 76%|███████▌  | 7398/9770 [1:25:19<25:41,  1.54it/s] 76%|███████▌  | 7399/9770 [1:25:20<26:04,  1.52it/s] 76%|███████▌  | 7400/9770 [1:25:20<25:41,  1.54it/s]                                                      76%|███████▌  | 7400/9770 [1:25:20<25:41,  1.54it/s] 76%|███████▌  | 7401/9770 [1:25:21<25:44,  1.53it/s] 76%|███████▌  | 7402/9770 [1:25:21<25:31,  1.55it/s] 76%|███████▌  | 7403/9770 [1:25:22<25:31,  1.55it/s] 76%|███████▌  | 7404/9770 [1:25:23<25:38,  1.54it/s] 76%|███████▌  | 7405/9770 [1:25:23<25:31,  1.54it/s] 76%|███████▌  | 7406/9770 [1:25:24<25:32,  1.54it/s] 76%|███████▌  | 7407/9770 [1:25:25<25:27,  1.55it/s] 76%|███████▌  | 7408/9770 [1:25:25<
+0: {'loss': 0.657, 'grad_norm': 0.6000013480638627, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.76}
+0: 25:32,  1.54it/s] 76%|███████▌  | 7409/9770 [1:25:26<25:21,  1.55it/s] 76%|███████▌  | 7410/9770 [1:25:27<25:06,  1.57it/s]                                                      76%|███████▌  | 7410/9770 [1:25:27<25:06,  1.57it/s] 76%|███████▌  | 7411/9770 [1:25:27<25:22,  1.55it/s] 76%|███████▌  | 7412/9770 [1:25:28<25:17,  1.55it/s] 76%|███████▌  | 7413/9770 [1:25:29<25:41,  1.53it/s] 76%|███████▌  | 7414/9770 [1:25:29<26:05,  1.51it/s] 76%|███████▌  | 7415/9770 [1:25:30<26:08,  1.50it/s] 76%|███████▌  | 7416/9770 [1:25:31<25:51,  1.52it/s] 76%|███████▌  | 7417/9770 [1:25:31<26:19,  1.49it/s] 76%|███████▌  | 7418/9770 [1:25:32<26:28,  1.48it/s] 76%|███████▌  | 7419/9770 [1:25:33<25:58,  1.51it/s] 76%|███████▌  | 7420/9770 [1:25:33<26:02,  1.50it/s]                                         
+0: {'loss': 0.6577, 'grad_norm': 0.5889138649444225, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.76}
+0: {'loss': 0.66, 'grad_norm': 0.6042249110869242, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.76}
+0:              76%|███████▌  | 7420/9770 [1:25:33<26:02,  1.50it/s] 76%|███████▌  | 7421/9770 [1:25:34<25:57,  1.51it/s] 76%|███████▌  | 7422/9770 [1:25:35<25:49,  1.52it/s] 76%|███████▌  | 7423/9770 [1:25:35<26:02,  1.50it/s] 76%|███████▌  | 7424/9770 [1:25:36<25:45,  1.52it/s] 76%|███████▌  | 7425/9770 [1:25:37<25:31,  1.53it/s] 76%|███████▌  | 7426/9770 [1:25:37<25:37,  1.52it/s] 76%|███████▌  | 7427/9770 [1:25:38<25:37,  1.52it/s] 76%|███████▌  | 7428/9770 [1:25:39<25:29,  1.53it/s] 76%|███████▌  | 7429/9770 [1:25:39<25:11,  1.55it/s] 76%|███████▌  | 7430/9770 [1:25:40<25:05,  1.55it/s]                                                      76%|███████▌  | 7430/9770 [1:25:40<25:05,  1.55it/s] 76%|███████▌  | 7431/9770 [1:25:40<25:42,  1.52it/s] 76%|███████▌  | 7432/9770 [1
+0: {'loss': 0.6301, 'grad_norm': 0.5930566989166397, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.76}
+0: :25:41<25:34,  1.52it/s] 76%|███████▌  | 7433/9770 [1:25:42<25:47,  1.51it/s] 76%|███████▌  | 7434/9770 [1:25:42<25:41,  1.52it/s] 76%|███████▌  | 7435/9770 [1:25:43<26:16,  1.48it/s] 76%|███████▌  | 7436/9770 [1:25:44<25:51,  1.50it/s] 76%|███████▌  | 7437/9770 [1:25:45<26:12,  1.48it/s] 76%|███████▌  | 7438/9770 [1:25:45<25:42,  1.51it/s] 76%|███████▌  | 7439/9770 [1:25:46<25:40,  1.51it/s] 76%|███████▌  | 7440/9770 [1:25:46<25:33,  1.52it/s]                                                      76%|███████▌  | 7440/9770 [1:25:46<25:33,  1.52it/s] 76%|███████▌  | 7441/9770 [1:25:47<25:28,  1.52it/s] 76%|███████▌  | 7442/9770 [1:25:48<25:24,  1.53it/s] 76%|███████▌  | 7443/9770 [1:25:48<25:54,  1.50it/s] 76%|███████▌  | 7444/9770 [1:25:49<25:45,  1.50it/s] 76%|███████▌  | 7
+0: {'loss': 0.644, 'grad_norm': 0.5943052392252575, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.76}
+0: 445/9770 [1:25:50<25:50,  1.50it/s] 76%|███████▌  | 7446/9770 [1:25:50<26:00,  1.49it/s] 76%|███████▌  | 7447/9770 [1:25:51<25:44,  1.50it/s] 76%|███████▌  | 7448/9770 [1:25:52<25:56,  1.49it/s] 76%|███████▌  | 7449/9770 [1:25:52<26:05,  1.48it/s] 76%|███████▋  | 7450/9770 [1:25:53<25:54,  1.49it/s]                                                      76%|███████▋  | 7450/9770 [1:25:53<25:54,  1.49it/s] 76%|███████▋  | 7451/9770 [1:25:54<25:30,  1.52it/s] 76%|███████▋  | 7452/9770 [1:25:54<25:23,  1.52it/s] 76%|███████▋  | 7453/9770 [1:25:55<25:24,  1.52it/s] 76%|███████▋  | 7454/9770 [1:25:56<25:30,  1.51it/s] 76%|███████▋  | 7455/9770 [1:25:56<25:32,  1.51it/s] 76%|███████▋  | 7456/9770 [1:25:57<25:53,  1.49it/s] 76%|███████▋  | 7457/9770 [1:25:58<25:35,  1.51it/s] 76%|██████
+0: {'loss': 0.6622, 'grad_norm': 0.589574033988617, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.76}
+0: █▋  | 7458/9770 [1:25:58<25:23,  1.52it/s] 76%|███████▋  | 7459/9770 [1:25:59<25:21,  1.52it/s] 76%|███████▋  | 7460/9770 [1:26:00<25:13,  1.53it/s]                                                      76%|███████▋  | 7460/9770 [1:26:00<25:13,  1.53it/s] 76%|███████▋  | 7461/9770 [1:26:00<25:13,  1.53it/s] 76%|███████▋  | 7462/9770 [1:26:01<25:08,  1.53it/s] 76%|███████▋  | 7463/9770 [1:26:02<25:01,  1.54it/s] 76%|███████▋  | 7464/9770 [1:26:02<24:48,  1.55it/s] 76%|███████▋  | 7465/9770 [1:26:03<25:11,  1.52it/s] 76%|███████▋  | 7466/9770 [1:26:04<24:58,  1.54it/s] 76%|███████▋  | 7467/9770 [1:26:04<24:45,  1.55it/s] 76%|███████▋  | 7468/9770 [1:26:05<24:32,  1.56it/s] 76%|███████▋  | 7469/9770 [1:26:06<24:57,  1.54it/s] 76%|███████▋  | 7470/9770 [1:26:06<24:55,  1.54it/s]            
+0: {'loss': 0.6661, 'grad_norm': 0.6078080519022193, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.76}
+0: {'loss': 0.6717, 'grad_norm': 0.6513111552467029, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.77}
+0:                                           76%|███████▋  | 7470/9770 [1:26:06<24:55,  1.54it/s] 76%|███████▋  | 7471/9770 [1:26:07<25:02,  1.53it/s] 76%|███████▋  | 7472/9770 [1:26:08<25:08,  1.52it/s] 76%|███████▋  | 7473/9770 [1:26:08<24:56,  1.54it/s] 76%|███████▋  | 7474/9770 [1:26:09<24:59,  1.53it/s] 77%|███████▋  | 7475/9770 [1:26:10<25:09,  1.52it/s] 77%|███████▋  | 7476/9770 [1:26:10<25:07,  1.52it/s] 77%|███████▋  | 7477/9770 [1:26:11<24:57,  1.53it/s] 77%|███████▋  | 7478/9770 [1:26:12<25:30,  1.50it/s] 77%|███████▋  | 7479/9770 [1:26:12<25:10,  1.52it/s] 77%|███████▋  | 7480/9770 [1:26:13<24:51,  1.54it/s]                                                      77%|███████▋  | 7480/9770 [1:26:13<24:51,  1.54it/s] 77%|███████▋  | 7481/9770 [1:26:13<24:32,  1.55it/s] 77%|███�
+0: {'loss': 0.6414, 'grad_norm': 0.5754265651193947, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.77}
+0: �███▋  | 7482/9770 [1:26:14<24:13,  1.57it/s] 77%|███████▋  | 7483/9770 [1:26:15<24:27,  1.56it/s] 77%|███████▋  | 7484/9770 [1:26:15<24:38,  1.55it/s] 77%|███████▋  | 7485/9770 [1:26:16<24:54,  1.53it/s] 77%|███████▋  | 7486/9770 [1:26:17<24:39,  1.54it/s] 77%|███████▋  | 7487/9770 [1:26:17<24:45,  1.54it/s] 77%|███████▋  | 7488/9770 [1:26:18<24:31,  1.55it/s] 77%|███████▋  | 7489/9770 [1:26:19<24:57,  1.52it/s] 77%|███████▋  | 7490/9770 [1:26:19<24:37,  1.54it/s]                                                      77%|███████▋  | 7490/9770 [1:26:19<24:37,  1.54it/s] 77%|███████▋  | 7491/9770 [1:26:20<24:42,  1.54it/s] 77%|███████▋  | 7492/9770 [1:26:21<24:59,  1.52it/s] 77%|███████▋  | 7493/9770 [1:26:21<24:54,  1.52it/s] 77%|███████▋  | 7494/9770 [1:26:22<24:43,  1.53it/s] 77%|
+0: {'loss': 0.6404, 'grad_norm': 0.6131656563321725, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.77}
+0: ███████▋  | 7495/9770 [1:26:23<24:43,  1.53it/s] 77%|███████▋  | 7496/9770 [1:26:23<24:31,  1.55it/s] 77%|███████▋  | 7497/9770 [1:26:24<24:39,  1.54it/s] 77%|███████▋  | 7498/9770 [1:26:24<24:44,  1.53it/s] 77%|███████▋  | 7499/9770 [1:26:25<24:26,  1.55it/s] 77%|███████▋  | 7500/9770 [1:26:26<24:27,  1.55it/s]                                                      77%|███████▋  | 7500/9770 [1:26:26<24:27,  1.55it/s] 77%|███████▋  | 7501/9770 [1:26:26<24:24,  1.55it/s] 77%|███████▋  | 7502/9770 [1:26:27<24:14,  1.56it/s] 77%|███████▋  | 7503/9770 [1:26:28<24:15,  1.56it/s] 77%|███████▋  | 7504/9770 [1:26:28<24:30,  1.54it/s] 77%|███████▋  | 7505/9770 [1:26:29<24:41,  1.53it/s] 77%|███████▋  | 7506/9770 [1:26:30<24:38,  1.53it/s] 77%|███████▋  | 7507/9770 [1:26:30<24:29,  1.54
+0: {'loss': 0.6606, 'grad_norm': 0.5875310509975358, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.77}
+0: it/s] 77%|███████▋  | 7508/9770 [1:26:31<24:21,  1.55it/s] 77%|███████▋  | 7509/9770 [1:26:32<24:33,  1.53it/s] 77%|███████▋  | 7510/9770 [1:26:32<24:20,  1.55it/s]                                                      77%|███████▋  | 7510/9770 [1:26:32<24:20,  1.55it/s] 77%|███████▋  | 7511/9770 [1:26:33<24:20,  1.55it/s] 77%|███████▋  | 7512/9770 [1:26:34<24:19,  1.55it/s] 77%|███████▋  | 7513/9770 [1:26:34<24:07,  1.56it/s] 77%|███████▋  | 7514/9770 [1:26:35<24:13,  1.55it/s] 77%|███████▋  | 7515/9770 [1:26:35<24:44,  1.52it/s] 77%|███████▋  | 7516/9770 [1:26:36<24:23,  1.54it/s] 77%|███████▋  | 7517/9770 [1:26:37<24:20,  1.54it/s] 77%|███████▋  | 7518/9770 [1:26:37<24:24,  1.54it/s] 77%|███████▋  | 7519/9770 [1:26:38<25:32,  1.47it/s] 77%|████���██▋  | 7520/9770 [1:26:39<2
+0: {'loss': 0.6476, 'grad_norm': 0.6307834141084832, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.77}
+0: {'loss': 0.6625, 'grad_norm': 0.6392285155212977, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.77}
+0: 4:57,  1.50it/s]                                                      77%|███████▋  | 7520/9770 [1:26:39<24:57,  1.50it/s] 77%|███████▋  | 7521/9770 [1:26:39<24:40,  1.52it/s] 77%|███████▋  | 7522/9770 [1:26:40<24:44,  1.51it/s] 77%|███████▋  | 7523/9770 [1:26:41<24:38,  1.52it/s] 77%|███████▋  | 7524/9770 [1:26:41<24:39,  1.52it/s] 77%|███████▋  | 7525/9770 [1:26:42<24:39,  1.52it/s] 77%|███████▋  | 7526/9770 [1:26:43<25:12,  1.48it/s] 77%|███████▋  | 7527/9770 [1:26:43<25:16,  1.48it/s] 77%|███████▋  | 7528/9770 [1:26:44<25:01,  1.49it/s] 77%|███████▋  | 7529/9770 [1:26:45<24:52,  1.50it/s] 77%|███████▋  | 7530/9770 [1:26:45<24:43,  1.51it/s]                                                      77%|███████▋  | 7530/9770 [1:26:45<24:43,  1.51it/s] 77%|███████▋  | 7531/9770 [1:26:46<24:50
+0: {'loss': 0.6403, 'grad_norm': 0.6336726995125174, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.77}
+0: ,  1.50it/s] 77%|███████▋  | 7532/9770 [1:26:47<24:34,  1.52it/s] 77%|███████▋  | 7533/9770 [1:26:47<24:45,  1.51it/s] 77%|███████▋  | 7534/9770 [1:26:48<24:21,  1.53it/s] 77%|███████▋  | 7535/9770 [1:26:49<24:14,  1.54it/s] 77%|███████▋  | 7536/9770 [1:26:49<24:14,  1.54it/s] 77%|███████▋  | 7537/9770 [1:26:50<23:57,  1.55it/s] 77%|███████▋  | 7538/9770 [1:26:51<24:12,  1.54it/s] 77%|███████▋  | 7539/9770 [1:26:51<24:49,  1.50it/s] 77%|███████▋  | 7540/9770 [1:26:52<24:31,  1.52it/s]                                                      77%|███████▋  | 7540/9770 [1:26:52<24:31,  1.52it/s] 77%|███████▋  | 7541/9770 [1:26:53<24:08,  1.54it/s] 77%|███████▋  | 7542/9770 [1:26:53<24:00,  1.55it/s] 77%|███████▋  | 7543/9770 [1:26:54<24:06,  1.54it/s] 77%|███████▋  | 7544/9770 [1:
+0: {'loss': 0.6346, 'grad_norm': 0.6252329712101955, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.77}
+0: 26:55<24:05,  1.54it/s] 77%|███████▋  | 7545/9770 [1:26:55<24:32,  1.51it/s] 77%|███████▋  | 7546/9770 [1:26:56<24:52,  1.49it/s] 77%|███████▋  | 7547/9770 [1:26:57<24:34,  1.51it/s] 77%|███████▋  | 7548/9770 [1:26:57<24:22,  1.52it/s] 77%|███████▋  | 7549/9770 [1:26:58<24:21,  1.52it/s] 77%|███████▋  | 7550/9770 [1:26:59<24:20,  1.52it/s]                                                      77%|███████▋  | 7550/9770 [1:26:59<24:20,  1.52it/s] 77%|███████▋  | 7551/9770 [1:26:59<24:05,  1.54it/s] 77%|███████▋  | 7552/9770 [1:27:00<24:19,  1.52it/s] 77%|███████▋  | 7553/9770 [1:27:01<24:10,  1.53it/s] 77%|███████▋  | 7554/9770 [1:27:01<24:22,  1.52it/s] 77%|███████▋  | 7555/9770 [1:27:02<24:07,  1.53it/s] 77%|███████▋  | 7556/9770 [1:27:02<24:01,  1.54it/s] 77%|███████▋  | 75
+0: {'loss': 0.642, 'grad_norm': 0.5714879057449758, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.77}
+0: 57/9770 [1:27:03<24:06,  1.53it/s] 77%|███████▋  | 7558/9770 [1:27:04<23:56,  1.54it/s] 77%|███████▋  | 7559/9770 [1:27:04<23:44,  1.55it/s] 77%|███████▋  | 7560/9770 [1:27:05<23:52,  1.54it/s]                                                      77%|███████▋  | 7560/9770 [1:27:05<23:52,  1.54it/s] 77%|███████▋  | 7561/9770 [1:27:06<24:06,  1.53it/s] 77%|███████▋  | 7562/9770 [1:27:06<24:00,  1.53it/s] 77%|███████▋  | 7563/9770 [1:27:07<24:05,  1.53it/s] 77%|███████▋  | 7564/9770 [1:27:08<23:40,  1.55it/s] 77%|███████▋  | 7565/9770 [1:27:08<23:40,  1.55it/s] 77%|███████▋  | 7566/9770 [1:27:09<24:02,  1.53it/s] 77%|███████▋  | 7567/9770 [1:27:10<23:47,  1.54it/s] 77%|███████▋  | 7568/9770 [1:27:10<24:04,  1.52it/s] 77%|███████▋  | 7569/9770 [1:27:11<23:57,  1.53it/s] 77%|██████�
+0: {'loss': 0.6607, 'grad_norm': 0.5544062795923599, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.77}
+0: {'loss': 0.6307, 'grad_norm': 0.6148215099868057, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.78}
+0: ��▋  | 7570/9770 [1:27:12<23:38,  1.55it/s]                                                      77%|███████▋  | 7570/9770 [1:27:12<23:38,  1.55it/s] 77%|███████▋  | 7571/9770 [1:27:12<23:37,  1.55it/s] 78%|███████▊  | 7572/9770 [1:27:13<23:34,  1.55it/s] 78%|███████▊  | 7573/9770 [1:27:13<23:39,  1.55it/s] 78%|███████▊  | 7574/9770 [1:27:14<23:41,  1.55it/s] 78%|███████▊  | 7575/9770 [1:27:15<23:46,  1.54it/s] 78%|███████▊  | 7576/9770 [1:27:15<23:37,  1.55it/s] 78%|███████▊  | 7577/9770 [1:27:16<23:46,  1.54it/s] 78%|███████▊  | 7578/9770 [1:27:17<23:42,  1.54it/s] 78%|███████▊  | 7579/9770 [1:27:17<23:47,  1.53it/s] 78%|███████▊  | 7580/9770 [1:27:18<23:50,  1.53it/s]                                                      78%|███████▊  | 7580/9770 [1:27:18<23:50,  1.53it/s] 78%|███████�
+0: {'loss': 0.6483, 'grad_norm': 0.6182403662394549, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.78}
+0: �  | 7581/9770 [1:27:19<23:44,  1.54it/s] 78%|███████▊  | 7582/9770 [1:27:19<23:35,  1.55it/s] 78%|███████▊  | 7583/9770 [1:27:20<23:19,  1.56it/s] 78%|███████▊  | 7584/9770 [1:27:21<23:30,  1.55it/s] 78%|███████▊  | 7585/9770 [1:27:21<23:31,  1.55it/s] 78%|███████▊  | 7586/9770 [1:27:22<23:47,  1.53it/s] 78%|███████▊  | 7587/9770 [1:27:23<23:54,  1.52it/s] 78%|███████▊  | 7588/9770 [1:27:23<24:07,  1.51it/s] 78%|███████▊  | 7589/9770 [1:27:24<24:24,  1.49it/s] 78%|███████▊  | 7590/9770 [1:27:25<24:06,  1.51it/s]                                                      78%|███████▊  | 7590/9770 [1:27:25<24:06,  1.51it/s] 78%|███████▊  | 7591/9770 [1:27:25<23:53,  1.52it/s] 78%|███████▊  | 7592/9770 [1:27:26<23:52,  1.52it/s] 78%|███████▊  | 7593/9770 [1:27:27<23:36,  1.54it/s] 78%|████
+0: {'loss': 0.6719, 'grad_norm': 0.5942953071673898, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.78}
+0: ███▊  | 7594/9770 [1:27:27<23:29,  1.54it/s] 78%|███████▊  | 7595/9770 [1:27:28<23:36,  1.54it/s] 78%|███████▊  | 7596/9770 [1:27:29<24:16,  1.49it/s] 78%|███████▊  | 7597/9770 [1:27:29<24:03,  1.51it/s] 78%|███████▊  | 7598/9770 [1:27:30<23:47,  1.52it/s] 78%|███████▊  | 7599/9770 [1:27:31<23:40,  1.53it/s] 78%|███████▊  | 7600/9770 [1:27:31<23:43,  1.52it/s]                                                      78%|███████▊  | 7600/9770 [1:27:31<23:43,  1.52it/s] 78%|███████▊  | 7601/9770 [1:27:32<23:39,  1.53it/s] 78%|███████▊  | 7602/9770 [1:27:32<23:37,  1.53it/s] 78%|███████▊  | 7603/9770 [1:27:33<23:20,  1.55it/s] 78%|███████▊  | 7604/9770 [1:27:34<23:29,  1.54it/s] 78%|███████▊  | 7605/9770 [1:27:34<23:27,  1.54it/s] 78%|███████▊  | 7606/9770 [1:27:35<23:39,  1.52it/s] 78%|�
+0: {'loss': 0.6478, 'grad_norm': 0.5694043947539491, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.78}
+0: ��██████▊  | 7607/9770 [1:27:36<23:41,  1.52it/s] 78%|███████▊  | 7608/9770 [1:27:36<23:50,  1.51it/s] 78%|███████▊  | 7609/9770 [1:27:37<23:36,  1.53it/s] 78%|███████▊  | 7610/9770 [1:27:38<23:33,  1.53it/s]                                                      78%|███████▊  | 7610/9770 [1:27:38<23:33,  1.53it/s] 78%|███████▊  | 7611/9770 [1:27:38<23:46,  1.51it/s] 78%|███████▊  | 7612/9770 [1:27:39<23:39,  1.52it/s] 78%|███████▊  | 7613/9770 [1:27:40<23:51,  1.51it/s] 78%|███████▊  | 7614/9770 [1:27:40<23:53,  1.50it/s] 78%|███████▊  | 7615/9770 [1:27:41<23:46,  1.51it/s] 78%|███████▊  | 7616/9770 [1:27:42<24:00,  1.50it/s] 78%|███████▊  | 7617/9770 [1:27:42<23:32,  1.52it/s] 78%|███████▊  | 7618/9770 [1:27:43<23:07,  1.55it/s] 78%|███████▊  | 7619/9770 [1:27:44<23:09,  1.55i
+0: {'loss': 0.6497, 'grad_norm': 0.5858731492458507, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.78}
+0: {'loss': 0.6725, 'grad_norm': 0.6262111008331863, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.78}
+0: t/s] 78%|███████▊  | 7620/9770 [1:27:44<23:17,  1.54it/s]                                                      78%|███████▊  | 7620/9770 [1:27:44<23:17,  1.54it/s] 78%|███████▊  | 7621/9770 [1:27:45<23:26,  1.53it/s] 78%|███████▊  | 7622/9770 [1:27:46<23:05,  1.55it/s] 78%|███████▊  | 7623/9770 [1:27:46<23:13,  1.54it/s] 78%|███████▊  | 7624/9770 [1:27:47<23:16,  1.54it/s] 78%|███████▊  | 7625/9770 [1:27:48<23:16,  1.54it/s] 78%|███████▊  | 7626/9770 [1:27:48<23:03,  1.55it/s] 78%|███████▊  | 7627/9770 [1:27:49<23:31,  1.52it/s] 78%|███████▊  | 7628/9770 [1:27:49<23:15,  1.54it/s] 78%|███████▊  | 7629/9770 [1:27:50<23:23,  1.53it/s] 78%|███████▊  | 7630/9770 [1:27:51<23:04,  1.55it/s]                                                      78%|███████▊  | 7630/9770 [1:27:51<23:04,  1.55it/s]
+0: {'loss': 0.649, 'grad_norm': 0.6119738083369423, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.78}
+0:  78%|███████▊  | 7631/9770 [1:27:51<23:07,  1.54it/s] 78%|███████▊  | 7632/9770 [1:27:52<23:04,  1.54it/s] 78%|███████▊  | 7633/9770 [1:27:53<22:52,  1.56it/s] 78%|███████▊  | 7634/9770 [1:27:53<23:00,  1.55it/s] 78%|███████▊  | 7635/9770 [1:27:54<22:57,  1.55it/s] 78%|███████▊  | 7636/9770 [1:27:55<22:37,  1.57it/s] 78%|███████▊  | 7637/9770 [1:27:55<22:50,  1.56it/s] 78%|███████▊  | 7638/9770 [1:27:56<22:53,  1.55it/s] 78%|███████▊  | 7639/9770 [1:27:57<22:49,  1.56it/s] 78%|███████▊  | 7640/9770 [1:27:57<22:45,  1.56it/s]                                                      78%|███████▊  | 7640/9770 [1:27:57<22:45,  1.56it/s] 78%|███████▊  | 7641/9770 [1:27:58<22:59,  1.54it/s] 78%|███████▊  | 7642/9770 [1:27:58<22:55,  1.55it/s] 78%|███████▊  | 7643/9770 [1:27:59<22:37,
+0: {'loss': 0.6369, 'grad_norm': 0.6060823204611006, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.78}
+0:   1.57it/s] 78%|███████▊  | 7644/9770 [1:28:00<22:53,  1.55it/s] 78%|███████▊  | 7645/9770 [1:28:00<22:57,  1.54it/s] 78%|███████▊  | 7646/9770 [1:28:01<23:06,  1.53it/s] 78%|███████▊  | 7647/9770 [1:28:02<23:03,  1.53it/s] 78%|███████▊  | 7648/9770 [1:28:02<23:04,  1.53it/s] 78%|███████▊  | 7649/9770 [1:28:03<23:12,  1.52it/s] 78%|███████▊  | 7650/9770 [1:28:04<23:00,  1.54it/s]                                                      78%|███████▊  | 7650/9770 [1:28:04<23:00,  1.54it/s] 78%|███████▊  | 7651/9770 [1:28:04<22:41,  1.56it/s] 78%|███████▊  | 7652/9770 [1:28:05<22:49,  1.55it/s] 78%|███████▊  | 7653/9770 [1:28:06<22:40,  1.56it/s] 78%|███████▊  | 7654/9770 [1:28:06<22:54,  1.54it/s] 78%|███████▊  | 7655/9770 [1:28:07<23:19,  1.51it/s] 78%|███████▊  | 7656/9770 [1:2
+0: {'loss': 0.6423, 'grad_norm': 0.5716280329295779, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.78}
+0: 8:08<23:32,  1.50it/s] 78%|███████▊  | 7657/9770 [1:28:08<23:27,  1.50it/s] 78%|███████▊  | 7658/9770 [1:28:09<23:20,  1.51it/s] 78%|███████▊  | 7659/9770 [1:28:10<23:13,  1.51it/s] 78%|███████▊  | 7660/9770 [1:28:10<23:29,  1.50it/s]                                                      78%|███████▊  | 7660/9770 [1:28:10<23:29,  1.50it/s] 78%|███████▊  | 7661/9770 [1:28:11<23:04,  1.52it/s] 78%|███████▊  | 7662/9770 [1:28:12<22:50,  1.54it/s] 78%|███████▊  | 7663/9770 [1:28:12<23:04,  1.52it/s] 78%|███████▊  | 7664/9770 [1:28:13<22:59,  1.53it/s] 78%|███████▊  | 7665/9770 [1:28:14<22:52,  1.53it/s] 78%|███████▊  | 7666/9770 [1:28:14<22:33,  1.55it/s] 78%|███████▊  | 7667/9770 [1:28:15<23:20,  1.50it/s] 78%|███████▊  | 7668/9770 [1:28:16<23:21,  1.50it/s] 78%|███████▊  | 766
+0: {'loss': 0.6516, 'grad_norm': 0.5654812439202281, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.78}
+0: {'loss': 0.6562, 'grad_norm': 0.6199105076588045, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.79}
+0: 9/9770 [1:28:16<23:15,  1.51it/s] 79%|███████▊  | 7670/9770 [1:28:17<22:52,  1.53it/s]                                                      79%|███████▊  | 7670/9770 [1:28:17<22:52,  1.53it/s] 79%|███████▊  | 7671/9770 [1:28:18<22:53,  1.53it/s] 79%|███████▊  | 7672/9770 [1:28:18<22:49,  1.53it/s] 79%|███████▊  | 7673/9770 [1:28:19<22:42,  1.54it/s] 79%|███████▊  | 7674/9770 [1:28:19<22:44,  1.54it/s] 79%|███████▊  | 7675/9770 [1:28:20<23:01,  1.52it/s] 79%|███████▊  | 7676/9770 [1:28:21<22:53,  1.52it/s] 79%|███████▊  | 7677/9770 [1:28:21<22:57,  1.52it/s] 79%|███████▊  | 7678/9770 [1:28:22<22:45,  1.53it/s] 79%|███████▊  | 7679/9770 [1:28:23<22:42,  1.54it/s] 79%|███████▊  | 7680/9770 [1:28:23<22:37,  1.54it/s]                                                      79%|███████▊  | 7680/97
+0: {'loss': 0.6361, 'grad_norm': 0.5924798354147677, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.79}
+0: 70 [1:28:23<22:37,  1.54it/s] 79%|███████▊  | 7681/9770 [1:28:24<22:38,  1.54it/s] 79%|███████▊  | 7682/9770 [1:28:25<22:32,  1.54it/s] 79%|███████▊  | 7683/9770 [1:28:25<22:24,  1.55it/s] 79%|███████▊  | 7684/9770 [1:28:26<22:31,  1.54it/s] 79%|███████▊  | 7685/9770 [1:28:27<22:48,  1.52it/s] 79%|███████▊  | 7686/9770 [1:28:27<22:36,  1.54it/s] 79%|███████▊  | 7687/9770 [1:28:28<22:37,  1.53it/s] 79%|███████▊  | 7688/9770 [1:28:29<22:42,  1.53it/s] 79%|███████▊  | 7689/9770 [1:28:29<22:44,  1.53it/s] 79%|███████▊  | 7690/9770 [1:28:30<22:52,  1.52it/s]                                                      79%|███████▊  | 7690/9770 [1:28:30<22:52,  1.52it/s] 79%|███████▊  | 7691/9770 [1:28:31<22:44,  1.52it/s] 79%|███████▊  | 7692/9770 [1:28:31<22:27,  1.54it/s] 79%|███████▊
+0: {'loss': 0.6667, 'grad_norm': 0.6107388749357734, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.79}
+0:   | 7693/9770 [1:28:32<22:17,  1.55it/s] 79%|███████▉  | 7694/9770 [1:28:32<22:15,  1.55it/s] 79%|███████▉  | 7695/9770 [1:28:33<22:14,  1.55it/s] 79%|███████▉  | 7696/9770 [1:28:34<22:24,  1.54it/s] 79%|███████▉  | 7697/9770 [1:28:34<22:21,  1.55it/s] 79%|███████▉  | 7698/9770 [1:28:35<22:29,  1.54it/s] 79%|███████▉  | 7699/9770 [1:28:36<23:03,  1.50it/s] 79%|███████▉  | 7700/9770 [1:28:36<22:32,  1.53it/s]                                                      79%|███████▉  | 7700/9770 [1:28:36<22:32,  1.53it/s] 79%|███████▉  | 7701/9770 [1:28:37<22:35,  1.53it/s] 79%|███████▉  | 7702/9770 [1:28:38<22:28,  1.53it/s] 79%|███████▉  | 7703/9770 [1:28:38<22:34,  1.53it/s] 79%|███████▉  | 7704/9770 [1:28:39<22:57,  1.50it/s] 79%|███████▉  | 7705/9770 [1:28:40<22:32,  1.53it/s] 79%|████�
+0: {'loss': 0.6495, 'grad_norm': 0.6100469495613748, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.79}
+0: ��██▉  | 7706/9770 [1:28:40<22:37,  1.52it/s] 79%|███████▉  | 7707/9770 [1:28:41<22:31,  1.53it/s] 79%|███████▉  | 7708/9770 [1:28:42<22:27,  1.53it/s] 79%|███████▉  | 7709/9770 [1:28:42<22:37,  1.52it/s] 79%|███████▉  | 7710/9770 [1:28:43<22:36,  1.52it/s]                                                      79%|███████▉  | 7710/9770 [1:28:43<22:36,  1.52it/s] 79%|███████▉  | 7711/9770 [1:28:44<22:34,  1.52it/s] 79%|███████▉  | 7712/9770 [1:28:44<22:25,  1.53it/s] 79%|███████▉  | 7713/9770 [1:28:45<22:23,  1.53it/s] 79%|███████▉  | 7714/9770 [1:28:46<22:23,  1.53it/s] 79%|███████▉  | 7715/9770 [1:28:46<22:43,  1.51it/s] 79%|███████▉  | 7716/9770 [1:28:47<22:16,  1.54it/s] 79%|███████▉  | 7717/9770 [1:28:48<22:24,  1.53it/s] 79%|███████▉  | 7718/9770 [1:28:48<22:21,  1.53it/s] 79%|�
+0: {'loss': 0.641, 'grad_norm': 0.5786261006200034, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.79}
+0: {'loss': 0.6421, 'grad_norm': 0.5998515328407902, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.79}
+0: �██████▉  | 7719/9770 [1:28:49<22:30,  1.52it/s] 79%|███████▉  | 7720/9770 [1:28:50<22:18,  1.53it/s]                                                      79%|███████▉  | 7720/9770 [1:28:50<22:18,  1.53it/s] 79%|███████▉  | 7721/9770 [1:28:50<22:18,  1.53it/s] 79%|███████▉  | 7722/9770 [1:28:51<22:35,  1.51it/s] 79%|███████▉  | 7723/9770 [1:28:52<22:24,  1.52it/s] 79%|███████▉  | 7724/9770 [1:28:52<22:15,  1.53it/s] 79%|███████▉  | 7725/9770 [1:28:53<22:18,  1.53it/s] 79%|███████▉  | 7726/9770 [1:28:53<22:24,  1.52it/s] 79%|███████▉  | 7727/9770 [1:28:54<22:24,  1.52it/s] 79%|███████▉  | 7728/9770 [1:28:55<22:41,  1.50it/s] 79%|███████▉  | 7729/9770 [1:28:55<22:17,  1.53it/s] 79%|███████▉  | 7730/9770 [1:28:56<22:15,  1.53it/s]                                                      79%|██
+0: {'loss': 0.6508, 'grad_norm': 0.5757542905583947, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.79}
+0: █████▉  | 7730/9770 [1:28:56<22:15,  1.53it/s] 79%|███████▉  | 7731/9770 [1:28:57<22:09,  1.53it/s] 79%|███████▉  | 7732/9770 [1:28:57<22:20,  1.52it/s] 79%|███████▉  | 7733/9770 [1:28:58<22:59,  1.48it/s] 79%|███████▉  | 7734/9770 [1:28:59<22:47,  1.49it/s] 79%|███████▉  | 7735/9770 [1:28:59<22:37,  1.50it/s] 79%|███████▉  | 7736/9770 [1:29:00<22:18,  1.52it/s] 79%|███████▉  | 7737/9770 [1:29:01<22:13,  1.52it/s] 79%|███████▉  | 7738/9770 [1:29:01<21:55,  1.54it/s] 79%|███████▉  | 7739/9770 [1:29:02<22:24,  1.51it/s] 79%|███████▉  | 7740/9770 [1:29:03<22:17,  1.52it/s]                                                      79%|███████▉  | 7740/9770 [1:29:03<22:17,  1.52it/s] 79%|███████▉  | 7741/9770 [1:29:03<22:07,  1.53it/s] 79%|███████▉  | 7742/9770 [1:29:04<22:02,  1.53it/s]
+0: {'loss': 0.6773, 'grad_norm': 0.6095169766738976, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.79}
+0:  79%|███████▉  | 7743/9770 [1:29:05<21:58,  1.54it/s] 79%|███████▉  | 7744/9770 [1:29:05<22:19,  1.51it/s] 79%|███████▉  | 7745/9770 [1:29:06<22:12,  1.52it/s] 79%|███████▉  | 7746/9770 [1:29:07<22:01,  1.53it/s] 79%|███████▉  | 7747/9770 [1:29:07<22:21,  1.51it/s] 79%|███████▉  | 7748/9770 [1:29:08<22:35,  1.49it/s] 79%|███████▉  | 7749/9770 [1:29:09<22:44,  1.48it/s] 79%|███████▉  | 7750/9770 [1:29:09<22:24,  1.50it/s]                                                      79%|███████▉  | 7750/9770 [1:29:09<22:24,  1.50it/s] 79%|███████▉  | 7751/9770 [1:29:10<22:18,  1.51it/s] 79%|███████▉  | 7752/9770 [1:29:11<22:08,  1.52it/s] 79%|███████▉  | 7753/9770 [1:29:11<21:57,  1.53it/s] 79%|███████▉  | 7754/9770 [1:29:12<22:08,  1.52it/s] 79%|███████▉  | 7755/9770 [1:29:13<21:45, 
+0: {'loss': 0.6616, 'grad_norm': 0.5825167591987942, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.79}
+0:  1.54it/s] 79%|███████▉  | 7756/9770 [1:29:13<21:51,  1.54it/s] 79%|███████▉  | 7757/9770 [1:29:14<21:52,  1.53it/s] 79%|███████▉  | 7758/9770 [1:29:15<21:37,  1.55it/s] 79%|███████▉  | 7759/9770 [1:29:15<21:50,  1.53it/s] 79%|███████▉  | 7760/9770 [1:29:16<21:56,  1.53it/s]                                                      79%|███████▉  | 7760/9770 [1:29:16<21:56,  1.53it/s] 79%|███████▉  | 7761/9770 [1:29:16<21:50,  1.53it/s] 79%|███████▉  | 7762/9770 [1:29:17<21:56,  1.53it/s] 79%|███████▉  | 7763/9770 [1:29:18<21:57,  1.52it/s] 79%|███��███▉  | 7764/9770 [1:29:18<21:59,  1.52it/s] 79%|███████▉  | 7765/9770 [1:29:19<22:06,  1.51it/s] 79%|███████▉  | 7766/9770 [1:29:20<22:33,  1.48it/s] 79%|███████▉  | 7767/9770 [1:29:20<22:12,  1.50it/s] 80%|███████▉  | 7768/9770 [1:29
+0: {'loss': 0.6564, 'grad_norm': 0.582752384573689, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.79}
+0: :21<22:09,  1.51it/s] 80%|███████▉  | 7769/9770 [1:29:22<21:53,  1.52it/s] 80%|███████▉  | 7770/9770 [1:29:22<22:09,  1.50it/s]                                                      80%|███████▉  | 7770/9770 [1:29:22<22:09,  1.50it/s] 80%|███████▉  | 7771/9770 [1:29:23<22:32,  1.48it/s] 80%|███████▉  | 7772/9770 [1:29:24<22:17,  1.49it/s] 80%|███████▉  | 7773/9770 [1:29:24<22:13,  1.50it/s] 80%|███████▉  | 7774/9770 [1:29:25<22:05,  1.51it/s] 80%|███████▉  | 7775/9770 [1:29:26<21:58,  1.51it/s] 80%|███████▉  | 7776/9770 [1:29:26<21:56,  1.51it/s] 80%|███████▉  | 7777/9770 [1:29:27<21:40,  1.53it/s] 80%|███████▉  | 7778/9770 [1:29:28<22:12,  1.50it/s] 80%|███████▉  | 7779/9770 [1:29:28<21:46,  1.52it/s] 80%|███████▉  | 7780/9770 [1:29:29<21:38,  1.53it/s]                                     
+0: {'loss': 0.67, 'grad_norm': 0.640384264981169, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.8}
+0: {'loss': 0.6364, 'grad_norm': 0.5767090113468913, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.8}
+0:                  80%|███████▉  | 7780/9770 [1:29:29<21:38,  1.53it/s] 80%|███████▉  | 7781/9770 [1:29:30<21:21,  1.55it/s] 80%|███████▉  | 7782/9770 [1:29:30<22:00,  1.51it/s] 80%|███████▉  | 7783/9770 [1:29:31<21:55,  1.51it/s] 80%|███████▉  | 7784/9770 [1:29:32<21:44,  1.52it/s] 80%|███████▉  | 7785/9770 [1:29:32<21:28,  1.54it/s] 80%|███████▉  | 7786/9770 [1:29:33<21:35,  1.53it/s] 80%|███████▉  | 7787/9770 [1:29:34<21:44,  1.52it/s] 80%|███████▉  | 7788/9770 [1:29:34<21:38,  1.53it/s] 80%|███████▉  | 7789/9770 [1:29:35<21:20,  1.55it/s] 80%|███████▉  | 7790/9770 [1:29:36<21:30,  1.53it/s]                                                      80%|███████▉  | 7790/9770 [1:29:36<21:30,  1.53it/s] 80%|███████▉  | 7791/9770 [1:29:36<21:38,  1.52it/s] 80%|███████▉  | 7792/977
+0: {'loss': 0.6386, 'grad_norm': 0.5957009439289818, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.8}
+0: 0 [1:29:37<21:29,  1.53it/s] 80%|███████▉  | 7793/9770 [1:29:38<21:27,  1.54it/s] 80%|███████▉  | 7794/9770 [1:29:38<21:27,  1.54it/s] 80%|███████▉  | 7795/9770 [1:29:39<21:16,  1.55it/s] 80%|███████▉  | 7796/9770 [1:29:40<21:49,  1.51it/s] 80%|███████▉  | 7797/9770 [1:29:40<21:43,  1.51it/s] 80%|███████▉  | 7798/9770 [1:29:41<21:32,  1.53it/s] 80%|███████▉  | 7799/9770 [1:29:41<21:19,  1.54it/s] 80%|███████▉  | 7800/9770 [1:29:42<21:26,  1.53it/s]                                                      80%|███████▉  | 7800/9770 [1:29:42<21:26,  1.53it/s] 80%|███████▉  | 7801/9770 [1:29:43<21:26,  1.53it/s] 80%|███████▉  | 7802/9770 [1:29:43<21:28,  1.53it/s] 80%|███████▉  | 7803/9770 [1:29:44<21:54,  1.50it/s] 80%|███████▉  | 7804/9770 [1:29:45<21:40,  1.51it/s] 80%|███████▉ 
+0: {'loss': 0.6525, 'grad_norm': 0.6051848176523708, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.8}
+0: [2025-09-02 21:25:54,345] [INFO] [axolotl.core.trainers.base._save:613] [PID:3622631] [RANK:0] Saving model checkpoint to /lustre/fswork/projects/rech/dgo/udv55np/math/Qwen3-235B-A22B/Qwen2.5-0.5B_reasoning/1/checkpoint-7816[39m
+0: [2025-09-02 21:25:55,242] [INFO] [axolotl.core.trainers.base._save:662] [PID:3622631] [RANK:0] Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`[39m
+0:  | 7805/9770 [1:29:45<21:14,  1.54it/s] 80%|███████▉  | 7806/9770 [1:29:46<21:11,  1.54it/s] 80%|███████▉  | 7807/9770 [1:29:47<21:08,  1.55it/s] 80%|███████▉  | 7808/9770 [1:29:47<21:11,  1.54it/s] 80%|███████▉  | 7809/9770 [1:29:48<21:25,  1.53it/s] 80%|███████▉  | 7810/9770 [1:29:49<21:28,  1.52it/s]                                                      80%|███████▉  | 7810/9770 [1:29:49<21:28,  1.52it/s] 80%|███████▉  | 7811/9770 [1:29:49<21:28,  1.52it/s] 80%|███████▉  | 7812/9770 [1:29:50<21:23,  1.53it/s] 80%|███████▉  | 7813/9770 [1:29:51<21:21,  1.53it/s] 80%|███████▉  | 7814/9770 [1:29:51<21:17,  1.53it/s] 80%|███████▉  | 7815/9770 [1:29:52<20:56,  1.56it/s] 80%|████████  | 7816/9770 [1:29:53<20:35,  1.58it/s] 80%|████████  | 7817/9770 [1:29:55<41:09,  1.26s/it] 80%|████�
+0: {'loss': 0.6366, 'grad_norm': 0.6151419471490285, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.8}
+0: �███  | 7818/9770 [1:29:56<35:06,  1.08s/it] 80%|████████  | 7819/9770 [1:29:57<30:37,  1.06it/s] 80%|████████  | 7820/9770 [1:29:57<27:57,  1.16it/s]                                                      80%|████████  | 7820/9770 [1:29:57<27:57,  1.16it/s] 80%|████████  | 7821/9770 [1:29:58<25:57,  1.25it/s] 80%|████████  | 7822/9770 [1:29:59<25:22,  1.28it/s] 80%|████████  | 7823/9770 [1:29:59<24:19,  1.33it/s] 80%|████████  | 7824/9770 [1:30:00<23:22,  1.39it/s] 80%|████████  | 7825/9770 [1:30:01<23:06,  1.40it/s] 80%|████████  | 7826/9770 [1:30:01<22:58,  1.41it/s] 80%|████████  | 7827/9770 [1:30:02<22:06,  1.46it/s] 80%|████████  | 7828/9770 [1:30:03<21:43,  1.49it/s] 80%|████████  | 7829/9770 [1:30:03<21:23,  1.51it/s] 80%|████████  | 7830/9770 [1:30:04<21:14,  1.52it/s]        
+0: {'loss': 0.6436, 'grad_norm': 0.5832642772142379, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.8}
+0: {'loss': 0.6556, 'grad_norm': 0.5978307860900283, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.8}
+0:                                               80%|████████  | 7830/9770 [1:30:04<21:14,  1.52it/s] 80%|████████  | 7831/9770 [1:30:05<21:09,  1.53it/s] 80%|████████  | 7832/9770 [1:30:05<21:08,  1.53it/s] 80%|████████  | 7833/9770 [1:30:06<21:06,  1.53it/s] 80%|████████  | 7834/9770 [1:30:07<21:27,  1.50it/s] 80%|████████  | 7835/9770 [1:30:07<21:19,  1.51it/s] 80%|████████  | 7836/9770 [1:30:08<21:14,  1.52it/s] 80%|████████  | 7837/9770 [1:30:08<20:55,  1.54it/s] 80%|████████  | 7838/9770 [1:30:09<20:58,  1.54it/s] 80%|████████  | 7839/9770 [1:30:10<21:02,  1.53it/s] 80%|████████  | 7840/9770 [1:30:10<21:02,  1.53it/s]                                                      80%|████████  | 7840/9770 [1:30:10<21:02,  1.53it/s] 80%|████████  | 7841/9770 [1:30:11<20:59,  1.53it/s] 80%|██�
+0: {'loss': 0.6499, 'grad_norm': 0.5847377319393593, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.8}
+0: ��█████  | 7842/9770 [1:30:12<21:12,  1.52it/s] 80%|████████  | 7843/9770 [1:30:12<21:16,  1.51it/s] 80%|████████  | 7844/9770 [1:30:13<21:06,  1.52it/s] 80%|████████  | 7845/9770 [1:30:14<20:49,  1.54it/s] 80%|████████  | 7846/9770 [1:30:14<20:50,  1.54it/s] 80%|████████  | 7847/9770 [1:30:15<21:01,  1.52it/s] 80%|████████  | 7848/9770 [1:30:16<20:55,  1.53it/s] 80%|████████  | 7849/9770 [1:30:16<20:50,  1.54it/s] 80%|████████  | 7850/9770 [1:30:17<20:35,  1.55it/s]                                                      80%|████████  | 7850/9770 [1:30:17<20:35,  1.55it/s] 80%|████████  | 7851/9770 [1:30:18<20:35,  1.55it/s] 80%|████████  | 7852/9770 [1:30:18<20:42,  1.54it/s] 80%|████████  | 7853/9770 [1:30:19<20:52,  1.53it/s] 80%|████████  | 7854/9770 [1:30:20<20:43,  1.54it/s] 
+0: {'loss': 0.6604, 'grad_norm': 0.6134773396412422, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.8}
+0: 80%|████████  | 7855/9770 [1:30:20<20:47,  1.53it/s] 80%|████████  | 7856/9770 [1:30:21<21:11,  1.51it/s] 80%|████████  | 7857/9770 [1:30:22<21:01,  1.52it/s] 80%|████████  | 7858/9770 [1:30:22<20:51,  1.53it/s] 80%|████████  | 7859/9770 [1:30:23<21:11,  1.50it/s] 80%|████████  | 7860/9770 [1:30:24<20:55,  1.52it/s]                                                      80%|████████  | 7860/9770 [1:30:24<20:55,  1.52it/s] 80%|████████  | 7861/9770 [1:30:24<20:53,  1.52it/s] 80%|████████  | 7862/9770 [1:30:25<21:08,  1.50it/s] 80%|████████  | 7863/9770 [1:30:26<20:52,  1.52it/s] 80%|████████  | 7864/9770 [1:30:26<20:46,  1.53it/s] 81%|████████  | 7865/9770 [1:30:27<20:42,  1.53it/s] 81%|████████  | 7866/9770 [1:30:27<20:46,  1.53it/s] 81%|████████  | 7867/9770 [1:30:28<20:41,  
+0: {'loss': 0.6542, 'grad_norm': 0.5846256365738515, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.81}
+0: 1.53it/s] 81%|████████  | 7868/9770 [1:30:29<20:56,  1.51it/s] 81%|████████  | 7869/9770 [1:30:29<20:39,  1.53it/s] 81%|████████  | 7870/9770 [1:30:30<20:24,  1.55it/s]                                                      81%|████████  | 7870/9770 [1:30:30<20:24,  1.55it/s] 81%|████████  | 7871/9770 [1:30:31<20:20,  1.56it/s] 81%|████████  | 7872/9770 [1:30:31<20:12,  1.56it/s] 81%|████████  | 7873/9770 [1:30:32<20:17,  1.56it/s] 81%|████████  | 7874/9770 [1:30:33<20:27,  1.54it/s] 81%|████████  | 7875/9770 [1:30:33<20:21,  1.55it/s] 81%|████████  | 7876/9770 [1:30:34<20:34,  1.53it/s] 81%|████████  | 7877/9770 [1:30:35<20:29,  1.54it/s] 81%|████████  | 7878/9770 [1:30:35<20:18,  1.55it/s] 81%|████████  | 7879/9770 [1:30:36<20:10,  1.56it/s] 81%|████████  | 7880/9770 [1:30:
+0: {'loss': 0.648, 'grad_norm': 0.5594934689814495, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.81}
+0: {'loss': 0.6466, 'grad_norm': 0.5833123225807513, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.81}
+0: 36<19:52,  1.59it/s]                                                      81%|████████  | 7880/9770 [1:30:36<19:52,  1.59it/s] 81%|████████  | 7881/9770 [1:30:37<20:10,  1.56it/s] 81%|████████  | 7882/9770 [1:30:38<20:22,  1.54it/s] 81%|████████  | 7883/9770 [1:30:38<20:07,  1.56it/s] 81%|████████  | 7884/9770 [1:30:39<20:22,  1.54it/s] 81%|████████  | 7885/9770 [1:30:40<20:20,  1.54it/s] 81%|████████  | 7886/9770 [1:30:40<20:19,  1.55it/s] 81%|████████  | 7887/9770 [1:30:41<20:18,  1.55it/s] 81%|████████  | 7888/9770 [1:30:42<20:27,  1.53it/s] 81%|████████  | 7889/9770 [1:30:42<20:44,  1.51it/s] 81%|████████  | 7890/9770 [1:30:43<20:25,  1.53it/s]                                                      81%|████████  | 7890/9770 [1:30:43<20:25,  1.53it/s] 81%|████████  | 7891/9770 [1:30:44<2
+0: {'loss': 0.6461, 'grad_norm': 0.5757969189097039, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.81}
+0: 0:31,  1.53it/s] 81%|████████  | 7892/9770 [1:30:44<20:24,  1.53it/s] 81%|████████  | 7893/9770 [1:30:45<20:18,  1.54it/s] 81%|████████  | 7894/9770 [1:30:46<20:17,  1.54it/s] 81%|████████  | 7895/9770 [1:30:46<20:23,  1.53it/s] 81%|████████  | 7896/9770 [1:30:47<20:36,  1.52it/s] 81%|████████  | 7897/9770 [1:30:48<20:21,  1.53it/s] 81%|████████  | 7898/9770 [1:30:48<20:21,  1.53it/s] 81%|████████  | 7899/9770 [1:30:49<20:20,  1.53it/s] 81%|████████  | 7900/9770 [1:30:50<20:27,  1.52it/s]                                                      81%|████████  | 7900/9770 [1:30:50<20:27,  1.52it/s] 81%|████████  | 7901/9770 [1:30:50<20:34,  1.51it/s] 81%|████████  | 7902/9770 [1:30:51<20:31,  1.52it/s] 81%|████████  | 7903/9770 [1:30:52<20:46,  1.50it/s] 81%|████████  | 7904/9770
+0: {'loss': 0.6505, 'grad_norm': 0.6262193887659668, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.81}
+0:  [1:30:52<20:25,  1.52it/s] 81%|████████  | 7905/9770 [1:30:53<20:25,  1.52it/s] 81%|████████  | 7906/9770 [1:30:53<20:26,  1.52it/s] 81%|████████  | 7907/9770 [1:30:54<20:24,  1.52it/s] 81%|████████  | 7908/9770 [1:30:55<20:20,  1.53it/s] 81%|████████  | 7909/9770 [1:30:55<20:34,  1.51it/s] 81%|████████  | 7910/9770 [1:30:56<20:25,  1.52it/s]                                                      81%|████████  | 7910/9770 [1:30:56<20:25,  1.52it/s] 81%|████████  | 7911/9770 [1:30:57<20:21,  1.52it/s] 81%|████████  | 7912/9770 [1:30:57<20:38,  1.50it/s] 81%|████████  | 7913/9770 [1:30:58<20:24,  1.52it/s] 81%|████████  | 7914/9770 [1:30:59<20:20,  1.52it/s] 81%|████████  | 7915/9770 [1:30:59<20:21,  1.52it/s] 81%|████████  | 7916/9770 [1:31:00<20:04,  1.54it/s] 81%|████████  
+0: {'loss': 0.6481, 'grad_norm': 0.5938254866180448, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.81}
+0: | 7917/9770 [1:31:01<20:04,  1.54it/s] 81%|████████  | 7918/9770 [1:31:01<20:21,  1.52it/s] 81%|████████  | 7919/9770 [1:31:02<20:16,  1.52it/s] 81%|████████  | 7920/9770 [1:31:03<20:03,  1.54it/s]                                                      81%|████████  | 7920/9770 [1:31:03<20:03,  1.54it/s] 81%|████████  | 7921/9770 [1:31:03<19:51,  1.55it/s] 81%|████████  | 7922/9770 [1:31:04<20:03,  1.54it/s] 81%|████████  | 7923/9770 [1:31:05<20:03,  1.53it/s] 81%|████████  | 7924/9770 [1:31:05<20:29,  1.50it/s] 81%|████████  | 7925/9770 [1:31:06<20:21,  1.51it/s] 81%|████████  | 7926/9770 [1:31:07<20:32,  1.50it/s] 81%|████████  | 7927/9770 [1:31:07<20:47,  1.48it/s] 81%|████████  | 7928/9770 [1:31:08<20:26,  1.50it/s] 81%|████████  | 7929/9770 [1:31:09<20:27,  1.50it/s] 81%|█████
+0: {'loss': 0.6498, 'grad_norm': 0.6020907690551343, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.81}
+0: {'loss': 0.6434, 'grad_norm': 0.6060348582271227, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.81}
+0: ███  | 7930/9770 [1:31:09<20:21,  1.51it/s]                                                      81%|████████  | 7930/9770 [1:31:09<20:21,  1.51it/s] 81%|████████  | 7931/9770 [1:31:10<20:23,  1.50it/s] 81%|████████  | 7932/9770 [1:31:11<20:16,  1.51it/s] 81%|████████  | 7933/9770 [1:31:11<19:56,  1.53it/s] 81%|████████  | 7934/9770 [1:31:12<19:57,  1.53it/s] 81%|████████  | 7935/9770 [1:31:13<19:55,  1.54it/s] 81%|████████  | 7936/9770 [1:31:13<19:59,  1.53it/s] 81%|████████  | 7937/9770 [1:31:14<19:46,  1.54it/s] 81%|████████  | 7938/9770 [1:31:15<20:07,  1.52it/s] 81%|████████▏ | 7939/9770 [1:31:15<20:05,  1.52it/s] 81%|████████▏ | 7940/9770 [1:31:16<20:08,  1.51it/s]                                                      81%|████████▏ | 7940/9770 [1:31:16<20:08,  1.51it/s] 81%|████�
+0: {'loss': 0.6425, 'grad_norm': 0.6216693844810112, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.81}
+0: ��███▏ | 7941/9770 [1:31:16<19:43,  1.55it/s] 81%|████████▏ | 7942/9770 [1:31:17<19:40,  1.55it/s] 81%|████████▏ | 7943/9770 [1:31:18<19:47,  1.54it/s] 81%|████████▏ | 7944/9770 [1:31:18<19:46,  1.54it/s] 81%|████████▏ | 7945/9770 [1:31:19<19:58,  1.52it/s] 81%|████████▏ | 7946/9770 [1:31:20<19:36,  1.55it/s] 81%|████████▏ | 7947/9770 [1:31:20<19:49,  1.53it/s] 81%|████████▏ | 7948/9770 [1:31:21<19:46,  1.54it/s] 81%|████████▏ | 7949/9770 [1:31:22<19:26,  1.56it/s] 81%|████████▏ | 7950/9770 [1:31:22<19:40,  1.54it/s]                                                      81%|████████▏ | 7950/9770 [1:31:22<19:40,  1.54it/s] 81%|████████▏ | 7951/9770 [1:31:23<19:47,  1.53it/s] 81%|████████▏ | 7952/9770 [1:31:24<19:45,  1.53it/s] 81%|████████▏ | 7953/9770 [1:31:
+0: {'loss': 0.6635, 'grad_norm': 0.582854611367015, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.81}
+0: 24<19:41,  1.54it/s] 81%|████████▏ | 7954/9770 [1:31:25<19:48,  1.53it/s] 81%|████████▏ | 7955/9770 [1:31:26<19:43,  1.53it/s] 81%|████████▏ | 7956/9770 [1:31:26<19:30,  1.55it/s] 81%|████████▏ | 7957/9770 [1:31:27<19:30,  1.55it/s] 81%|████████▏ | 7958/9770 [1:31:28<19:39,  1.54it/s] 81%|████████▏ | 7959/9770 [1:31:28<19:41,  1.53it/s] 81%|████████▏ | 7960/9770 [1:31:29<19:19,  1.56it/s]                                                      81%|████████▏ | 7960/9770 [1:31:29<19:19,  1.56it/s] 81%|████████▏ | 7961/9770 [1:31:29<19:24,  1.55it/s] 81%|████████▏ | 7962/9770 [1:31:30<19:50,  1.52it/s] 82%|████████▏ | 7963/9770 [1:31:31<19:46,  1.52it/s] 82%|████████▏ | 7964/9770 [1:31:31<19:48,  1.52it/s] 82%|████████▏ | 7965/9770 [1:31:32<19:30,  1.54it/s] 82%|██�
+0: {'loss': 0.6339, 'grad_norm': 0.5840065675906253, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.82}
+0: ��█████▏ | 7966/9770 [1:31:33<19:24,  1.55it/s] 82%|████████▏ | 7967/9770 [1:31:33<19:21,  1.55it/s] 82%|████████▏ | 7968/9770 [1:31:34<19:17,  1.56it/s] 82%|████████▏ | 7969/9770 [1:31:35<19:22,  1.55it/s] 82%|████████▏ | 7970/9770 [1:31:35<19:20,  1.55it/s]                                                      82%|████████▏ | 7970/9770 [1:31:35<19:20,  1.55it/s] 82%|████████▏ | 7971/9770 [1:31:36<19:26,  1.54it/s] 82%|████████▏ | 7972/9770 [1:31:37<19:41,  1.52it/s] 82%|████████▏ | 7973/9770 [1:31:37<20:01,  1.50it/s] 82%|████████▏ | 7974/9770 [1:31:38<19:42,  1.52it/s] 82%|████████▏ | 7975/9770 [1:31:39<19:42,  1.52it/s] 82%|████████▏ | 7976/9770 [1:31:39<19:46,  1.51it/s] 82%|████████▏ | 7977/9770 [1:31:40<19:34,  1.53it/s] 82%|████████▏ | 7978/9770 
+0: {'loss': 0.6368, 'grad_norm': 0.6193311371538059, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.82}
+0: [1:31:41<19:35,  1.52it/s] 82%|████████▏ | 7979/9770 [1:31:41<19:25,  1.54it/s] 82%|████████▏ | 7980/9770 [1:31:42<19:24,  1.54it/s]                                                      82%|████████▏ | 7980/9770 [1:31:42<19:24,  1.54it/s] 82%|████████▏ | 7981/9770 [1:31:43<19:32,  1.53it/s] 82%|████████▏ | 7982/9770 [1:31:43<19:34,  1.52it/s] 82%|████████▏ | 7983/9770 [1:31:44<20:01,  1.49it/s] 82%|████████▏ | 7984/9770 [1:31:45<19:29,  1.53it/s] 82%|████████▏ | 7985/9770 [1:31:45<19:22,  1.53it/s] 82%|████████▏ | 7986/9770 [1:31:46<19:29,  1.53it/s] 82%|████████▏ | 7987/9770 [1:31:46<19:20,  1.54it/s] 82%|████████▏ | 7988/9770 [1:31:47<19:19,  1.54it/s] 82%|████████▏ | 7989/9770 [1:31:48<19:33,  1.52it/s] 82%|████████▏ | 7990/9770 [1:31:48<19:28,  1.52it/s]      
+0: {'loss': 0.6741, 'grad_norm': 0.6208613350071387, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.82}
+0: {'loss': 0.6432, 'grad_norm': 0.5729237702257056, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.82}
+0:                                                 82%|████████▏ | 7990/9770 [1:31:48<19:28,  1.52it/s] 82%|████████▏ | 7991/9770 [1:31:49<19:11,  1.55it/s] 82%|████████▏ | 7992/9770 [1:31:50<19:19,  1.53it/s] 82%|████████▏ | 7993/9770 [1:31:50<19:19,  1.53it/s] 82%|████████▏ | 7994/9770 [1:31:51<19:15,  1.54it/s] 82%|████████▏ | 7995/9770 [1:31:52<19:20,  1.53it/s] 82%|████████▏ | 7996/9770 [1:31:52<19:25,  1.52it/s] 82%|████████▏ | 7997/9770 [1:31:53<19:14,  1.54it/s] 82%|████████▏ | 7998/9770 [1:31:54<19:06,  1.55it/s] 82%|████████▏ | 7999/9770 [1:31:54<19:09,  1.54it/s] 82%|████████▏ | 8000/9770 [1:31:55<19:14,  1.53it/s]                                                      82%|████████▏ | 8000/9770 [1:31:55<19:14,  1.53it/s] 82%|████████▏ | 8001/9770 [1:31:56<19
+0: {'loss': 0.6555, 'grad_norm': 0.6171835596278724, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.82}
+0: :20,  1.52it/s] 82%|████████▏ | 8002/9770 [1:31:56<19:31,  1.51it/s] 82%|████████▏ | 8003/9770 [1:31:57<19:19,  1.52it/s] 82%|████████▏ | 8004/9770 [1:31:58<19:03,  1.54it/s] 82%|████████▏ | 8005/9770 [1:31:58<19:07,  1.54it/s] 82%|████████▏ | 8006/9770 [1:31:59<19:04,  1.54it/s] 82%|████████▏ | 8007/9770 [1:32:00<19:04,  1.54it/s] 82%|████████▏ | 8008/9770 [1:32:00<19:08,  1.53it/s] 82%|████████▏ | 8009/9770 [1:32:01<19:03,  1.54it/s] 82%|████████▏ | 8010/9770 [1:32:01<19:04,  1.54it/s]                                                      82%|████████▏ | 8010/9770 [1:32:01<19:04,  1.54it/s] 82%|████████▏ | 8011/9770 [1:32:02<19:12,  1.53it/s] 82%|████████▏ | 8012/9770 [1:32:03<19:12,  1.53it/s] 82%|████████▏ | 8013/9770 [1:32:04<19:36,  1.49it/s] 82%|████
+0: {'loss': 0.6548, 'grad_norm': 0.6677592893497909, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.82}
+0: ████▏ | 8014/9770 [1:32:04<19:47,  1.48it/s] 82%|████████▏ | 8015/9770 [1:32:05<19:40,  1.49it/s] 82%|████████▏ | 8016/9770 [1:32:06<19:37,  1.49it/s] 82%|████████▏ | 8017/9770 [1:32:06<19:23,  1.51it/s] 82%|████████▏ | 8018/9770 [1:32:07<19:23,  1.51it/s] 82%|████████▏ | 8019/9770 [1:32:08<19:35,  1.49it/s] 82%|████████▏ | 8020/9770 [1:32:08<19:38,  1.48it/s]                                                      82%|████████▏ | 8020/9770 [1:32:08<19:38,  1.48it/s] 82%|████████▏ | 8021/9770 [1:32:09<19:31,  1.49it/s] 82%|████████▏ | 8022/9770 [1:32:10<19:25,  1.50it/s] 82%|████████▏ | 8023/9770 [1:32:10<19:10,  1.52it/s] 82%|████████▏ | 8024/9770 [1:32:11<19:07,  1.52it/s] 82%|████████▏ | 8025/9770 [1:32:11<19:07,  1.52it/s] 82%|████████▏ | 8026/9770 [1:32
+0: {'loss': 0.6447, 'grad_norm': 0.5980020406673359, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.82}
+0: :12<19:23,  1.50it/s] 82%|████████▏ | 8027/9770 [1:32:13<19:10,  1.51it/s] 82%|████████▏ | 8028/9770 [1:32:13<19:09,  1.52it/s] 82%|████████▏ | 8029/9770 [1:32:14<19:29,  1.49it/s] 82%|████████▏ | 8030/9770 [1:32:15<19:33,  1.48it/s]                                                      82%|████████▏ | 8030/9770 [1:32:15<19:33,  1.48it/s] 82%|████████▏ | 8031/9770 [1:32:16<19:17,  1.50it/s] 82%|████████▏ | 8032/9770 [1:32:16<19:23,  1.49it/s] 82%|████████▏ | 8033/9770 [1:32:17<19:03,  1.52it/s] 82%|████████▏ | 8034/9770 [1:32:17<18:53,  1.53it/s] 82%|████████▏ | 8035/9770 [1:32:18<18:37,  1.55it/s] 82%|████████▏ | 8036/9770 [1:32:19<18:43,  1.54it/s] 82%|████████▏ | 8037/9770 [1:32:19<18:47,  1.54it/s] 82%|████████▏ | 8038/9770 [1:32:20<18:45,  1.54it/s] 82%|██
+0: {'loss': 0.6621, 'grad_norm': 0.6144292215970258, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.82}
+0: ██████▏ | 8039/9770 [1:32:21<18:45,  1.54it/s] 82%|████████▏ | 8040/9770 [1:32:21<18:45,  1.54it/s]                                                      82%|████████▏ | 8040/9770 [1:32:21<18:45,  1.54it/s] 82%|████████▏ | 8041/9770 [1:32:22<18:48,  1.53it/s] 82%|████████▏ | 8042/9770 [1:32:23<18:46,  1.53it/s] 82%|████████▏ | 8043/9770 [1:32:23<18:28,  1.56it/s] 82%|████████▏ | 8044/9770 [1:32:24<18:24,  1.56it/s] 82%|████████▏ | 8045/9770 [1:32:25<18:26,  1.56it/s] 82%|████████▏ | 8046/9770 [1:32:25<18:30,  1.55it/s] 82%|████████▏ | 8047/9770 [1:32:26<18:35,  1.54it/s] 82%|████████▏ | 8048/9770 [1:32:27<18:41,  1.54it/s] 82%|████████▏ | 8049/9770 [1:32:27<18:49,  1.52it/s] 82%|████████▏ | 8050/9770 [1:32:28<18:51,  1.52it/s]                                            
+0: {'loss': 0.6477, 'grad_norm': 0.588304783120434, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.82}
+0: {'loss': 0.6428, 'grad_norm': 0.6089214420843809, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.82}
+0:           82%|████████▏ | 8050/9770 [1:32:28<18:51,  1.52it/s] 82%|████████▏ | 8051/9770 [1:32:28<18:33,  1.54it/s] 82%|████████▏ | 8052/9770 [1:32:29<18:16,  1.57it/s] 82%|████████▏ | 8053/9770 [1:32:30<18:38,  1.53it/s] 82%|████████▏ | 8054/9770 [1:32:30<18:43,  1.53it/s] 82%|████████▏ | 8055/9770 [1:32:31<18:43,  1.53it/s] 82%|████████▏ | 8056/9770 [1:32:32<18:52,  1.51it/s] 82%|████████▏ | 8057/9770 [1:32:32<18:50,  1.52it/s] 82%|████████▏ | 8058/9770 [1:32:33<18:47,  1.52it/s] 82%|████████▏ | 8059/9770 [1:32:34<18:46,  1.52it/s] 82%|████████▏ | 8060/9770 [1:32:34<19:13,  1.48it/s]                                                      82%|████████▏ | 8060/9770 [1:32:34<19:13,  1.48it/s] 83%|████████▎ | 8061/9770 [1:32:35<18:47,  1.52it/s] 83%|█████�
+0: {'loss': 0.6646, 'grad_norm': 0.6159007486217206, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.83}
+0: �██▎ | 8062/9770 [1:32:36<18:45,  1.52it/s] 83%|████████▎ | 8063/9770 [1:32:36<18:39,  1.52it/s] 83%|████████▎ | 8064/9770 [1:32:37<18:42,  1.52it/s] 83%|████████▎ | 8065/9770 [1:32:38<18:44,  1.52it/s] 83%|████████▎ | 8066/9770 [1:32:38<18:32,  1.53it/s] 83%|████████▎ | 8067/9770 [1:32:39<18:29,  1.54it/s] 83%|████████▎ | 8068/9770 [1:32:40<18:52,  1.50it/s] 83%|████████▎ | 8069/9770 [1:32:40<18:42,  1.51it/s] 83%|████████▎ | 8070/9770 [1:32:41<18:37,  1.52it/s]                                                      83%|████████▎ | 8070/9770 [1:32:41<18:37,  1.52it/s] 83%|████████▎ | 8071/9770 [1:32:42<18:32,  1.53it/s] 83%|████████▎ | 8072/9770 [1:32:42<18:28,  1.53it/s] 83%|████████▎ | 8073/9770 [1:32:43<18:45,  1.51it/s] 83%|████████▎ | 8074/9770 [1:32:44<1
+0: {'loss': 0.6649, 'grad_norm': 0.6164654085364054, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.83}
+0: 8:38,  1.52it/s] 83%|████████▎ | 8075/9770 [1:32:44<18:36,  1.52it/s] 83%|████████▎ | 8076/9770 [1:32:45<18:37,  1.52it/s] 83%|████████▎ | 8077/9770 [1:32:46<18:31,  1.52it/s] 83%|████████▎ | 8078/9770 [1:32:46<18:38,  1.51it/s] 83%|████████▎ | 8079/9770 [1:32:47<18:24,  1.53it/s] 83%|████████▎ | 8080/9770 [1:32:48<18:33,  1.52it/s]                                                      83%|████████▎ | 8080/9770 [1:32:48<18:33,  1.52it/s] 83%|████████▎ | 8081/9770 [1:32:48<18:30,  1.52it/s] 83%|████████▎ | 8082/9770 [1:32:49<18:31,  1.52it/s] 83%|████████▎ | 8083/9770 [1:32:50<18:12,  1.54it/s] 83%|████████▎ | 8084/9770 [1:32:50<18:21,  1.53it/s] 83%|████████▎ | 8085/9770 [1:32:51<18:18,  1.53it/s] 83%|████████▎ | 8086/9770 [1:32:51<18:25,  1.52it/s] 83%|███�
+0: {'loss': 0.6418, 'grad_norm': 0.6094421314964482, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.83}
+0: �████▎ | 8087/9770 [1:32:52<18:08,  1.55it/s] 83%|████████▎ | 8088/9770 [1:32:53<18:06,  1.55it/s] 83%|████████▎ | 8089/9770 [1:32:53<17:56,  1.56it/s] 83%|████████▎ | 8090/9770 [1:32:54<18:02,  1.55it/s]                                                      83%|████████▎ | 8090/9770 [1:32:54<18:02,  1.55it/s] 83%|████████▎ | 8091/9770 [1:32:55<18:04,  1.55it/s] 83%|████████▎ | 8092/9770 [1:32:55<18:05,  1.55it/s] 83%|████████▎ | 8093/9770 [1:32:56<18:04,  1.55it/s] 83%|████████▎ | 8094/9770 [1:32:57<18:09,  1.54it/s] 83%|████████▎ | 8095/9770 [1:32:57<18:17,  1.53it/s] 83%|████████▎ | 8096/9770 [1:32:58<18:15,  1.53it/s] 83%|████████▎ | 8097/9770 [1:32:59<18:00,  1.55it/s] 83%|████████▎ | 8098/9770 [1:32:59<17:58,  1.55it/s] 83%|████████▎ | 8099/9770 [1:3
+0: {'loss': 0.656, 'grad_norm': 0.60547380148355, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.83}
+0: {'loss': 0.6508, 'grad_norm': 0.5841923562627888, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.83}
+0: 3:00<17:50,  1.56it/s] 83%|████████▎ | 8100/9770 [1:33:01<17:58,  1.55it/s]                                                      83%|████████▎ | 8100/9770 [1:33:01<17:58,  1.55it/s] 83%|████████▎ | 8101/9770 [1:33:01<18:24,  1.51it/s] 83%|████████▎ | 8102/9770 [1:33:02<18:46,  1.48it/s] 83%|████████▎ | 8103/9770 [1:33:03<18:57,  1.47it/s] 83%|████████▎ | 8104/9770 [1:33:03<18:33,  1.50it/s] 83%|████████▎ | 8105/9770 [1:33:04<18:11,  1.53it/s] 83%|████████▎ | 8106/9770 [1:33:05<18:01,  1.54it/s] 83%|████████▎ | 8107/9770 [1:33:05<17:57,  1.54it/s] 83%|████████▎ | 8108/9770 [1:33:06<17:39,  1.57it/s] 83%|████████▎ | 8109/9770 [1:33:06<17:48,  1.55it/s] 83%|████████▎ | 8110/9770 [1:33:07<17:52,  1.55it/s]                                                      83%|███████�
+0: {'loss': 0.6696, 'grad_norm': 0.5854473509277579, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.83}
+0: ��▎ | 8110/9770 [1:33:07<17:52,  1.55it/s] 83%|████████▎ | 8111/9770 [1:33:08<18:01,  1.53it/s] 83%|████████▎ | 8112/9770 [1:33:08<17:59,  1.54it/s] 83%|████████▎ | 8113/9770 [1:33:09<18:03,  1.53it/s] 83%|████████▎ | 8114/9770 [1:33:10<17:54,  1.54it/s] 83%|████████▎ | 8115/9770 [1:33:10<18:00,  1.53it/s] 83%|████████▎ | 8116/9770 [1:33:11<17:47,  1.55it/s] 83%|████████▎ | 8117/9770 [1:33:12<17:38,  1.56it/s] 83%|████████▎ | 8118/9770 [1:33:12<17:40,  1.56it/s] 83%|████████▎ | 8119/9770 [1:33:13<17:30,  1.57it/s] 83%|████████▎ | 8120/9770 [1:33:14<17:33,  1.57it/s]                                                      83%|████████▎ | 8120/9770 [1:33:14<17:33,  1.57it/s] 83%|████████▎ | 8121/9770 [1:33:14<17:44,  1.55it/s] 83%|████████▎ | 8122/9770 [1:33:15<17:53,
+0: {'loss': 0.6508, 'grad_norm': 0.6101386612387996, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.83}
+0:   1.53it/s] 83%|████████▎ | 8123/9770 [1:33:16<17:53,  1.53it/s] 83%|████████▎ | 8124/9770 [1:33:16<18:06,  1.52it/s] 83%|████████▎ | 8125/9770 [1:33:17<17:50,  1.54it/s] 83%|████████▎ | 8126/9770 [1:33:17<17:50,  1.54it/s] 83%|████████▎ | 8127/9770 [1:33:18<17:40,  1.55it/s] 83%|████████▎ | 8128/9770 [1:33:19<17:30,  1.56it/s] 83%|████████▎ | 8129/9770 [1:33:19<17:33,  1.56it/s] 83%|████████▎ | 8130/9770 [1:33:20<17:28,  1.56it/s]                                                      83%|████████▎ | 8130/9770 [1:33:20<17:28,  1.56it/s] 83%|████████▎ | 8131/9770 [1:33:21<17:43,  1.54it/s] 83%|████████▎ | 8132/9770 [1:33:21<18:09,  1.50it/s] 83%|████████▎ | 8133/9770 [1:33:22<18:02,  1.51it/s] 83%|████████▎ | 8134/9770 [1:33:23<18:00,  1.51it/s] 83%|█████�
+0: {'loss': 0.6607, 'grad_norm': 0.5825941876726741, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.83}
+0: ��██▎ | 8135/9770 [1:33:23<17:52,  1.52it/s] 83%|████████▎ | 8136/9770 [1:33:24<17:49,  1.53it/s] 83%|████████▎ | 8137/9770 [1:33:25<17:39,  1.54it/s] 83%|████████▎ | 8138/9770 [1:33:25<17:33,  1.55it/s] 83%|████████▎ | 8139/9770 [1:33:26<17:47,  1.53it/s] 83%|████████▎ | 8140/9770 [1:33:27<17:37,  1.54it/s]                                                      83%|████████▎ | 8140/9770 [1:33:27<17:37,  1.54it/s] 83%|████████▎ | 8141/9770 [1:33:27<17:39,  1.54it/s] 83%|████████▎ | 8142/9770 [1:33:28<17:48,  1.52it/s] 83%|████████▎ | 8143/9770 [1:33:29<17:44,  1.53it/s] 83%|████████▎ | 8144/9770 [1:33:29<17:36,  1.54it/s] 83%|████████▎ | 8145/9770 [1:33:30<17:34,  1.54it/s] 83%|████████▎ | 8146/9770 [1:33:30<17:29,  1.55it/s] 83%|████████▎ | 8147/9770 [1:33:31<
+0: {'loss': 0.643, 'grad_norm': 0.5545415505236951, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.83}
+0: 17:32,  1.54it/s] 83%|████████▎ | 8148/9770 [1:33:32<17:24,  1.55it/s] 83%|████████▎ | 8149/9770 [1:33:32<17:26,  1.55it/s] 83%|████████▎ | 8150/9770 [1:33:33<17:58,  1.50it/s]                                                      83%|████████▎ | 8150/9770 [1:33:33<17:58,  1.50it/s] 83%|████████▎ | 8151/9770 [1:33:34<18:07,  1.49it/s] 83%|████████▎ | 8152/9770 [1:33:34<17:55,  1.50it/s] 83%|████████▎ | 8153/9770 [1:33:35<17:52,  1.51it/s] 83%|████████▎ | 8154/9770 [1:33:36<17:36,  1.53it/s] 83%|████████▎ | 8155/9770 [1:33:36<17:38,  1.53it/s] 83%|████████▎ | 8156/9770 [1:33:37<17:33,  1.53it/s] 83%|████████▎ | 8157/9770 [1:33:38<17:43,  1.52it/s] 84%|████████▎ | 8158/9770 [1:33:38<17:40,  1.52it/s] 84%|████████▎ | 8159/9770 [1:33:39<17:55,  1.50it/s] 84%|███�
+0: {'loss': 0.6298, 'grad_norm': 0.5720221603385147, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.83}
+0: {'loss': 0.6508, 'grad_norm': 0.5954875717516195, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.84}
+0: ��████▎ | 8160/9770 [1:33:40<18:02,  1.49it/s]                                                      84%|████████▎ | 8160/9770 [1:33:40<18:02,  1.49it/s] 84%|████████▎ | 8161/9770 [1:33:40<18:04,  1.48it/s] 84%|████████▎ | 8162/9770 [1:33:41<17:50,  1.50it/s] 84%|████████▎ | 8163/9770 [1:33:42<17:43,  1.51it/s] 84%|████████▎ | 8164/9770 [1:33:42<17:41,  1.51it/s] 84%|████████▎ | 8165/9770 [1:33:43<17:53,  1.49it/s] 84%|████████▎ | 8166/9770 [1:33:44<18:02,  1.48it/s] 84%|████████▎ | 8167/9770 [1:33:44<18:00,  1.48it/s] 84%|████████▎ | 8168/9770 [1:33:45<17:53,  1.49it/s] 84%|████████▎ | 8169/9770 [1:33:46<17:42,  1.51it/s] 84%|████████▎ | 8170/9770 [1:33:46<17:48,  1.50it/s]                                                      84%|████████▎ | 8170/9770 [1:33:46<17:48,  1.5
+0: {'loss': 0.6379, 'grad_norm': 0.5937285383603257, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.84}
+0: 0it/s] 84%|████████▎ | 8171/9770 [1:33:47<17:28,  1.52it/s] 84%|████████▎ | 8172/9770 [1:33:48<17:48,  1.50it/s] 84%|████████▎ | 8173/9770 [1:33:48<17:45,  1.50it/s] 84%|████████▎ | 8174/9770 [1:33:49<17:39,  1.51it/s] 84%|████████▎ | 8175/9770 [1:33:50<17:48,  1.49it/s] 84%|████████▎ | 8176/9770 [1:33:50<17:36,  1.51it/s] 84%|████████▎ | 8177/9770 [1:33:51<17:22,  1.53it/s] 84%|████████▎ | 8178/9770 [1:33:52<17:18,  1.53it/s] 84%|████████▎ | 8179/9770 [1:33:52<17:42,  1.50it/s] 84%|████████▎ | 8180/9770 [1:33:53<17:32,  1.51it/s]                                                      84%|████████▎ | 8180/9770 [1:33:53<17:32,  1.51it/s] 84%|████████▎ | 8181/9770 [1:33:54<17:15,  1.53it/s] 84%|████████▎ | 8182/9770 [1:33:54<17:14,  1.53it/s] 84%|███████
+0: {'loss': 0.651, 'grad_norm': 0.6353387079587396, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.84}
+0: █▍ | 8183/9770 [1:33:55<17:02,  1.55it/s] 84%|████████▍ | 8184/9770 [1:33:56<17:02,  1.55it/s] 84%|████████▍ | 8185/9770 [1:33:56<17:16,  1.53it/s] 84%|████████▍ | 8186/9770 [1:33:57<17:16,  1.53it/s] 84%|████████▍ | 8187/9770 [1:33:58<17:05,  1.54it/s] 84%|████████▍ | 8188/9770 [1:33:58<17:24,  1.51it/s] 84%|████████▍ | 8189/9770 [1:33:59<17:30,  1.50it/s] 84%|████████▍ | 8190/9770 [1:34:00<17:41,  1.49it/s]                                                      84%|████████▍ | 8190/9770 [1:34:00<17:41,  1.49it/s] 84%|████████▍ | 8191/9770 [1:34:00<18:00,  1.46it/s] 84%|████████▍ | 8192/9770 [1:34:01<17:28,  1.50it/s] 84%|████████▍ | 8193/9770 [1:34:02<17:16,  1.52it/s] 84%|████████▍ | 8194/9770 [1:34:02<17:09,  1.53it/s] 84%|████████▍ | 8195/9770 [1:34:03<17:30
+0: {'loss': 0.6311, 'grad_norm': 0.5549069972711465, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.84}
+0: ,  1.50it/s] 84%|████████▍ | 8196/9770 [1:34:04<17:19,  1.51it/s] 84%|████████▍ | 8197/9770 [1:34:04<17:15,  1.52it/s] 84%|████████▍ | 8198/9770 [1:34:05<17:30,  1.50it/s] 84%|████████▍ | 8199/9770 [1:34:06<17:18,  1.51it/s] 84%|████████▍ | 8200/9770 [1:34:06<17:33,  1.49it/s]                                                      84%|████████▍ | 8200/9770 [1:34:06<17:33,  1.49it/s] 84%|████████▍ | 8201/9770 [1:34:07<17:19,  1.51it/s] 84%|████████▍ | 8202/9770 [1:34:08<17:11,  1.52it/s] 84%|████████▍ | 8203/9770 [1:34:08<16:57,  1.54it/s] 84%|████████▍ | 8204/9770 [1:34:09<16:44,  1.56it/s] 84%|████████▍ | 8205/9770 [1:34:09<16:51,  1.55it/s] 84%|████████▍ | 8206/9770 [1:34:10<16:50,  1.55it/s] 84%|████████▍ | 8207/9770 [1:34:11<16:54,  1.54it/s] 84%|█████
+0: {'loss': 0.6434, 'grad_norm': 0.5836313442357334, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.84}
+0: ███▍ | 8208/9770 [1:34:11<16:55,  1.54it/s] 84%|████████▍ | 8209/9770 [1:34:12<17:01,  1.53it/s] 84%|████████▍ | 8210/9770 [1:34:13<16:44,  1.55it/s]                                                      84%|████████▍ | 8210/9770 [1:34:13<16:44,  1.55it/s] 84%|████████▍ | 8211/9770 [1:34:13<16:53,  1.54it/s] 84%|████████▍ | 8212/9770 [1:34:14<16:42,  1.55it/s] 84%|████████▍ | 8213/9770 [1:34:15<16:44,  1.55it/s] 84%|████████▍ | 8214/9770 [1:34:15<16:41,  1.55it/s] 84%|████████▍ | 8215/9770 [1:34:16<17:12,  1.51it/s] 84%|████████▍ | 8216/9770 [1:34:17<16:52,  1.53it/s] 84%|████████▍ | 8217/9770 [1:34:17<16:50,  1.54it/s] 84%|████████▍ | 8218/9770 [1:34:18<16:43,  1.55it/s] 84%|████████▍ | 8219/9770 [1:34:19<16:46,  1.54it/s] 84%|████████▍ | 8220/9770 [1:34:19
+0: {'loss': 0.6591, 'grad_norm': 0.5948877497922387, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.84}
+0: {'loss': 0.6424, 'grad_norm': 0.5763685160472533, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.84}
+0: <16:47,  1.54it/s]                                                      84%|████████▍ | 8220/9770 [1:34:19<16:47,  1.54it/s] 84%|████████▍ | 8221/9770 [1:34:20<16:51,  1.53it/s] 84%|████████▍ | 8222/9770 [1:34:21<16:51,  1.53it/s] 84%|████████▍ | 8223/9770 [1:34:21<16:50,  1.53it/s] 84%|████████▍ | 8224/9770 [1:34:22<16:53,  1.53it/s] 84%|████████▍ | 8225/9770 [1:34:22<16:46,  1.54it/s] 84%|████████▍ | 8226/9770 [1:34:23<17:09,  1.50it/s] 84%|████████▍ | 8227/9770 [1:34:24<16:48,  1.53it/s] 84%|████████▍ | 8228/9770 [1:34:24<16:55,  1.52it/s] 84%|████████▍ | 8229/9770 [1:34:25<16:49,  1.53it/s] 84%|████████▍ | 8230/9770 [1:34:26<16:38,  1.54it/s]                                                      84%|████████▍ | 8230/9770 [1:34:26<16:38,  1.54it/s] 84%|████████�
+0: {'loss': 0.6373, 'grad_norm': 0.6056688816769327, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.84}
+0: � | 8231/9770 [1:34:26<16:46,  1.53it/s] 84%|████████▍ | 8232/9770 [1:34:27<16:43,  1.53it/s] 84%|████████▍ | 8233/9770 [1:34:28<16:45,  1.53it/s] 84%|████████▍ | 8234/9770 [1:34:28<16:44,  1.53it/s] 84%|████████▍ | 8235/9770 [1:34:29<16:34,  1.54it/s] 84%|████████▍ | 8236/9770 [1:34:30<16:49,  1.52it/s] 84%|████████▍ | 8237/9770 [1:34:30<16:39,  1.53it/s] 84%|████████▍ | 8238/9770 [1:34:31<17:04,  1.50it/s] 84%|████████▍ | 8239/9770 [1:34:32<16:49,  1.52it/s] 84%|████████▍ | 8240/9770 [1:34:32<16:49,  1.52it/s]                                                      84%|████████▍ | 8240/9770 [1:34:32<16:49,  1.52it/s] 84%|████████▍ | 8241/9770 [1:34:33<16:43,  1.52it/s] 84%|████████▍ | 8242/9770 [1:34:34<16:49,  1.51it/s] 84%|████████▍ | 8243/9770 [1:34:34<16:45,  1.
+0: {'loss': 0.6457, 'grad_norm': 0.6003665729718766, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.84}
+0: 52it/s] 84%|████████▍ | 8244/9770 [1:34:35<16:41,  1.52it/s] 84%|████████▍ | 8245/9770 [1:34:36<16:39,  1.53it/s] 84%|████████▍ | 8246/9770 [1:34:36<16:37,  1.53it/s] 84%|████████▍ | 8247/9770 [1:34:37<16:35,  1.53it/s] 84%|████████▍ | 8248/9770 [1:34:38<16:30,  1.54it/s] 84%|████████▍ | 8249/9770 [1:34:38<16:57,  1.50it/s] 84%|████████▍ | 8250/9770 [1:34:39<16:40,  1.52it/s]                                                      84%|████████▍ | 8250/9770 [1:34:39<16:40,  1.52it/s] 84%|████████▍ | 8251/9770 [1:34:40<16:31,  1.53it/s] 84%|████████▍ | 8252/9770 [1:34:40<16:20,  1.55it/s] 84%|████████▍ | 8253/9770 [1:34:41<16:27,  1.54it/s] 84%|████████▍ | 8254/9770 [1:34:41<16:32,  1.53it/s] 84%|████████▍ | 8255/9770 [1:34:42<16:36,  1.52it/s] 85%|██████�
+0: {'loss': 0.6338, 'grad_norm': 0.607244218041695, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.85}
+0: �█▍ | 8256/9770 [1:34:43<16:19,  1.55it/s] 85%|████████▍ | 8257/9770 [1:34:43<16:20,  1.54it/s] 85%|████████▍ | 8258/9770 [1:34:44<16:26,  1.53it/s] 85%|████████▍ | 8259/9770 [1:34:45<16:27,  1.53it/s] 85%|████████▍ | 8260/9770 [1:34:45<16:25,  1.53it/s]                                                      85%|████████▍ | 8260/9770 [1:34:45<16:25,  1.53it/s] 85%|████████▍ | 8261/9770 [1:34:46<16:18,  1.54it/s] 85%|████████▍ | 8262/9770 [1:34:47<16:14,  1.55it/s] 85%|████████▍ | 8263/9770 [1:34:47<16:12,  1.55it/s] 85%|████████▍ | 8264/9770 [1:34:48<16:18,  1.54it/s] 85%|████████▍ | 8265/9770 [1:34:49<16:05,  1.56it/s] 85%|████████▍ | 8266/9770 [1:34:49<16:00,  1.57it/s] 85%|████████▍ | 8267/9770 [1:34:50<16:05,  1.56it/s] 85%|████████▍ | 8268/9770 [1:34:51<16:0
+0: {'loss': 0.644, 'grad_norm': 0.6186205128490853, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.85}
+0: 7,  1.55it/s] 85%|████████▍ | 8269/9770 [1:34:51<16:00,  1.56it/s] 85%|████████▍ | 8270/9770 [1:34:52<16:04,  1.56it/s]                                                      85%|████████▍ | 8270/9770 [1:34:52<16:04,  1.56it/s] 85%|████████▍ | 8271/9770 [1:34:52<16:11,  1.54it/s] 85%|████████▍ | 8272/9770 [1:34:53<16:32,  1.51it/s] 85%|████████▍ | 8273/9770 [1:34:54<16:25,  1.52it/s] 85%|████████▍ | 8274/9770 [1:34:54<16:33,  1.51it/s] 85%|████████▍ | 8275/9770 [1:34:55<16:26,  1.52it/s] 85%|████████▍ | 8276/9770 [1:34:56<16:28,  1.51it/s] 85%|████████▍ | 8277/9770 [1:34:56<16:23,  1.52it/s] 85%|████████▍ | 8278/9770 [1:34:57<16:19,  1.52it/s] 85%|████████▍ | 8279/9770 [1:34:58<16:14,  1.53it/s] 85%|████████▍ | 8280/9770 [1:34:58<16:11,  1.53it/s]                   
+0: {'loss': 0.6577, 'grad_norm': 0.6554491228832788, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.85}
+0: {'loss': 0.6665, 'grad_norm': 0.596526928701001, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.85}
+0:                                    85%|████████▍ | 8280/9770 [1:34:58<16:11,  1.53it/s] 85%|████████▍ | 8281/9770 [1:34:59<16:05,  1.54it/s] 85%|████████▍ | 8282/9770 [1:35:00<16:07,  1.54it/s] 85%|████████▍ | 8283/9770 [1:35:00<15:58,  1.55it/s] 85%|████████▍ | 8284/9770 [1:35:01<16:13,  1.53it/s] 85%|████████▍ | 8285/9770 [1:35:02<16:19,  1.52it/s] 85%|████████▍ | 8286/9770 [1:35:02<16:15,  1.52it/s] 85%|████████▍ | 8287/9770 [1:35:03<16:07,  1.53it/s] 85%|████████▍ | 8288/9770 [1:35:04<16:06,  1.53it/s] 85%|████████▍ | 8289/9770 [1:35:04<16:01,  1.54it/s] 85%|████████▍ | 8290/9770 [1:35:05<16:02,  1.54it/s]                                                      85%|████████▍ | 8290/9770 [1:35:05<16:02,  1.54it/s] 85%|████████▍ | 8291/9770 [1:35:06<15:56,  1.55it/
+0: {'loss': 0.6173, 'grad_norm': 0.5895966949035891, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.85}
+0: s] 85%|████████▍ | 8292/9770 [1:35:06<15:57,  1.54it/s] 85%|████████▍ | 8293/9770 [1:35:07<16:07,  1.53it/s] 85%|████████▍ | 8294/9770 [1:35:08<16:13,  1.52it/s] 85%|████████▍ | 8295/9770 [1:35:08<16:05,  1.53it/s] 85%|████████▍ | 8296/9770 [1:35:09<16:06,  1.52it/s] 85%|████████▍ | 8297/9770 [1:35:10<16:03,  1.53it/s] 85%|████████▍ | 8298/9770 [1:35:10<15:59,  1.53it/s] 85%|████████▍ | 8299/9770 [1:35:11<16:01,  1.53it/s] 85%|████████▍ | 8300/9770 [1:35:11<15:56,  1.54it/s]                                                      85%|████████▍ | 8300/9770 [1:35:11<15:56,  1.54it/s] 85%|████████▍ | 8301/9770 [1:35:12<16:20,  1.50it/s] 85%|████████▍ | 8302/9770 [1:35:13<16:13,  1.51it/s] 85%|████████▍ | 8303/9770 [1:35:13<16:07,  1.52it/s] 85%|████████�
+0: {'loss': 0.639, 'grad_norm': 0.598837078887474, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.85}
+0: �� | 8304/9770 [1:35:14<15:56,  1.53it/s] 85%|████████▌ | 8305/9770 [1:35:15<15:53,  1.54it/s] 85%|████████▌ | 8306/9770 [1:35:15<15:50,  1.54it/s] 85%|████████▌ | 8307/9770 [1:35:16<15:41,  1.55it/s] 85%|████████▌ | 8308/9770 [1:35:17<15:41,  1.55it/s] 85%|████████▌ | 8309/9770 [1:35:17<15:36,  1.56it/s] 85%|████████▌ | 8310/9770 [1:35:18<15:44,  1.55it/s]                                                      85%|████████▌ | 8310/9770 [1:35:18<15:44,  1.55it/s] 85%|████████▌ | 8311/9770 [1:35:19<15:47,  1.54it/s] 85%|████████▌ | 8312/9770 [1:35:19<15:41,  1.55it/s] 85%|████████▌ | 8313/9770 [1:35:20<15:40,  1.55it/s] 85%|████████▌ | 8314/9770 [1:35:21<15:44,  1.54it/s] 85%|████████▌ | 8315/9770 [1:35:21<15:37,  1.55it/s] 85%|████████▌ | 8316/9770 [1:35:22<15:46,  1
+0: {'loss': 0.6523, 'grad_norm': 0.6128223636084873, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.85}
+0: .54it/s] 85%|████████▌ | 8317/9770 [1:35:23<15:49,  1.53it/s] 85%|████████▌ | 8318/9770 [1:35:23<15:52,  1.52it/s] 85%|████████▌ | 8319/9770 [1:35:24<15:52,  1.52it/s] 85%|████████▌ | 8320/9770 [1:35:24<15:45,  1.53it/s]                                                      85%|████████▌ | 8320/9770 [1:35:24<15:45,  1.53it/s] 85%|████████▌ | 8321/9770 [1:35:25<15:41,  1.54it/s] 85%|████████▌ | 8322/9770 [1:35:26<15:32,  1.55it/s] 85%|████████▌ | 8323/9770 [1:35:26<15:34,  1.55it/s] 85%|████████▌ | 8324/9770 [1:35:27<15:32,  1.55it/s] 85%|████████▌ | 8325/9770 [1:35:28<15:32,  1.55it/s] 85%|████████▌ | 8326/9770 [1:35:28<15:33,  1.55it/s] 85%|████████▌ | 8327/9770 [1:35:29<15:25,  1.56it/s] 85%|████████▌ | 8328/9770 [1:35:30<15:21,  1.57it/s] 85%|██████�
+0: {'loss': 0.6527, 'grad_norm': 0.598424214726235, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.85}
+0: {'loss': 0.6397, 'grad_norm': 0.5988903949903229, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.85}
+0: ��█▌ | 8329/9770 [1:35:30<15:27,  1.55it/s] 85%|████████▌ | 8330/9770 [1:35:31<15:21,  1.56it/s]                                                      85%|████████▌ | 8330/9770 [1:35:31<15:21,  1.56it/s] 85%|████████▌ | 8331/9770 [1:35:32<15:29,  1.55it/s] 85%|████████▌ | 8332/9770 [1:35:32<15:22,  1.56it/s] 85%|████████▌ | 8333/9770 [1:35:33<15:32,  1.54it/s] 85%|████████▌ | 8334/9770 [1:35:34<15:37,  1.53it/s] 85%|████████▌ | 8335/9770 [1:35:34<15:33,  1.54it/s] 85%|████████▌ | 8336/9770 [1:35:35<15:31,  1.54it/s] 85%|████████▌ | 8337/9770 [1:35:35<15:31,  1.54it/s] 85%|████████▌ | 8338/9770 [1:35:36<15:53,  1.50it/s] 85%|████████▌ | 8339/9770 [1:35:37<15:46,  1.51it/s] 85%|████████▌ | 8340/9770 [1:35:37<15:42,  1.52it/s]                                                      8
+0: {'loss': 0.6412, 'grad_norm': 0.5483067057822244, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.85}
+0: 5%|████████▌ | 8340/9770 [1:35:37<15:42,  1.52it/s] 85%|████████▌ | 8341/9770 [1:35:38<15:39,  1.52it/s] 85%|████████▌ | 8342/9770 [1:35:39<15:33,  1.53it/s] 85%|████████▌ | 8343/9770 [1:35:39<15:22,  1.55it/s] 85%|████████▌ | 8344/9770 [1:35:40<15:22,  1.55it/s] 85%|████████▌ | 8345/9770 [1:35:41<15:42,  1.51it/s] 85%|████████▌ | 8346/9770 [1:35:41<15:40,  1.51it/s] 85%|████████▌ | 8347/9770 [1:35:42<15:31,  1.53it/s] 85%|████████▌ | 8348/9770 [1:35:43<15:32,  1.52it/s] 85%|████████▌ | 8349/9770 [1:35:43<15:25,  1.53it/s] 85%|████████▌ | 8350/9770 [1:35:44<15:30,  1.53it/s]                                                      85%|████████▌ | 8350/9770 [1:35:44<15:30,  1.53it/s] 85%|████████▌ | 8351/9770 [1:35:45<15:20,  1.54it/s] 85%|████████▌ | 
+0: {'loss': 0.6421, 'grad_norm': 0.6657337970824535, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.86}
+0: 8352/9770 [1:35:45<15:20,  1.54it/s] 85%|████████▌ | 8353/9770 [1:35:46<15:13,  1.55it/s] 86%|████████▌ | 8354/9770 [1:35:47<15:15,  1.55it/s] 86%|████████▌ | 8355/9770 [1:35:47<15:11,  1.55it/s] 86%|████████▌ | 8356/9770 [1:35:48<15:12,  1.55it/s] 86%|████████▌ | 8357/9770 [1:35:48<15:12,  1.55it/s] 86%|████████▌ | 8358/9770 [1:35:49<15:16,  1.54it/s] 86%|████████▌ | 8359/9770 [1:35:50<15:08,  1.55it/s] 86%|████████▌ | 8360/9770 [1:35:50<15:04,  1.56it/s]                                                      86%|████████▌ | 8360/9770 [1:35:50<15:04,  1.56it/s] 86%|████████▌ | 8361/9770 [1:35:51<15:10,  1.55it/s] 86%|████████▌ | 8362/9770 [1:35:52<15:17,  1.54it/s] 86%|████████▌ | 8363/9770 [1:35:52<15:17,  1.53it/s] 86%|████████▌ | 8364/9770 [1:35:53<15:18,  1.53it
+0: {'loss': 0.6536, 'grad_norm': 0.5832815917266596, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.86}
+0: /s] 86%|████████▌ | 8365/9770 [1:35:54<15:00,  1.56it/s] 86%|████████▌ | 8366/9770 [1:35:54<15:01,  1.56it/s] 86%|████████▌ | 8367/9770 [1:35:55<14:58,  1.56it/s] 86%|████████▌ | 8368/9770 [1:35:56<14:54,  1.57it/s] 86%|████████▌ | 8369/9770 [1:35:56<15:01,  1.55it/s] 86%|████████▌ | 8370/9770 [1:35:57<15:05,  1.55it/s]                                                      86%|████████▌ | 8370/9770 [1:35:57<15:05,  1.55it/s] 86%|████████▌ | 8371/9770 [1:35:58<15:10,  1.54it/s] 86%|████████▌ | 8372/9770 [1:35:58<15:05,  1.54it/s] 86%|████████▌ | 8373/9770 [1:35:59<14:55,  1.56it/s] 86%|████████▌ | 8374/9770 [1:35:59<14:56,  1.56it/s] 86%|████████▌ | 8375/9770 [1:36:00<15:00,  1.55it/s] 86%|████████▌ | 8376/9770 [1:36:01<15:04,  1.54it/s] 86%|████████
+0: {'loss': 0.6329, 'grad_norm': 0.5801911628826856, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.86}
+0: ▌ | 8377/9770 [1:36:01<15:02,  1.54it/s] 86%|████████▌ | 8378/9770 [1:36:02<14:54,  1.56it/s] 86%|████████▌ | 8379/9770 [1:36:03<14:54,  1.56it/s] 86%|████████▌ | 8380/9770 [1:36:03<15:08,  1.53it/s]                                                      86%|████████▌ | 8380/9770 [1:36:03<15:08,  1.53it/s] 86%|████████▌ | 8381/9770 [1:36:04<15:06,  1.53it/s] 86%|████████▌ | 8382/9770 [1:36:05<15:11,  1.52it/s] 86%|████████▌ | 8383/9770 [1:36:05<15:15,  1.51it/s] 86%|████████▌ | 8384/9770 [1:36:06<15:28,  1.49it/s] 86%|████████▌ | 8385/9770 [1:36:07<15:19,  1.51it/s] 86%|████████▌ | 8386/9770 [1:36:07<15:13,  1.52it/s] 86%|████████▌ | 8387/9770 [1:36:08<15:29,  1.49it/s] 86%|████████▌ | 8388/9770 [1:36:09<15:22,  1.50it/s] 86%|████████▌ | 8389/9770 [1:36:09<15:07,  
+0: {'loss': 0.6596, 'grad_norm': 0.6350260297397763, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.86}
+0: {'loss': 0.6508, 'grad_norm': 0.6124821373067496, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.86}
+0: 1.52it/s] 86%|████████▌ | 8390/9770 [1:36:10<15:05,  1.52it/s]                                                      86%|████████▌ | 8390/9770 [1:36:10<15:05,  1.52it/s] 86%|████████▌ | 8391/9770 [1:36:11<15:05,  1.52it/s] 86%|████████▌ | 8392/9770 [1:36:11<15:01,  1.53it/s] 86%|████████▌ | 8393/9770 [1:36:12<14:56,  1.54it/s] 86%|████████▌ | 8394/9770 [1:36:13<15:02,  1.52it/s] 86%|████████▌ | 8395/9770 [1:36:13<14:59,  1.53it/s] 86%|████████▌ | 8396/9770 [1:36:14<14:54,  1.54it/s] 86%|████████▌ | 8397/9770 [1:36:15<14:50,  1.54it/s] 86%|████████▌ | 8398/9770 [1:36:15<14:48,  1.54it/s] 86%|████████▌ | 8399/9770 [1:36:16<14:57,  1.53it/s] 86%|████████▌ | 8400/9770 [1:36:16<14:50,  1.54it/s]                                                      86%|████████▌ | 8400/
+0: {'loss': 0.663, 'grad_norm': 0.5928170836107521, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.86}
+0: 9770 [1:36:17<14:50,  1.54it/s] 86%|████████▌ | 8401/9770 [1:36:17<14:50,  1.54it/s] 86%|████████▌ | 8402/9770 [1:36:18<14:43,  1.55it/s] 86%|████████▌ | 8403/9770 [1:36:18<14:45,  1.54it/s] 86%|████████▌ | 8404/9770 [1:36:19<14:51,  1.53it/s] 86%|████████▌ | 8405/9770 [1:36:20<15:07,  1.50it/s] 86%|████████▌ | 8406/9770 [1:36:20<15:02,  1.51it/s] 86%|████████▌ | 8407/9770 [1:36:21<14:59,  1.52it/s] 86%|████████▌ | 8408/9770 [1:36:22<14:49,  1.53it/s] 86%|████████▌ | 8409/9770 [1:36:22<14:44,  1.54it/s] 86%|████████▌ | 8410/9770 [1:36:23<14:50,  1.53it/s]                                                      86%|████████▌ | 8410/9770 [1:36:23<14:50,  1.53it/s] 86%|████████▌ | 8411/9770 [1:36:24<15:00,  1.51it/s] 86%|████████▌ | 8412/9770 [1:36:24<15:01,  1.51it/s] 
+0: {'loss': 0.641, 'grad_norm': 0.5619216197416638, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.86}
+0: 86%|████████▌ | 8413/9770 [1:36:25<14:53,  1.52it/s] 86%|████████▌ | 8414/9770 [1:36:26<15:04,  1.50it/s] 86%|████████▌ | 8415/9770 [1:36:26<14:50,  1.52it/s] 86%|████████▌ | 8416/9770 [1:36:27<15:14,  1.48it/s] 86%|████████▌ | 8417/9770 [1:36:28<15:00,  1.50it/s] 86%|████████▌ | 8418/9770 [1:36:28<14:49,  1.52it/s] 86%|████████▌ | 8419/9770 [1:36:29<14:40,  1.53it/s] 86%|████████▌ | 8420/9770 [1:36:30<14:50,  1.52it/s]                                                      86%|████████▌ | 8420/9770 [1:36:30<14:50,  1.52it/s] 86%|████████▌ | 8421/9770 [1:36:30<14:47,  1.52it/s] 86%|████████▌ | 8422/9770 [1:36:31<14:46,  1.52it/s] 86%|████████▌ | 8423/9770 [1:36:32<14:45,  1.52it/s] 86%|████████▌ | 8424/9770 [1:36:32<14:40,  1.53it/s] 86%|████████▌ |
+0: {'loss': 0.6358, 'grad_norm': 0.6026142219192688, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.86}
+0:  8425/9770 [1:36:33<14:28,  1.55it/s] 86%|████████▌ | 8426/9770 [1:36:34<14:23,  1.56it/s] 86%|████████▋ | 8427/9770 [1:36:34<14:10,  1.58it/s] 86%|████████▋ | 8428/9770 [1:36:35<14:08,  1.58it/s] 86%|████████▋ | 8429/9770 [1:36:35<14:21,  1.56it/s] 86%|████████▋ | 8430/9770 [1:36:36<14:24,  1.55it/s]                                                      86%|████████▋ | 8430/9770 [1:36:36<14:24,  1.55it/s] 86%|████████▋ | 8431/9770 [1:36:37<14:26,  1.55it/s] 86%|████████▋ | 8432/9770 [1:36:37<14:17,  1.56it/s] 86%|████████▋ | 8433/9770 [1:36:38<14:17,  1.56it/s] 86%|████████▋ | 8434/9770 [1:36:39<14:24,  1.55it/s] 86%|████████▋ | 8435/9770 [1:36:39<14:21,  1.55it/s] 86%|████████▋ | 8436/9770 [1:36:40<14:14,  1.56it/s] 86%|████████▋ | 8437/9770 [1:36:41<14:13,  1.56i
+0: {'loss': 0.6555, 'grad_norm': 0.6065234321260474, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.86}
+0: t/s] 86%|████████▋ | 8438/9770 [1:36:41<14:14,  1.56it/s] 86%|████████▋ | 8439/9770 [1:36:42<14:12,  1.56it/s] 86%|████████▋ | 8440/9770 [1:36:43<14:15,  1.55it/s]                                                      86%|████████▋ | 8440/9770 [1:36:43<14:15,  1.55it/s] 86%|████████▋ | 8441/9770 [1:36:43<14:08,  1.57it/s] 86%|████████▋ | 8442/9770 [1:36:44<14:01,  1.58it/s] 86%|████████▋ | 8443/9770 [1:36:44<14:09,  1.56it/s] 86%|████████▋ | 8444/9770 [1:36:45<14:36,  1.51it/s] 86%|████████▋ | 8445/9770 [1:36:46<14:39,  1.51it/s] 86%|████████▋ | 8446/9770 [1:36:46<14:42,  1.50it/s] 86%|████████▋ | 8447/9770 [1:36:47<14:33,  1.51it/s] 86%|████████▋ | 8448/9770 [1:36:48<14:25,  1.53it/s] 86%|████████▋ | 8449/9770 [1:36:48<14:23,  1.53it/s] 86%|███████�
+0: {'loss': 0.6429, 'grad_norm': 0.635609117972797, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.86}
+0: {'loss': 0.6471, 'grad_norm': 0.6496045878158897, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.87}
+0: �▋ | 8450/9770 [1:36:49<14:16,  1.54it/s]                                                      86%|████████▋ | 8450/9770 [1:36:49<14:16,  1.54it/s] 86%|████████▋ | 8451/9770 [1:36:50<14:16,  1.54it/s] 87%|████████▋ | 8452/9770 [1:36:50<14:06,  1.56it/s] 87%|████████▋ | 8453/9770 [1:36:51<14:12,  1.54it/s] 87%|████████▋ | 8454/9770 [1:36:52<14:08,  1.55it/s] 87%|████████▋ | 8455/9770 [1:36:52<14:07,  1.55it/s] 87%|████████▋ | 8456/9770 [1:36:53<14:12,  1.54it/s] 87%|████████▋ | 8457/9770 [1:36:54<14:16,  1.53it/s] 87%|████████▋ | 8458/9770 [1:36:54<14:17,  1.53it/s] 87%|████████▋ | 8459/9770 [1:36:55<14:10,  1.54it/s] 87%|████████▋ | 8460/9770 [1:36:56<14:06,  1.55it/s]                                                      87%|████████▋ | 8460/9770 [1:36:56<14:06,  1.55it/s] 87%|�
+0: {'loss': 0.6682, 'grad_norm': 0.618038144296049, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.87}
+0: ��███████▋ | 8461/9770 [1:36:56<14:05,  1.55it/s] 87%|████████▋ | 8462/9770 [1:36:57<14:21,  1.52it/s] 87%|████████▋ | 8463/9770 [1:36:58<14:20,  1.52it/s] 87%|████████▋ | 8464/9770 [1:36:58<14:17,  1.52it/s] 87%|████████▋ | 8465/9770 [1:36:59<14:08,  1.54it/s] 87%|████████▋ | 8466/9770 [1:37:00<14:28,  1.50it/s] 87%|████████▋ | 8467/9770 [1:37:00<14:21,  1.51it/s] 87%|████████▋ | 8468/9770 [1:37:01<14:11,  1.53it/s] 87%|████████▋ | 8469/9770 [1:37:01<14:09,  1.53it/s] 87%|████████▋ | 8470/9770 [1:37:02<14:12,  1.52it/s]                                                      87%|████████▋ | 8470/9770 [1:37:02<14:12,  1.52it/s] 87%|████████▋ | 8471/9770 [1:37:03<14:13,  1.52it/s] 87%|████████▋ | 8472/9770 [1:37:03<14:11,  1.52it/s] 87%|████████▋ | 8473
+0: {'loss': 0.6373, 'grad_norm': 0.5795847678857926, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.87}
+0: /9770 [1:37:04<14:34,  1.48it/s] 87%|████████▋ | 8474/9770 [1:37:05<14:24,  1.50it/s] 87%|████████▋ | 8475/9770 [1:37:05<14:15,  1.51it/s] 87%|████████▋ | 8476/9770 [1:37:06<14:15,  1.51it/s] 87%|████████▋ | 8477/9770 [1:37:07<14:20,  1.50it/s] 87%|████████▋ | 8478/9770 [1:37:07<14:13,  1.51it/s] 87%|████████▋ | 8479/9770 [1:37:08<14:05,  1.53it/s] 87%|████████▋ | 8480/9770 [1:37:09<13:59,  1.54it/s]                                                      87%|████████▋ | 8480/9770 [1:37:09<13:59,  1.54it/s] 87%|████████▋ | 8481/9770 [1:37:09<13:58,  1.54it/s] 87%|████████▋ | 8482/9770 [1:37:10<13:55,  1.54it/s] 87%|████████▋ | 8483/9770 [1:37:11<13:46,  1.56it/s] 87%|████████▋ | 8484/9770 [1:37:11<13:44,  1.56it/s] 87%|████████▋ | 8485/9770 [1:37:12<13:41,  1.56it/s]
+0: {'loss': 0.6583, 'grad_norm': 0.5873279654115177, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.87}
+0:  87%|████████▋ | 8486/9770 [1:37:13<13:35,  1.57it/s] 87%|████████▋ | 8487/9770 [1:37:13<13:47,  1.55it/s] 87%|████████▋ | 8488/9770 [1:37:14<13:44,  1.55it/s] 87%|████████▋ | 8489/9770 [1:37:15<13:51,  1.54it/s] 87%|████████▋ | 8490/9770 [1:37:15<13:50,  1.54it/s]                                                      87%|████████▋ | 8490/9770 [1:37:15<13:50,  1.54it/s] 87%|████████▋ | 8491/9770 [1:37:16<13:55,  1.53it/s] 87%|████████▋ | 8492/9770 [1:37:16<13:59,  1.52it/s] 87%|████████▋ | 8493/9770 [1:37:17<14:20,  1.48it/s] 87%|████████▋ | 8494/9770 [1:37:18<14:16,  1.49it/s] 87%|████████▋ | 8495/9770 [1:37:19<14:17,  1.49it/s] 87%|████████▋ | 8496/9770 [1:37:19<14:00,  1.52it/s] 87%|████████▋ | 8497/9770 [1:37:20<13:55,  1.52it/s] 87%|████████▋ 
+0: {'loss': 0.6404, 'grad_norm': 0.6160615730378544, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.87}
+0: | 8498/9770 [1:37:20<13:51,  1.53it/s] 87%|████████▋ | 8499/9770 [1:37:21<13:40,  1.55it/s] 87%|████████▋ | 8500/9770 [1:37:22<13:45,  1.54it/s]                                                      87%|████████▋ | 8500/9770 [1:37:22<13:45,  1.54it/s] 87%|████████▋ | 8501/9770 [1:37:22<14:02,  1.51it/s] 87%|████████▋ | 8502/9770 [1:37:23<14:11,  1.49it/s] 87%|████████▋ | 8503/9770 [1:37:24<14:02,  1.50it/s] 87%|████████▋ | 8504/9770 [1:37:25<14:16,  1.48it/s] 87%|████████▋ | 8505/9770 [1:37:25<14:04,  1.50it/s] 87%|████████▋ | 8506/9770 [1:37:26<13:50,  1.52it/s] 87%|████████▋ | 8507/9770 [1:37:26<14:08,  1.49it/s] 87%|████████▋ | 8508/9770 [1:37:27<14:06,  1.49it/s] 87%|████████▋ | 8509/9770 [1:37:28<13:54,  1.51it/s] 87%|████████▋ | 8510/9770 [1:37:28<13:42,  1.53
+0: {'loss': 0.6409, 'grad_norm': 0.6189802003648246, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.87}
+0: {'loss': 0.6745, 'grad_norm': 0.6240342347917001, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.87}
+0: it/s]                                                      87%|████████▋ | 8510/9770 [1:37:28<13:42,  1.53it/s] 87%|████████▋ | 8511/9770 [1:37:29<13:47,  1.52it/s] 87%|████████▋ | 8512/9770 [1:37:30<13:45,  1.52it/s] 87%|████████▋ | 8513/9770 [1:37:30<13:30,  1.55it/s] 87%|████████▋ | 8514/9770 [1:37:31<13:43,  1.53it/s] 87%|████████▋ | 8515/9770 [1:37:32<13:42,  1.53it/s] 87%|████████▋ | 8516/9770 [1:37:32<13:29,  1.55it/s] 87%|████████▋ | 8517/9770 [1:37:33<13:40,  1.53it/s] 87%|████████▋ | 8518/9770 [1:37:34<13:40,  1.53it/s] 87%|████████▋ | 8519/9770 [1:37:34<13:44,  1.52it/s] 87%|████████▋ | 8520/9770 [1:37:35<13:39,  1.53it/s]                                                      87%|████████▋ | 8520/9770 [1:37:35<13:39,  1.53it/s] 87%|████████▋ | 8521/9770
+0: {'loss': 0.6431, 'grad_norm': 0.6112937163509875, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.87}
+0:  [1:37:36<13:46,  1.51it/s] 87%|████████▋ | 8522/9770 [1:37:36<13:59,  1.49it/s] 87%|████████▋ | 8523/9770 [1:37:37<13:53,  1.50it/s] 87%|████████▋ | 8524/9770 [1:37:38<13:49,  1.50it/s] 87%|████████▋ | 8525/9770 [1:37:38<13:42,  1.51it/s] 87%|████████▋ | 8526/9770 [1:37:39<13:35,  1.53it/s] 87%|████████▋ | 8527/9770 [1:37:40<13:31,  1.53it/s] 87%|████████▋ | 8528/9770 [1:37:40<13:35,  1.52it/s] 87%|████████▋ | 8529/9770 [1:37:41<13:30,  1.53it/s] 87%|████████▋ | 8530/9770 [1:37:42<13:26,  1.54it/s]                                                      87%|████████▋ | 8530/9770 [1:37:42<13:26,  1.54it/s] 87%|████████▋ | 8531/9770 [1:37:42<13:21,  1.55it/s] 87%|████████▋ | 8532/9770 [1:37:43<13:15,  1.56it/s] 87%|████████▋ | 8533/9770 [1:37:43<13:19,  1.55it/s] 87%|
+0: {'loss': 0.637, 'grad_norm': 0.6454752552430932, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.87}
+0: ████████▋ | 8534/9770 [1:37:44<13:18,  1.55it/s] 87%|████████▋ | 8535/9770 [1:37:45<13:12,  1.56it/s] 87%|████████▋ | 8536/9770 [1:37:45<13:10,  1.56it/s] 87%|████████▋ | 8537/9770 [1:37:46<13:19,  1.54it/s] 87%|████████▋ | 8538/9770 [1:37:47<13:17,  1.54it/s] 87%|████████▋ | 8539/9770 [1:37:47<13:36,  1.51it/s] 87%|████████▋ | 8540/9770 [1:37:48<13:34,  1.51it/s]                                                      87%|████████▋ | 8540/9770 [1:37:48<13:34,  1.51it/s] 87%|████████▋ | 8541/9770 [1:37:49<13:35,  1.51it/s] 87%|████████▋ | 8542/9770 [1:37:49<13:26,  1.52it/s] 87%|████████▋ | 8543/9770 [1:37:50<13:45,  1.49it/s] 87%|████████▋ | 8544/9770 [1:37:51<13:34,  1.50it/s] 87%|████████▋ | 8545/9770 [1:37:51<13:41,  1.49it/s] 87%|████████▋ | 854
+0: {'loss': 0.6773, 'grad_norm': 0.5885302883964239, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.87}
+0: 6/9770 [1:37:52<13:36,  1.50it/s] 87%|████████▋ | 8547/9770 [1:37:53<13:37,  1.50it/s] 87%|████████▋ | 8548/9770 [1:37:53<13:31,  1.51it/s] 88%|████████▊ | 8549/9770 [1:37:54<13:20,  1.52it/s] 88%|████████▊ | 8550/9770 [1:37:55<13:17,  1.53it/s]                                                      88%|████████▊ | 8550/9770 [1:37:55<13:17,  1.53it/s] 88%|████████▊ | 8551/9770 [1:37:55<13:33,  1.50it/s] 88%|████████▊ | 8552/9770 [1:37:56<13:26,  1.51it/s] 88%|████████▊ | 8553/9770 [1:37:57<13:16,  1.53it/s] 88%|████████▊ | 8554/9770 [1:37:57<13:09,  1.54it/s] 88%|████████▊ | 8555/9770 [1:37:58<13:28,  1.50it/s] 88%|████████▊ | 8556/9770 [1:37:59<13:25,  1.51it/s] 88%|████████▊ | 8557/9770 [1:37:59<13:12,  1.53it/s] 88%|████████▊ | 8558/9770 [1:38:00<13:04,  1.54it/s]
+0: {'loss': 0.6639, 'grad_norm': 0.6060617851454885, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.88}
+0:  88%|████████▊ | 8559/9770 [1:38:01<13:12,  1.53it/s] 88%|████████▊ | 8560/9770 [1:38:01<13:21,  1.51it/s]                                                      88%|████████▊ | 8560/9770 [1:38:01<13:21,  1.51it/s] 88%|████████▊ | 8561/9770 [1:38:02<13:20,  1.51it/s] 88%|████████▊ | 8562/9770 [1:38:03<13:10,  1.53it/s] 88%|████████▊ | 8563/9770 [1:38:03<13:21,  1.51it/s] 88%|████████▊ | 8564/9770 [1:38:04<13:15,  1.52it/s] 88%|████████▊ | 8565/9770 [1:38:05<13:22,  1.50it/s] 88%|████████▊ | 8566/9770 [1:38:05<13:13,  1.52it/s] 88%|████████▊ | 8567/9770 [1:38:06<13:05,  1.53it/s] 88%|████████▊ | 8568/9770 [1:38:07<13:06,  1.53it/s] 88%|████████▊ | 8569/9770 [1:38:07<13:03,  1.53it/s] 88%|████████▊ | 8570/9770 [1:38:08<12:54,  1.55it/s]                                
+0: {'loss': 0.6391, 'grad_norm': 0.5760706070176715, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.88}
+0: {'loss': 0.6508, 'grad_norm': 0.5965601065925983, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.88}
+0:                       88%|████████▊ | 8570/9770 [1:38:08<12:54,  1.55it/s] 88%|████████▊ | 8571/9770 [1:38:08<12:59,  1.54it/s] 88%|████████▊ | 8572/9770 [1:38:09<12:56,  1.54it/s] 88%|████████▊ | 8573/9770 [1:38:10<12:54,  1.55it/s] 88%|████████▊ | 8574/9770 [1:38:10<12:58,  1.54it/s] 88%|████████▊ | 8575/9770 [1:38:11<13:08,  1.52it/s] 88%|████████▊ | 8576/9770 [1:38:12<13:09,  1.51it/s] 88%|████████▊ | 8577/9770 [1:38:12<13:04,  1.52it/s] 88%|████████▊ | 8578/9770 [1:38:13<13:09,  1.51it/s] 88%|████████▊ | 8579/9770 [1:38:14<12:55,  1.54it/s] 88%|████████▊ | 8580/9770 [1:38:14<12:55,  1.53it/s]                                                      88%|████████▊ | 8580/9770 [1:38:14<12:55,  1.53it/s] 88%|████████▊ | 8581/9770 [1:38:15<13:00,  1.52it/s] 88%|█�
+0: {'loss': 0.6488, 'grad_norm': 0.6168248671049509, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.88}
+0: �██████▊ | 8582/9770 [1:38:16<12:57,  1.53it/s] 88%|████████▊ | 8583/9770 [1:38:16<12:50,  1.54it/s] 88%|████████▊ | 8584/9770 [1:38:17<13:07,  1.51it/s] 88%|████████▊ | 8585/9770 [1:38:18<13:14,  1.49it/s] 88%|████████▊ | 8586/9770 [1:38:18<13:11,  1.50it/s] 88%|████████▊ | 8587/9770 [1:38:19<13:01,  1.51it/s] 88%|████████▊ | 8588/9770 [1:38:20<12:46,  1.54it/s] 88%|████████▊ | 8589/9770 [1:38:20<12:55,  1.52it/s] 88%|████████▊ | 8590/9770 [1:38:21<12:49,  1.53it/s]                                                      88%|████████▊ | 8590/9770 [1:38:21<12:49,  1.53it/s] 88%|████████▊ | 8591/9770 [1:38:22<13:08,  1.50it/s] 88%|████████▊ | 8592/9770 [1:38:22<13:03,  1.50it/s] 88%|████████▊ | 8593/9770 [1:38:23<12:53,  1.52it/s] 88%|████████▊ | 8594/977
+0: {'loss': 0.6366, 'grad_norm': 0.6032865679992432, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.88}
+0: 0 [1:38:24<12:48,  1.53it/s] 88%|████████▊ | 8595/9770 [1:38:24<12:53,  1.52it/s] 88%|████████▊ | 8596/9770 [1:38:25<12:48,  1.53it/s] 88%|████████▊ | 8597/9770 [1:38:26<12:41,  1.54it/s] 88%|████████▊ | 8598/9770 [1:38:26<12:44,  1.53it/s] 88%|████████▊ | 8599/9770 [1:38:27<12:34,  1.55it/s] 88%|████████▊ | 8600/9770 [1:38:27<12:31,  1.56it/s]                                                      88%|████████▊ | 8600/9770 [1:38:27<12:31,  1.56it/s] 88%|████████▊ | 8601/9770 [1:38:28<12:33,  1.55it/s] 88%|████████▊ | 8602/9770 [1:38:29<12:34,  1.55it/s] 88%|████████▊ | 8603/9770 [1:38:29<12:34,  1.55it/s] 88%|████████▊ | 8604/9770 [1:38:30<12:22,  1.57it/s] 88%|████████▊ | 8605/9770 [1:38:31<12:45,  1.52it/s] 88%|████████▊ | 8606/9770 [1:38:31<12:34,  1.54it/s] 88%
+0: {'loss': 0.652, 'grad_norm': 0.6168589508591895, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.88}
+0: |████████▊ | 8607/9770 [1:38:32<12:38,  1.53it/s] 88%|████████▊ | 8608/9770 [1:38:33<12:35,  1.54it/s] 88%|████████▊ | 8609/9770 [1:38:33<12:38,  1.53it/s] 88%|████████▊ | 8610/9770 [1:38:34<12:37,  1.53it/s]                                                      88%|████████▊ | 8610/9770 [1:38:34<12:37,  1.53it/s] 88%|████████▊ | 8611/9770 [1:38:35<12:45,  1.51it/s] 88%|████████▊ | 8612/9770 [1:38:35<12:40,  1.52it/s] 88%|████████▊ | 8613/9770 [1:38:36<12:33,  1.54it/s] 88%|████████▊ | 8614/9770 [1:38:37<12:34,  1.53it/s] 88%|████████▊ | 8615/9770 [1:38:37<12:39,  1.52it/s] 88%|████████▊ | 8616/9770 [1:38:38<12:34,  1.53it/s] 88%|████████▊ | 8617/9770 [1:38:39<12:24,  1.55it/s] 88%|████████▊ | 8618/9770 [1:38:39<12:25,  1.55it/s] 88%|████████▊ | 86
+0: {'loss': 0.6516, 'grad_norm': 0.595497472824902, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.88}
+0: {'loss': 0.6628, 'grad_norm': 0.6049395103594175, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.88}
+0: 19/9770 [1:38:40<12:14,  1.57it/s] 88%|████████▊ | 8620/9770 [1:38:40<12:19,  1.56it/s]                                                      88%|████████▊ | 8620/9770 [1:38:40<12:19,  1.56it/s] 88%|████████▊ | 8621/9770 [1:38:41<12:29,  1.53it/s] 88%|████████▊ | 8622/9770 [1:38:42<12:29,  1.53it/s] 88%|████████▊ | 8623/9770 [1:38:42<12:36,  1.52it/s] 88%|████████▊ | 8624/9770 [1:38:43<12:22,  1.54it/s] 88%|████████▊ | 8625/9770 [1:38:44<12:26,  1.53it/s] 88%|████████▊ | 8626/9770 [1:38:44<12:21,  1.54it/s] 88%|████████▊ | 8627/9770 [1:38:45<12:10,  1.57it/s] 88%|████████▊ | 8628/9770 [1:38:46<12:29,  1.52it/s] 88%|████████▊ | 8629/9770 [1:38:46<12:47,  1.49it/s] 88%|████████▊ | 8630/9770 [1:38:47<12:31,  1.52it/s]                                                      88%|███�
+0: {'loss': 0.6444, 'grad_norm': 0.6330025249925728, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.88}
+0: ��████▊ | 8630/9770 [1:38:47<12:31,  1.52it/s] 88%|████████▊ | 8631/9770 [1:38:48<12:35,  1.51it/s] 88%|████████▊ | 8632/9770 [1:38:48<12:30,  1.52it/s] 88%|████████▊ | 8633/9770 [1:38:49<12:24,  1.53it/s] 88%|████████▊ | 8634/9770 [1:38:50<12:29,  1.51it/s] 88%|████████▊ | 8635/9770 [1:38:50<12:28,  1.52it/s] 88%|████████▊ | 8636/9770 [1:38:51<12:24,  1.52it/s] 88%|████████▊ | 8637/9770 [1:38:52<12:19,  1.53it/s] 88%|████████▊ | 8638/9770 [1:38:52<12:34,  1.50it/s] 88%|████████▊ | 8639/9770 [1:38:53<12:31,  1.50it/s] 88%|████████▊ | 8640/9770 [1:38:54<12:27,  1.51it/s]                                                      88%|████████▊ | 8640/9770 [1:38:54<12:27,  1.51it/s] 88%|████████▊ | 8641/9770 [1:38:54<12:17,  1.53it/s] 88%|████████▊ | 8642/9770 [1:
+0: {'loss': 0.646, 'grad_norm': 0.6801030407353096, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.88}
+0: 38:55<12:22,  1.52it/s] 88%|████████▊ | 8643/9770 [1:38:56<12:24,  1.51it/s] 88%|████████▊ | 8644/9770 [1:38:56<12:21,  1.52it/s] 88%|████████▊ | 8645/9770 [1:38:57<12:19,  1.52it/s] 88%|████████▊ | 8646/9770 [1:38:58<12:21,  1.52it/s] 89%|████████▊ | 8647/9770 [1:38:58<12:13,  1.53it/s] 89%|████████▊ | 8648/9770 [1:38:59<12:01,  1.55it/s] 89%|████████▊ | 8649/9770 [1:38:59<11:59,  1.56it/s] 89%|████████▊ | 8650/9770 [1:39:00<11:58,  1.56it/s]                                                      89%|████████▊ | 8650/9770 [1:39:00<11:58,  1.56it/s] 89%|████████▊ | 8651/9770 [1:39:01<11:53,  1.57it/s] 89%|████████▊ | 8652/9770 [1:39:01<12:02,  1.55it/s] 89%|████████▊ | 8653/9770 [1:39:02<12:14,  1.52it/s] 89%|████████▊ | 8654/9770 [1:39:03<12:16,  1.51it/s] 89%|█�
+0: {'loss': 0.6376, 'grad_norm': 0.6148884516601141, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.89}
+0: ��██████▊ | 8655/9770 [1:39:03<12:17,  1.51it/s] 89%|████████▊ | 8656/9770 [1:39:04<12:18,  1.51it/s] 89%|████████▊ | 8657/9770 [1:39:05<12:20,  1.50it/s] 89%|████████▊ | 8658/9770 [1:39:05<12:16,  1.51it/s] 89%|████████▊ | 8659/9770 [1:39:06<12:13,  1.51it/s] 89%|████████▊ | 8660/9770 [1:39:07<12:05,  1.53it/s]                                                      89%|████████▊ | 8660/9770 [1:39:07<12:05,  1.53it/s] 89%|████████▊ | 8661/9770 [1:39:07<12:07,  1.52it/s] 89%|████████▊ | 8662/9770 [1:39:08<12:10,  1.52it/s] 89%|████████▊ | 8663/9770 [1:39:09<12:08,  1.52it/s] 89%|████████▊ | 8664/9770 [1:39:09<11:56,  1.54it/s] 89%|████████▊ | 8665/9770 [1:39:10<11:54,  1.55it/s] 89%|████████▊ | 8666/9770 [1:39:11<11:49,  1.56it/s] 89%|████████▊ | 8667/97
+0: {'loss': 0.6429, 'grad_norm': 0.6115386321301234, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.89}
+0: 70 [1:39:11<11:48,  1.56it/s] 89%|████████▊ | 8668/9770 [1:39:12<11:38,  1.58it/s] 89%|████████▊ | 8669/9770 [1:39:13<11:40,  1.57it/s] 89%|████████▊ | 8670/9770 [1:39:13<11:45,  1.56it/s]                                                      89%|████████▊ | 8670/9770 [1:39:13<11:45,  1.56it/s] 89%|████████▉ | 8671/9770 [1:39:14<11:50,  1.55it/s] 89%|████████▉ | 8672/9770 [1:39:14<11:52,  1.54it/s] 89%|████████▉ | 8673/9770 [1:39:15<11:59,  1.53it/s] 89%|████████▉ | 8674/9770 [1:39:16<11:59,  1.52it/s] 89%|████████▉ | 8675/9770 [1:39:16<11:55,  1.53it/s] 89%|████████▉ | 8676/9770 [1:39:17<11:47,  1.55it/s] 89%|████████▉ | 8677/9770 [1:39:18<11:53,  1.53it/s] 89%|████████▉ | 8678/9770 [1:39:18<11:52,  1.53it/s] 89%|████████▉ | 8679/9770 [1:39:19<11:51,  1.53it/s] 89
+0: {'loss': 0.6483, 'grad_norm': 0.586653385539892, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.89}
+0: {'loss': 0.6592, 'grad_norm': 0.582585014988046, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.89}
+0: %|████████▉ | 8680/9770 [1:39:20<11:45,  1.55it/s]                                                      89%|████████▉ | 8680/9770 [1:39:20<11:45,  1.55it/s] 89%|████████▉ | 8681/9770 [1:39:20<11:51,  1.53it/s] 89%|████████▉ | 8682/9770 [1:39:21<11:50,  1.53it/s] 89%|████████▉ | 8683/9770 [1:39:22<11:47,  1.54it/s] 89%|████████▉ | 8684/9770 [1:39:22<11:59,  1.51it/s] 89%|████████▉ | 8685/9770 [1:39:23<11:51,  1.53it/s] 89%|████████▉ | 8686/9770 [1:39:24<11:43,  1.54it/s] 89%|████████▉ | 8687/9770 [1:39:24<11:38,  1.55it/s] 89%|████████▉ | 8688/9770 [1:39:25<11:44,  1.53it/s] 89%|████████▉ | 8689/9770 [1:39:26<12:10,  1.48it/s] 89%|████████▉ | 8690/9770 [1:39:26<11:55,  1.51it/s]                                                      89%|████████▉ | 8690/9770 [1:39:26
+0: {'loss': 0.6196, 'grad_norm': 0.5853838127434589, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.89}
+0: <11:55,  1.51it/s] 89%|████████▉ | 8691/9770 [1:39:27<11:50,  1.52it/s] 89%|████████▉ | 8692/9770 [1:39:28<11:43,  1.53it/s] 89%|████████▉ | 8693/9770 [1:39:28<11:45,  1.53it/s] 89%|████████▉ | 8694/9770 [1:39:29<11:36,  1.54it/s] 89%|████████▉ | 8695/9770 [1:39:30<11:36,  1.54it/s] 89%|████████▉ | 8696/9770 [1:39:30<11:36,  1.54it/s] 89%|████████▉ | 8697/9770 [1:39:31<11:33,  1.55it/s] 89%|████████▉ | 8698/9770 [1:39:31<11:36,  1.54it/s] 89%|████████▉ | 8699/9770 [1:39:32<11:48,  1.51it/s] 89%|████████▉ | 8700/9770 [1:39:33<11:36,  1.54it/s]                                                      89%|████████▉ | 8700/9770 [1:39:33<11:36,  1.54it/s] 89%|████████▉ | 8701/9770 [1:39:33<11:34,  1.54it/s] 89%|████████▉ | 8702/9770 [1:39:34<11:48,  1.51it/s] 89%|███
+0: {'loss': 0.6464, 'grad_norm': 0.6132056282981102, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.89}
+0: █████▉ | 8703/9770 [1:39:35<11:51,  1.50it/s] 89%|████████▉ | 8704/9770 [1:39:35<11:44,  1.51it/s] 89%|████████▉ | 8705/9770 [1:39:36<11:40,  1.52it/s] 89%|████████▉ | 8706/9770 [1:39:37<11:39,  1.52it/s] 89%|████████▉ | 8707/9770 [1:39:37<11:41,  1.52it/s] 89%|████████▉ | 8708/9770 [1:39:38<11:38,  1.52it/s] 89%|████████▉ | 8709/9770 [1:39:39<11:34,  1.53it/s] 89%|████████▉ | 8710/9770 [1:39:39<11:38,  1.52it/s]                                                      89%|████████▉ | 8710/9770 [1:39:39<11:38,  1.52it/s] 89%|████████▉ | 8711/9770 [1:39:40<11:37,  1.52it/s] 89%|████████▉ | 8712/9770 [1:39:41<11:37,  1.52it/s] 89%|████████▉ | 8713/9770 [1:39:41<11:35,  1.52it/s] 89%|████████▉ | 8714/9770 [1:39:42<11:34,  1.52it/s] 89%|████████▉ | 8715/9770 [1
+0: {'loss': 0.6724, 'grad_norm': 0.5833862736005774, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.89}
+0: :39:43<11:29,  1.53it/s] 89%|████████▉ | 8716/9770 [1:39:43<11:31,  1.53it/s] 89%|████████▉ | 8717/9770 [1:39:44<11:33,  1.52it/s] 89%|████████▉ | 8718/9770 [1:39:45<11:28,  1.53it/s] 89%|████████▉ | 8719/9770 [1:39:45<11:20,  1.55it/s] 89%|████████▉ | 8720/9770 [1:39:46<11:14,  1.56it/s]                                                      89%|████████▉ | 8720/9770 [1:39:46<11:14,  1.56it/s] 89%|████████▉ | 8721/9770 [1:39:47<11:12,  1.56it/s] 89%|████████▉ | 8722/9770 [1:39:47<11:35,  1.51it/s] 89%|████████▉ | 8723/9770 [1:39:48<11:28,  1.52it/s] 89%|████████▉ | 8724/9770 [1:39:49<11:22,  1.53it/s] 89%|████████▉ | 8725/9770 [1:39:49<11:20,  1.53it/s] 89%|████████▉ | 8726/9770 [1:39:50<11:23,  1.53it/s] 89%|████████▉ | 8727/9770 [1:39:50<11:22,  1.53it/s] 89%|█
+0: {'loss': 0.6382, 'grad_norm': 0.5985510225455318, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.89}
+0: ███████▉ | 8728/9770 [1:39:51<11:21,  1.53it/s] 89%|████████▉ | 8729/9770 [1:39:52<11:22,  1.52it/s] 89%|████████▉ | 8730/9770 [1:39:52<11:21,  1.53it/s]                                                      89%|████████▉ | 8730/9770 [1:39:52<11:21,  1.53it/s] 89%|████████▉ | 8731/9770 [1:39:53<11:12,  1.54it/s] 89%|████████▉ | 8732/9770 [1:39:54<11:13,  1.54it/s] 89%|████████▉ | 8733/9770 [1:39:54<11:10,  1.55it/s] 89%|████████▉ | 8734/9770 [1:39:55<11:06,  1.56it/s] 89%|████████▉ | 8735/9770 [1:39:56<11:10,  1.54it/s] 89%|████████▉ | 8736/9770 [1:39:56<11:00,  1.57it/s] 89%|████████▉ | 8737/9770 [1:39:57<11:05,  1.55it/s] 89%|████████▉ | 8738/9770 [1:39:58<11:01,  1.56it/s] 89%|████████▉ | 8739/9770 [1:39:58<11:16,  1.52it/s] 89%|████████▉ | 8740/9
+0: {'loss': 0.6236, 'grad_norm': 0.6356619541275096, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.89}
+0: {'loss': 0.6373, 'grad_norm': 0.5849162574581885, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.9}
+0: 770 [1:39:59<11:10,  1.54it/s]                                                      89%|████████▉ | 8740/9770 [1:39:59<11:10,  1.54it/s] 89%|████████▉ | 8741/9770 [1:40:00<11:09,  1.54it/s] 89%|████████▉ | 8742/9770 [1:40:00<11:01,  1.55it/s] 89%|████████▉ | 8743/9770 [1:40:01<10:53,  1.57it/s] 89%|████████▉ | 8744/9770 [1:40:01<10:53,  1.57it/s] 90%|████████▉ | 8745/9770 [1:40:02<10:54,  1.57it/s] 90%|████████▉ | 8746/9770 [1:40:03<10:53,  1.57it/s] 90%|████████▉ | 8747/9770 [1:40:03<10:53,  1.57it/s] 90%|████████▉ | 8748/9770 [1:40:04<10:55,  1.56it/s] 90%|████████▉ | 8749/9770 [1:40:05<11:00,  1.55it/s] 90%|████████▉ | 8750/9770 [1:40:05<10:56,  1.55it/s]                                                      90%|████████▉ | 8750/9770 [1:40:05<10:56,  1.55it/s] 90%|████�
+0: {'loss': 0.6417, 'grad_norm': 0.5725301793115135, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.9}
+0: �███▉ | 8751/9770 [1:40:06<11:06,  1.53it/s] 90%|████████▉ | 8752/9770 [1:40:07<10:58,  1.55it/s] 90%|████████▉ | 8753/9770 [1:40:07<11:10,  1.52it/s] 90%|████████▉ | 8754/9770 [1:40:08<11:12,  1.51it/s] 90%|████████▉ | 8755/9770 [1:40:09<11:01,  1.53it/s] 90%|████████▉ | 8756/9770 [1:40:09<10:57,  1.54it/s] 90%|████████▉ | 8757/9770 [1:40:10<10:58,  1.54it/s] 90%|████████▉ | 8758/9770 [1:40:11<10:53,  1.55it/s] 90%|████████▉ | 8759/9770 [1:40:11<11:04,  1.52it/s] 90%|████████▉ | 8760/9770 [1:40:12<11:09,  1.51it/s]                                                      90%|████████▉ | 8760/9770 [1:40:12<11:09,  1.51it/s] 90%|████████▉ | 8761/9770 [1:40:13<11:11,  1.50it/s] 90%|████████▉ | 8762/9770 [1:40:13<10:59,  1.53it/s] 90%|████████▉ | 8763/9770 [1:40:1
+0: {'loss': 0.6515, 'grad_norm': 0.6190158774613381, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.9}
+0: 4<11:15,  1.49it/s] 90%|████████▉ | 8764/9770 [1:40:15<10:59,  1.53it/s] 90%|████████▉ | 8765/9770 [1:40:15<11:05,  1.51it/s] 90%|████████▉ | 8766/9770 [1:40:16<10:51,  1.54it/s] 90%|████████▉ | 8767/9770 [1:40:16<10:50,  1.54it/s] 90%|████████▉ | 8768/9770 [1:40:17<10:47,  1.55it/s] 90%|████████▉ | 8769/9770 [1:40:18<10:48,  1.54it/s] 90%|████████▉ | 8770/9770 [1:40:18<10:52,  1.53it/s]                                                      90%|████████▉ | 8770/9770 [1:40:18<10:52,  1.53it/s] 90%|████████▉ | 8771/9770 [1:40:19<10:51,  1.53it/s] 90%|████████▉ | 8772/9770 [1:40:20<10:47,  1.54it/s] 90%|████████▉ | 8773/9770 [1:40:20<10:44,  1.55it/s] 90%|████████▉ | 8774/9770 [1:40:21<10:50,  1.53it/s] 90%|████████▉ | 8775/9770 [1:40:22<10:47,  1.54it/s] 90%|██�
+0: {'loss': 0.6466, 'grad_norm': 0.59364489234436, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.9}
+0: �█████▉ | 8776/9770 [1:40:22<11:01,  1.50it/s] 90%|████████▉ | 8777/9770 [1:40:23<10:58,  1.51it/s] 90%|████████▉ | 8778/9770 [1:40:24<10:59,  1.50it/s] 90%|████████▉ | 8779/9770 [1:40:24<10:49,  1.53it/s] 90%|████████▉ | 8780/9770 [1:40:25<10:41,  1.54it/s]                                                      90%|████████▉ | 8780/9770 [1:40:25<10:41,  1.54it/s] 90%|████████▉ | 8781/9770 [1:40:26<10:57,  1.50it/s] 90%|████████▉ | 8782/9770 [1:40:26<10:55,  1.51it/s] 90%|████████▉ | 8783/9770 [1:40:27<10:53,  1.51it/s] 90%|████████▉ | 8784/9770 [1:40:28<10:48,  1.52it/s] 90%|████████▉ | 8785/9770 [1:40:28<10:45,  1.53it/s] 90%|████████▉ | 8786/9770 [1:40:29<10:36,  1.55it/s] 90%|████████▉ | 8787/9770 [1:40:30<10:34,  1.55it/s] 90%|████████▉ | 8788/9770 [
+0: {'loss': 0.6442, 'grad_norm': 0.6273796777531278, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.9}
+0: 1:40:30<10:37,  1.54it/s] 90%|████████▉ | 8789/9770 [1:40:31<10:40,  1.53it/s] 90%|████████▉ | 8790/9770 [1:40:32<10:35,  1.54it/s]                                                      90%|████████▉ | 8790/9770 [1:40:32<10:35,  1.54it/s] 90%|████████▉ | 8791/9770 [1:40:32<10:39,  1.53it/s] 90%|████████▉ | 8792/9770 [1:40:33<10:53,  1.50it/s] 90%|█████████ | 8793/9770 [1:40:34<10:44,  1.52it/s] 90%|█████████ | 8794/9770 [1:40:34<10:54,  1.49it/s] 90%|█████████ | 8795/9770 [1:40:35<10:39,  1.52it/s] 90%|█████████ | 8796/9770 [1:40:35<10:32,  1.54it/s] 90%|█████████ | 8797/9770 [1:40:36<10:33,  1.53it/s] 90%|█████████ | 8798/9770 [1:40:37<10:32,  1.54it/s] 90%|█████████ | 8799/9770 [1:40:37<10:33,  1.53it/s] 90%|█████████ | 8800/9770 [1:40:38<10:22,  1.56it/s]       
+0: {'loss': 0.6573, 'grad_norm': 0.6342980367172633, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.9}
+0: {'loss': 0.6533, 'grad_norm': 0.6165304337266457, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.9}
+0:                                                90%|█████████ | 8800/9770 [1:40:38<10:22,  1.56it/s] 90%|█████████ | 8801/9770 [1:40:39<10:22,  1.56it/s] 90%|█████████ | 8802/9770 [1:40:39<10:19,  1.56it/s] 90%|█████████ | 8803/9770 [1:40:40<10:23,  1.55it/s] 90%|█████████ | 8804/9770 [1:40:41<10:24,  1.55it/s] 90%|█████████ | 8805/9770 [1:40:41<10:25,  1.54it/s] 90%|█████████ | 8806/9770 [1:40:42<10:24,  1.54it/s] 90%|█████████ | 8807/9770 [1:40:43<10:24,  1.54it/s] 90%|█████████ | 8808/9770 [1:40:43<10:21,  1.55it/s] 90%|█████████ | 8809/9770 [1:40:44<10:25,  1.54it/s] 90%|█████████ | 8810/9770 [1:40:45<10:31,  1.52it/s]                                                      90%|█████████ | 8810/9770 [1:40:45<10:31,  1.52it/s] 90%|█████████ | 8811/9770 [1:40:45<10:
+0: {'loss': 0.6583, 'grad_norm': 0.6445558806507502, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.9}
+0: 25,  1.53it/s] 90%|█████████ | 8812/9770 [1:40:46<10:23,  1.54it/s] 90%|█████████ | 8813/9770 [1:40:46<10:13,  1.56it/s] 90%|█████████ | 8814/9770 [1:40:47<10:12,  1.56it/s] 90%|█████████ | 8815/9770 [1:40:48<10:10,  1.56it/s] 90%|█████████ | 8816/9770 [1:40:48<10:08,  1.57it/s] 90%|█████████ | 8817/9770 [1:40:49<10:10,  1.56it/s] 90%|█████████ | 8818/9770 [1:40:50<10:27,  1.52it/s] 90%|█████████ | 8819/9770 [1:40:50<10:23,  1.52it/s] 90%|█████████ | 8820/9770 [1:40:51<10:29,  1.51it/s]                                                      90%|█████████ | 8820/9770 [1:40:51<10:29,  1.51it/s] 90%|█████████ | 8821/9770 [1:40:52<10:31,  1.50it/s] 90%|█████████ | 8822/9770 [1:40:52<10:24,  1.52it/s] 90%|█████████ | 8823/9770 [1:40:53<10:22,  1.52it/s] 90%|████�
+0: {'loss': 0.6513, 'grad_norm': 0.5907378091949187, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.9}
+0: ��████ | 8824/9770 [1:40:54<10:36,  1.49it/s] 90%|█████████ | 8825/9770 [1:40:54<10:30,  1.50it/s] 90%|█████████ | 8826/9770 [1:40:55<10:22,  1.52it/s] 90%|█████████ | 8827/9770 [1:40:56<10:21,  1.52it/s] 90%|█████████ | 8828/9770 [1:40:56<10:25,  1.51it/s] 90%|█████████ | 8829/9770 [1:40:57<10:17,  1.52it/s] 90%|█████████ | 8830/9770 [1:40:58<10:10,  1.54it/s]                                                      90%|█████████ | 8830/9770 [1:40:58<10:10,  1.54it/s] 90%|█████████ | 8831/9770 [1:40:58<10:11,  1.54it/s] 90%|█████████ | 8832/9770 [1:40:59<10:11,  1.53it/s] 90%|█████████ | 8833/9770 [1:41:00<10:04,  1.55it/s] 90%|█████████ | 8834/9770 [1:41:00<10:16,  1.52it/s] 90%|█████████ | 8835/9770 [1:41:01<10:20,  1.51it/s] 90%|█████████ | 8836/9770 [1:41:
+0: {'loss': 0.6494, 'grad_norm': 0.6164724133022096, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.9}
+0: 02<10:17,  1.51it/s] 90%|█████████ | 8837/9770 [1:41:02<10:06,  1.54it/s] 90%|█████████ | 8838/9770 [1:41:03<10:04,  1.54it/s] 90%|█████████ | 8839/9770 [1:41:03<10:00,  1.55it/s] 90%|█████████ | 8840/9770 [1:41:04<09:56,  1.56it/s]                                                      90%|█████████ | 8840/9770 [1:41:04<09:56,  1.56it/s] 90%|█████████ | 8841/9770 [1:41:05<10:00,  1.55it/s] 91%|█████████ | 8842/9770 [1:41:05<09:56,  1.55it/s] 91%|█████████ | 8843/9770 [1:41:06<10:05,  1.53it/s] 91%|█████████ | 8844/9770 [1:41:07<10:03,  1.53it/s] 91%|█████████ | 8845/9770 [1:41:07<10:06,  1.52it/s] 91%|█████████ | 8846/9770 [1:41:08<10:04,  1.53it/s] 91%|█████████ | 8847/9770 [1:41:09<10:13,  1.50it/s] 91%|█████████ | 8848/9770 [1:41:09<10:04,  1.53it/s] 91%|██�
+0: {'loss': 0.6496, 'grad_norm': 0.6003184546725324, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.91}
+0: ��██████ | 8849/9770 [1:41:10<09:58,  1.54it/s] 91%|█████████ | 8850/9770 [1:41:11<09:56,  1.54it/s]                                                      91%|█████████ | 8850/9770 [1:41:11<09:56,  1.54it/s] 91%|█████████ | 8851/9770 [1:41:11<10:11,  1.50it/s] 91%|█████████ | 8852/9770 [1:41:12<10:18,  1.48it/s] 91%|█████████ | 8853/9770 [1:41:13<10:18,  1.48it/s] 91%|█████████ | 8854/9770 [1:41:13<10:12,  1.50it/s] 91%|█████████ | 8855/9770 [1:41:14<10:05,  1.51it/s] 91%|█████████ | 8856/9770 [1:41:15<09:57,  1.53it/s] 91%|█████████ | 8857/9770 [1:41:15<09:59,  1.52it/s] 91%|█████████ | 8858/9770 [1:41:16<10:12,  1.49it/s] 91%|█████████ | 8859/9770 [1:41:17<10:02,  1.51it/s] 91%|█████████ | 8860/9770 [1:41:17<10:02,  1.51it/s]                                             
+0: {'loss': 0.6548, 'grad_norm': 0.5949722249011528, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.91}
+0: {'loss': 0.6633, 'grad_norm': 0.6079243746631338, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.91}
+0:          91%|█████████ | 8860/9770 [1:41:17<10:02,  1.51it/s] 91%|█████████ | 8861/9770 [1:41:18<09:58,  1.52it/s] 91%|█████████ | 8862/9770 [1:41:19<09:48,  1.54it/s] 91%|█████████ | 8863/9770 [1:41:19<09:50,  1.54it/s] 91%|█████████ | 8864/9770 [1:41:20<09:48,  1.54it/s] 91%|█████████ | 8865/9770 [1:41:21<09:51,  1.53it/s] 91%|█████████ | 8866/9770 [1:41:21<09:50,  1.53it/s] 91%|█████████ | 8867/9770 [1:41:22<09:51,  1.53it/s] 91%|█████████ | 8868/9770 [1:41:23<10:00,  1.50it/s] 91%|█████████ | 8869/9770 [1:41:23<09:49,  1.53it/s] 91%|█████████ | 8870/9770 [1:41:24<09:45,  1.54it/s]                                                      91%|█████████ | 8870/9770 [1:41:24<09:45,  1.54it/s] 91%|█████████ | 8871/9770 [1:41:24<09:37,  1.56it/s] 91%|██████
+0: {'loss': 0.6528, 'grad_norm': 0.6286621798326656, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.91}
+0: ███ | 8872/9770 [1:41:25<09:38,  1.55it/s] 91%|█████████ | 8873/9770 [1:41:26<09:38,  1.55it/s] 91%|█████████ | 8874/9770 [1:41:26<09:37,  1.55it/s] 91%|█████████ | 8875/9770 [1:41:27<09:31,  1.57it/s] 91%|█████████ | 8876/9770 [1:41:28<09:36,  1.55it/s] 91%|█████████ | 8877/9770 [1:41:28<09:38,  1.54it/s] 91%|█████████ | 8878/9770 [1:41:29<09:43,  1.53it/s] 91%|█████████ | 8879/9770 [1:41:30<09:39,  1.54it/s] 91%|█████████ | 8880/9770 [1:41:30<09:36,  1.55it/s]                                                      91%|█████████ | 8880/9770 [1:41:30<09:36,  1.55it/s] 91%|█████████ | 8881/9770 [1:41:31<09:40,  1.53it/s] 91%|█████████ | 8882/9770 [1:41:32<09:39,  1.53it/s] 91%|█████████ | 8883/9770 [1:41:32<09:35,  1.54it/s] 91%|█████████ | 8884/9770 [1:41:33<09
+0: {'loss': 0.6772, 'grad_norm': 0.6469110840141102, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.91}
+0: :34,  1.54it/s] 91%|█████████ | 8885/9770 [1:41:34<09:36,  1.54it/s] 91%|█████████ | 8886/9770 [1:41:34<09:32,  1.54it/s] 91%|█████████ | 8887/9770 [1:41:35<09:36,  1.53it/s] 91%|█████████ | 8888/9770 [1:41:36<09:41,  1.52it/s] 91%|█████████ | 8889/9770 [1:41:36<09:34,  1.53it/s] 91%|█████████ | 8890/9770 [1:41:37<09:32,  1.54it/s]                                                      91%|█████████ | 8890/9770 [1:41:37<09:32,  1.54it/s] 91%|█████████ | 8891/9770 [1:41:37<09:36,  1.53it/s] 91%|█████████ | 8892/9770 [1:41:38<09:40,  1.51it/s] 91%|█████████ | 8893/9770 [1:41:39<09:41,  1.51it/s] 91%|█████████ | 8894/9770 [1:41:39<09:39,  1.51it/s] 91%|█████████ | 8895/9770 [1:41:40<09:31,  1.53it/s] 91%|█████████ | 8896/9770 [1:41:41<09:27,  1.54it/s] 91%|████
+0: {'loss': 0.6383, 'grad_norm': 0.6046227197190288, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.91}
+0: █████ | 8897/9770 [1:41:41<09:27,  1.54it/s] 91%|█████████ | 8898/9770 [1:41:42<09:26,  1.54it/s] 91%|█████████ | 8899/9770 [1:41:43<09:24,  1.54it/s] 91%|█████████ | 8900/9770 [1:41:43<09:18,  1.56it/s]                                                      91%|█████████ | 8900/9770 [1:41:43<09:18,  1.56it/s] 91%|█████████ | 8901/9770 [1:41:44<09:24,  1.54it/s] 91%|█████████ | 8902/9770 [1:41:45<09:23,  1.54it/s] 91%|█████████ | 8903/9770 [1:41:45<09:16,  1.56it/s] 91%|█████████ | 8904/9770 [1:41:46<09:12,  1.57it/s] 91%|█████████ | 8905/9770 [1:41:47<09:15,  1.56it/s] 91%|█████████ | 8906/9770 [1:41:47<09:16,  1.55it/s] 91%|█████████ | 8907/9770 [1:41:48<09:20,  1.54it/s] 91%|█████████ | 8908/9770 [1:41:49<09:21,  1.53it/s] 91%|█████████ | 8909/9770 [1:41
+0: {'loss': 0.6568, 'grad_norm': 0.5658794045844479, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.91}
+0: {'loss': 0.6175, 'grad_norm': 0.582273896832439, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.91}
+0: :49<09:15,  1.55it/s] 91%|█████████ | 8910/9770 [1:41:50<09:11,  1.56it/s]                                                      91%|█████████ | 8910/9770 [1:41:50<09:11,  1.56it/s] 91%|█████████ | 8911/9770 [1:41:50<09:14,  1.55it/s] 91%|█████████ | 8912/9770 [1:41:51<09:12,  1.55it/s] 91%|█████████ | 8913/9770 [1:41:52<09:26,  1.51it/s] 91%|█████████ | 8914/9770 [1:41:52<09:20,  1.53it/s] 91%|█████████ | 8915/9770 [1:41:53<09:18,  1.53it/s] 91%|█████████▏| 8916/9770 [1:41:54<09:23,  1.52it/s] 91%|█████████▏| 8917/9770 [1:41:54<09:30,  1.49it/s] 91%|█████████▏| 8918/9770 [1:41:55<09:38,  1.47it/s] 91%|█████████▏| 8919/9770 [1:41:56<09:26,  1.50it/s] 91%|█████████▏| 8920/9770 [1:41:56<09:20,  1.52it/s]                                                      91%|████�
+0: {'loss': 0.6511, 'grad_norm': 0.6349106806086336, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.91}
+0: ��████▏| 8920/9770 [1:41:56<09:20,  1.52it/s] 91%|█████████▏| 8921/9770 [1:41:57<09:13,  1.53it/s] 91%|█████████▏| 8922/9770 [1:41:58<09:08,  1.55it/s] 91%|█████████▏| 8923/9770 [1:41:58<09:06,  1.55it/s] 91%|█████████▏| 8924/9770 [1:41:59<09:16,  1.52it/s] 91%|█████████▏| 8925/9770 [1:42:00<09:28,  1.49it/s] 91%|█████████▏| 8926/9770 [1:42:00<09:23,  1.50it/s] 91%|█████████▏| 8927/9770 [1:42:01<09:17,  1.51it/s] 91%|█████████▏| 8928/9770 [1:42:02<09:15,  1.52it/s] 91%|█████████▏| 8929/9770 [1:42:02<09:15,  1.51it/s] 91%|█████████▏| 8930/9770 [1:42:03<09:13,  1.52it/s]                                                      91%|█████████▏| 8930/9770 [1:42:03<09:13,  1.52it/s] 91%|█████████▏| 8931/9770 [1:42:04<09:11,  1.52it/s] 91%|██████�
+0: {'loss': 0.6556, 'grad_norm': 0.5848014910791852, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.91}
+0: �██▏| 8932/9770 [1:42:05<10:03,  1.39it/s] 91%|█████████▏| 8933/9770 [1:42:05<09:55,  1.41it/s] 91%|█████████▏| 8934/9770 [1:42:06<09:32,  1.46it/s] 91%|█████████▏| 8935/9770 [1:42:06<09:16,  1.50it/s] 91%|█████████▏| 8936/9770 [1:42:07<09:16,  1.50it/s] 91%|█████████▏| 8937/9770 [1:42:08<09:14,  1.50it/s] 91%|█████████▏| 8938/9770 [1:42:08<09:09,  1.51it/s] 91%|█████████▏| 8939/9770 [1:42:09<09:03,  1.53it/s] 92%|█████████▏| 8940/9770 [1:42:10<09:00,  1.53it/s]                                                      92%|█████████▏| 8940/9770 [1:42:10<09:00,  1.53it/s] 92%|█████████▏| 8941/9770 [1:42:10<09:07,  1.52it/s] 92%|█████████▏| 8942/9770 [1:42:11<09:13,  1.50it/s] 92%|█████████▏| 8943/9770 [1:42:12<09:13,  1.50it/s] 92%|█████████
+0: {'loss': 0.6346, 'grad_norm': 0.6121770653829334, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.92}
+0: ▏| 8944/9770 [1:42:12<09:11,  1.50it/s] 92%|█████████▏| 8945/9770 [1:42:13<09:07,  1.51it/s] 92%|█████████▏| 8946/9770 [1:42:14<09:07,  1.51it/s] 92%|█████████▏| 8947/9770 [1:42:14<09:01,  1.52it/s] 92%|█████████▏| 8948/9770 [1:42:15<08:55,  1.53it/s] 92%|█████████▏| 8949/9770 [1:42:16<09:00,  1.52it/s] 92%|█████████▏| 8950/9770 [1:42:16<08:52,  1.54it/s]                                                      92%|█████████▏| 8950/9770 [1:42:16<08:52,  1.54it/s] 92%|█████████▏| 8951/9770 [1:42:17<09:07,  1.50it/s] 92%|█████████▏| 8952/9770 [1:42:18<09:02,  1.51it/s] 92%|█████████▏| 8953/9770 [1:42:18<08:57,  1.52it/s] 92%|█████████▏| 8954/9770 [1:42:19<08:54,  1.53it/s] 92%|█████████▏| 8955/9770 [1:42:20<08:49,  1.54it/s] 92%|█████████▏| 89
+0: {'loss': 0.6385, 'grad_norm': 0.6648868044028359, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.92}
+0: 56/9770 [1:42:20<08:44,  1.55it/s] 92%|█████████▏| 8957/9770 [1:42:21<08:48,  1.54it/s] 92%|█████████▏| 8958/9770 [1:42:22<08:40,  1.56it/s] 92%|█████████▏| 8959/9770 [1:42:22<08:56,  1.51it/s] 92%|█████████▏| 8960/9770 [1:42:23<08:54,  1.51it/s]                                                      92%|█████████▏| 8960/9770 [1:42:23<08:54,  1.51it/s] 92%|█████████▏| 8961/9770 [1:42:24<08:52,  1.52it/s] 92%|█████████▏| 8962/9770 [1:42:24<08:51,  1.52it/s] 92%|█████████▏| 8963/9770 [1:42:25<08:49,  1.53it/s] 92%|█████████▏| 8964/9770 [1:42:26<08:44,  1.54it/s] 92%|█████████▏| 8965/9770 [1:42:26<08:45,  1.53it/s] 92%|█████████▏| 8966/9770 [1:42:27<08:48,  1.52it/s] 92%|█████████▏| 8967/9770 [1:42:27<08:46,  1.53it/s] 92%|█████████▏| 8968/9770
+0: {'loss': 0.656, 'grad_norm': 0.6048487738666392, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.92}
+0:  [1:42:28<08:41,  1.54it/s] 92%|█████████▏| 8969/9770 [1:42:29<08:41,  1.54it/s] 92%|█████████▏| 8970/9770 [1:42:29<08:43,  1.53it/s]                                                      92%|█████████▏| 8970/9770 [1:42:29<08:43,  1.53it/s] 92%|█████████▏| 8971/9770 [1:42:30<08:43,  1.52it/s] 92%|█████████▏| 8972/9770 [1:42:31<08:32,  1.56it/s] 92%|█████████▏| 8973/9770 [1:42:31<08:32,  1.55it/s] 92%|█████████▏| 8974/9770 [1:42:32<08:28,  1.56it/s] 92%|█████████▏| 8975/9770 [1:42:33<08:30,  1.56it/s] 92%|█████████▏| 8976/9770 [1:42:33<08:29,  1.56it/s] 92%|█████████▏| 8977/9770 [1:42:34<08:25,  1.57it/s] 92%|█████████▏| 8978/9770 [1:42:35<08:35,  1.54it/s] 92%|█████████▏| 8979/9770 [1:42:35<08:35,  1.53it/s] 92%|█████████▏| 8980/9770 [1:42:
+0: {'loss': 0.6545, 'grad_norm': 0.6316234817479406, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.92}
+0: {'loss': 0.6541, 'grad_norm': 0.6356930256668423, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.92}
+0: 36<08:30,  1.55it/s]                                                      92%|█████████▏| 8980/9770 [1:42:36<08:30,  1.55it/s] 92%|█████████▏| 8981/9770 [1:42:37<08:33,  1.54it/s] 92%|█████████▏| 8982/9770 [1:42:37<08:33,  1.53it/s] 92%|█████████▏| 8983/9770 [1:42:38<08:26,  1.55it/s] 92%|█████████▏| 8984/9770 [1:42:38<08:32,  1.53it/s] 92%|█████████▏| 8985/9770 [1:42:39<08:33,  1.53it/s] 92%|█████████▏| 8986/9770 [1:42:40<08:31,  1.53it/s] 92%|█████████▏| 8987/9770 [1:42:40<08:29,  1.54it/s] 92%|█████████▏| 8988/9770 [1:42:41<08:28,  1.54it/s] 92%|█████████▏| 8989/9770 [1:42:42<08:31,  1.53it/s] 92%|█████████▏| 8990/9770 [1:42:42<08:23,  1.55it/s]                                                      92%|█████████▏| 8990/9770 [1:42:42<08:23,  1.55it/s] 92%|
+0: {'loss': 0.6454, 'grad_norm': 0.6145826754778619, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.92}
+0: █████████▏| 8991/9770 [1:42:43<08:26,  1.54it/s] 92%|█████████▏| 8992/9770 [1:42:44<08:23,  1.54it/s] 92%|█████████▏| 8993/9770 [1:42:44<08:24,  1.54it/s] 92%|█████████▏| 8994/9770 [1:42:45<08:23,  1.54it/s] 92%|█████████▏| 8995/9770 [1:42:46<08:21,  1.55it/s] 92%|█████████▏| 8996/9770 [1:42:46<08:18,  1.55it/s] 92%|█████████▏| 8997/9770 [1:42:47<08:19,  1.55it/s] 92%|█████████▏| 8998/9770 [1:42:48<08:25,  1.53it/s] 92%|█████████▏| 8999/9770 [1:42:48<08:22,  1.53it/s] 92%|█████████▏| 9000/9770 [1:42:49<08:14,  1.56it/s]                                                      92%|█████████▏| 9000/9770 [1:42:49<08:14,  1.56it/s] 92%|█████████▏| 9001/9770 [1:42:50<08:17,  1.55it/s] 92%|█████████▏| 9002/9770 [1:42:50<08:12,  1.56it/s] 92%|██�
+0: {'loss': 0.6432, 'grad_norm': 0.6090897459389449, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.92}
+0: ��██████▏| 9003/9770 [1:42:51<08:13,  1.56it/s] 92%|█████████▏| 9004/9770 [1:42:51<08:13,  1.55it/s] 92%|█████████▏| 9005/9770 [1:42:52<08:27,  1.51it/s] 92%|█████████▏| 9006/9770 [1:42:53<08:22,  1.52it/s] 92%|█████████▏| 9007/9770 [1:42:53<08:16,  1.54it/s] 92%|█████████▏| 9008/9770 [1:42:54<08:18,  1.53it/s] 92%|█████████▏| 9009/9770 [1:42:55<08:19,  1.52it/s] 92%|█████████▏| 9010/9770 [1:42:55<08:19,  1.52it/s]                                                      92%|█████████▏| 9010/9770 [1:42:55<08:19,  1.52it/s] 92%|█████████▏| 9011/9770 [1:42:56<08:21,  1.51it/s] 92%|█████████▏| 9012/9770 [1:42:57<08:19,  1.52it/s] 92%|█████████▏| 9013/9770 [1:42:57<08:14,  1.53it/s] 92%|█████████▏| 9014/9770 [1:42:58<08:19,  1.51it/s] 92%|████�
+0: {'loss': 0.6448, 'grad_norm': 0.5943273866248485, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.92}
+0: �████▏| 9015/9770 [1:42:59<08:16,  1.52it/s] 92%|█████████▏| 9016/9770 [1:42:59<08:16,  1.52it/s] 92%|█████████▏| 9017/9770 [1:43:00<08:15,  1.52it/s] 92%|█████████▏| 9018/9770 [1:43:01<08:17,  1.51it/s] 92%|█████████▏| 9019/9770 [1:43:01<08:18,  1.51it/s] 92%|█████████▏| 9020/9770 [1:43:02<08:12,  1.52it/s]                                                      92%|█████████▏| 9020/9770 [1:43:02<08:12,  1.52it/s] 92%|█████████▏| 9021/9770 [1:43:03<08:19,  1.50it/s] 92%|█████████▏| 9022/9770 [1:43:03<08:12,  1.52it/s] 92%|█████████▏| 9023/9770 [1:43:04<08:06,  1.54it/s] 92%|█████████▏| 9024/9770 [1:43:05<08:15,  1.51it/s] 92%|█████████▏| 9025/9770 [1:43:05<08:08,  1.53it/s] 92%|█████████▏| 9026/9770 [1:43:06<08:02,  1.54it/s] 92%|███████
+0: {'loss': 0.6458, 'grad_norm': 0.588879941053277, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.92}
+0: ██▏| 9027/9770 [1:43:07<08:01,  1.54it/s] 92%|█████████▏| 9028/9770 [1:43:07<08:00,  1.54it/s] 92%|█████████▏| 9029/9770 [1:43:08<07:57,  1.55it/s] 92%|█████████▏| 9030/9770 [1:43:08<07:55,  1.56it/s]                                                      92%|█████████▏| 9030/9770 [1:43:08<07:55,  1.56it/s] 92%|█████████▏| 9031/9770 [1:43:09<07:59,  1.54it/s] 92%|█████████▏| 9032/9770 [1:43:10<08:01,  1.53it/s] 92%|█████████▏| 9033/9770 [1:43:10<07:58,  1.54it/s] 92%|█████████▏| 9034/9770 [1:43:11<07:56,  1.55it/s] 92%|█████████▏| 9035/9770 [1:43:12<07:54,  1.55it/s] 92%|█████████▏| 9036/9770 [1:43:12<07:50,  1.56it/s] 92%|█████████▏| 9037/9770 [1:43:13<07:47,  1.57it/s] 93%|█████████▎| 9038/9770 [1:43:14<07:53,  1.55it/s] 93%|█████████�
+0: {'loss': 0.6592, 'grad_norm': 0.6252651380238585, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.92}
+0: ��| 9039/9770 [1:43:14<07:50,  1.55it/s] 93%|█████████▎| 9040/9770 [1:43:15<07:56,  1.53it/s]                                                      93%|█████████▎| 9040/9770 [1:43:15<07:56,  1.53it/s] 93%|█████████▎| 9041/9770 [1:43:16<07:56,  1.53it/s] 93%|█████████▎| 9042/9770 [1:43:16<08:00,  1.52it/s] 93%|█████████▎| 9043/9770 [1:43:17<07:58,  1.52it/s] 93%|█████████▎| 9044/9770 [1:43:18<07:59,  1.51it/s] 93%|█████████▎| 9045/9770 [1:43:18<08:04,  1.50it/s] 93%|█████████▎| 9046/9770 [1:43:19<08:01,  1.50it/s] 93%|█████████▎| 9047/9770 [1:43:20<08:05,  1.49it/s] 93%|█████████▎| 9048/9770 [1:43:20<07:54,  1.52it/s] 93%|█████████▎| 9049/9770 [1:43:21<08:03,  1.49it/s] 93%|█████████▎| 9050/9770 [1:43:22<07:58,  1.51it/s]                                        
+0: {'loss': 0.6788, 'grad_norm': 0.5998442737974874, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.93}
+0: {'loss': 0.6543, 'grad_norm': 0.5963629694320636, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.93}
+0:               93%|█████████▎| 9050/9770 [1:43:22<07:58,  1.51it/s] 93%|█████████▎| 9051/9770 [1:43:22<07:56,  1.51it/s] 93%|█████████▎| 9052/9770 [1:43:23<07:58,  1.50it/s] 93%|█████████▎| 9053/9770 [1:43:24<07:55,  1.51it/s] 93%|█████████▎| 9054/9770 [1:43:24<07:53,  1.51it/s] 93%|█████████▎| 9055/9770 [1:43:25<07:50,  1.52it/s] 93%|█████████▎| 9056/9770 [1:43:26<07:43,  1.54it/s] 93%|█████████▎| 9057/9770 [1:43:26<07:46,  1.53it/s] 93%|█████████▎| 9058/9770 [1:43:27<07:44,  1.53it/s] 93%|█████████▎| 9059/9770 [1:43:27<07:38,  1.55it/s] 93%|█████████▎| 9060/9770 [1:43:28<07:39,  1.55it/s]                                                      93%|████████��▎| 9060/9770 [1:43:28<07:39,  1.55it/s] 93%|█████████▎| 9061/9770 [1:43:29<07:41,  1.
+0: {'loss': 0.6454, 'grad_norm': 0.5722944897162896, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.93}
+0: 54it/s] 93%|█████████▎| 9062/9770 [1:43:29<07:40,  1.54it/s] 93%|█████████▎| 9063/9770 [1:43:30<07:37,  1.55it/s] 93%|█████████▎| 9064/9770 [1:43:31<07:37,  1.54it/s] 93%|█████████▎| 9065/9770 [1:43:31<07:36,  1.55it/s] 93%|█████████▎| 9066/9770 [1:43:32<07:41,  1.53it/s] 93%|█████████▎| 9067/9770 [1:43:33<07:36,  1.54it/s] 93%|█████████▎| 9068/9770 [1:43:33<07:34,  1.55it/s] 93%|█████████▎| 9069/9770 [1:43:34<07:33,  1.55it/s] 93%|█████████▎| 9070/9770 [1:43:35<08:11,  1.43it/s]                                                      93%|█████████▎| 9070/9770 [1:43:35<08:11,  1.43it/s] 93%|█████████▎| 9071/9770 [1:43:35<08:01,  1.45it/s] 93%|█████████▎| 9072/9770 [1:43:36<07:48,  1.49it/s] 93%|█████████▎| 9073/9770 [1:43:37<07:46,  1.49it/s]
+0: {'loss': 0.6515, 'grad_norm': 0.5960115992769335, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.93}
+0:  93%|█████████▎| 9074/9770 [1:43:37<07:52,  1.47it/s] 93%|█████████▎| 9075/9770 [1:43:38<07:58,  1.45it/s] 93%|█████████▎| 9076/9770 [1:43:39<07:47,  1.48it/s] 93%|█████████▎| 9077/9770 [1:43:39<07:46,  1.49it/s] 93%|█████████▎| 9078/9770 [1:43:40<07:42,  1.50it/s] 93%|█████████▎| 9079/9770 [1:43:41<07:38,  1.51it/s] 93%|█████████▎| 9080/9770 [1:43:41<07:31,  1.53it/s]                                                      93%|█████████▎| 9080/9770 [1:43:41<07:31,  1.53it/s] 93%|█████████▎| 9081/9770 [1:43:42<07:30,  1.53it/s] 93%|█████████▎| 9082/9770 [1:43:43<07:27,  1.54it/s] 93%|█████████▎| 9083/9770 [1:43:43<07:23,  1.55it/s] 93%|█████████▎| 9084/9770 [1:43:44<07:15,  1.58it/s] 93%|█████████▎| 9085/9770 [1:43:45<07:17,  1.57it/s] 93%|�
+0: {'loss': 0.6481, 'grad_norm': 0.5685386914118158, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.93}
+0: ��████████▎| 9086/9770 [1:43:45<07:20,  1.55it/s] 93%|█████████▎| 9087/9770 [1:43:46<07:15,  1.57it/s] 93%|█████████▎| 9088/9770 [1:43:47<07:20,  1.55it/s] 93%|█████████▎| 9089/9770 [1:43:47<07:17,  1.56it/s] 93%|█████████▎| 9090/9770 [1:43:48<07:18,  1.55it/s]                                                      93%|█████████▎| 9090/9770 [1:43:48<07:18,  1.55it/s] 93%|█████████▎| 9091/9770 [1:43:48<07:17,  1.55it/s] 93%|█████████▎| 9092/9770 [1:43:49<07:15,  1.56it/s] 93%|█████████▎| 9093/9770 [1:43:50<07:12,  1.57it/s] 93%|█████████▎| 9094/9770 [1:43:50<07:09,  1.57it/s] 93%|█████████▎| 9095/9770 [1:43:51<07:10,  1.57it/s] 93%|█████████▎| 9096/9770 [1:43:52<07:12,  1.56it/s] 93%|█████████▎| 9097/9770 [1:43:52<07:18,  1.53it/s] 93%|██�
+0: {'loss': 0.6456, 'grad_norm': 0.5753653422857062, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.93}
+0: �██████▎| 9098/9770 [1:43:53<07:21,  1.52it/s] 93%|█████████▎| 9099/9770 [1:43:54<07:18,  1.53it/s] 93%|█████████▎| 9100/9770 [1:43:54<07:19,  1.52it/s]                                                      93%|█████████▎| 9100/9770 [1:43:54<07:19,  1.52it/s] 93%|█████████▎| 9101/9770 [1:43:55<07:18,  1.53it/s] 93%|█████████▎| 9102/9770 [1:43:56<07:21,  1.51it/s] 93%|█████████▎| 9103/9770 [1:43:56<07:30,  1.48it/s] 93%|█████████▎| 9104/9770 [1:43:57<07:25,  1.50it/s] 93%|█████████▎| 9105/9770 [1:43:58<07:19,  1.51it/s] 93%|█████████▎| 9106/9770 [1:43:58<07:17,  1.52it/s] 93%|█████████▎| 9107/9770 [1:43:59<07:16,  1.52it/s] 93%|█████████▎| 9108/9770 [1:44:00<07:14,  1.52it/s] 93%|█████████▎| 9109/9770 [1:44:00<07:08,  1.54it/s] 93%|█████
+0: {'loss': 0.6493, 'grad_norm': 0.5896975597523512, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.93}
+0: {'loss': 0.6599, 'grad_norm': 0.6162107450486746, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.93}
+0: ████▎| 9110/9770 [1:44:01<07:06,  1.55it/s]                                                      93%|█████████▎| 9110/9770 [1:44:01<07:06,  1.55it/s] 93%|█████████▎| 9111/9770 [1:44:02<07:05,  1.55it/s] 93%|█████████▎| 9112/9770 [1:44:02<07:04,  1.55it/s] 93%|█████████▎| 9113/9770 [1:44:03<06:59,  1.57it/s] 93%|█████████▎| 9114/9770 [1:44:04<07:41,  1.42it/s] 93%|█████████▎| 9115/9770 [1:44:04<07:39,  1.43it/s] 93%|█████████▎| 9116/9770 [1:44:05<07:31,  1.45it/s] 93%|█████████▎| 9117/9770 [1:44:06<07:17,  1.49it/s] 93%|█████████▎| 9118/9770 [1:44:06<07:04,  1.53it/s] 93%|█████████▎| 9119/9770 [1:44:07<07:04,  1.53it/s] 93%|█████████▎| 9120/9770 [1:44:08<07:02,  1.54it/s]                                                      93%|█████████▎| 9120/9770
+0: {'loss': 0.6357, 'grad_norm': 0.5991133982171295, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.93}
+0:  [1:44:08<07:02,  1.54it/s] 93%|█████████▎| 9121/9770 [1:44:08<07:05,  1.52it/s] 93%|█████████▎| 9122/9770 [1:44:09<07:01,  1.54it/s] 93%|█████████▎| 9123/9770 [1:44:10<07:06,  1.52it/s] 93%|█████████▎| 9124/9770 [1:44:10<06:59,  1.54it/s] 93%|█████████▎| 9125/9770 [1:44:11<06:53,  1.56it/s] 93%|█████████▎| 9126/9770 [1:44:11<06:54,  1.55it/s] 93%|█████████▎| 9127/9770 [1:44:12<06:57,  1.54it/s] 93%|█████████▎| 9128/9770 [1:44:13<06:51,  1.56it/s] 93%|█████████▎| 9129/9770 [1:44:13<06:51,  1.56it/s] 93%|█████████▎| 9130/9770 [1:44:14<06:54,  1.54it/s]                                                      93%|█████████▎| 9130/9770 [1:44:14<06:54,  1.54it/s] 93%|█████████▎| 9131/9770 [1:44:15<06:53,  1.55it/s] 93%|█████████▎| 9132/9770 [1:44:
+0: {'loss': 0.6574, 'grad_norm': 0.6206182712005641, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.94}
+0: 15<06:53,  1.54it/s] 93%|█████████▎| 9133/9770 [1:44:16<06:58,  1.52it/s] 93%|█████████▎| 9134/9770 [1:44:17<06:55,  1.53it/s] 94%|█████████▎| 9135/9770 [1:44:17<06:57,  1.52it/s] 94%|█████████▎| 9136/9770 [1:44:18<06:53,  1.53it/s] 94%|█████████▎| 9137/9770 [1:44:19<07:01,  1.50it/s] 94%|█████████▎| 9138/9770 [1:44:19<06:53,  1.53it/s] 94%|█████████▎| 9139/9770 [1:44:20<06:50,  1.54it/s] 94%|█████████▎| 9140/9770 [1:44:21<06:47,  1.54it/s]                                                      94%|█████████▎| 9140/9770 [1:44:21<06:47,  1.54it/s] 94%|█████████▎| 9141/9770 [1:44:21<06:47,  1.55it/s] 94%|█████████▎| 9142/9770 [1:44:22<06:45,  1.55it/s] 94%|█████████▎| 9143/9770 [1:44:22<06:44,  1.55it/s] 94%|█████████▎| 9144/9770 [1:44:23<06:5
+0: {'loss': 0.6345, 'grad_norm': 0.6449612752162264, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.94}
+0: 1,  1.52it/s] 94%|█████████▎| 9145/9770 [1:44:24<06:54,  1.51it/s] 94%|█████████▎| 9146/9770 [1:44:25<07:00,  1.48it/s] 94%|█████████▎| 9147/9770 [1:44:25<06:54,  1.50it/s] 94%|█████████▎| 9148/9770 [1:44:26<06:50,  1.51it/s] 94%|█████████▎| 9149/9770 [1:44:27<06:55,  1.49it/s] 94%|█████████▎| 9150/9770 [1:44:27<06:54,  1.50it/s]                                                      94%|█████████▎| 9150/9770 [1:44:27<06:54,  1.50it/s] 94%|█████████▎| 9151/9770 [1:44:28<06:45,  1.53it/s] 94%|█████████▎| 9152/9770 [1:44:28<06:43,  1.53it/s] 94%|█████████▎| 9153/9770 [1:44:29<06:40,  1.54it/s] 94%|█████████▎| 9154/9770 [1:44:30<06:40,  1.54it/s] 94%|█████████▎| 9155/9770 [1:44:30<06:39,  1.54it/s] 94%|█████████▎| 9156/9770 [1:44:31<06:38,  1.5
+0: {'loss': 0.6433, 'grad_norm': 0.6309728032732347, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.94}
+0: 4it/s] 94%|█████████▎| 9157/9770 [1:44:32<06:38,  1.54it/s] 94%|█████████▎| 9158/9770 [1:44:32<06:38,  1.54it/s] 94%|█████████▎| 9159/9770 [1:44:33<06:37,  1.54it/s] 94%|█████████▍| 9160/9770 [1:44:34<06:32,  1.55it/s]                                                      94%|█████████▍| 9160/9770 [1:44:34<06:32,  1.55it/s] 94%|█████████▍| 9161/9770 [1:44:34<06:33,  1.55it/s] 94%|█████████▍| 9162/9770 [1:44:35<06:33,  1.54it/s] 94%|█████████▍| 9163/9770 [1:44:36<06:40,  1.52it/s] 94%|█████████▍| 9164/9770 [1:44:36<06:47,  1.49it/s] 94%|█████████▍| 9165/9770 [1:44:37<06:49,  1.48it/s] 94%|█████████▍| 9166/9770 [1:44:38<06:41,  1.50it/s] 94%|█████████▍| 9167/9770 [1:44:38<06:39,  1.51it/s] 94%|█████████▍| 9168/9770 [1:44:39<06:34,  1.53it/s]
+0: {'loss': 0.6532, 'grad_norm': 0.6295363364371517, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.94}
+0:  94%|█████████▍| 9169/9770 [1:44:40<06:31,  1.54it/s] 94%|█████████▍| 9170/9770 [1:44:40<06:26,  1.55it/s]                                                      94%|█████████▍| 9170/9770 [1:44:40<06:26,  1.55it/s] 94%|█████████▍| 9171/9770 [1:44:41<06:25,  1.55it/s] 94%|█████████▍| 9172/9770 [1:44:42<06:25,  1.55it/s] 94%|█████████▍| 9173/9770 [1:44:42<06:30,  1.53it/s] 94%|█████████▍| 9174/9770 [1:44:43<06:27,  1.54it/s] 94%|█████████▍| 9175/9770 [1:44:44<06:33,  1.51it/s] 94%|█████████▍| 9176/9770 [1:44:44<06:33,  1.51it/s] 94%|█████████▍| 9177/9770 [1:44:45<06:33,  1.51it/s] 94%|█████████▍| 9178/9770 [1:44:45<06:27,  1.53it/s] 94%|█████████▍| 9179/9770 [1:44:46<06:26,  1.53it/s] 94%|█████████▍| 9180/9770 [1:44:47<06:20,  1.55it/s]       
+0: {'loss': 0.6506, 'grad_norm': 0.6223546298723089, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.94}
+0: {'loss': 0.6593, 'grad_norm': 0.6518014406327484, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.94}
+0:                                                94%|█████████▍| 9180/9770 [1:44:47<06:20,  1.55it/s] 94%|█████████▍| 9181/9770 [1:44:47<06:20,  1.55it/s] 94%|█████████▍| 9182/9770 [1:44:48<06:29,  1.51it/s] 94%|█████████▍| 9183/9770 [1:44:49<06:37,  1.48it/s] 94%|█████████▍| 9184/9770 [1:44:49<06:29,  1.50it/s] 94%|█████████▍| 9185/9770 [1:44:50<06:32,  1.49it/s] 94%|█████████▍| 9186/9770 [1:44:51<06:35,  1.47it/s] 94%|█████████▍| 9187/9770 [1:44:51<06:29,  1.50it/s] 94%|█████████▍| 9188/9770 [1:44:52<06:35,  1.47it/s] 94%|█████████▍| 9189/9770 [1:44:53<06:30,  1.49it/s] 94%|█████████▍| 9190/9770 [1:44:54<06:29,  1.49it/s]                                                      94%|█████████▍| 9190/9770 [1:44:54<06:29,  1.49it/s] 94%|█████████�
+0: {'loss': 0.6442, 'grad_norm': 0.5878341105812127, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.94}
+0: ��| 9191/9770 [1:44:54<06:26,  1.50it/s] 94%|█████████▍| 9192/9770 [1:44:55<06:21,  1.52it/s] 94%|█████████▍| 9193/9770 [1:44:55<06:21,  1.51it/s] 94%|█████████▍| 9194/9770 [1:44:56<06:18,  1.52it/s] 94%|█████████▍| 9195/9770 [1:44:57<06:16,  1.53it/s] 94%|█████████▍| 9196/9770 [1:44:57<06:17,  1.52it/s] 94%|█████████▍| 9197/9770 [1:44:58<06:12,  1.54it/s] 94%|█████████▍| 9198/9770 [1:44:59<06:21,  1.50it/s] 94%|█████████▍| 9199/9770 [1:44:59<06:14,  1.52it/s] 94%|█████████▍| 9200/9770 [1:45:00<06:11,  1.54it/s]                                                      94%|█████████▍| 9200/9770 [1:45:00<06:11,  1.54it/s] 94%|█████████▍| 9201/9770 [1:45:01<06:07,  1.55it/s] 94%|█████████▍| 9202/9770 [1:45:01<06:05,  1.56it/s] 94%|█████████▍| 920
+0: {'loss': 0.6617, 'grad_norm': 0.6047952422693393, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.94}
+0: 3/9770 [1:45:02<06:05,  1.55it/s] 94%|█████████▍| 9204/9770 [1:45:03<06:07,  1.54it/s] 94%|█████████▍| 9205/9770 [1:45:03<06:02,  1.56it/s] 94%|█████████▍| 9206/9770 [1:45:04<06:02,  1.56it/s] 94%|█████████▍| 9207/9770 [1:45:05<06:01,  1.56it/s] 94%|█████████▍| 9208/9770 [1:45:05<06:05,  1.54it/s] 94%|█████████▍| 9209/9770 [1:45:06<06:05,  1.54it/s] 94%|█████████▍| 9210/9770 [1:45:07<06:11,  1.51it/s]                                                      94%|█████████▍| 9210/9770 [1:45:07<06:11,  1.51it/s] 94%|█████████▍| 9211/9770 [1:45:07<06:13,  1.50it/s] 94%|█████████▍| 9212/9770 [1:45:08<06:12,  1.50it/s] 94%|█████████▍| 9213/9770 [1:45:09<06:13,  1.49it/s] 94%|█████████▍| 9214/9770 [1:45:09<06:13,  1.49it/s] 94%|█████████▍| 9215/9770 
+0: {'loss': 0.6493, 'grad_norm': 0.5983847840084231, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.94}
+0: [1:45:10<06:12,  1.49it/s] 94%|█████████▍| 9216/9770 [1:45:11<06:07,  1.51it/s] 94%|█████████▍| 9217/9770 [1:45:11<06:06,  1.51it/s] 94%|█████████▍| 9218/9770 [1:45:12<05:58,  1.54it/s] 94%|█████████▍| 9219/9770 [1:45:12<05:58,  1.54it/s] 94%|█████████▍| 9220/9770 [1:45:13<05:56,  1.54it/s]                                                      94%|█████████▍| 9220/9770 [1:45:13<05:56,  1.54it/s] 94%|█████████▍| 9221/9770 [1:45:14<05:55,  1.54it/s] 94%|█████████▍| 9222/9770 [1:45:14<06:02,  1.51it/s] 94%|█████████▍| 9223/9770 [1:45:15<06:01,  1.51it/s] 94%|█████████▍| 9224/9770 [1:45:16<05:59,  1.52it/s] 94%|█████████▍| 9225/9770 [1:45:16<06:00,  1.51it/s] 94%|█████████▍| 9226/9770 [1:45:17<06:05,  1.49it/s] 94%|█████████▍| 9227/9770 [1:45:1
+0: {'loss': 0.6635, 'grad_norm': 0.5806640127898456, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.94}
+0: 8<05:57,  1.52it/s] 94%|█████████▍| 9228/9770 [1:45:18<05:56,  1.52it/s] 94%|█████████▍| 9229/9770 [1:45:19<05:56,  1.52it/s] 94%|█████████▍| 9230/9770 [1:45:20<05:51,  1.54it/s]                                                      94%|█████████▍| 9230/9770 [1:45:20<05:51,  1.54it/s] 94%|█████████▍| 9231/9770 [1:45:20<05:47,  1.55it/s] 94%|█████████▍| 9232/9770 [1:45:21<05:41,  1.58it/s] 95%|█████████▍| 9233/9770 [1:45:22<05:44,  1.56it/s] 95%|█████████▍| 9234/9770 [1:45:22<05:45,  1.55it/s] 95%|█████████▍| 9235/9770 [1:45:23<05:45,  1.55it/s] 95%|█████████▍| 9236/9770 [1:45:24<05:46,  1.54it/s] 95%|█████████▍| 9237/9770 [1:45:24<05:50,  1.52it/s] 95%|█████████▍| 9238/9770 [1:45:25<05:51,  1.51it/s] 95%|█████████▍| 9239/9770 [1:45:26<05:47
+0: {'loss': 0.6393, 'grad_norm': 0.6085019158392394, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.95}
+0: {'loss': 0.6393, 'grad_norm': 0.6722992955193487, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.95}
+0: ,  1.53it/s] 95%|█████████▍| 9240/9770 [1:45:26<05:46,  1.53it/s]                                                      95%|█████████▍| 9240/9770 [1:45:26<05:46,  1.53it/s] 95%|█████████▍| 9241/9770 [1:45:27<05:51,  1.51it/s] 95%|█████████▍| 9242/9770 [1:45:28<05:52,  1.50it/s] 95%|█████████▍| 9243/9770 [1:45:28<05:48,  1.51it/s] 95%|█████████▍| 9244/9770 [1:45:29<05:43,  1.53it/s] 95%|█████████▍| 9245/9770 [1:45:30<05:43,  1.53it/s] 95%|█████████▍| 9246/9770 [1:45:30<05:50,  1.50it/s] 95%|█████████▍| 9247/9770 [1:45:31<05:47,  1.50it/s] 95%|█████████▍| 9248/9770 [1:45:31<05:37,  1.55it/s] 95%|█████████▍| 9249/9770 [1:45:32<05:35,  1.55it/s] 95%|█████���███▍| 9250/9770 [1:45:33<05:36,  1.55it/s]                                                      95%|██�
+0: {'loss': 0.6574, 'grad_norm': 0.6577717992564499, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.95}
+0: �██████▍| 9250/9770 [1:45:33<05:36,  1.55it/s] 95%|█████████▍| 9251/9770 [1:45:33<05:32,  1.56it/s] 95%|█████████▍| 9252/9770 [1:45:34<05:32,  1.56it/s] 95%|█████████▍| 9253/9770 [1:45:35<05:33,  1.55it/s] 95%|█████████▍| 9254/9770 [1:45:35<05:40,  1.51it/s] 95%|█████████▍| 9255/9770 [1:45:36<05:33,  1.54it/s] 95%|█████████▍| 9256/9770 [1:45:37<05:35,  1.53it/s] 95%|█████████▍| 9257/9770 [1:45:37<05:33,  1.54it/s] 95%|█████████▍| 9258/9770 [1:45:38<05:32,  1.54it/s] 95%|█████████▍| 9259/9770 [1:45:39<05:31,  1.54it/s] 95%|█████████▍| 9260/9770 [1:45:39<05:33,  1.53it/s]                                                      95%|█████████▍| 9260/9770 [1:45:39<05:33,  1.53it/s] 95%|█████████▍| 9261/9770 [1:45:40<05:33,  1.53it/s] 95%|█████
+0: {'loss': 0.635, 'grad_norm': 0.6106677820634747, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.95}
+0: ████▍| 9262/9770 [1:45:41<05:32,  1.53it/s] 95%|█████████▍| 9263/9770 [1:45:41<05:31,  1.53it/s] 95%|█████████▍| 9264/9770 [1:45:42<05:29,  1.53it/s] 95%|█████████▍| 9265/9770 [1:45:43<05:26,  1.55it/s] 95%|█████████▍| 9266/9770 [1:45:43<05:25,  1.55it/s] 95%|█████████▍| 9267/9770 [1:45:44<05:24,  1.55it/s] 95%|█████████▍| 9268/9770 [1:45:44<05:24,  1.55it/s] 95%|█████████▍| 9269/9770 [1:45:45<05:30,  1.52it/s] 95%|█████████▍| 9270/9770 [1:45:46<05:26,  1.53it/s]                                                      95%|█████████▍| 9270/9770 [1:45:46<05:26,  1.53it/s] 95%|█████████▍| 9271/9770 [1:45:46<05:32,  1.50it/s] 95%|█████████▍| 9272/9770 [1:45:47<05:24,  1.54it/s] 95%|█████████▍| 9273/9770 [1:45:48<05:18,  1.56it/s] 95%|███████�
+0: {'loss': 0.6459, 'grad_norm': 0.5892172470406517, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.95}
+0: ��█▍| 9274/9770 [1:45:48<05:15,  1.57it/s] 95%|█████████▍| 9275/9770 [1:45:49<05:17,  1.56it/s] 95%|█████████▍| 9276/9770 [1:45:50<05:15,  1.57it/s] 95%|█████████▍| 9277/9770 [1:45:50<05:16,  1.56it/s] 95%|█████████▍| 9278/9770 [1:45:51<05:26,  1.51it/s] 95%|█████████▍| 9279/9770 [1:45:52<05:21,  1.53it/s] 95%|█████████▍| 9280/9770 [1:45:52<05:24,  1.51it/s]                                                      95%|█████████▍| 9280/9770 [1:45:52<05:24,  1.51it/s] 95%|█████████▍| 9281/9770 [1:45:53<05:23,  1.51it/s] 95%|█████████▌| 9282/9770 [1:45:54<05:19,  1.53it/s] 95%|█████████▌| 9283/9770 [1:45:54<05:15,  1.54it/s] 95%|█████████▌| 9284/9770 [1:45:55<05:12,  1.55it/s] 95%|█████████▌| 9285/9770 [1:45:56<05:10,  1.56it/s] 95%|█████████�
+0: {'loss': 0.6606, 'grad_norm': 0.6250498018994596, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.95}
+0: �| 9286/9770 [1:45:56<05:08,  1.57it/s] 95%|█████████▌| 9287/9770 [1:45:57<05:18,  1.52it/s] 95%|█████████▌| 9288/9770 [1:45:57<05:15,  1.53it/s] 95%|█████████▌| 9289/9770 [1:45:58<05:13,  1.53it/s] 95%|█████████▌| 9290/9770 [1:45:59<05:11,  1.54it/s]                                                      95%|█████████▌| 9290/9770 [1:45:59<05:11,  1.54it/s] 95%|█████████▌| 9291/9770 [1:45:59<05:12,  1.53it/s] 95%|█████████▌| 9292/9770 [1:46:00<05:16,  1.51it/s] 95%|█████████▌| 9293/9770 [1:46:01<05:13,  1.52it/s] 95%|█████████▌| 9294/9770 [1:46:01<05:10,  1.54it/s] 95%|█████████▌| 9295/9770 [1:46:02<05:11,  1.53it/s] 95%|█████████▌| 9296/9770 [1:46:03<05:09,  1.53it/s] 95%|█████████▌| 9297/9770 [1:46:03<05:05,  1.55it/s] 95%|█████████▌| 9298
+0: {'loss': 0.6562, 'grad_norm': 0.6241406870419981, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.95}
+0: /9770 [1:46:04<05:05,  1.54it/s] 95%|█████████▌| 9299/9770 [1:46:05<05:04,  1.55it/s] 95%|█████████▌| 9300/9770 [1:46:05<05:05,  1.54it/s]                                                      95%|█████████▌| 9300/9770 [1:46:05<05:05,  1.54it/s] 95%|█████████▌| 9301/9770 [1:46:06<05:02,  1.55it/s] 95%|█████████▌| 9302/9770 [1:46:07<05:02,  1.55it/s] 95%|█████████▌| 9303/9770 [1:46:07<05:01,  1.55it/s] 95%|█████████▌| 9304/9770 [1:46:08<04:57,  1.56it/s] 95%|█████████▌| 9305/9770 [1:46:09<05:04,  1.53it/s] 95%|█████████▌| 9306/9770 [1:46:09<05:03,  1.53it/s] 95%|█████████▌| 9307/9770 [1:46:10<04:58,  1.55it/s] 95%|█████████▌| 9308/9770 [1:46:10<05:02,  1.53it/s] 95%|█████████▌| 9309/9770 [1:46:11<05:03,  1.52it/s] 95%|█████████▌| 9310/9770 [
+0: {'loss': 0.6557, 'grad_norm': 0.6075309381489284, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.95}
+0: {'loss': 0.6499, 'grad_norm': 0.6565779546056522, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.95}
+0: 1:46:12<04:59,  1.54it/s]                                                      95%|█████████▌| 9310/9770 [1:46:12<04:59,  1.54it/s] 95%|█████████▌| 9311/9770 [1:46:12<04:58,  1.54it/s] 95%|█████████▌| 9312/9770 [1:46:13<04:56,  1.55it/s] 95%|█████████▌| 9313/9770 [1:46:14<04:53,  1.56it/s] 95%|█████████▌| 9314/9770 [1:46:14<04:50,  1.57it/s] 95%|█████████▌| 9315/9770 [1:46:15<04:50,  1.57it/s] 95%|█████████▌| 9316/9770 [1:46:16<04:51,  1.56it/s] 95%|█████████▌| 9317/9770 [1:46:16<04:46,  1.58it/s] 95%|█████████▌| 9318/9770 [1:46:17<04:46,  1.58it/s] 95%|█████████▌| 9319/9770 [1:46:18<04:49,  1.56it/s] 95%|█████████▌| 9320/9770 [1:46:18<04:49,  1.55it/s]                                                      95%|█████████▌| 9320/9770 [1:46:18<04:49,  1.55it/s]
+0: {'loss': 0.6326, 'grad_norm': 0.6006315586133305, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.95}
+0:  95%|█████████▌| 9321/9770 [1:46:19<04:54,  1.52it/s] 95%|█████████▌| 9322/9770 [1:46:20<04:52,  1.53it/s] 95%|█████████▌| 9323/9770 [1:46:20<04:49,  1.55it/s] 95%|█████████▌| 9324/9770 [1:46:21<04:48,  1.55it/s] 95%|█████████▌| 9325/9770 [1:46:21<04:49,  1.53it/s] 95%|█████████▌| 9326/9770 [1:46:22<04:47,  1.54it/s] 95%|█████████▌| 9327/9770 [1:46:23<04:43,  1.56it/s] 95%|█████████▌| 9328/9770 [1:46:23<04:43,  1.56it/s] 95%|█████████▌| 9329/9770 [1:46:24<04:48,  1.53it/s] 95%|█████████▌| 9330/9770 [1:46:25<04:46,  1.54it/s]                                                      95%|█████████▌| 9330/9770 [1:46:25<04:46,  1.54it/s] 96%|█████████▌| 9331/9770 [1:46:25<04:42,  1.55it/s] 96%|█████████▌| 9332/9770 [1:46:26<04:38,  1.57it/s] 96%|�
+0: {'loss': 0.6617, 'grad_norm': 0.6010492938213644, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.96}
+0: �████████▌| 9333/9770 [1:46:27<04:41,  1.55it/s] 96%|█████████▌| 9334/9770 [1:46:27<04:48,  1.51it/s] 96%|█████████▌| 9335/9770 [1:46:28<04:44,  1.53it/s] 96%|█████████▌| 9336/9770 [1:46:29<04:41,  1.54it/s] 96%|█████████▌| 9337/9770 [1:46:29<04:40,  1.54it/s] 96%|█████████▌| 9338/9770 [1:46:30<04:37,  1.56it/s] 96%|█████████▌| 9339/9770 [1:46:30<04:36,  1.56it/s] 96%|█████████▌| 9340/9770 [1:46:31<04:35,  1.56it/s]                                                      96%|█████████▌| 9340/9770 [1:46:31<04:35,  1.56it/s] 96%|█████████▌| 9341/9770 [1:46:32<04:41,  1.52it/s] 96%|█████████▌| 9342/9770 [1:46:32<04:40,  1.53it/s] 96%|█████████▌| 9343/9770 [1:46:33<04:38,  1.53it/s] 96%|█████████▌| 9344/9770 [1:46:34<04:43,  1.50it/s] 96%|███
+0: {'loss': 0.6654, 'grad_norm': 0.5917335559990318, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.96}
+0: ██████▌| 9345/9770 [1:46:34<04:40,  1.51it/s] 96%|█████████▌| 9346/9770 [1:46:35<04:38,  1.52it/s] 96%|█████████▌| 9347/9770 [1:46:36<04:35,  1.53it/s] 96%|█████████▌| 9348/9770 [1:46:36<04:34,  1.54it/s] 96%|█████████▌| 9349/9770 [1:46:37<04:33,  1.54it/s] 96%|█████████▌| 9350/9770 [1:46:38<04:34,  1.53it/s]                                                      96%|█████████▌| 9350/9770 [1:46:38<04:34,  1.53it/s] 96%|█████████▌| 9351/9770 [1:46:38<04:31,  1.55it/s] 96%|█████████▌| 9352/9770 [1:46:39<04:30,  1.54it/s] 96%|█████████▌| 9353/9770 [1:46:40<04:29,  1.55it/s] 96%|█████████▌| 9354/9770 [1:46:40<04:27,  1.56it/s] 96%|█████████▌| 9355/9770 [1:46:41<04:25,  1.57it/s] 96%|█████████▌| 9356/9770 [1:46:42<04:24,  1.56it/s] 96%|█████�
+0: {'loss': 0.6399, 'grad_norm': 0.6185418935928941, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.96}
+0: ��███▌| 9357/9770 [1:46:42<04:25,  1.56it/s] 96%|█████████▌| 9358/9770 [1:46:43<04:23,  1.56it/s] 96%|█████████▌| 9359/9770 [1:46:43<04:23,  1.56it/s] 96%|█████████▌| 9360/9770 [1:46:44<04:21,  1.57it/s]                                                      96%|█████████▌| 9360/9770 [1:46:44<04:21,  1.57it/s] 96%|█████████▌| 9361/9770 [1:46:45<04:21,  1.56it/s] 96%|█████████▌| 9362/9770 [1:46:45<04:20,  1.57it/s] 96%|█████████▌| 9363/9770 [1:46:46<04:17,  1.58it/s] 96%|█████████▌| 9364/9770 [1:46:47<04:19,  1.56it/s] 96%|█████████▌| 9365/9770 [1:46:47<04:24,  1.53it/s] 96%|█████████▌| 9366/9770 [1:46:48<04:21,  1.55it/s] 96%|█████████▌| 9367/9770 [1:46:49<04:18,  1.56it/s] 96%|█████████▌| 9368/9770 [1:46:49<04:18,  1.55it/s] 96%|███████�
+0: {'loss': 0.654, 'grad_norm': 0.5818399760468063, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.96}
+0: �█▌| 9369/9770 [1:46:50<04:18,  1.55it/s] 96%|█████████▌| 9370/9770 [1:46:51<04:18,  1.55it/s]                                                      96%|█████████▌| 9370/9770 [1:46:51<04:18,  1.55it/s] 96%|█████████▌| 9371/9770 [1:46:51<04:18,  1.54it/s] 96%|█████████▌| 9372/9770 [1:46:52<04:20,  1.53it/s] 96%|█████████▌| 9373/9770 [1:46:53<04:17,  1.54it/s] 96%|█████████▌| 9374/9770 [1:46:53<04:17,  1.54it/s] 96%|█████████▌| 9375/9770 [1:46:54<04:12,  1.56it/s] 96%|█████████▌| 9376/9770 [1:46:54<04:15,  1.54it/s] 96%|█████████▌| 9377/9770 [1:46:55<04:14,  1.54it/s] 96%|█████████▌| 9378/9770 [1:46:56<04:13,  1.55it/s] 96%|█████████▌| 9379/9770 [1:46:56<04:17,  1.52it/s] 96%|█████████▌| 9380/9770 [1:46:57<04:14,  1.53it/s]                                   
+0: {'loss': 0.6354, 'grad_norm': 0.5840479474234904, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.96}
+0: {'loss': 0.6384, 'grad_norm': 0.5873396434065948, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.96}
+0:                    96%|█████████▌| 9380/9770 [1:46:57<04:14,  1.53it/s] 96%|█████████▌| 9381/9770 [1:46:58<04:10,  1.55it/s] 96%|█████████▌| 9382/9770 [1:46:58<04:08,  1.56it/s] 96%|█████████▌| 9383/9770 [1:46:59<04:06,  1.57it/s] 96%|█████████▌| 9384/9770 [1:47:00<04:05,  1.57it/s] 96%|█████████▌| 9385/9770 [1:47:00<04:06,  1.56it/s] 96%|█████████▌| 9386/9770 [1:47:01<04:06,  1.56it/s] 96%|█████████▌| 9387/9770 [1:47:02<04:06,  1.55it/s] 96%|█████████▌| 9388/9770 [1:47:02<04:02,  1.58it/s] 96%|█████████▌| 9389/9770 [1:47:03<04:01,  1.58it/s] 96%|█████████▌| 9390/9770 [1:47:03<03:59,  1.58it/s]                                                      96%|█████████▌| 9390/9770 [1:47:03<03:59,  1.58it/s] 96%|█████████▌| 9391/9770 [1:47:04<04:03
+0: {'loss': 0.6383, 'grad_norm': 0.6130380983503895, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.96}
+0: ,  1.56it/s] 96%|█████████▌| 9392/9770 [1:47:05<04:04,  1.55it/s] 96%|█████████▌| 9393/9770 [1:47:05<04:02,  1.55it/s] 96%|█████████▌| 9394/9770 [1:47:06<03:58,  1.58it/s] 96%|█████████▌| 9395/9770 [1:47:07<03:55,  1.59it/s] 96%|█████████▌| 9396/9770 [1:47:07<03:55,  1.59it/s] 96%|█████████▌| 9397/9770 [1:47:08<03:55,  1.58it/s] 96%|█████████▌| 9398/9770 [1:47:09<03:57,  1.57it/s] 96%|█████████▌| 9399/9770 [1:47:09<03:59,  1.55it/s] 96%|█████████▌| 9400/9770 [1:47:10<04:01,  1.53it/s]                                                      96%|█████████▌| 9400/9770 [1:47:10<04:01,  1.53it/s] 96%|█████████▌| 9401/9770 [1:47:10<03:59,  1.54it/s] 96%|█████████▌| 9402/9770 [1:47:11<04:02,  1.52it/s] 96%|█████████▌| 9403/9770 [1:47:12<03:58,  1.54
+0: {'loss': 0.639, 'grad_norm': 0.621869921293018, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.96}
+0: it/s] 96%|█████████▋| 9404/9770 [1:47:12<04:01,  1.52it/s] 96%|█████████▋| 9405/9770 [1:47:13<03:56,  1.54it/s] 96%|█████████▋| 9406/9770 [1:47:14<03:56,  1.54it/s] 96%|█████████▋| 9407/9770 [1:47:14<03:54,  1.55it/s] 96%|█████████▋| 9408/9770 [1:47:15<03:53,  1.55it/s] 96%|█████████▋| 9409/9770 [1:47:16<03:52,  1.56it/s] 96%|█████████▋| 9410/9770 [1:47:16<03:51,  1.55it/s]                                                      96%|█████████▋| 9410/9770 [1:47:16<03:51,  1.55it/s] 96%|█████████▋| 9411/9770 [1:47:17<03:58,  1.51it/s] 96%|█████████▋| 9412/9770 [1:47:18<03:56,  1.51it/s] 96%|█████████▋| 9413/9770 [1:47:18<03:54,  1.53it/s] 96%|█████████▋| 9414/9770 [1:47:19<03:52,  1.53it/s] 96%|█████████▋| 9415/9770 [1:47:20<03:51,  1.53it/s] 
+0: {'loss': 0.6508, 'grad_norm': 0.5863603485788632, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.96}
+0: 96%|█████████▋| 9416/9770 [1:47:20<03:48,  1.55it/s] 96%|█████████▋| 9417/9770 [1:47:21<03:48,  1.55it/s] 96%|█████████▋| 9418/9770 [1:47:22<03:46,  1.55it/s] 96%|█████████▋| 9419/9770 [1:47:22<03:44,  1.56it/s] 96%|█████████▋| 9420/9770 [1:47:23<03:49,  1.52it/s]                                                      96%|█████████▋| 9420/9770 [1:47:23<03:49,  1.52it/s] 96%|█████████▋| 9421/9770 [1:47:24<03:49,  1.52it/s] 96%|█████████▋| 9422/9770 [1:47:24<03:45,  1.54it/s] 96%|█████████▋| 9423/9770 [1:47:25<03:44,  1.54it/s] 96%|█████████▋| 9424/9770 [1:47:25<03:40,  1.57it/s] 96%|█████████▋| 9425/9770 [1:47:26<03:36,  1.59it/s] 96%|█████████▋| 9426/9770 [1:47:27<03:36,  1.59it/s] 96%|█████████▋| 9427/9770 [1:47:27<03:35,  1.59it/s] 96%|█
+0: {'loss': 0.6523, 'grad_norm': 0.6000655950105521, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.96}
+0: ████████▋| 9428/9770 [1:47:28<03:39,  1.56it/s] 97%|█████████▋| 9429/9770 [1:47:29<03:37,  1.57it/s] 97%|█████████▋| 9430/9770 [1:47:29<03:36,  1.57it/s]                                                      97%|█████████▋| 9430/9770 [1:47:29<03:36,  1.57it/s] 97%|█████████▋| 9431/9770 [1:47:30<03:37,  1.56it/s] 97%|█████████▋| 9432/9770 [1:47:30<03:35,  1.57it/s] 97%|█████████▋| 9433/9770 [1:47:31<03:35,  1.56it/s] 97%|█████████▋| 9434/9770 [1:47:32<03:37,  1.55it/s] 97%|█████████▋| 9435/9770 [1:47:32<03:36,  1.55it/s] 97%|█████████▋| 9436/9770 [1:47:33<03:39,  1.52it/s] 97%|█████████▋| 9437/9770 [1:47:34<03:36,  1.54it/s] 97%|█████████▋| 9438/9770 [1:47:34<03:35,  1.54it/s] 97%|█████████▋| 9439/9770 [1:47:35<03:31,  1.57it/s] 97%|███�
+0: {'loss': 0.651, 'grad_norm': 0.5690883540561729, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.97}
+0: {'loss': 0.6423, 'grad_norm': 0.5931820646400616, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.97}
+0: ��█████▋| 9440/9770 [1:47:36<03:31,  1.56it/s]                                                      97%|█████████▋| 9440/9770 [1:47:36<03:31,  1.56it/s] 97%|█████████▋| 9441/9770 [1:47:36<03:32,  1.55it/s] 97%|█████████▋| 9442/9770 [1:47:37<03:31,  1.55it/s] 97%|█████████▋| 9443/9770 [1:47:38<03:30,  1.55it/s] 97%|█████████▋| 9444/9770 [1:47:38<03:31,  1.54it/s] 97%|█████████▋| 9445/9770 [1:47:39<03:33,  1.52it/s] 97%|█████████▋| 9446/9770 [1:47:40<03:31,  1.53it/s] 97%|█████████▋| 9447/9770 [1:47:40<03:30,  1.54it/s] 97%|█████████▋| 9448/9770 [1:47:41<03:28,  1.54it/s] 97%|█████████▋| 9449/9770 [1:47:42<03:27,  1.54it/s] 97%|█████████▋| 9450/9770 [1:47:42<03:26,  1.55it/s]                                                      97%|█████████▋| 9450
+0: {'loss': 0.6494, 'grad_norm': 0.5770314725540777, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.97}
+0: /9770 [1:47:42<03:26,  1.55it/s] 97%|█████████▋| 9451/9770 [1:47:43<03:23,  1.57it/s] 97%|█████████▋| 9452/9770 [1:47:43<03:28,  1.52it/s] 97%|█████████▋| 9453/9770 [1:47:44<03:26,  1.54it/s] 97%|█████████▋| 9454/9770 [1:47:45<03:22,  1.56it/s] 97%|█████████▋| 9455/9770 [1:47:45<03:22,  1.56it/s] 97%|█████████▋| 9456/9770 [1:47:46<03:22,  1.55it/s] 97%|█████████▋| 9457/9770 [1:47:47<03:21,  1.56it/s] 97%|█████████▋| 9458/9770 [1:47:47<03:21,  1.55it/s] 97%|█████████▋| 9459/9770 [1:47:48<03:18,  1.56it/s] 97%|█████████▋| 9460/9770 [1:47:49<03:18,  1.56it/s]                                                      97%|█████████▋| 9460/9770 [1:47:49<03:18,  1.56it/s] 97%|█████████▋| 9461/9770 [1:47:49<03:19,  1.55it/s] 97%|█████████▋| 9462/9770 [
+0: {'loss': 0.6341, 'grad_norm': 0.6117866376643601, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.97}
+0: 1:47:50<03:19,  1.54it/s] 97%|█████████▋| 9463/9770 [1:47:51<03:19,  1.54it/s] 97%|█████████▋| 9464/9770 [1:47:51<03:17,  1.55it/s] 97%|█████████▋| 9465/9770 [1:47:52<03:17,  1.55it/s] 97%|█████████▋| 9466/9770 [1:47:52<03:15,  1.56it/s] 97%|█████████▋| 9467/9770 [1:47:53<03:13,  1.57it/s] 97%|█████████▋| 9468/9770 [1:47:54<03:14,  1.55it/s] 97%|█████████▋| 9469/9770 [1:47:54<03:14,  1.55it/s] 97%|█████████▋| 9470/9770 [1:47:55<03:16,  1.53it/s]                                                      97%|█████████▋| 9470/9770 [1:47:55<03:16,  1.53it/s] 97%|█████████▋| 9471/9770 [1:47:56<03:15,  1.53it/s] 97%|█████████▋| 9472/9770 [1:47:56<03:14,  1.53it/s] 97%|█████████▋| 9473/9770 [1:47:57<03:14,  1.52it/s] 97%|█████████▋| 9474/9770 [1:47:58
+0: {'loss': 0.6642, 'grad_norm': 0.601466373901166, 'learning_rate': 1.996005768142772e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.97}
+0: <03:13,  1.53it/s] 97%|█████████▋| 9475/9770 [1:47:58<03:11,  1.54it/s] 97%|█████████▋| 9476/9770 [1:47:59<03:09,  1.55it/s] 97%|█████████▋| 9477/9770 [1:48:00<03:07,  1.57it/s] 97%|█████████▋| 9478/9770 [1:48:00<03:05,  1.57it/s] 97%|█████████▋| 9479/9770 [1:48:01<03:06,  1.56it/s] 97%|█████████▋| 9480/9770 [1:48:02<03:06,  1.56it/s]                                                      97%|█████████▋| 9480/9770 [1:48:02<03:06,  1.56it/s] 97%|█████████▋| 9481/9770 [1:48:02<03:06,  1.55it/s] 97%|█████████▋| 9482/9770 [1:48:03<03:03,  1.57it/s] 97%|█████████▋| 9483/9770 [1:48:03<03:04,  1.55it/s] 97%|█████████▋| 9484/9770 [1:48:04<03:03,  1.56it/s] 97%|█████████▋| 9485/9770 [1:48:05<03:03,  1.55it/s] 97%|█████████▋| 9486/9770 [1:48:05<03:05,
+0: {'loss': 0.632, 'grad_norm': 0.6080561729558677, 'learning_rate': 1.98224405715955e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.97}
+0:   1.53it/s] 97%|█████████▋| 9487/9770 [1:48:06<03:02,  1.55it/s] 97%|█████████▋| 9488/9770 [1:48:07<03:02,  1.55it/s] 97%|█████████▋| 9489/9770 [1:48:07<02:58,  1.57it/s] 97%|█████████▋| 9490/9770 [1:48:08<02:58,  1.56it/s]                                                      97%|█████████▋| 9490/9770 [1:48:08<02:58,  1.56it/s] 97%|█████████▋| 9491/9770 [1:48:09<03:02,  1.53it/s] 97%|█████████▋| 9492/9770 [1:48:09<03:04,  1.51it/s] 97%|█████████▋| 9493/9770 [1:48:10<03:01,  1.52it/s] 97%|█████████▋| 9494/9770 [1:48:11<02:59,  1.54it/s] 97%|█████████▋| 9495/9770 [1:48:11<02:57,  1.55it/s] 97%|█████████▋| 9496/9770 [1:48:12<02:54,  1.57it/s] 97%|█████████▋| 9497/9770 [1:48:13<02:57,  1.54it/s] 97%|█████████▋| 9498/9770 [1:48:13<02:56,  1.54i
+0: {'loss': 0.668, 'grad_norm': 0.6344582302657377, 'learning_rate': 1.958816295664649e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.97}
+0: t/s] 97%|█████████▋| 9499/9770 [1:48:14<02:54,  1.55it/s] 97%|█████████▋| 9500/9770 [1:48:15<02:59,  1.50it/s]                                                      97%|█████████▋| 9500/9770 [1:48:15<02:59,  1.50it/s] 97%|█████████▋| 9501/9770 [1:48:15<02:57,  1.52it/s] 97%|█████████▋| 9502/9770 [1:48:16<02:53,  1.55it/s] 97%|█████████▋| 9503/9770 [1:48:16<02:51,  1.55it/s] 97%|█████████▋| 9504/9770 [1:48:17<02:49,  1.57it/s] 97%|█████████▋| 9505/9770 [1:48:18<02:48,  1.57it/s] 97%|█████████▋| 9506/9770 [1:48:18<02:48,  1.57it/s] 97%|█████████▋| 9507/9770 [1:48:19<02:48,  1.56it/s] 97%|█████████▋| 9508/9770 [1:48:20<02:48,  1.56it/s] 97%|█████████▋| 9509/9770 [1:48:20<02:52,  1.51it/s] 97%|█████████▋| 9510/9770 [1:48:21<02:51,  1.52it/s]  
+0: {'loss': 0.6471, 'grad_norm': 0.5789882942483003, 'learning_rate': 1.925979163115583e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.97}
+0: {'loss': 0.6565, 'grad_norm': 0.601459843793583, 'learning_rate': 1.8840924300081706e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.97}
+0:                                                     97%|█████████▋| 9510/9770 [1:48:21<02:51,  1.52it/s] 97%|█████████▋| 9511/9770 [1:48:22<02:48,  1.54it/s] 97%|█████████▋| 9512/9770 [1:48:22<02:47,  1.54it/s] 97%|█████████▋| 9513/9770 [1:48:23<02:46,  1.54it/s] 97%|█████████▋| 9514/9770 [1:48:24<02:44,  1.55it/s] 97%|█████████▋| 9515/9770 [1:48:24<02:44,  1.55it/s] 97%|█████████▋| 9516/9770 [1:48:25<02:42,  1.56it/s] 97%|█████████▋| 9517/9770 [1:48:25<02:44,  1.54it/s] 97%|█████████▋| 9518/9770 [1:48:26<02:43,  1.54it/s] 97%|█████████▋| 9519/9770 [1:48:27<02:41,  1.55it/s] 97%|█████████▋| 9520/9770 [1:48:27<02:40,  1.56it/s]                                                      97%|█████████▋| 9520/9770 [1:48:27<02:40,  1.56it/s] 97%|███████�
+0: {'loss': 0.6519, 'grad_norm': 0.5386981168101596, 'learning_rate': 1.833615016155699e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.97}
+0: �█▋| 9521/9770 [1:48:28<02:40,  1.55it/s] 97%|█████████▋| 9522/9770 [1:48:29<02:39,  1.55it/s] 97%|█████████▋| 9523/9770 [1:48:29<02:37,  1.57it/s] 97%|█████████▋| 9524/9770 [1:48:30<02:37,  1.56it/s] 97%|█████████▋| 9525/9770 [1:48:31<02:36,  1.56it/s] 98%|█████████▊| 9526/9770 [1:48:31<02:34,  1.58it/s] 98%|█████████▊| 9527/9770 [1:48:32<02:34,  1.57it/s] 98%|█████████▊| 9528/9770 [1:48:32<02:33,  1.57it/s] 98%|█████████▊| 9529/9770 [1:48:33<02:33,  1.57it/s] 98%|█████████▊| 9530/9770 [1:48:34<02:32,  1.57it/s]                                                      98%|█████████▊| 9530/9770 [1:48:34<02:32,  1.57it/s] 98%|█████████▊| 9531/9770 [1:48:34<02:37,  1.52it/s] 98%|█████████▊| 9532/9770 [1:48:35<02:40,  1.49it/s] 98%|█████████▊
+0: {'loss': 0.6647, 'grad_norm': 0.5970480385736316, 'learning_rate': 1.775099962667414e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.98}
+0: | 9533/9770 [1:48:36<02:37,  1.51it/s] 98%|█████████▊| 9534/9770 [1:48:36<02:33,  1.53it/s] 98%|█████████▊| 9535/9770 [1:48:37<02:32,  1.54it/s] 98%|█████████▊| 9536/9770 [1:48:38<02:32,  1.53it/s] 98%|█████████▊| 9537/9770 [1:48:38<02:30,  1.55it/s] 98%|█████████▊| 9538/9770 [1:48:39<02:29,  1.55it/s] 98%|█████████▊| 9539/9770 [1:48:40<02:28,  1.56it/s] 98%|█████████▊| 9540/9770 [1:48:40<02:27,  1.55it/s]                                                      98%|█████████▊| 9540/9770 [1:48:40<02:27,  1.55it/s] 98%|█████████▊| 9541/9770 [1:48:41<02:26,  1.56it/s] 98%|█████████▊| 9542/9770 [1:48:42<02:25,  1.57it/s] 98%|█████████▊| 9543/9770 [1:48:42<02:23,  1.58it/s] 98%|█████████▊| 9544/9770 [1:48:43<02:23,  1.58it/s] 98%|█████████▊| 9545/
+0: {'loss': 0.6511, 'grad_norm': 0.6414451513390736, 'learning_rate': 1.7091883727143946e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.98}
+0: 9770 [1:48:43<02:22,  1.57it/s] 98%|█████████▊| 9546/9770 [1:48:44<02:20,  1.59it/s] 98%|█████████▊| 9547/9770 [1:48:45<02:23,  1.56it/s] 98%|█████████▊| 9548/9770 [1:48:45<02:23,  1.55it/s] 98%|█████████▊| 9549/9770 [1:48:46<02:22,  1.55it/s] 98%|█████████▊| 9550/9770 [1:48:47<02:21,  1.55it/s]                                                      98%|█████████▊| 9550/9770 [1:48:47<02:21,  1.55it/s] 98%|█████████▊| 9551/9770 [1:48:47<02:20,  1.56it/s] 98%|█████████▊| 9552/9770 [1:48:48<02:18,  1.57it/s] 98%|█████████▊| 9553/9770 [1:48:49<02:17,  1.57it/s] 98%|█████████▊| 9554/9770 [1:48:49<02:18,  1.56it/s] 98%|█████████▊| 9555/9770 [1:48:50<02:19,  1.55it/s] 98%|█████████▊| 9556/9770 [1:48:51<02:18,  1.55it/s] 98%|█████████▊| 9557/9770 [1
+0: {'loss': 0.6372, 'grad_norm': 0.5805169337332307, 'learning_rate': 1.6366023874690543e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.98}
+0: :48:51<02:17,  1.54it/s] 98%|█████████▊| 9558/9770 [1:48:52<02:16,  1.55it/s] 98%|█████████▊| 9559/9770 [1:48:52<02:16,  1.55it/s] 98%|█████████▊| 9560/9770 [1:48:53<02:16,  1.54it/s]                                                      98%|█████████▊| 9560/9770 [1:48:53<02:16,  1.54it/s] 98%|█████████▊| 9561/9770 [1:48:54<02:15,  1.54it/s] 98%|█████████▊| 9562/9770 [1:48:54<02:15,  1.54it/s] 98%|█████████▊| 9563/9770 [1:48:55<02:15,  1.52it/s] 98%|█████████▊| 9564/9770 [1:48:56<02:13,  1.54it/s] 98%|█████████▊| 9565/9770 [1:48:56<02:12,  1.54it/s] 98%|█████████▊| 9566/9770 [1:48:57<02:12,  1.55it/s] 98%|█████████▊| 9567/9770 [1:48:58<02:12,  1.53it/s] 98%|█████████▊| 9568/9770 [1:48:58<02:10,  1.55it/s] 98%|█████████▊| 9569/9770 [1:48:59<
+0: {'loss': 0.6638, 'grad_norm': 0.5848790352766927, 'learning_rate': 1.558137274175334e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.98}
+0: {'loss': 0.6394, 'grad_norm': 0.5700925984015552, 'learning_rate': 1.4746527130343613e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.98}
+0: 02:11,  1.53it/s] 98%|█████████▊| 9570/9770 [1:49:00<02:10,  1.54it/s]                                                      98%|█████████▊| 9570/9770 [1:49:00<02:10,  1.54it/s] 98%|█████████▊| 9571/9770 [1:49:00<02:11,  1.52it/s] 98%|█████████▊| 9572/9770 [1:49:01<02:09,  1.53it/s] 98%|█████████▊| 9573/9770 [1:49:02<02:07,  1.55it/s] 98%|█████████▊| 9574/9770 [1:49:02<02:04,  1.57it/s] 98%|█████████▊| 9575/9770 [1:49:03<02:03,  1.57it/s] 98%|█████████▊| 9576/9770 [1:49:03<02:02,  1.59it/s] 98%|█████████▊| 9577/9770 [1:49:04<02:00,  1.60it/s] 98%|█████████▊| 9578/9770 [1:49:05<02:02,  1.56it/s] 98%|█████████▊| 9579/9770 [1:49:05<02:01,  1.57it/s] 98%|█████████▊| 9580/9770 [1:49:06<02:01,  1.56it/s]                                                      98%|█
+0: {'loss': 0.6608, 'grad_norm': 0.5846458498232325, 'learning_rate': 1.3870633783682632e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.98}
+0: ████████▊| 9580/9770 [1:49:06<02:01,  1.56it/s] 98%|█████████▊| 9581/9770 [1:49:07<02:01,  1.56it/s] 98%|█████████▊| 9582/9770 [1:49:07<02:01,  1.54it/s] 98%|█████████▊| 9583/9770 [1:49:08<02:01,  1.54it/s] 98%|█████████▊| 9584/9770 [1:49:09<02:00,  1.54it/s] 98%|█████████▊| 9585/9770 [1:49:09<01:59,  1.55it/s] 98%|█████████▊| 9586/9770 [1:49:10<01:58,  1.55it/s] 98%|█████████▊| 9587/9770 [1:49:11<01:56,  1.57it/s] 98%|█████████▊| 9588/9770 [1:49:11<01:56,  1.57it/s] 98%|█████████▊| 9589/9770 [1:49:12<01:54,  1.58it/s] 98%|█████████▊| 9590/9770 [1:49:12<01:53,  1.59it/s]                                                      98%|█████████▊| 9590/9770 [1:49:12<01:53,  1.59it/s] 98%|█████████▊| 9591/9770 [1:49:13<01:52,  1.59it/s] 98%|███�
+0: {'loss': 0.6541, 'grad_norm': 0.5777555439158457, 'learning_rate': 1.2963289172568885e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.98}
+0: ��█████▊| 9592/9770 [1:49:14<01:52,  1.58it/s] 98%|█████████▊| 9593/9770 [1:49:14<01:52,  1.57it/s] 98%|█████████▊| 9594/9770 [1:49:15<01:50,  1.59it/s] 98%|█████████▊| 9595/9770 [1:49:16<01:50,  1.58it/s] 98%|█████████▊| 9596/9770 [1:49:16<01:51,  1.56it/s] 98%|█████████▊| 9597/9770 [1:49:17<01:51,  1.55it/s] 98%|█████████▊| 9598/9770 [1:49:18<01:53,  1.52it/s] 98%|█████████▊| 9599/9770 [1:49:18<01:52,  1.52it/s] 98%|█████████▊| 9600/9770 [1:49:19<01:51,  1.52it/s]                                                      98%|█████████▊| 9600/9770 [1:49:19<01:51,  1.52it/s] 98%|█████████▊| 9601/9770 [1:49:20<01:49,  1.54it/s] 98%|█████████▊| 9602/9770 [1:49:20<01:48,  1.55it/s] 98%|█████████▊| 9603/9770 [1:49:21<01:46,  1.56it/s] 98%|█████�
+0: {'loss': 0.6348, 'grad_norm': 0.5675719910023068, 'learning_rate': 1.20344343544358e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.98}
+0: �███▊| 9604/9770 [1:49:21<01:46,  1.57it/s] 98%|█████████▊| 9605/9770 [1:49:22<01:44,  1.58it/s] 98%|█████████▊| 9606/9770 [1:49:23<01:44,  1.57it/s] 98%|█████████▊| 9607/9770 [1:49:23<01:43,  1.58it/s] 98%|█████████▊| 9608/9770 [1:49:24<01:44,  1.55it/s] 98%|█████████▊| 9609/9770 [1:49:25<01:43,  1.56it/s] 98%|█████████▊| 9610/9770 [1:49:25<01:42,  1.56it/s]                                                      98%|█████████▊| 9610/9770 [1:49:25<01:42,  1.56it/s] 98%|█████████▊| 9611/9770 [1:49:26<01:41,  1.57it/s] 98%|█████████▊| 9612/9770 [1:49:27<01:41,  1.56it/s] 98%|█████████▊| 9613/9770 [1:49:27<01:43,  1.52it/s] 98%|█████████▊| 9614/9770 [1:49:28<01:42,  1.52it/s] 98%|█████████▊| 9615/9770 [1:49:29<01:41,  1.53it/s] 98%|████████
+0: {'loss': 0.6523, 'grad_norm': 0.6151241464979615, 'learning_rate': 1.1094246057046214e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.98}
+0: █▊| 9616/9770 [1:49:29<01:39,  1.54it/s] 98%|█████████▊| 9617/9770 [1:49:30<01:38,  1.55it/s] 98%|█████████▊| 9618/9770 [1:49:30<01:38,  1.54it/s] 98%|█████████▊| 9619/9770 [1:49:31<01:37,  1.55it/s] 98%|█████████▊| 9620/9770 [1:49:32<01:36,  1.55it/s]                                                      98%|█████████▊| 9620/9770 [1:49:32<01:36,  1.55it/s] 98%|█████████▊| 9621/9770 [1:49:32<01:36,  1.55it/s] 98%|█████████▊| 9622/9770 [1:49:33<01:35,  1.55it/s] 98%|█████████▊| 9623/9770 [1:49:34<01:36,  1.52it/s] 99%|█████████▊| 9624/9770 [1:49:34<01:34,  1.55it/s] 99%|█████████▊| 9625/9770 [1:49:35<01:34,  1.53it/s] 99%|███���█████▊| 9626/9770 [1:49:36<01:33,  1.54it/s] 99%|█████████▊| 9627/9770 [1:49:36<01:33,  1.54it/s] 99%|█████████▊|
+0: {'loss': 0.6278, 'grad_norm': 0.550106592947246, 'learning_rate': 1.0153025180133372e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.99}
+0:  9628/9770 [1:49:37<01:31,  1.55it/s] 99%|█████████▊| 9629/9770 [1:49:38<01:30,  1.56it/s] 99%|█████████▊| 9630/9770 [1:49:38<01:29,  1.56it/s]                                                      99%|█████████▊| 9630/9770 [1:49:38<01:29,  1.56it/s] 99%|█████████▊| 9631/9770 [1:49:39<01:37,  1.42it/s] 99%|█████████▊| 9632/9770 [1:49:40<01:34,  1.46it/s] 99%|█████████▊| 9633/9770 [1:49:40<01:34,  1.45it/s] 99%|█████████▊| 9634/9770 [1:49:41<01:33,  1.46it/s] 99%|█████████▊| 9635/9770 [1:49:42<01:31,  1.48it/s] 99%|█████████▊| 9636/9770 [1:49:42<01:29,  1.50it/s] 99%|█████████▊| 9637/9770 [1:49:43<01:28,  1.51it/s] 99%|█████████▊| 9638/9770 [1:49:44<01:26,  1.53it/s] 99%|█████████▊| 9639/9770 [1:49:44<01:26,  1.52it/s] 99%|█████████▊| 9640/9
+0: {'loss': 0.6299, 'grad_norm': 0.5585174558643733, 'learning_rate': 9.221083936587864e-06, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.99}
+0: {'loss': 0.6449, 'grad_norm': 0.5474740219469646, 'learning_rate': 8.308632869695222e-06, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.99}
+0: 770 [1:49:45<01:25,  1.52it/s]                                                      99%|█████████▊| 9640/9770 [1:49:45<01:25,  1.52it/s] 99%|█████████▊| 9641/9770 [1:49:46<01:24,  1.52it/s] 99%|█████████▊| 9642/9770 [1:49:46<01:24,  1.52it/s] 99%|█████████▊| 9643/9770 [1:49:47<01:21,  1.55it/s] 99%|█████████▊| 9644/9770 [1:49:48<01:21,  1.55it/s] 99%|█████████▊| 9645/9770 [1:49:48<01:23,  1.50it/s] 99%|█████████▊| 9646/9770 [1:49:49<01:22,  1.51it/s] 99%|█████████▊| 9647/9770 [1:49:50<01:20,  1.53it/s] 99%|█████████▉| 9648/9770 [1:49:50<01:19,  1.53it/s] 99%|█████████▉| 9649/9770 [1:49:51<01:19,  1.53it/s] 99%|█████████▉| 9650/9770 [1:49:52<01:18,  1.53it/s]                                                      99%|█████████▉| 9650/9770 [1:49:52<01:18,  1.53i
+0: {'loss': 0.6329, 'grad_norm': 0.5328633471127429, 'learning_rate': 7.425668984286976e-06, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.99}
+0: t/s] 99%|█████████▉| 9651/9770 [1:49:52<01:17,  1.53it/s] 99%|█████████▉| 9652/9770 [1:49:53<01:17,  1.51it/s] 99%|█████████▉| 9653/9770 [1:49:54<01:16,  1.53it/s] 99%|█████████▉| 9654/9770 [1:49:54<01:14,  1.56it/s] 99%|█████████▉| 9655/9770 [1:49:55<01:13,  1.56it/s] 99%|█████████▉| 9656/9770 [1:49:55<01:12,  1.56it/s] 99%|█████████▉| 9657/9770 [1:49:56<01:11,  1.57it/s] 99%|█████████▉| 9658/9770 [1:49:57<01:11,  1.56it/s] 99%|█████████▉| 9659/9770 [1:49:57<01:11,  1.56it/s] 99%|█████████▉| 9660/9770 [1:49:58<01:09,  1.57it/s]                                                      99%|█████████▉| 9660/9770 [1:49:58<01:09,  1.57it/s] 99%|█████████▉| 9661/9770 [1:49:59<01:09,  1.58it/s] 99%|█████████▉| 9662/9770 [1:49:59<01:08,  1.57it/s] 9
+0: {'loss': 0.6481, 'grad_norm': 0.5662254303204157, 'learning_rate': 6.581866217463733e-06, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.99}
+0: 9%|█████████▉| 9663/9770 [1:50:00<01:07,  1.58it/s] 99%|█████████▉| 9664/9770 [1:50:00<01:07,  1.58it/s] 99%|█████████▉| 9665/9770 [1:50:01<01:06,  1.59it/s] 99%|█████████▉| 9666/9770 [1:50:02<01:05,  1.59it/s] 99%|█████████▉| 9667/9770 [1:50:02<01:04,  1.59it/s] 99%|█████████▉| 9668/9770 [1:50:03<01:04,  1.59it/s] 99%|█████████▉| 9669/9770 [1:50:04<01:03,  1.58it/s] 99%|█████████▉| 9670/9770 [1:50:04<01:04,  1.56it/s]                                                      99%|█████████▉| 9670/9770 [1:50:04<01:04,  1.56it/s] 99%|█████████▉| 9671/9770 [1:50:05<01:03,  1.55it/s] 99%|█████████▉| 9672/9770 [1:50:06<01:03,  1.55it/s] 99%|█████████▉| 9673/9770 [1:50:06<01:02,  1.56it/s] 99%|█████████▉| 9674/9770 [1:50:07<01:00,  1.58it/s] 99%|█�
+0: {'loss': 0.6296, 'grad_norm': 0.5685001869287369, 'learning_rate': 5.786469448915892e-06, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.99}
+0: ��███████▉| 9675/9770 [1:50:07<00:59,  1.59it/s] 99%|█████████▉| 9676/9770 [1:50:08<00:59,  1.58it/s] 99%|█████████▉| 9677/9770 [1:50:09<01:00,  1.53it/s] 99%|█████████▉| 9678/9770 [1:50:09<00:59,  1.54it/s] 99%|█████████▉| 9679/9770 [1:50:10<00:59,  1.54it/s] 99%|█████████▉| 9680/9770 [1:50:11<00:58,  1.53it/s]                                                      99%|█████████▉| 9680/9770 [1:50:11<00:58,  1.53it/s] 99%|█████████▉| 9681/9770 [1:50:11<00:58,  1.53it/s] 99%|█████████▉| 9682/9770 [1:50:12<00:56,  1.56it/s] 99%|█████████▉| 9683/9770 [1:50:13<00:54,  1.59it/s] 99%|█████████▉| 9684/9770 [1:50:13<00:55,  1.56it/s] 99%|█████████▉| 9685/9770 [1:50:14<00:54,  1.55it/s] 99%|█████████▉| 9686/9770 [1:50:15<00:54,  1.54it/s] 99%|███�
+0: {'loss': 0.6429, 'grad_norm': 0.5625623232656415, 'learning_rate': 5.048193212087136e-06, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.99}
+0: �█████▉| 9687/9770 [1:50:15<00:54,  1.53it/s] 99%|█████████▉| 9688/9770 [1:50:16<00:53,  1.54it/s] 99%|█████████▉| 9689/9770 [1:50:17<00:52,  1.54it/s] 99%|█████████▉| 9690/9770 [1:50:17<00:52,  1.52it/s]                                                      99%|█████████▉| 9690/9770 [1:50:17<00:52,  1.52it/s] 99%|█████████▉| 9691/9770 [1:50:18<00:51,  1.54it/s] 99%|█████████▉| 9692/9770 [1:50:19<00:50,  1.54it/s] 99%|█████████▉| 9693/9770 [1:50:19<00:50,  1.54it/s] 99%|█████████▉| 9694/9770 [1:50:20<00:49,  1.55it/s] 99%|█████████▉| 9695/9770 [1:50:20<00:48,  1.54it/s] 99%|█████████▉| 9696/9770 [1:50:21<00:48,  1.53it/s] 99%|█████████▉| 9697/9770 [1:50:22<00:47,  1.53it/s] 99%|█████████▉| 9698/9770 [1:50:22<00:46,  1.56it/s] 99%|██████
+0: {'loss': 0.661, 'grad_norm': 0.5814069370871611, 'learning_rate': 4.3751262159223915e-06, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.99}
+0: ███▉| 9699/9770 [1:50:23<00:45,  1.56it/s] 99%|█████████▉| 9700/9770 [1:50:24<00:44,  1.56it/s]                                                      99%|█████████▉| 9700/9770 [1:50:24<00:44,  1.56it/s] 99%|█████████▉| 9701/9770 [1:50:24<00:45,  1.53it/s] 99%|█████████▉| 9702/9770 [1:50:25<00:44,  1.52it/s] 99%|█████████▉| 9703/9770 [1:50:26<00:46,  1.44it/s] 99%|█████████▉| 9704/9770 [1:50:26<00:45,  1.44it/s] 99%|█████████▉| 9705/9770 [1:50:27<00:44,  1.45it/s] 99%|█████████▉| 9706/9770 [1:50:28<00:44,  1.43it/s] 99%|█████████▉| 9707/9770 [1:50:29<00:42,  1.47it/s] 99%|█████████▉| 9708/9770 [1:50:29<00:41,  1.50it/s] 99%|█████████▉| 9709/9770 [1:50:30<00:40,  1.52it/s] 99%|█████████▉| 9710/9770 [1:50:30<00:39,  1.52it/s]                              
+0: {'loss': 0.6271, 'grad_norm': 0.5491855585365405, 'learning_rate': 3.774642723279971e-06, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.99}
+0: {'loss': 0.6408, 'grad_norm': 0.5532423779121038, 'learning_rate': 3.2533217569645088e-06, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.99}
+0:                         99%|█████████▉| 9710/9770 [1:50:30<00:39,  1.52it/s] 99%|█████████▉| 9711/9770 [1:50:31<00:38,  1.52it/s] 99%|█████████▉| 9712/9770 [1:50:32<00:38,  1.52it/s] 99%|█████████▉| 9713/9770 [1:50:32<00:37,  1.53it/s] 99%|█████████▉| 9714/9770 [1:50:33<00:36,  1.53it/s] 99%|█████████▉| 9715/9770 [1:50:34<00:35,  1.54it/s] 99%|█████████▉| 9716/9770 [1:50:34<00:34,  1.55it/s] 99%|█████████▉| 9717/9770 [1:50:35<00:34,  1.55it/s] 99%|█████████▉| 9718/9770 [1:50:36<00:33,  1.55it/s] 99%|█████████▉| 9719/9770 [1:50:36<00:32,  1.55it/s] 99%|█████████▉| 9720/9770 [1:50:37<00:32,  1.55it/s]                                                      99%|█████████▉| 9720/9770 [1:50:37<00:32,  1.55it/s] 99%|█████████▉| 9721/9770 [1:50:38<
+0: {'loss': 0.6413, 'grad_norm': 0.542795882382674, 'learning_rate': 2.8168750185763838e-06, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 1.0}
+0: 00:31,  1.55it/s]100%|█████████▉| 9722/9770 [1:50:38<00:30,  1.55it/s]100%|█████████▉| 9723/9770 [1:50:39<00:30,  1.54it/s]100%|█████████▉| 9724/9770 [1:50:40<00:29,  1.55it/s]100%|█████████▉| 9725/9770 [1:50:40<00:28,  1.57it/s]100%|█████████▉| 9726/9770 [1:50:41<00:28,  1.56it/s]100%|█████████▉| 9727/9770 [1:50:41<00:27,  1.58it/s]100%|█████████▉| 9728/9770 [1:50:42<00:27,  1.54it/s]100%|█████████▉| 9729/9770 [1:50:43<00:26,  1.54it/s]100%|█████████▉| 9730/9770 [1:50:43<00:26,  1.53it/s]                                                     100%|█████████▉| 9730/9770 [1:50:43<00:26,  1.53it/s]100%|█████████▉| 9731/9770 [1:50:44<00:25,  1.51it/s]100%|█████████▉| 9732/9770 [1:50:45<00:25,  1.52it/s]100%|█████████▉| 9733/9770 [1:50:45<00:24, 
+0: {'loss': 0.6331, 'grad_norm': 0.5298572666971668, 'learning_rate': 2.4700843099137305e-06, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 1.0}
+0:  1.52it/s]100%|█████████▉| 9734/9770 [1:50:46<00:24,  1.48it/s]100%|█████████▉| 9735/9770 [1:50:47<00:23,  1.49it/s]100%|█████████▉| 9736/9770 [1:50:47<00:22,  1.49it/s]100%|█████████▉| 9737/9770 [1:50:48<00:21,  1.52it/s]100%|█████████▉| 9738/9770 [1:50:49<00:21,  1.52it/s]100%|█████████▉| 9739/9770 [1:50:49<00:20,  1.50it/s]100%|█████████▉| 9740/9770 [1:50:50<00:19,  1.50it/s]                                                     100%|█████████▉| 9740/9770 [1:50:50<00:19,  1.50it/s]100%|█████████▉| 9741/9770 [1:50:51<00:19,  1.51it/s]100%|█████████▉| 9742/9770 [1:50:51<00:18,  1.53it/s]100%|█████████▉| 9743/9770 [1:50:52<00:17,  1.53it/s]100%|█████████▉| 9744/9770 [1:50:53<00:17,  1.47it/s]100%|█████████▉| 9745/9770 [1:50:53<00:16,  1.49it
+0: {'loss': 0.6584, 'grad_norm': 0.5415119503803693, 'learning_rate': 2.2167491425512732e-06, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 1.0}
+0: /s]100%|█████████▉| 9746/9770 [1:50:54<00:15,  1.51it/s]100%|█████████▉| 9747/9770 [1:50:55<00:15,  1.53it/s]100%|█████████▉| 9748/9770 [1:50:55<00:14,  1.54it/s]100%|█████████▉| 9749/9770 [1:50:56<00:13,  1.52it/s]100%|█████████▉| 9750/9770 [1:50:57<00:13,  1.53it/s]                                                     100%|█████████▉| 9750/9770 [1:50:57<00:13,  1.53it/s]100%|█████████▉| 9751/9770 [1:50:57<00:12,  1.54it/s]100%|█████████▉| 9752/9770 [1:50:58<00:11,  1.53it/s]100%|█████████▉| 9753/9770 [1:50:59<00:10,  1.55it/s]100%|█████████▉| 9754/9770 [1:50:59<00:10,  1.55it/s]100%|█████████▉| 9755/9770 [1:51:00<00:09,  1.54it/s]100%|█████████▉| 9756/9770 [1:51:00<00:08,  1.56it/s]100%|█████████▉| 9757/9770 [1:51:01<00:08,  1.52it/s]100
+0: {'loss': 0.6587, 'grad_norm': 0.5807606526510161, 'learning_rate': 2.0596451095964325e-06, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 1.0}
+0: %|█████████▉| 9758/9770 [1:51:02<00:07,  1.53it/s]100%|█████████▉| 9759/9770 [1:51:02<00:07,  1.56it/s]100%|█████████▉| 9760/9770 [1:51:03<00:06,  1.54it/s]                                                     100%|█████████▉| 9760/9770 [1:51:03<00:06,  1.54it/s]100%|█████████▉| 9761/9770 [1:51:04<00:05,  1.55it/s]100%|█████████▉| 9762/9770 [1:51:04<00:05,  1.54it/s]100%|█████████▉| 9763/9770 [1:51:05<00:04,  1.54it/s]100%|█████████▉| 9764/9770 [1:51:06<00:03,  1.55it/s]100%|████████��▉| 9765/9770 [1:51:06<00:03,  1.54it/s]100%|█████████▉| 9766/9770 [1:51:07<00:02,  1.57it/s]100%|█████████▉| 9767/9770 [1:51:08<00:01,  1.58it/s]100%|█████████▉| 9768/9770 [1:51:08<00:01,  1.57it/s]100%|█████████▉| 9769/9770 [1:51:09<00:00,  1.55it/s]100%|█�
+0: {'loss': 0.6783, 'grad_norm': 0.6349343704858915, 'learning_rate': 2.000493475710391e-06, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 1.0}
+0: [2025-09-02 21:47:11,463] [INFO] [axolotl.core.trainers.base._save:613] [PID:3622631] [RANK:0] Saving model checkpoint to /lustre/fswork/projects/rech/dgo/udv55np/math/Qwen3-235B-A22B/Qwen2.5-0.5B_reasoning/1/checkpoint-9770[39m
+0: [2025-09-02 21:47:12,391] [INFO] [axolotl.core.trainers.base._save:662] [PID:3622631] [RANK:0] Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`[39m
+0: {'train_runtime': 6672.3863, 'train_samples_per_second': 23.428, 'train_steps_per_second': 1.464, 'train_loss': 0.6798874075605535, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 1.0}
+0: �████████| 9770/9770 [1:51:10<00:00,  1.55it/s]                                                     100%|██████████| 9770/9770 [1:51:10<00:00,  1.55it/s]                                                     100%|██████████| 9770/9770 [1:51:12<00:00,  1.55it/s]100%|██████████| 9770/9770 [1:51:12<00:00,  1.46it/s]
+0: [2025-09-02 21:47:16,771] [INFO] [axolotl.train.save_trained_model:228] [PID:3622631] [RANK:0] Training completed! Saving trained model to /lustre/fswork/projects/rech/dgo/udv55np/math/Qwen3-235B-A22B/Qwen2.5-0.5B_reasoning/1.[39m
+0: [2025-09-02 21:47:17,333] [INFO] [axolotl.core.trainers.base._save:613] [PID:3622631] [RANK:0] Saving model checkpoint to /lustre/fswork/projects/rech/dgo/udv55np/math/Qwen3-235B-A22B/Qwen2.5-0.5B_reasoning/1[39m
+0: [2025-09-02 21:47:18,240] [INFO] [axolotl.core.trainers.base._save:662] [PID:3622631] [RANK:0] Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`[39m
+0: [2025-09-02 21:47:18,532] [INFO] [axolotl.train.save_trained_model:350] [PID:3622631] [RANK:0] Model successfully saved to /lustre/fswork/projects/rech/dgo/udv55np/math/Qwen3-235B-A22B/Qwen2.5-0.5B_reasoning/1[39m