diff --git "a/slurm.out" "b/slurm.out"
new file mode 100644--- /dev/null
+++ "b/slurm.out"
@@ -0,0 +1,2061 @@
+1: W0902 19:36:38.892000 3051572 torch/distributed/run.py:792]
+1: W0902 19:36:38.892000 3051572 torch/distributed/run.py:792] *****************************************
+1: W0902 19:36:38.892000 3051572 torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+1: W0902 19:36:38.892000 3051572 torch/distributed/run.py:792] *****************************************
+0: W0902 19:36:39.195000 3622552 torch/distributed/run.py:792]
+0: W0902 19:36:39.195000 3622552 torch/distributed/run.py:792] *****************************************
+0: W0902 19:36:39.195000 3622552 torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+0: W0902 19:36:39.195000 3622552 torch/distributed/run.py:792] *****************************************
+3: W0902 19:36:39.240000 2169916 torch/distributed/run.py:792]
+3: W0902 19:36:39.240000 2169916 torch/distributed/run.py:792] *****************************************
+3: W0902 19:36:39.240000 2169916 torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+3: W0902 19:36:39.240000 2169916 torch/distributed/run.py:792] *****************************************
+2: W0902 19:36:39.704000 3738913 torch/distributed/run.py:792]
+2: W0902 19:36:39.704000 3738913 torch/distributed/run.py:792] *****************************************
+2: W0902 19:36:39.704000 3738913 torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+2: W0902 19:36:39.704000 3738913 torch/distributed/run.py:792] *****************************************
+0: [2025-09-02 19:37:04,095] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:119] [PID:3622631] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing`[39m
+0: [2025-09-02 19:37:04,095] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:218] [PID:3622631] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing[39m
+3: [2025-09-02 19:37:05,674] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:119] [PID:2169992] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing`[39m
+3: [2025-09-02 19:37:05,674] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:218] [PID:2169992] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing[39m
+1: [2025-09-02 19:37:05,843] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:119] [PID:3051647] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing`[39m
+1: [2025-09-02 19:37:05,843] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:218] [PID:3051647] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing[39m
+2: [2025-09-02 19:37:05,901] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:119] [PID:3738989] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing`[39m
+2: [2025-09-02 19:37:05,902] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:218] [PID:3738989] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing[39m
+0: [2025-09-02 19:37:07,766] [INFO] [axolotl.cli.config.load_cfg:245] [PID:3622631] [RANK:0] config:
+0: {
+0: "activation_offloading": false,
+0: "auto_resume_from_checkpoints": true,
+0: "axolotl_config_path": "/lustre/fswork/projects/rech/dgo/udv55np/train/tmp/1756826506457874101.yaml",
+0: "base_model": "/lustre/fswork/projects/rech/qwv/udv55np/Qwen/Qwen2.5-0.5B_reasoning",
+0: "base_model_config": "/lustre/fswork/projects/rech/qwv/udv55np/Qwen/Qwen2.5-0.5B_reasoning",
+0: "batch_size": 16,
+0: "bf16": true,
+0: "capabilities": {
+0: "bf16": true,
+0: "compute_capability": "sm_90",
+0: "fp8": false,
+0: "n_gpu": 16,
+0: "n_node": 1
+0: },
+0: "chat_template": "qwen_25",
+0: "context_parallel_size": 1,
+0: "dataloader_num_workers": 16,
+0: "dataloader_pin_memory": true,
+0: "dataloader_prefetch_factor": 256,
+0: "dataset_prepared_path": "/lustre/fsn1/projects/rech/dgo/udv55np/dataset_math/Qwen3-235B-A22B/1",
+0: "dataset_processes": 192,
+0: "datasets": [
+0: {
+0: "chat_template": "tokenizer_default",
+0: "field_messages": "conversations",
+0: "message_property_mappings": {
+0: "content": "content",
+0: "role": "role"
+0: },
+0: "path": "/lustre/fswork/projects/rech/qwv/udv55np/dataset/math/hf/thinking_text/generator/default-68225543a18d39ac/0.0.0",
+0: "trust_remote_code": false,
+0: "type": "chat_template"
+0: }
+0: ],
+0: "ddp": true,
+0: "deepspeed": {
+0: "bf16": {
+0: "enabled": true
+0: },
+0: "gradient_accumulation_steps": "auto",
+0: "gradient_clipping": "auto",
+0: "train_batch_size": "auto",
+0: "train_micro_batch_size_per_gpu": "auto",
+0: "wall_clock_breakdown": false,
+0: "zero_optimization": {
+0: "contiguous_gradients": true,
+0: "overlap_comm": true,
+0: "reduce_bucket_size": "auto",
+0: "stage": 3,
+0: "stage3_gather_16bit_weights_on_model_save": true,
+0: "stage3_param_persistence_threshold": "auto",
+0: "stage3_prefetch_bucket_size": "auto",
+0: "sub_group_size": 0
+0: }
+0: },
+0: "device": "cuda:0",
+0: "device_map": {
+0: "": 0
+0: },
+0: "dion_rank_fraction": 1.0,
+0: "dion_rank_multiple_of": 1,
+0: "env_capabilities": {
+0: "torch_version": "2.6.0"
+0: },
+0: "eval_batch_size": 1,
+0: "eval_causal_lm_metrics": [
+0: "sacrebleu",
+0: "comet",
+0: "ter",
+0: "chrf"
+0: ],
+0: "eval_max_new_tokens": 128,
+0: "eval_sample_packing": true,
+0: "eval_table_size": 0,
+0: "evals_per_epoch": 0,
+0: "flash_attention": true,
+0: "fp16": false,
+0: "gradient_accumulation_steps": 1,
+0: "gradient_checkpointing": true,
+0: "gradient_checkpointing_kwargs": {
+0: "use_reentrant": true
+0: },
+0: "learning_rate": 2e-05,
+0: "lisa_layers_attribute": "model.layers",
+0: "load_best_model_at_end": false,
+0: "load_in_4bit": false,
+0: "load_in_8bit": false,
+0: "local_rank": 0,
+0: "logging_steps": 10,
+0: "lora_dropout": 0.0,
+0: "loraplus_lr_embedding": 1e-06,
+0: "lr_scheduler": "warmup_stable_decay",
+0: "lr_scheduler_kwargs": {
+0: "min_lr_ratio": 0.1,
+0: "num_decay_steps": 300
+0: },
+0: "max_prompt_len": 512,
+0: "mean_resizing_embeddings": false,
+0: "micro_batch_size": 1,
+0: "model_config_type": "qwen2",
+0: "num_epochs": 1.0,
+0: "optimizer": "adamw_torch_fused",
+0: "output_dir": "/lustre/fswork/projects/rech/dgo/udv55np/math/Qwen3-235B-A22B/Qwen2.5-0.5B_reasoning/1",
+0: "pad_to_sequence_len": true,
+0: "pretrain_multipack_attn": true,
+0: "pretrain_multipack_buffer_size": 10000,
+0: "profiler_steps_start": 0,
+0: "qlora_sharded_model_loading": false,
+0: "ray_num_workers": 1,
+0: "resources_per_worker": {
+0: "GPU": 1
+0: },
+0: "sample_packing": true,
+0: "sample_packing_bin_size": 200,
+0: "sample_packing_group_size": 100000,
+0: "save_only_model": false,
+0: "save_safetensors": true,
+0: "save_steps": 0.2,
+0: "save_total_limit": 20,
+0: "sequence_len": 16384,
+0: "shuffle_before_merging_datasets": false,
+0: "shuffle_merged_datasets": true,
+0: "skip_prepare_dataset": false,
+0: "special_tokens": {
+0: "bos_token": "<|im_start|>",
+0: "eos_token": "<|im_end|>",
+0: "pad_token": "<|endoftext|>"
+0: },
+0: "strict": false,
+0: "tensor_parallel_size": 1,
+0: "tf32": false,
+0: "tiled_mlp_use_original_mlp": true,
+0: "tokenizer_config": "/lustre/fswork/projects/rech/qwv/udv55np/Qwen/Qwen2.5-0.5B_reasoning",
+0: "torch_dtype": "torch.bfloat16",
+0: "train_on_inputs": false,
+0: "trl": {
+0: "log_completions": false,
+0: "mask_truncated_completions": false,
+0: "ref_model_mixup_alpha": 0.9,
+0: "ref_model_sync_steps": 64,
+0: "scale_rewards": true,
+0: "sync_ref_model": false,
+0: "use_vllm": false,
+0: "vllm_server_host": "0.0.0.0",
+0: "vllm_server_port": 8000
+0: },
+0: "use_ray": false,
+0: "use_tensorboard": true,
+0: "val_set_size": 0.0,
+0: "vllm": {
+0: "device": "auto",
+0: "dtype": "auto",
+0: "gpu_memory_utilization": 0.9,
+0: "host": "0.0.0.0",
+0: "port": 8000
+0: },
+0: "warmup_steps": 150,
+0: "weight_decay": 0.0,
+0: "world_size": 16
+0: }[39m
+0: [2025-09-02 19:37:07,768] [INFO] [axolotl.cli.checks.check_user_token:35] [PID:3622631] [RANK:0] Skipping HuggingFace token verification because HF_HUB_OFFLINE is set to True. Only local files will be used.[39m
+0: [2025-09-02 19:37:08,143] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:478] [PID:3622631] [RANK:0] Unable to find prepared dataset in /lustre/fsn1/projects/rech/dgo/udv55np/dataset_math/Qwen3-235B-A22B/1/21006e43c19b80ce1023552634abc92d[39m
+0: [2025-09-02 19:37:08,143] [INFO] [axolotl.utils.data.sft._load_raw_datasets:314] [PID:3622631] [RANK:0] Loading raw datasets...[39m
+0: [33m[2025-09-02 19:37:08,143] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:316] [PID:3622631] [RANK:0] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`.[39m
+0: [2025-09-02 19:37:08,415] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:88] [PID:3622631] [RANK:0] Loading dataset: /lustre/fswork/projects/rech/qwv/udv55np/dataset/math/hf/thinking_text/generator/default-68225543a18d39ac/0.0.0 with base_type: chat_template and prompt_style: None[39m
+0: [2025-09-02 19:37:08,421] [INFO] [axolotl.prompt_strategies.chat_template.__call__:957] [PID:3622631] [RANK:0] Using chat template:
+0: ---
+0: {%- if tools %}
+0: {{- '<|im_start|>system\n' }}
+0: {%- if messages[0]['role'] == 'system' %}
+0: {{- messages[0]['content'] }}
+0: {%- else %}
+0: {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+0: {%- endif %}
+0: {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+0: {%- for tool in tools %}
+0: {{- "\n" }}
+0: {{- tool | tojson }}
+0: {%- endfor %}
+0: {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+0: {%- else %}
+0: {%- if messages[0]['role'] == 'system' %}
+0: {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+0: {%- else %}
+0: {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+0: {%- endif %}
+0: {%- endif %}
+0: {%- for message in messages %}
+0: {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+0: {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+0: {%- elif message.role == "assistant" %}
+0: {{- '<|im_start|>' + message.role }}
+0: {%- if message.content %}
+0: {{- '\n' + message.content }}
+0: {%- endif %}
+0: {%- for tool_call in message.tool_calls %}
+0: {%- if tool_call.function is defined %}
+0: {%- set tool_call = tool_call.function %}
+0: {%- endif %}
+0: {{- '\n\n{"name": "' }}
+0: {{- tool_call.name }}
+0: {{- '", "arguments": ' }}
+0: {{- tool_call.arguments | tojson }}
+0: {{- '}\n' }}
+0: {%- endfor %}
+0: {{- '<|im_end|>\n' }}
+0: {%- elif message.role == "tool" %}
+0: {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+0: {{- '<|im_start|>user' }}
+0: {%- endif %}
+0: {{- '\n\n' }}
+0: {{- message.content }}
+0: {{- '\n' }}
+0: {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+0: {{- '<|im_end|>\n' }}
+0: {%- endif %}
+0: {%- endif %}
+0: {%- endfor %}
+0: {%- if add_generation_prompt %}
+0: {{- '<|im_start|>assistant\n' }}
+0: {%- endif %}
+0:
+0: ---[39m
+0:
Tokenizing Prompts (num_proc=192): 0%| | 0/322369 [00:00, ? examples/s]
Tokenizing Prompts (num_proc=192): 0%| | 1000/322369 [01:20<7:09:06, 12.48 examples/s]
Tokenizing Prompts (num_proc=192): 1%| | 2000/322369 [01:23<3:05:28, 28.79 examples/s]
Tokenizing Prompts (num_proc=192): 1%| | 3000/322369 [01:23<1:42:35, 51.89 examples/s]
Tokenizing Prompts (num_proc=192): 1%| | 4000/322369 [01:24<1:02:58, 84.27 examples/s]
Tokenizing Prompts (num_proc=192): 2%|▏ | 5000/322369 [01:25<41:51, 126.37 examples/s]
Tokenizing Prompts (num_proc=192): 2%|▏ | 6000/322369 [01:25<27:58, 188.50 examples/s]
Tokenizing Prompts (num_proc=192): 2%|▏ | 7000/322369 [01:25<19:04, 275.47 examples/s]
Tokenizing Prompts (num_proc=192): 2%|▏ | 8000/322369 [01:27<15:23, 340.26 examples/s]
Tokenizing Prompts (num_proc=192): 3%|▎ | 9000/322369 [01:28<12:07, 430.48 examples/s]
Tokenizing Prompts (num_proc=192): 3%|▎
+0: | 11000/322369 [01:28<06:33, 791.66 examples/s]
Tokenizing Prompts (num_proc=192): 4%|▍ | 14000/322369 [01:28<03:27, 1488.24 examples/s]
Tokenizing Prompts (num_proc=192): 5%|▍ | 16000/322369 [01:28<02:30, 2038.45 examples/s]
Tokenizing Prompts (num_proc=192): 5%|▌ | 17000/322369 [01:29<02:22, 2145.67 examples/s]
Tokenizing Prompts (num_proc=192): 6%|▌ | 18000/322369 [01:29<02:19, 2179.32 examples/s]
Tokenizing Prompts (num_proc=192): 6%|▌ | 19000/322369 [01:29<01:56, 2595.18 examples/s]
Tokenizing Prompts (num_proc=192): 6%|▌ | 20000/322369 [01:30<01:43, 2920.29 examples/s]
Tokenizing Prompts (num_proc=192): 7%|▋ | 22000/322369 [01:30<01:11, 4214.34 examples/s]
Tokenizing Prompts (num_proc=192): 7%|▋ | 24000/322369 [01:30<00:50, 5916.12 examples/s]
Tokenizing Prompts (num_proc=192): 8%|▊ | 26000/322369 [01:31<01:24, 3516.48 examples/s]
Tokenizing Prompts (num_proc=192): 8%|▊ | 27000/32236
+0: 9 [01:31<01:42, 2886.42 examples/s]
Tokenizing Prompts (num_proc=192): 9%|▊ | 28000/322369 [01:32<01:31, 3224.42 examples/s]
Tokenizing Prompts (num_proc=192): 9%|▉ | 29000/322369 [01:32<01:25, 3433.63 examples/s]
Tokenizing Prompts (num_proc=192): 10%|▉ | 31000/322369 [01:32<00:59, 4938.24 examples/s]
Tokenizing Prompts (num_proc=192): 10%|▉ | 32000/322369 [01:32<00:53, 5440.46 examples/s]
Tokenizing Prompts (num_proc=192): 11%|█ | 34000/322369 [01:33<00:56, 5081.97 examples/s]
Tokenizing Prompts (num_proc=192): 11%|█ | 35000/322369 [01:33<00:50, 5693.53 examples/s]
Tokenizing Prompts (num_proc=192): 11%|█ | 36000/322369 [01:33<00:49, 5817.66 examples/s]
Tokenizing Prompts (num_proc=192): 12%|█▏ | 38000/322369 [01:33<00:57, 4907.51 examples/s]
Tokenizing Prompts (num_proc=192): 12%|█▏ | 39000/322369 [01:34<01:04, 4368.61 examples/s]
Tokenizing Prompts (num_proc=192): 12%|█▏ | 40000/322369 [01:34
+0: <01:34, 3002.35 examples/s]
Tokenizing Prompts (num_proc=192): 13%|█▎ | 42000/322369 [01:35<01:36, 2914.97 examples/s]
Tokenizing Prompts (num_proc=192): 13%|█▎ | 43000/322369 [01:35<01:21, 3407.60 examples/s]
Tokenizing Prompts (num_proc=192): 14%|█▍ | 46000/322369 [01:35<00:51, 5351.20 examples/s]
Tokenizing Prompts (num_proc=192): 15%|█▍ | 48000/322369 [01:35<00:42, 6457.06 examples/s]
Tokenizing Prompts (num_proc=192): 16%|█▌ | 50000/322369 [01:36<00:34, 7925.12 examples/s]
Tokenizing Prompts (num_proc=192): 16%|█▌ | 52000/322369 [01:36<00:29, 9214.74 examples/s]
Tokenizing Prompts (num_proc=192): 17%|█▋ | 54000/322369 [01:36<00:25, 10360.06 examples/s]
Tokenizing Prompts (num_proc=192): 17%|█▋ | 56000/322369 [01:36<00:21, 12128.63 examples/s]
Tokenizing Prompts (num_proc=192): 18%|█▊ | 58000/322369 [01:36<00:21, 12271.54 examples/s]
Tokenizing Prompts (num_proc=192): 19%|█▊ | 60000/3223
+0: 69 [01:37<00:31, 8400.65 examples/s]
Tokenizing Prompts (num_proc=192): 19%|█▉ | 62000/322369 [01:37<00:39, 6517.05 examples/s]
Tokenizing Prompts (num_proc=192): 20%|█▉ | 63000/322369 [01:37<00:44, 5781.79 examples/s]
Tokenizing Prompts (num_proc=192): 20%|█▉ | 64000/322369 [01:38<01:06, 3902.11 examples/s]
Tokenizing Prompts (num_proc=192): 20%|██ | 65000/322369 [01:38<01:03, 4038.62 examples/s]
Tokenizing Prompts (num_proc=192): 21%|██ | 68000/322369 [01:38<00:48, 5211.08 examples/s]
Tokenizing Prompts (num_proc=192): 21%|██▏ | 69000/322369 [01:39<01:26, 2914.51 examples/s]
Tokenizing Prompts (num_proc=192): 22%|██▏ | 70000/322369 [01:41<02:04, 2024.42 examples/s]
Tokenizing Prompts (num_proc=192): 22%|██▏ | 71000/322369 [01:41<02:01, 2064.83 examples/s]
Tokenizing Prompts (num_proc=192): 23%|██▎ | 75000/322369 [01:41<01:00, 4121.57 examples/s]
Tokenizing Prompts (num_proc=192): 24%|██▎
+0: | 76000/322369 [01:42<01:04, 3831.39 examples/s]
Tokenizing Prompts (num_proc=192): 24%|██▍ | 77000/322369 [01:42<01:01, 3974.69 examples/s]
Tokenizing Prompts (num_proc=192): 25%|██▍ | 80000/322369 [01:42<00:43, 5573.63 examples/s]
Tokenizing Prompts (num_proc=192): 25%|██▌ | 81000/322369 [01:43<01:22, 2922.27 examples/s]
Tokenizing Prompts (num_proc=192): 26%|██▌ | 83000/322369 [01:44<01:22, 2892.49 examples/s]
Tokenizing Prompts (num_proc=192): 27%|██▋ | 86000/322369 [01:44<00:51, 4578.26 examples/s]
Tokenizing Prompts (num_proc=192): 27%|██▋ | 88000/322369 [01:45<01:04, 3639.73 examples/s]
Tokenizing Prompts (num_proc=192): 28%|██▊ | 89000/322369 [01:46<01:42, 2266.96 examples/s]
Tokenizing Prompts (num_proc=192): 28%|██▊ | 91000/322369 [01:46<01:16, 3006.51 examples/s]
Tokenizing Prompts (num_proc=192): 29%|██▊ | 92000/322369 [01:47<01:42, 2236.97 examples/s]
Tokenizing Prompts (num_p
+0: roc=192): 29%|██▉ | 93000/322369 [01:47<01:28, 2594.98 examples/s]
Tokenizing Prompts (num_proc=192): 29%|██▉ | 94000/322369 [01:48<01:29, 2538.26 examples/s]
Tokenizing Prompts (num_proc=192): 29%|██▉ | 95000/322369 [01:48<01:42, 2226.68 examples/s]
Tokenizing Prompts (num_proc=192): 30%|███ | 97000/322369 [01:49<01:05, 3434.52 examples/s]
Tokenizing Prompts (num_proc=192): 30%|███ | 98000/322369 [01:49<01:21, 2736.22 examples/s]
Tokenizing Prompts (num_proc=192): 31%|███ | 99000/322369 [01:50<01:20, 2789.64 examples/s]
Tokenizing Prompts (num_proc=192): 31%|███▏ | 101000/322369 [01:50<01:08, 3248.62 examples/s]
Tokenizing Prompts (num_proc=192): 32%|███▏ | 102000/322369 [01:50<00:59, 3711.17 examples/s]
Tokenizing Prompts (num_proc=192): 32%|███▏ | 104000/322369 [01:51<00:52, 4172.21 examples/s]
Tokenizing Prompts (num_proc=192): 33%|███▎ | 106000/322369 [01:51<00:38, 5665.57
+0: examples/s]
Tokenizing Prompts (num_proc=192): 33%|███▎ | 107000/322369 [01:51<00:38, 5591.44 examples/s]
Tokenizing Prompts (num_proc=192): 34%|███▍ | 109000/322369 [01:51<00:34, 6115.34 examples/s]
Tokenizing Prompts (num_proc=192): 34%|███▍ | 110000/322369 [01:51<00:35, 5938.83 examples/s]
Tokenizing Prompts (num_proc=192): 35%|███▍ | 112000/322369 [01:52<00:31, 6604.75 examples/s]
Tokenizing Prompts (num_proc=192): 35%|███▌ | 114000/322369 [01:52<00:24, 8503.75 examples/s]
Tokenizing Prompts (num_proc=192): 36%|███▋ | 117000/322369 [01:52<00:20, 10034.27 examples/s]
Tokenizing Prompts (num_proc=192): 37%|███▋ | 119000/322369 [01:52<00:22, 9228.05 examples/s]
Tokenizing Prompts (num_proc=192): 38%|███▊ | 121000/322369 [01:52<00:24, 8064.63 examples/s]
Tokenizing Prompts (num_proc=192): 38%|███▊ | 122000/322369 [01:53<00:28, 6911.45 examples/s]
Tokenizing Prompts (num_proc=192): 3
+0: 8%|███▊ | 123000/322369 [01:53<00:32, 6098.10 examples/s]
Tokenizing Prompts (num_proc=192): 38%|███▊ | 124000/322369 [01:53<00:39, 4987.41 examples/s]
Tokenizing Prompts (num_proc=192): 39%|███▉ | 127000/322369 [01:54<00:31, 6154.36 examples/s]
Tokenizing Prompts (num_proc=192): 40%|████ | 130000/322369 [01:55<00:43, 4430.66 examples/s]
Tokenizing Prompts (num_proc=192): 41%|████ | 131000/322369 [01:55<00:47, 3992.37 examples/s]
Tokenizing Prompts (num_proc=192): 41%|████ | 132000/322369 [01:55<00:48, 3915.99 examples/s]
Tokenizing Prompts (num_proc=192): 41%|████▏ | 133000/322369 [01:55<00:42, 4481.16 examples/s]
Tokenizing Prompts (num_proc=192): 42%|████▏ | 134000/322369 [01:56<00:55, 3375.06 examples/s]
Tokenizing Prompts (num_proc=192): 42%|████▏ | 135000/322369 [01:56<00:50, 3679.29 examples/s]
Tokenizing Prompts (num_proc=192): 42%|████▏ | 136000/322369 [01:56<
+0: 00:43, 4327.05 examples/s]
Tokenizing Prompts (num_proc=192): 42%|████▏ | 137000/322369 [01:57<01:01, 2993.11 examples/s]
Tokenizing Prompts (num_proc=192): 43%|████▎ | 138000/322369 [01:57<00:49, 3711.41 examples/s]
Tokenizing Prompts (num_proc=192): 44%|████▎ | 141000/322369 [01:57<00:30, 5920.88 examples/s]
Tokenizing Prompts (num_proc=192): 44%|████▍ | 143000/322369 [01:58<00:32, 5596.41 examples/s]
Tokenizing Prompts (num_proc=192): 45%|████▍ | 144000/322369 [01:58<00:31, 5670.96 examples/s]
Tokenizing Prompts (num_proc=192): 45%|████▍ | 145000/322369 [01:58<00:32, 5534.38 examples/s]
Tokenizing Prompts (num_proc=192): 46%|████▌ | 147000/322369 [01:58<00:27, 6405.72 examples/s]
Tokenizing Prompts (num_proc=192): 47%|████▋ | 150000/322369 [01:58<00:19, 8814.45 examples/s]
Tokenizing Prompts (num_proc=192): 47%|████▋ | 152000/322369 [01:59<00:21, 7769.71 examples/s]
Tokeniz
+0: ing Prompts (num_proc=192): 47%|████▋ | 153000/322369 [01:59<00:23, 7174.25 examples/s]
Tokenizing Prompts (num_proc=192): 48%|████▊ | 154000/322369 [01:59<00:22, 7417.28 examples/s]
Tokenizing Prompts (num_proc=192): 48%|████▊ | 155000/322369 [01:59<00:32, 5222.98 examples/s]
Tokenizing Prompts (num_proc=192): 49%|████▉ | 159000/322369 [02:00<00:21, 7571.02 examples/s]
Tokenizing Prompts (num_proc=192): 50%|████▉ | 160000/322369 [02:00<00:21, 7542.33 examples/s]
Tokenizing Prompts (num_proc=192): 50%|████▉ | 161000/322369 [02:00<00:31, 5120.95 examples/s]
Tokenizing Prompts (num_proc=192): 50%|█████ | 162000/322369 [02:01<00:33, 4839.42 examples/s]
Tokenizing Prompts (num_proc=192): 51%|█████ | 163000/322369 [02:01<00:57, 2781.70 examples/s]
Tokenizing Prompts (num_proc=192): 51%|█████ | 164000/322369 [02:02<00:54, 2901.14 examples/s]
Tokenizing Prompts (num_proc=192): 51%|�
+0: ��████▏ | 166000/322369 [02:02<00:49, 3158.81 examples/s]
Tokenizing Prompts (num_proc=192): 52%|█████▏ | 167000/322369 [02:03<00:56, 2772.72 examples/s]
Tokenizing Prompts (num_proc=192): 52%|█████▏ | 168000/322369 [02:03<00:46, 3296.43 examples/s]
Tokenizing Prompts (num_proc=192): 53%|█████▎ | 170000/322369 [02:03<00:43, 3489.36 examples/s]
Tokenizing Prompts (num_proc=192): 53%|█████▎ | 171000/322369 [02:04<00:47, 3190.27 examples/s]
Tokenizing Prompts (num_proc=192): 54%|█████▎ | 173000/322369 [02:04<00:32, 4646.90 examples/s]
Tokenizing Prompts (num_proc=192): 54%|█████▍ | 174000/322369 [02:04<00:28, 5170.71 examples/s]
Tokenizing Prompts (num_proc=192): 55%|█████▍ | 176000/322369 [02:05<00:54, 2669.22 examples/s]
Tokenizing Prompts (num_proc=192): 56%|█████▌ | 179000/322369 [02:06<00:35, 4016.72 examples/s]
Tokenizing Prompts (num_proc=192): 56%|█████�
+0: � | 180000/322369 [02:06<00:46, 3068.50 examples/s]
Tokenizing Prompts (num_proc=192): 56%|█████▌ | 181000/322369 [02:07<00:44, 3159.62 examples/s]
Tokenizing Prompts (num_proc=192): 56%|█████▋ | 182000/322369 [02:07<00:42, 3331.19 examples/s]
Tokenizing Prompts (num_proc=192): 57%|█████▋ | 183000/322369 [02:08<00:51, 2701.06 examples/s]
Tokenizing Prompts (num_proc=192): 57%|█████▋ | 184000/322369 [02:08<00:42, 3274.82 examples/s]
Tokenizing Prompts (num_proc=192): 57%|█████▋ | 185000/322369 [02:09<01:12, 1907.38 examples/s]
Tokenizing Prompts (num_proc=192): 58%|█████▊ | 186000/322369 [02:09<01:02, 2189.64 examples/s]
Tokenizing Prompts (num_proc=192): 58%|█████▊ | 187000/322369 [02:11<01:59, 1129.81 examples/s]
Tokenizing Prompts (num_proc=192): 58%|█████▊ | 188000/322369 [02:11<01:38, 1360.10 examples/s]
Tokenizing Prompts (num_proc=192): 59%|█████▊ | 189000/32
+0: 2369 [02:12<01:18, 1692.07 examples/s]
Tokenizing Prompts (num_proc=192): 59%|█████▉ | 190000/322369 [02:21<07:07, 309.68 examples/s]
Tokenizing Prompts (num_proc=192): 59%|█████▉ | 190679/322369 [02:28<10:13, 214.82 examples/s]
Tokenizing Prompts (num_proc=192): 59%|█████▉ | 191679/322369 [02:29<07:34, 287.34 examples/s]
Tokenizing Prompts (num_proc=192): 60%|█████▉ | 192358/322369 [02:30<07:06, 305.10 examples/s]
Tokenizing Prompts (num_proc=192): 60%|█████▉ | 193037/322369 [02:31<05:27, 395.50 examples/s]
Tokenizing Prompts (num_proc=192): 60%|██████ | 194395/322369 [02:33<04:48, 443.35 examples/s]
Tokenizing Prompts (num_proc=192): 61%|██████ | 195753/322369 [02:34<03:16, 643.36 examples/s]
Tokenizing Prompts (num_proc=192): 61%|██████ | 196753/322369 [02:34<02:29, 837.99 examples/s]
Tokenizing Prompts (num_proc=192): 61%|██████ | 197432/322369 [02:34<02:08, 974.
+0: 10 examples/s]
Tokenizing Prompts (num_proc=192): 61%|██████▏ | 198111/322369 [02:37<03:23, 610.37 examples/s]
Tokenizing Prompts (num_proc=192): 62%|██████▏ | 199469/322369 [02:37<02:19, 883.86 examples/s]
Tokenizing Prompts (num_proc=192): 62%|██████▏ | 200827/322369 [02:38<01:43, 1174.94 examples/s]
Tokenizing Prompts (num_proc=192): 63%|██████▎ | 201506/322369 [02:38<01:30, 1331.26 examples/s]
Tokenizing Prompts (num_proc=192): 63%|██████▎ | 202185/322369 [02:38<01:20, 1484.66 examples/s]
Tokenizing Prompts (num_proc=192): 63%|██████▎ | 202864/322369 [02:39<01:12, 1650.05 examples/s]
Tokenizing Prompts (num_proc=192): 63%|██████▎ | 203543/322369 [02:39<01:03, 1865.72 examples/s]
Tokenizing Prompts (num_proc=192): 63%|██████▎ | 204222/322369 [02:39<00:54, 2173.02 examples/s]
Tokenizing Prompts (num_proc=192): 64%|██████▎ | 204901/322369 [02:40<01:29, 1310.
+0: 63 examples/s]
Tokenizing Prompts (num_proc=192): 64%|██████▍ | 205580/322369 [02:40<01:20, 1453.66 examples/s]
Tokenizing Prompts (num_proc=192): 64%|██████▍ | 206938/322369 [02:41<00:54, 2104.70 examples/s]
Tokenizing Prompts (num_proc=192): 64%|██████▍ | 207617/322369 [02:41<00:47, 2441.03 examples/s]
Tokenizing Prompts (num_proc=192): 65%|██████▍ | 208296/322369 [02:41<00:54, 2079.42 examples/s]
Tokenizing Prompts (num_proc=192): 65%|██████▍ | 208975/322369 [02:41<00:48, 2317.15 examples/s]
Tokenizing Prompts (num_proc=192): 65%|██████▌ | 210333/322369 [02:42<00:37, 3022.59 examples/s]
Tokenizing Prompts (num_proc=192): 66%|██████▌ | 212370/322369 [02:42<00:28, 3869.34 examples/s]
Tokenizing Prompts (num_proc=192): 67%|██████▋ | 214407/322369 [02:42<00:19, 5464.99 examples/s]
Tokenizing Prompts (num_proc=192): 67%|██████▋ | 215765/322369 [02:42<00:18, 571
+0: 7.52 examples/s]
Tokenizing Prompts (num_proc=192): 67%|██████▋ | 217123/322369 [02:43<00:21, 4927.01 examples/s]
Tokenizing Prompts (num_proc=192): 68%|██████▊ | 217802/322369 [02:43<00:34, 3066.15 examples/s]
Tokenizing Prompts (num_proc=192): 68%|██████▊ | 219160/322369 [02:44<00:27, 3762.83 examples/s]
Tokenizing Prompts (num_proc=192): 68%|██████▊ | 219839/322369 [02:44<00:33, 3030.82 examples/s]
Tokenizing Prompts (num_proc=192): 68%|██████▊ | 220518/322369 [02:45<00:55, 1830.84 examples/s]
Tokenizing Prompts (num_proc=192): 69%|██████▊ | 221197/322369 [02:45<00:59, 1689.49 examples/s]
Tokenizing Prompts (num_proc=192): 69%|██████▉ | 221876/322369 [02:46<01:18, 1285.50 examples/s]
Tokenizing Prompts (num_proc=192): 69%|██████▉ | 222555/322369 [02:47<01:14, 1341.28 examples/s]
Tokenizing Prompts (num_proc=192): 70%|██████▉ | 224592/322369 [02:47<00:40, 2
+0: 422.22 examples/s]
Tokenizing Prompts (num_proc=192): 70%|██████▉ | 225271/322369 [02:47<00:42, 2307.77 examples/s]
Tokenizing Prompts (num_proc=192): 70%|███████ | 225950/322369 [02:48<00:45, 2098.76 examples/s]
Tokenizing Prompts (num_proc=192): 70%|███████ | 226629/322369 [02:48<00:40, 2339.96 examples/s]
Tokenizing Prompts (num_proc=192): 71%|███████ | 227987/322369 [02:48<00:29, 3147.62 examples/s]
Tokenizing Prompts (num_proc=192): 71%|███████ | 228666/322369 [02:48<00:27, 3426.70 examples/s]
Tokenizing Prompts (num_proc=192): 71%|███████ | 229345/322369 [02:49<00:30, 3095.42 examples/s]
Tokenizing Prompts (num_proc=192): 71%|███████▏ | 230024/322369 [02:49<00:37, 2432.54 examples/s]
Tokenizing Prompts (num_proc=192): 72%|███████▏ | 230703/322369 [02:49<00:32, 2807.09 examples/s]
Tokenizing Prompts (num_proc=192): 72%|███████▏ | 232061/322369 [02:49<
+0: 00:23, 3809.93 examples/s]
Tokenizing Prompts (num_proc=192): 72%|███████▏ | 232740/322369 [02:51<01:01, 1461.67 examples/s]
Tokenizing Prompts (num_proc=192): 72%|███████▏ | 233419/322369 [02:51<00:53, 1652.12 examples/s]
Tokenizing Prompts (num_proc=192): 73%|███████▎ | 234098/322369 [02:51<00:49, 1791.81 examples/s]
Tokenizing Prompts (num_proc=192): 73%|███████▎ | 235456/322369 [02:52<00:36, 2380.17 examples/s]
Tokenizing Prompts (num_proc=192): 73%|███████▎ | 236135/322369 [02:52<00:43, 1986.08 examples/s]
Tokenizing Prompts (num_proc=192): 74%|███████▎ | 237493/322369 [02:53<00:34, 2444.88 examples/s]
Tokenizing Prompts (num_proc=192): 74%|███████▍ | 238172/322369 [02:53<00:31, 2696.56 examples/s]
Tokenizing Prompts (num_proc=192): 74%|███████▍ | 238851/322369 [02:53<00:29, 2845.25 examples/s]
Tokenizing Prompts (num_proc=192): 74%|███████▍ | 2
+0: 39530/322369 [02:53<00:30, 2696.42 examples/s]
Tokenizing Prompts (num_proc=192): 75%|███████▍ | 240209/322369 [02:53<00:26, 3044.02 examples/s]
Tokenizing Prompts (num_proc=192): 75%|███████▍ | 241567/322369 [02:54<00:18, 4286.57 examples/s]
Tokenizing Prompts (num_proc=192): 75%|███████▌ | 242247/322369 [02:54<00:21, 3809.80 examples/s]
Tokenizing Prompts (num_proc=192): 75%|███████▌ | 242926/322369 [02:54<00:18, 4233.84 examples/s]
Tokenizing Prompts (num_proc=192): 76%|███████▌ | 244284/322369 [02:54<00:14, 5406.25 examples/s]
Tokenizing Prompts (num_proc=192): 76%|███████▌ | 244963/322369 [02:54<00:14, 5350.62 examples/s]
Tokenizing Prompts (num_proc=192): 76%|███████▋ | 246321/322369 [02:54<00:13, 5760.84 examples/s]
Tokenizing Prompts (num_proc=192): 77%|███████▋ | 247000/322369 [02:55<00:15, 4732.75 examples/s]
Tokenizing Prompts (num_proc=192): 77%|███
+0: ████▋ | 249037/322369 [02:55<00:18, 4004.32 examples/s]
Tokenizing Prompts (num_proc=192): 77%|███████▋ | 249716/322369 [02:55<00:18, 3943.41 examples/s]
Tokenizing Prompts (num_proc=192): 78%|███████▊ | 251074/322369 [02:56<00:17, 4173.74 examples/s]
Tokenizing Prompts (num_proc=192): 78%|███████▊ | 251753/322369 [02:56<00:18, 3918.07 examples/s]
Tokenizing Prompts (num_proc=192): 78%|███████▊ | 252432/322369 [02:56<00:18, 3784.54 examples/s]
Tokenizing Prompts (num_proc=192): 79%|███████▊ | 253111/322369 [02:56<00:16, 4163.78 examples/s]
Tokenizing Prompts (num_proc=192): 79%|███████▉ | 254469/322369 [02:56<00:14, 4601.64 examples/s]
Tokenizing Prompts (num_proc=192): 79%|███████▉ | 255148/322369 [02:57<00:17, 3875.70 examples/s]
Tokenizing Prompts (num_proc=192): 79%|███████▉ | 255827/322369 [02:57<00:16, 4062.51 examples/s]
Tokenizing Prompts (num_proc=
+0: 192): 80%|███████▉ | 257185/322369 [02:57<00:14, 4412.07 examples/s]
Tokenizing Prompts (num_proc=192): 80%|████████ | 258543/322369 [02:57<00:12, 5313.58 examples/s]
Tokenizing Prompts (num_proc=192): 80%|████████ | 259222/322369 [02:57<00:11, 5414.51 examples/s]
Tokenizing Prompts (num_proc=192): 81%|████████ | 259901/322369 [02:58<00:11, 5259.63 examples/s]
Tokenizing Prompts (num_proc=192): 81%|████████▏ | 261938/322369 [02:58<00:10, 5849.39 examples/s]
Tokenizing Prompts (num_proc=192): 81%|████████▏ | 262617/322369 [02:58<00:10, 5501.34 examples/s]
Tokenizing Prompts (num_proc=192): 82%|████████▏ | 264654/322369 [02:58<00:08, 6537.00 examples/s]
Tokenizing Prompts (num_proc=192): 82%|████████▏ | 265333/322369 [02:59<00:13, 4257.74 examples/s]
Tokenizing Prompts (num_proc=192): 83%|████████▎ | 266691/322369 [02:59<00:10, 5317.27 examples/s]
+0:
Tokenizing Prompts (num_proc=192): 83%|████████▎ | 268728/322369 [02:59<00:11, 4688.72 examples/s]
Tokenizing Prompts (num_proc=192): 84%|████████▍ | 270086/322369 [02:59<00:09, 5369.20 examples/s]
Tokenizing Prompts (num_proc=192): 84%|████████▍ | 270765/322369 [03:00<00:09, 5205.89 examples/s]
Tokenizing Prompts (num_proc=192): 84%|████████▍ | 271444/322369 [03:00<00:13, 3685.47 examples/s]
Tokenizing Prompts (num_proc=192): 85%|████████▍ | 272802/322369 [03:00<00:11, 4178.81 examples/s]
Tokenizing Prompts (num_proc=192): 85%|████████▌ | 274160/322369 [03:00<00:09, 5210.99 examples/s]
Tokenizing Prompts (num_proc=192): 85%|████████▌ | 274839/322369 [03:01<00:08, 5357.08 examples/s]
Tokenizing Prompts (num_proc=192): 85%|████████▌ | 275518/322369 [03:01<00:08, 5603.95 examples/s]
Tokenizing Prompts (num_proc=192): 86%|████████▋ | 278234/32
+0: 2369 [03:01<00:06, 7255.63 examples/s]
Tokenizing Prompts (num_proc=192): 87%|████████▋ | 279592/322369 [03:01<00:08, 5327.40 examples/s]
Tokenizing Prompts (num_proc=192): 87%|████████▋ | 280271/322369 [03:01<00:07, 5366.46 examples/s]
Tokenizing Prompts (num_proc=192): 87%|████████▋ | 280950/322369 [03:02<00:07, 5295.13 examples/s]
Tokenizing Prompts (num_proc=192): 88%|████████▊ | 282987/322369 [03:02<00:08, 4833.28 examples/s]
Tokenizing Prompts (num_proc=192): 88%|████████▊ | 283666/322369 [03:02<00:07, 4933.09 examples/s]
Tokenizing Prompts (num_proc=192): 88%|████████▊ | 284345/322369 [03:03<00:11, 3359.59 examples/s]
Tokenizing Prompts (num_proc=192): 88%|████████▊ | 285024/322369 [03:03<00:11, 3263.62 examples/s]
Tokenizing Prompts (num_proc=192): 89%|████████▊ | 285703/322369 [03:03<00:16, 2279.12 examples/s]
Tokenizing Prompts (num_proc=192): 89%|�
+0: ��███████▉ | 286382/322369 [03:04<00:21, 1665.02 examples/s]
Tokenizing Prompts (num_proc=192): 89%|████████▉ | 287740/322369 [03:04<00:14, 2442.36 examples/s]
Tokenizing Prompts (num_proc=192): 89%|████████▉ | 288419/322369 [03:04<00:12, 2809.62 examples/s]
Tokenizing Prompts (num_proc=192): 90%|████████▉ | 289098/322369 [03:05<00:11, 2960.04 examples/s]
Tokenizing Prompts (num_proc=192): 90%|████████▉ | 289777/322369 [03:05<00:17, 1827.52 examples/s]
Tokenizing Prompts (num_proc=192): 90%|█████████ | 290456/322369 [03:06<00:16, 1972.81 examples/s]
Tokenizing Prompts (num_proc=192): 91%|█████████ | 291814/322369 [03:06<00:09, 3092.81 examples/s]
Tokenizing Prompts (num_proc=192): 91%|█████████▏| 294530/322369 [03:06<00:05, 4727.89 examples/s]
Tokenizing Prompts (num_proc=192): 92%|█████████▏| 295209/322369 [03:06<00:06, 3931.43 examples/s]
+0:
Tokenizing Prompts (num_proc=192): 92%|█████████▏| 295888/322369 [03:07<00:07, 3542.11 examples/s]
Tokenizing Prompts (num_proc=192): 92%|█████████▏| 296567/322369 [03:07<00:07, 3525.33 examples/s]
Tokenizing Prompts (num_proc=192): 92%|█████████▏| 297925/322369 [03:07<00:06, 3800.61 examples/s]
Tokenizing Prompts (num_proc=192): 93%|█████████▎| 299283/322369 [03:08<00:05, 4307.85 examples/s]
Tokenizing Prompts (num_proc=192): 93%|█████████▎| 301320/322369 [03:08<00:04, 4855.89 examples/s]
Tokenizing Prompts (num_proc=192): 94%|█████████▎| 301999/322369 [03:08<00:04, 4105.00 examples/s]
Tokenizing Prompts (num_proc=192): 94%|█████████▍| 302678/322369 [03:08<00:05, 3478.05 examples/s]
Tokenizing Prompts (num_proc=192): 94%|█████████▍| 304036/322369 [03:09<00:06, 2793.45 examples/s]
Tokenizing Prompts (num_proc=192): 95%|███████�
+0: �█▍| 304715/322369 [03:10<00:07, 2492.44 examples/s]
Tokenizing Prompts (num_proc=192): 95%|█████████▌| 306752/322369 [03:10<00:03, 4052.01 examples/s]
Tokenizing Prompts (num_proc=192): 96%|█████████▌| 308110/322369 [03:10<00:02, 4832.91 examples/s]
Tokenizing Prompts (num_proc=192): 96%|█████████▌| 309468/322369 [03:10<00:03, 4206.48 examples/s]
Tokenizing Prompts (num_proc=192): 96%|█████████▌| 310147/322369 [03:10<00:02, 4310.54 examples/s]
Tokenizing Prompts (num_proc=192): 97%|█████████▋| 311505/322369 [03:11<00:02, 5298.84 examples/s]
Tokenizing Prompts (num_proc=192): 97%|█████████▋| 312184/322369 [03:11<00:02, 4237.44 examples/s]
Tokenizing Prompts (num_proc=192): 97%|█████████▋| 312863/322369 [03:11<00:02, 4246.83 examples/s]
Tokenizing Prompts (num_proc=192): 97%|█████��███▋| 313542/322369 [03:11<00:02, 3132.99 examples/s]
Tokeniz
+0: ing Prompts (num_proc=192): 97%|█████████▋| 314221/322369 [03:12<00:02, 2904.33 examples/s]
Tokenizing Prompts (num_proc=192): 98%|█████████▊| 315579/322369 [03:12<00:02, 3285.91 examples/s]
Tokenizing Prompts (num_proc=192): 98%|█████████▊| 316258/322369 [03:12<00:01, 3254.45 examples/s]
Tokenizing Prompts (num_proc=192): 99%|█████████▊| 317616/322369 [03:13<00:01, 2837.23 examples/s]
Tokenizing Prompts (num_proc=192): 99%|█████████▊| 318295/322369 [03:13<00:01, 2193.08 examples/s]
Tokenizing Prompts (num_proc=192): 99%|█████████▉| 318974/322369 [03:14<00:02, 1592.02 examples/s]
Tokenizing Prompts (num_proc=192): 99%|█████████▉| 319653/322369 [03:15<00:01, 1552.20 examples/s]
Tokenizing Prompts (num_proc=192): 99%|█████████▉| 320332/322369 [03:16<00:01, 1185.50 examples/s]
Tokenizing Prompts (num_proc=192): 100%|█████████▉|
+0: 321011/322369 [03:16<00:01, 1028.53 examples/s]
Tokenizing Prompts (num_proc=192): 100%|█████████▉| 321690/322369 [03:19<00:01, 617.73 examples/s]
Tokenizing Prompts (num_proc=192): 100%|██████████| 322369/322369 [03:20<00:00, 572.60 examples/s]
Tokenizing Prompts (num_proc=192): 100%|██████████| 322369/322369 [03:21<00:00, 1601.32 examples/s]
+0: [2025-09-02 19:40:36,936] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:209] [PID:3622631] [RANK:0] min_input_len: 385[39m
+0: [2025-09-02 19:40:36,937] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:211] [PID:3622631] [RANK:0] max_input_len: 16413[39m
+0:
Dropping Long Sequences (>16384) (num_proc=192): 0%| | 0/322369 [00:00, ? examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 0%| | 1000/322369 [00:06<36:09, 148.14 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 1%| | 4000/322369 [00:06<07:00, 757.58 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 2%|▏ | 6000/322369 [00:07<04:10, 1261.67 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 3%|▎ | 10000/322369 [00:07<01:56, 2692.18 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 5%|▍ | 15000/322369 [00:07<01:01, 5034.59 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 6%|▌ | 19000/322369 [00:07<00:42, 7217.43 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 8%|▊ | 25000/322369 [00:07<00:25, 11484.18 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 10%|█ | 33000/322369 [00:07<00:15, 18799.94 examples/s]
Dropping Lo
+0: ng Sequences (>16384) (num_proc=192): 13%|█▎ | 43000/322369 [00:07<00:09, 28769.90 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 15%|█▌ | 49000/322369 [00:07<00:08, 33450.33 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 18%|█▊ | 59000/322369 [00:08<00:05, 44377.24 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 20%|██ | 66000/322369 [00:08<00:05, 47104.39 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 23%|██▎ | 73000/322369 [00:08<00:05, 44052.55 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 25%|██▍ | 79000/322369 [00:08<00:05, 46681.30 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 26%|██▋ | 85000/322369 [00:08<00:06, 36246.85 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 28%|██▊ | 90000/322369 [00:09<00:07, 30121.31 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 29%|██▉ | 94000/32236
+0: 9 [00:09<00:08, 25538.92 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 30%|███ | 98000/322369 [00:09<00:08, 25422.39 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 32%|███▏ | 103000/322369 [00:09<00:07, 28478.49 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 33%|███▎ | 107000/322369 [00:09<00:07, 30388.17 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 35%|███▍ | 112000/322369 [00:09<00:06, 32943.40 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 36%|███▌ | 116000/322369 [00:09<00:06, 31420.77 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 40%|████ | 129000/322369 [00:10<00:03, 52910.28 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 42%|████▏ | 137000/322369 [00:10<00:03, 58067.13 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 45%|████▍ | 144000/322369 [00:10<00:02, 60217.10 examples/s]
Dr
+0: opping Long Sequences (>16384) (num_proc=192): 47%|████▋ | 151000/322369 [00:10<00:02, 61892.45 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 49%|████▉ | 158000/322369 [00:10<00:02, 63424.25 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 51%|█████ | 165000/322369 [00:10<00:02, 59524.72 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 53%|█████▎ | 172000/322369 [00:10<00:02, 61209.42 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 56%|█████▌ | 180000/322369 [00:10<00:02, 65579.19 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 58%|█████▊ | 187000/322369 [00:10<00:02, 62434.71 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 60%|██████ | 194037/322369 [00:12<00:07, 16980.70 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 62%|██████▏ | 198790/322369 [00:12<00:07, 16811.26 examples/s]
Dropping Long
+0: Sequences (>16384) (num_proc=192): 63%|██████▎ | 202864/322369 [00:12<00:06, 17545.05 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 64%|██████▍ | 206259/322369 [00:12<00:06, 19099.19 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 66%|██████▌ | 211691/322369 [00:12<00:04, 22159.56 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 67%|██████▋ | 217123/322369 [00:12<00:03, 27099.77 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 69%|██████▉ | 221876/322369 [00:13<00:03, 29819.74 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 72%|███████▏ | 230703/322369 [00:13<00:02, 40026.85 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 73%|███████▎ | 236136/322369 [00:13<00:02, 38594.30 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 75%|███████▍ | 240889/322369 [00:13<00:02, 40423.40 examples/s]
+0: Dropping Long Sequences (>16384) (num_proc=192): 76%|███████▌ | 245642/322369 [00:13<00:01, 40269.97 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 78%|███████▊ | 251753/322369 [00:13<00:01, 44576.83 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 80%|███████▉ | 257185/322369 [00:13<00:01, 43078.37 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 81%|████████▏ | 261938/322369 [00:13<00:01, 40329.01 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 84%|████████▍ | 272123/322369 [00:14<00:00, 54881.05 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 88%|████████▊ | 282308/322369 [00:14<00:00, 65008.56 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 91%|█████████ | 291814/322369 [00:14<00:00, 72724.62 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 93%|█████████▎| 300641/322369
+0: [00:14<00:00, 75828.80 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 96%|█████████▌| 308789/322369 [00:14<00:00, 68320.98 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 98%|█████████▊| 316258/322369 [00:14<00:00, 67013.11 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 100%|██████████| 322369/322369 [00:15<00:00, 21177.10 examples/s]
+0: [33m[2025-09-02 19:40:53,734] [WARNING] [axolotl.utils.data.utils.handle_long_seq_in_dataset:251] [PID:3622631] [RANK:0] Dropped 31920 samples from dataset[39m
+0:
Drop Samples with Zero Trainable Tokens (num_proc=192): 0%| | 0/290449 [00:00, ? examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 0%| | 1000/290449 [00:04<23:22, 206.35 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 1%| | 3000/290449 [00:05<06:25, 744.87 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 2%|▏ | 5000/290449 [00:05<03:15, 1460.49 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 2%|▏ | 7000/290449 [00:05<02:22, 1992.67 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 3%|▎ | 9000/290449 [00:05<01:39, 2824.31 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 4%|▍ | 11000/290449 [00:06<01:16, 3651.34 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 5%|▌ | 15000/290449 [00:06<00:41, 6601.62 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 6%|▌ |
+0: 17000/290449 [00:06<00:36, 7471.80 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 7%|▋ | 21000/290449 [00:06<00:27, 9638.40 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 9%|▊ | 25000/290449 [00:06<00:23, 11302.53 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 10%|▉ | 28000/290449 [00:07<00:36, 7138.81 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 18%|█▊ | 52000/290449 [00:07<00:08, 27187.59 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 20%|██ | 59000/290449 [00:08<00:08, 28468.98 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 22%|██▏ | 64513/290449 [00:08<00:08, 27877.83 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 24%|██▍ | 69539/290449 [00:08<00:07, 29651.97 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 25%|██▌ | 74052/290449 [00:08<00:07
+0: , 30046.55 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 27%|██▋ | 78565/290449 [00:08<00:06, 31590.66 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 28%|██▊ | 82565/290449 [00:09<00:08, 23362.12 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 30%|██▉ | 86591/290449 [00:09<00:07, 25486.99 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 31%|███ | 90130/290449 [00:09<00:09, 20457.83 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 32%|███▏ | 93156/290449 [00:09<00:10, 18160.10 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 33%|███▎ | 96182/290449 [00:09<00:10, 17945.14 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 34%|███▍ | 99721/290449 [00:10<00:10, 18799.21 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 36%|███▌ | 104260/290449 [00:10<00:0
+0: 8, 23127.42 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 38%|███▊ | 110260/290449 [00:10<00:06, 29836.06 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 41%|████ | 118312/290449 [00:10<00:04, 38736.04 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 43%|████▎ | 124825/290449 [00:10<00:03, 44326.33 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 45%|████▍ | 129877/290449 [00:10<00:03, 45214.17 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 46%|████▋ | 134903/290449 [00:10<00:03, 40051.01 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 48%|████▊ | 139442/290449 [00:10<00:03, 39949.29 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 50%|█████ | 145442/290449 [00:10<00:03, 44883.21 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 53%|█████▎
+0: | 153494/290449 [00:11<00:02, 53804.06 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 56%|█████▌ | 162033/290449 [00:11<00:02, 55773.00 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 59%|█████▉ | 171572/290449 [00:11<00:01, 64690.97 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 63%|██████▎ | 181702/290449 [00:11<00:01, 69984.89 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 65%|██████▌ | 189780/290449 [00:11<00:01, 68142.13 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 68%|██████▊ | 198858/290449 [00:11<00:01, 68007.39 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 71%|███████ | 205910/290449 [00:11<00:01, 64318.70 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 73%|███████▎ | 212962/290449 [00:11<00:01, 64574.79 examples/s]
Drop Samples with Ze
+0: ro Trainable Tokens (num_proc=192): 76%|███████▌ | 219527/290449 [00:12<00:01, 63669.07 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 78%|███████▊ | 227605/290449 [00:12<00:00, 64273.58 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 81%|████████ | 234657/290449 [00:12<00:00, 59830.77 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 83%|████████▎ | 240786/290449 [00:12<00:00, 52711.05 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 85%|████████▍ | 246403/290449 [00:12<00:00, 52530.09 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 87%|████████▋ | 252019/290449 [00:12<00:00, 44269.30 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 89%|████████▊ | 257144/290449 [00:12<00:00, 42447.89 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 90%|███
+0: ██████ | 262271/290449 [00:12<00:00, 44477.86 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 92%|█████████▏| 267910/290449 [00:13<00:00, 46665.26 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 94%|█████████▍| 273037/290449 [00:13<00:00, 37991.27 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 96%|█████████▌| 277647/290449 [00:13<00:00, 34992.04 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 97%|█████████▋| 281745/290449 [00:13<00:00, 35592.32 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 98%|█████████▊| 285841/290449 [00:13<00:00, 24333.80 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 100%|██████████| 290449/290449 [00:14<00:00, 26388.32 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 100%|██████████| 290449/29
+0: 0449 [00:14<00:00, 19726.00 examples/s]
+0:
Add position_id column (Sample Packing) (num_proc=192): 0%| | 0/290449 [00:00, ? examples/s]
Add position_id column (Sample Packing) (num_proc=192): 0%| | 1000/290449 [00:09<46:32, 103.64 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 1%| | 2000/290449 [00:09<19:25, 247.59 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 1%|▏ | 4000/290449 [00:10<07:33, 631.40 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 3%|▎ | 8000/290449 [00:10<02:52, 1641.84 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 4%|▍ | 11000/290449 [00:10<01:44, 2661.99 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 4%|▍ | 13000/290449 [00:10<01:21, 3404.08 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 5%|▌ | 15000/290449 [00:10<01:07, 4101.46 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 8%|▊ |
+0: 23000/290449 [00:10<00:27, 9634.71 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 9%|▉ | 27000/290449 [00:11<00:21, 12258.64 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 12%|█▏ | 34000/290449 [00:11<00:13, 18957.47 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 14%|█▍ | 41000/290449 [00:11<00:09, 25116.32 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 17%|█▋ | 49000/290449 [00:11<00:07, 33467.07 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 19%|█▉ | 55000/290449 [00:11<00:06, 36362.44 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 21%|██ | 61000/290449 [00:11<00:06, 37929.95 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 23%|██▎ | 66000/290449 [00:11<00:05, 40072.06 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 24%|██▍ | 71000/290449 [00:11<00
+0: :06, 33671.80 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 26%|██▌ | 76000/290449 [00:12<00:06, 33393.67 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 28%|██▊ | 80000/290449 [00:12<00:06, 30105.93 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 29%|██▉ | 85000/290449 [00:12<00:06, 31678.33 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 31%|███▏ | 91000/290449 [00:12<00:05, 37398.19 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 33%|███▎ | 96000/290449 [00:12<00:05, 36817.33 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 35%|███▍ | 101000/290449 [00:12<00:05, 37372.23 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 36%|███▌ | 105000/290449 [00:12<00:05, 35776.81 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 38%|███▊ | 110000/290449 [00:
+0: 13<00:04, 39152.57 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 41%|████ | 119000/290449 [00:13<00:03, 50660.90 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 43%|████▎ | 125000/290449 [00:13<00:03, 50728.93 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 46%|████▌ | 133000/290449 [00:13<00:02, 55477.66 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 50%|████▉ | 144000/290449 [00:13<00:02, 65647.22 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 53%|█████▎ | 155000/290449 [00:13<00:01, 73832.19 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 56%|█████▌ | 163000/290449 [00:13<00:01, 74120.94 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 59%|█████▉ | 172000/290449 [00:13<00:01, 71201.32 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 62%|█�
+0: ��████▏ | 180000/290449 [00:14<00:01, 66816.76 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 64%|██████▍ | 187000/290449 [00:14<00:01, 58713.85 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 67%|██████▋ | 193539/290449 [00:15<00:07, 13034.93 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 68%|██████▊ | 198156/290449 [00:16<00:08, 11172.80 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 69%|██████▉ | 201747/290449 [00:16<00:07, 11333.11 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 71%|███████ | 204825/290449 [00:16<00:06, 12691.77 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 72%|███████▏ | 207903/290449 [00:17<00:06, 13665.36 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 72%|███████▏ | 210468/290449 [00:17<00:05, 14883.74 examples/
+0: s]
Add position_id column (Sample Packing) (num_proc=192): 74%|███████▎ | 213546/290449 [00:17<00:04, 17013.55 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 75%|███████▌ | 218676/290449 [00:17<00:03, 21439.94 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 76%|███████▋ | 221754/290449 [00:17<00:03, 21484.45 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 77%|███████▋ | 224832/290449 [00:17<00:03, 19974.78 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 78%|███████▊ | 227397/290449 [00:17<00:03, 20065.88 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 79%|███████▉ | 230474/290449 [00:18<00:03, 18805.57 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 80%|████████ | 233039/290449 [00:18<00:02, 19263.13 examples/s]
Add position_id column (Sample Packing) (num_proc=192):
+0: 82%|████████▏ | 237143/290449 [00:18<00:02, 22462.76 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 83%|████████▎ | 239708/290449 [00:18<00:02, 21513.18 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 84%|████████▍ | 245351/290449 [00:18<00:01, 29425.82 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 86%|████████▌ | 248942/290449 [00:18<00:01, 30212.40 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 87%|████████▋ | 254069/290449 [00:18<00:01, 34929.20 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 90%|████████▉ | 260731/290449 [00:18<00:00, 41536.13 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 91%|█████████▏| 265345/290449 [00:18<00:00, 42154.18 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 94%|█████████▍| 2740
+0: 59/290449 [00:19<00:00, 52481.73 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 96%|█████████▋| 280207/290449 [00:19<00:00, 47475.38 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 98%|█████████▊| 285329/290449 [00:19<00:00, 46971.34 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 100%|██████████| 290449/290449 [00:19<00:00, 35087.67 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 100%|██████████| 290449/290449 [00:20<00:00, 14390.52 examples/s]
+0:
Saving the dataset (0/192 shards): 0%| | 0/290449 [00:00, ? examples/s]
Saving the dataset (0/192 shards): 0%| | 1000/290449 [00:02<10:37, 453.92 examples/s]
Saving the dataset (1/192 shards): 11%|█ | 30513/290449 [00:02<09:32, 453.92 examples/s]
Saving the dataset (2/192 shards): 11%|█ | 32026/290449 [00:02<09:29, 453.92 examples/s]
Saving the dataset (3/192 shards): 16%|█▌ | 45539/290449 [00:02<08:59, 453.92 examples/s]
Saving the dataset (4/192 shards): 16%|█▌ | 47052/290449 [00:02<08:56, 453.92 examples/s]
Saving the dataset (5/192 shards): 16%|█▋ | 47565/290449 [00:02<08:55, 453.92 examples/s]
Saving the dataset (6/192 shards): 17%|█▋ | 50078/290449 [00:02<08:49, 453.92 examples/s]
Saving the dataset (7/192 shards): 18%|█▊ | 52591/290449 [00:02<08:44, 453.92 examples/s]
Saving the dataset (8/192 shards): 19%|█▊ | 54104/290449 [00:02<08:40, 453.92 examples/s]
Saving the dataset (9/192 sha
+0: rds): 19%|█▉ | 54617/290449 [00:02<08:39, 453.92 examples/s]
Saving the dataset (10/192 shards): 21%|██ | 61130/290449 [00:02<08:25, 453.92 examples/s]
Saving the dataset (11/192 shards): 21%|██ | 61643/290449 [00:02<08:24, 453.92 examples/s]
Saving the dataset (12/192 shards): 22%|██▏ | 65156/290449 [00:02<08:16, 453.92 examples/s]
Saving the dataset (13/192 shards): 23%|██▎ | 67669/290449 [00:02<08:10, 453.92 examples/s]
Saving the dataset (13/192 shards): 23%|██▎ | 68182/290449 [00:02<00:05, 41326.47 examples/s]
Saving the dataset (14/192 shards): 23%|██▎ | 68182/290449 [00:02<00:05, 41326.47 examples/s]
Saving the dataset (15/192 shards): 25%|██▍ | 71695/290449 [00:02<00:05, 41326.47 examples/s]
Saving the dataset (16/192 shards): 25%|██▍ | 72208/290449 [00:02<00:05, 41326.47 examples/s]
Saving the dataset (17/192 shards): 25%|██▌ | 73721/290449 [00:02<00:05, 41326.47 examples/s]
+0: Saving the dataset (18/192 shards): 26%|██▌ | 76234/290449 [00:02<00:05, 41326.47 examples/s]
Saving the dataset (19/192 shards): 27%|██▋ | 78747/290449 [00:02<00:05, 41326.47 examples/s]
Saving the dataset (20/192 shards): 27%|██▋ | 79260/290449 [00:02<00:05, 41326.47 examples/s]
Saving the dataset (21/192 shards): 28%|██▊ | 81773/290449 [00:02<00:05, 41326.47 examples/s]
Saving the dataset (22/192 shards): 29%|██▉ | 85286/290449 [00:02<00:04, 41326.47 examples/s]
Saving the dataset (23/192 shards): 30%|███ | 87799/290449 [00:02<00:04, 41326.47 examples/s]
Saving the dataset (24/192 shards): 32%|███▏ | 94312/290449 [00:02<00:04, 41326.47 examples/s]
Saving the dataset (25/192 shards): 33%|███▎ | 95338/290449 [00:02<00:04, 41326.47 examples/s]
Saving the dataset (26/192 shards): 33%|███▎ | 95338/290449 [00:02<00:04, 41326.47 examples/s]
Saving the dataset (27/192 shards): 33%|███▎
+0: | 96851/290449 [00:02<00:04, 41326.47 examples/s]
Saving the dataset (28/192 shards): 34%|███▎ | 97364/290449 [00:02<00:04, 41326.47 examples/s]
Saving the dataset (29/192 shards): 34%|███▍ | 98390/290449 [00:02<00:04, 41326.47 examples/s]
Saving the dataset (30/192 shards): 34%|███▍ | 98390/290449 [00:02<00:04, 41326.47 examples/s]
Saving the dataset (31/192 shards): 34%|███▍ | 98903/290449 [00:02<00:04, 41326.47 examples/s]
Saving the dataset (32/192 shards): 35%|███▍ | 100416/290449 [00:02<00:04, 41326.47 examples/s]
Saving the dataset (33/192 shards): 36%|███▋ | 105929/290449 [00:02<00:04, 41326.47 examples/s]
Saving the dataset (34/192 shards): 37%|███▋ | 107442/290449 [00:02<00:04, 41326.47 examples/s]
Saving the dataset (34/192 shards): 37%|███▋ | 108442/290449 [00:02<00:02, 66634.79 examples/s]
Saving the dataset (35/192 shards): 38%|███▊ | 110468/290449 [00:02<00:02, 66634
+0: .79 examples/s]
Saving the dataset (36/192 shards): 38%|███▊ | 111468/290449 [00:02<00:02, 66634.79 examples/s]
Saving the dataset (37/192 shards): 40%|███▉ | 115981/290449 [00:02<00:02, 66634.79 examples/s]
Saving the dataset (38/192 shards): 40%|████ | 117494/290449 [00:02<00:02, 66634.79 examples/s]
Saving the dataset (39/192 shards): 41%|████ | 119520/290449 [00:02<00:02, 66634.79 examples/s]
Saving the dataset (40/192 shards): 41%|████ | 119520/290449 [00:02<00:02, 66634.79 examples/s]
Saving the dataset (41/192 shards): 41%|████▏ | 120033/290449 [00:02<00:02, 66634.79 examples/s]
Saving the dataset (42/192 shards): 43%|████▎ | 124546/290449 [00:02<00:02, 66634.79 examples/s]
Saving the dataset (43/192 shards): 43%|████▎ | 125059/290449 [00:02<00:02, 66634.79 examples/s]
Saving the dataset (44/192 shards): 44%|████▎ | 126572/290449 [00:02<00:02, 66634.79 examples/s]
Saving the
+0: dataset (45/192 shards): 44%|████▍ | 128085/290449 [00:02<00:02, 66634.79 examples/s]
Saving the dataset (46/192 shards): 45%|████▍ | 130111/290449 [00:02<00:02, 66634.79 examples/s]
Saving the dataset (47/192 shards): 45%|████▍ | 130624/290449 [00:02<00:02, 66634.79 examples/s]
Saving the dataset (48/192 shards): 45%|████▍ | 130624/290449 [00:02<00:02, 66634.79 examples/s]
Saving the dataset (49/192 shards): 45%|████▌ | 131137/290449 [00:02<00:02, 66634.79 examples/s]
Saving the dataset (49/192 shards): 49%|████▊ | 141137/290449 [00:02<00:01, 86510.55 examples/s]
Saving the dataset (50/192 shards): 50%|████▉ | 144650/290449 [00:02<00:01, 86510.55 examples/s]
Saving the dataset (51/192 shards): 51%|█████ | 148163/290449 [00:02<00:01, 86510.55 examples/s]
Saving the dataset (52/192 shards): 51%|█████ | 148676/290449 [00:02<00:01, 86510.55 examples/s]
Saving the dataset (53/192
+0: shards): 52%|█████▏ | 151189/290449 [00:02<00:01, 86510.55 examples/s]
Saving the dataset (54/192 shards): 52%|█████▏ | 151702/290449 [00:02<00:01, 86510.55 examples/s]
Saving the dataset (55/192 shards): 53%|█████▎ | 155215/290449 [00:02<00:01, 86510.55 examples/s]
Saving the dataset (56/192 shards): 54%|█████▎ | 155728/290449 [00:02<00:01, 86510.55 examples/s]
Saving the dataset (57/192 shards): 54%|█████▍ | 157241/290449 [00:02<00:01, 86510.55 examples/s]
Saving the dataset (58/192 shards): 55%|█████▍ | 159267/290449 [00:02<00:01, 86510.55 examples/s]
Saving the dataset (59/192 shards): 55%|█████▍ | 159267/290449 [00:02<00:01, 86510.55 examples/s]
Saving the dataset (60/192 shards): 56%|█████▌ | 162780/290449 [00:02<00:01, 86510.55 examples/s]
Saving the dataset (61/192 shards): 56%|█████▌ | 163293/290449 [00:02<00:01, 86510.55 examples/s]
Saving the dataset (62/1
+0: 92 shards): 57%|█████▋ | 165806/290449 [00:02<00:01, 86510.55 examples/s]
Saving the dataset (63/192 shards): 58%|█████▊ | 168319/290449 [00:02<00:01, 86510.55 examples/s]
Saving the dataset (63/192 shards): 58%|█████▊ | 168832/290449 [00:02<00:01, 106577.04 examples/s]
Saving the dataset (64/192 shards): 58%|█████▊ | 168832/290449 [00:02<00:01, 106577.04 examples/s]
Saving the dataset (65/192 shards): 58%|█████▊ | 169345/290449 [00:02<00:01, 106577.04 examples/s]
Saving the dataset (66/192 shards): 59%|█████▉ | 171371/290449 [00:02<00:01, 106577.04 examples/s]
Saving the dataset (67/192 shards): 59%|█████▉ | 171371/290449 [00:02<00:01, 106577.04 examples/s]
Saving the dataset (68/192 shards): 59%|█████▉ | 171884/290449 [00:02<00:01, 106577.04 examples/s]
Saving the dataset (69/192 shards): 60%|█████▉ | 173910/290449 [00:02<00:01, 106577.04 examples/s]
Saving the data
+0: set (70/192 shards): 60%|█████▉ | 173910/290449 [00:02<00:01, 106577.04 examples/s]
Saving the dataset (71/192 shards): 60%|██████ | 174423/290449 [00:02<00:01, 106577.04 examples/s]
Saving the dataset (72/192 shards): 61%|██████ | 176936/290449 [00:02<00:01, 106577.04 examples/s]
Saving the dataset (73/192 shards): 61%|██████ | 177449/290449 [00:02<00:01, 106577.04 examples/s]
Saving the dataset (74/192 shards): 61%|██████▏ | 177962/290449 [00:02<00:01, 106577.04 examples/s]
Saving the dataset (75/192 shards): 62%|██████▏ | 178988/290449 [00:02<00:01, 106577.04 examples/s]
Saving the dataset (76/192 shards): 62%|██████▏ | 179988/290449 [00:02<00:01, 106577.04 examples/s]
Saving the dataset (77/192 shards): 62%|██████▏ | 180501/290449 [00:02<00:01, 106577.04 examples/s]
Saving the dataset (78/192 shards): 62%|██████▏ | 181014/290449 [00:02<00:01, 106577.04 exampl
+0: es/s]
Saving the dataset (79/192 shards): 64%|██████▎ | 184527/290449 [00:02<00:00, 106577.04 examples/s]
Saving the dataset (80/192 shards): 64%|██████▍ | 187040/290449 [00:02<00:00, 106577.04 examples/s]
Saving the dataset (81/192 shards): 65%|██████▍ | 188066/290449 [00:02<00:00, 106577.04 examples/s]
Saving the dataset (82/192 shards): 65%|██████▍ | 188579/290449 [00:02<00:00, 106577.04 examples/s]
Saving the dataset (83/192 shards): 65%|██████▍ | 188579/290449 [00:02<00:00, 106577.04 examples/s]
Saving the dataset (84/192 shards): 65%|██████▌ | 190092/290449 [00:02<00:00, 106577.04 examples/s]
Saving the dataset (85/192 shards): 66%|██████▋ | 192605/290449 [00:02<00:00, 106577.04 examples/s]
Saving the dataset (85/192 shards): 67%|██████▋ | 195118/290449 [00:02<00:00, 128094.20 examples/s]
Saving the dataset (86/192 shards): 67%|██████▋ | 195631/290449 [
+0: 00:02<00:00, 128094.20 examples/s]
Saving the dataset (87/192 shards): 67%|██████▋ | 195631/290449 [00:02<00:00, 128094.20 examples/s]
Saving the dataset (88/192 shards): 68%|██████▊ | 197144/290449 [00:02<00:00, 128094.20 examples/s]
Saving the dataset (89/192 shards): 69%|██████▉ | 201657/290449 [00:02<00:00, 128094.20 examples/s]
Saving the dataset (90/192 shards): 70%|██████▉ | 202170/290449 [00:02<00:00, 128094.20 examples/s]
Saving the dataset (91/192 shards): 70%|███████ | 203683/290449 [00:02<00:00, 128094.20 examples/s]
Saving the dataset (92/192 shards): 71%|███████▏ | 207196/290449 [00:02<00:00, 128094.20 examples/s]
Saving the dataset (93/192 shards): 72%|███████▏ | 208709/290449 [00:02<00:00, 128094.20 examples/s]
Saving the dataset (94/192 shards): 73%|███████▎ | 211222/290449 [00:02<00:00, 128094.20 examples/s]
Saving the dataset (95/192 shards): 75%|██
+0: █████▍ | 217248/290449 [00:02<00:00, 128094.20 examples/s]
Saving the dataset (96/192 shards): 75%|███████▍ | 217248/290449 [00:02<00:00, 128094.20 examples/s]
Saving the dataset (97/192 shards): 75%|███████▌ | 218761/290449 [00:02<00:00, 128094.20 examples/s]
Saving the dataset (98/192 shards): 77%|███████▋ | 224274/290449 [00:02<00:00, 128094.20 examples/s]
Saving the dataset (98/192 shards): 78%|███████▊ | 225274/290449 [00:02<00:00, 156727.22 examples/s]
Saving the dataset (99/192 shards): 78%|███████▊ | 227300/290449 [00:02<00:00, 156727.22 examples/s]
Saving the dataset (100/192 shards): 78%|███████▊ | 227300/290449 [00:02<00:00, 156727.22 examples/s]
Saving the dataset (101/192 shards): 79%|███████▉ | 228813/290449 [00:02<00:00, 156727.22 examples/s]
Saving the dataset (102/192 shards): 79%|███████▉ | 230326/290449 [00:02<00:00, 156727.22 examples/
+0: s]
Saving the dataset (103/192 shards): 81%|████████ | 233865/290449 [00:02<00:00, 156727.22 examples/s]
Saving the dataset (104/192 shards): 81%|████████ | 233865/290449 [00:02<00:00, 156727.22 examples/s]
Saving the dataset (105/192 shards): 81%|████████ | 233865/290449 [00:02<00:00, 156727.22 examples/s]
Saving the dataset (106/192 shards): 81%|████████ | 235404/290449 [00:02<00:00, 156727.22 examples/s]
Saving the dataset (107/192 shards): 81%|████████ | 235404/290449 [00:02<00:00, 156727.22 examples/s]
Saving the dataset (108/192 shards): 81%|████████ | 235404/290449 [00:02<00:00, 156727.22 examples/s]
Saving the dataset (109/192 shards): 81%|████████ | 235917/290449 [00:02<00:00, 156727.22 examples/s]
Saving the dataset (110/192 shards): 82%|████████▏ | 238943/290449 [00:02<00:00, 156727.22 examples/s]
Saving the dataset (111/192 shards): 82%|█████�
+0: �██▏ | 238943/290449 [00:02<00:00, 156727.22 examples/s]
Saving the dataset (112/192 shards): 83%|████████▎ | 241969/290449 [00:02<00:00, 156727.22 examples/s]
Saving the dataset (113/192 shards): 83%|████████▎ | 241969/290449 [00:02<00:00, 156727.22 examples/s]
Saving the dataset (114/192 shards): 83%|████████▎ | 242482/290449 [00:02<00:00, 156727.22 examples/s]
Saving the dataset (115/192 shards): 84%|████████▍ | 243508/290449 [00:03<00:00, 156727.22 examples/s]
Saving the dataset (116/192 shards): 84%|████████▍ | 243508/290449 [00:03<00:00, 156727.22 examples/s]
Saving the dataset (117/192 shards): 84%|████████▍ | 244021/290449 [00:03<00:00, 156727.22 examples/s]
Saving the dataset (118/192 shards): 84%|████████▍ | 244534/290449 [00:03<00:00, 156727.22 examples/s]
Saving the dataset (119/192 shards): 85%|████████▍ | 246047/290449 [00:03<00:00, 156727.
+0: 22 examples/s]
Saving the dataset (120/192 shards): 85%|████████▍ | 246560/290449 [00:03<00:00, 156727.22 examples/s]
Saving the dataset (121/192 shards): 85%|████████▌ | 247073/290449 [00:03<00:00, 156727.22 examples/s]
Saving the dataset (122/192 shards): 86%|████████▌ | 248586/290449 [00:03<00:00, 156727.22 examples/s]
Saving the dataset (123/192 shards): 86%|████████▌ | 249099/290449 [00:03<00:00, 156727.22 examples/s]
Saving the dataset (124/192 shards): 86%|████████▌ | 249612/290449 [00:03<00:00, 156727.22 examples/s]
Saving the dataset (125/192 shards): 86%|████████▌ | 250124/290449 [00:03<00:00, 156727.22 examples/s]
Saving the dataset (126/192 shards): 86%|████████▋ | 251150/290449 [00:03<00:00, 156727.22 examples/s]
Saving the dataset (127/192 shards): 86%|████████▋ | 251150/290449 [00:03<00:00, 156727.22 examples/s]
Saving the dataset (128/192 shard
+0: s): 87%|████████▋ | 252663/290449 [00:03<00:00, 156727.22 examples/s]
Saving the dataset (129/192 shards): 87%|████████▋ | 253175/290449 [00:03<00:00, 156727.22 examples/s]
Saving the dataset (130/192 shards): 87%|████████▋ | 253688/290449 [00:03<00:00, 156727.22 examples/s]
Saving the dataset (131/192 shards): 88%|████████▊ | 254201/290449 [00:03<00:00, 156727.22 examples/s]
Saving the dataset (131/192 shards): 88%|████████▊ | 254714/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (132/192 shards): 88%|████████▊ | 254714/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (133/192 shards): 88%|████████▊ | 255226/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (134/192 shards): 88%|████████▊ | 255739/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (135/192 shards): 88%|████████▊ | 256252/29
+0: 0449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (136/192 shards): 89%|████████▊ | 257278/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (137/192 shards): 89%|████████▊ | 257278/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (138/192 shards): 89%|████████▉ | 258791/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (139/192 shards): 90%|████████▉ | 260329/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (140/192 shards): 90%|████████▉ | 261329/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (141/192 shards): 90%|████████▉ | 261329/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (142/192 shards): 90%|█████████ | 261842/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (143/192 shards): 90%|█████████ | 262354/290449 [00:03<00:00, 182681.32 examples/s]
Saving
+0: the dataset (144/192 shards): 91%|█████████▏| 265378/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (145/192 shards): 91%|█████████▏| 265378/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (146/192 shards): 92%|█████████▏| 265890/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (147/192 shards): 92%|█████████▏| 266403/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (148/192 shards): 92%|█████████▏| 266915/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (149/192 shards): 92%|█████████▏| 267427/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (150/192 shards): 92%|█████████▏| 267940/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (151/192 shards): 92%|█████████▏| 268452/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (152/192 shards): 9
+0: 3%|█████████▎| 269476/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (153/192 shards): 93%|█████████▎| 270500/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (154/192 shards): 93%|█████████▎| 270500/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (155/192 shards): 93%|█████████▎| 270500/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (156/192 shards): 93%|█████████▎| 271525/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (157/192 shards): 93%|█████████▎| 271525/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (158/192 shards): 94%|█████████▎| 272038/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (159/192 shards): 94%|█████████▍| 273063/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (160/192 shards): 94%|█████████�
+0: �| 273575/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (161/192 shards): 94%|█████████▍| 274088/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (162/192 shards): 94%|█████████▍| 274088/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (163/192 shards): 95%|█████████▍| 275113/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (164/192 shards): 95%|█████████▍| 275113/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (165/192 shards): 95%|█████████▍| 275625/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (166/192 shards): 95%|█████████▌| 276649/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (167/192 shards): 96%|█████████▌| 277673/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (168/192 shards): 96%|█████████▌| 279185/290449 [00:03<00:00, 1
+0: 82681.32 examples/s]
Saving the dataset (169/192 shards): 96%|█████████▌| 279185/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (170/192 shards): 96%|█████████▌| 279185/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (171/192 shards): 96%|█████████▋| 279697/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (172/192 shards): 96%|█████████▋| 280209/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (173/192 shards): 97%|█████████▋| 280721/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (174/192 shards): 97%|█████████▋| 281233/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (175/192 shards): 97%|█████████▋| 282769/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (176/192 shards): 97%|█████████▋| 282769/290449 [00:03<00:00, 182681.32 examples/s]
Saving the
+0: dataset (177/192 shards): 97%|█████████▋| 282769/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (178/192 shards): 98%|█████████▊| 283793/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (179/192 shards): 98%|█████████▊| 283793/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (180/192 shards): 98%|█████████▊| 284305/290449 [00:03<00:00, 182681.32 examples/s]
Saving the dataset (180/192 shards): 98%|█████████▊| 284817/290449 [00:03<00:00, 207417.75 examples/s]
Saving the dataset (181/192 shards): 98%|█████████▊| 284817/290449 [00:03<00:00, 207417.75 examples/s]
Saving the dataset (182/192 shards): 98%|█████████▊| 285841/290449 [00:03<00:00, 207417.75 examples/s]
Saving the dataset (183/192 shards): 98%|█████████▊| 285841/290449 [00:03<00:00, 207417.75 examples/s]
Saving the dataset (184/192 shards): 99%|�
+0: ��████████▉| 286865/290449 [00:03<00:00, 207417.75 examples/s]
Saving the dataset (185/192 shards): 99%|█████████▉| 286865/290449 [00:03<00:00, 207417.75 examples/s]
Saving the dataset (186/192 shards): 99%|█████████▉| 287377/290449 [00:03<00:00, 207417.75 examples/s]
Saving the dataset (187/192 shards): 99%|█████████▉| 287889/290449 [00:03<00:00, 207417.75 examples/s]
Saving the dataset (188/192 shards): 99%|█████████▉| 288401/290449 [00:03<00:00, 207417.75 examples/s]
Saving the dataset (189/192 shards): 99%|█████████▉| 288913/290449 [00:03<00:00, 207417.75 examples/s]
Saving the dataset (190/192 shards): 100%|█████████▉| 289425/290449 [00:03<00:00, 207417.75 examples/s]
Saving the dataset (191/192 shards): 100%|█████████▉| 289937/290449 [00:03<00:00, 207417.75 examples/s]
Saving the dataset (192/192 shards): 100%|██████████| 2
+0: 90449/290449 [00:03<00:00, 207417.75 examples/s]
Saving the dataset (192/192 shards): 100%|██████████| 290449/290449 [00:03<00:00, 88496.31 examples/s]
+0: [2025-09-02 19:43:59,224] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:436] [PID:3622631] [RANK:0] gather_len_batches: [156401, 156323, 156360, 156342, 156327, 156381, 156371, 156340, 156377, 156377, 156382, 156350, 156392, 156337, 156379, 156365][39m
+0: [2025-09-02 19:43:59,634] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:495] [PID:3622631] [RANK:0] sample_packing_eff_est across ranks: [0.977264940738678, 0.977090060710907, 0.9777962565422058, 0.9773836731910706, 0.9768965244293213, 0.9772837162017822, 0.9774523973464966, 0.9769526720046997, 0.9774523973464966, 0.9775961637496948, 0.9775086641311646, 0.9766343832015991, 0.9773149490356445, 0.9774774312973022, 0.9774898886680603, 0.9774399399757385][39m
+0: [2025-09-02 19:43:59,650] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:127] [PID:3622631] [RANK:0] Maximum number of steps set at 9770[39m
+0: [2025-09-02 19:44:00,072] [INFO] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:110] [PID:3622631] [RANK:0] Patched Trainer.evaluation_loop with nanmean loss calculation[39m
+0: [2025-09-02 19:44:00,073] [INFO] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:164] [PID:3622631] [RANK:0] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation[39m
+0: [2025-09-02 19:44:05,320] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:345] [PID:3622631] [RANK:0] Converting modules to torch.bfloat16[39m
+0: [2025-09-02 19:44:42,778] [INFO] [axolotl.train.save_initial_configs:416] [PID:3622631] [RANK:0] Pre-saving tokenizer to /lustre/fswork/projects/rech/dgo/udv55np/math/Qwen3-235B-A22B/Qwen2.5-0.5B_reasoning/1...[39m
+0: [2025-09-02 19:44:43,248] [INFO] [axolotl.train.save_initial_configs:419] [PID:3622631] [RANK:0] Pre-saving model config to /lustre/fswork/projects/rech/dgo/udv55np/math/Qwen3-235B-A22B/Qwen2.5-0.5B_reasoning/1...[39m
+0: [2025-09-02 19:44:43,258] [INFO] [axolotl.train.execute_training:203] [PID:3622631] [RANK:0] Starting trainer...[39m
+0: [2025-09-02 19:55:57,577] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:436] [PID:3622631] [RANK:0] gather_len_batches: [156401, 156401, 156401, 156401, 156401, 156401, 156401, 156401, 156401, 156401, 156401, 156401, 156401, 156401, 156401, 156401][39m
+0: Parameter Offload - Persistent parameters statistics: param_count = 121, numel = 71552
+0: {'loss': 0.7713, 'grad_norm': 0.6373713397101597, 'learning_rate': 3.08e-06, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.0}
+0:
0%| | 0/9770 [00:00, ?it/s]
0%| | 1/9770 [03:13<523:53:08, 193.06s/it]
0%| | 2/9770 [03:17<222:37:50, 82.05s/it]
0%| | 3/9770 [03:18<122:21:38, 45.10s/it]
0%| | 4/9770 [03:19<75:14:46, 27.74s/it]
0%| | 5/9770 [03:20<49:08:31, 18.12s/it]
0%| | 6/9770 [03:21<33:02:00, 12.18s/it]
0%| | 7/9770 [03:22<22:48:10, 8.41s/it]
0%| | 8/9770 [03:22<16:06:36, 5.94s/it]
0%| | 9/9770 [03:23<11:37:08, 4.29s/it]
0%| | 10/9770 [03:23<8:35:23, 3.17s/it]
0%| | 10/9770 [03:23<8:35:23, 3.17s/it]
0%| | 11/9770 [03:24<6:30:35, 2.40s/it]
0%| | 12/9770 [03:25<5:03:26, 1.87s/it]
0%| | 13/9770 [03:25<4:03:13, 1.50s/it]
0%| | 14/9770 [03:26<3:21:51, 1.24s/it]
0%| | 15/9770 [03:27<2:53:37, 1.07s/it]
0%| | 16/9770 [03:27<2:33:58, 1.06it/s]
0%| | 17/9770 [03:28<2:20:02
+0: {'loss': 0.7882, 'grad_norm': 0.6298029794696478, 'learning_rate': 4.2800000000000005e-06, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.0}
+0: {'loss': 0.777, 'grad_norm': 0.6642386775767615, 'learning_rate': 5.480000000000001e-06, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.0}
+0: , 1.16it/s]
0%| | 18/9770 [03:29<2:09:06, 1.26it/s]
0%| | 19/9770 [03:29<2:04:28, 1.31it/s]
0%| | 20/9770 [03:30<1:59:48, 1.36it/s]
0%| | 20/9770 [03:30<1:59:48, 1.36it/s]
0%| | 21/9770 [03:31<1:55:50, 1.40it/s]
0%| | 22/9770 [03:31<1:52:59, 1.44it/s]
0%| | 23/9770 [03:32<1:50:05, 1.48it/s]
0%| | 24/9770 [03:33<1:49:35, 1.48it/s]
0%| | 25/9770 [03:33<1:48:23, 1.50it/s]
0%| | 26/9770 [03:34<1:48:06, 1.50it/s]
0%| | 27/9770 [03:35<1:47:15, 1.51it/s]
0%| | 28/9770 [03:35<1:47:53, 1.50it/s]
0%| | 29/9770 [03:36<1:49:09, 1.49it/s]
0%| | 30/9770 [03:37<1:50:50, 1.46it/s]
0%| | 30/9770 [03:37<1:50:50, 1.46it/s]
0%| | 31/9770 [03:37<1:49:52, 1.48it/s]
0%| | 32/9770 [03:38<1:48:38, 1.49it/s]
0%| | 33/9
+0: {'loss': 0.7812, 'grad_norm': 0.6098133550634027, 'learning_rate': 6.680000000000001e-06, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.0}
+0: 770 [03:39<1:48:19, 1.50it/s]
0%| | 34/9770 [03:39<1:46:37, 1.52it/s]
0%| | 35/9770 [03:40<1:45:57, 1.53it/s]
0%| | 36/9770 [03:41<1:47:10, 1.51it/s]
0%| | 37/9770 [03:41<1:47:26, 1.51it/s]
0%| | 38/9770 [03:42<1:48:45, 1.49it/s]
0%| | 39/9770 [03:43<1:49:42, 1.48it/s]
0%| | 40/9770 [03:43<1:48:37, 1.49it/s]
0%| | 40/9770 [03:43<1:48:37, 1.49it/s]
0%| | 41/9770 [03:44<1:46:05, 1.53it/s]
0%| | 42/9770 [03:45<1:46:21, 1.52it/s]
0%| | 43/9770 [03:45<1:46:00, 1.53it/s]
0%| | 44/9770 [03:46<1:45:19, 1.54it/s]
0%| | 45/9770 [03:47<1:45:19, 1.54it/s]
0%| | 46/9770 [03:47<1:46:05, 1.53it/s]
0%| | 47/9770 [03:48<1:46:53, 1.52it/s]
0%| | 48/9770 [03:49<1:45:56, 1.53it/s]
1%| | 49/9770 [03:49<1:44:58, 1.54it/s]
1%| | 50/9770 [03:50<1:44:52, 1.54it/s]
+0: {'loss': 0.7572, 'grad_norm': 0.698355356840049, 'learning_rate': 7.88e-06, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.01}
+0: {'loss': 0.7607, 'grad_norm': 0.6345132113651546, 'learning_rate': 9.08e-06, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.01}
+0:
1%| | 50/9770 [03:50<1:44:52, 1.54it/s]
1%| | 51/9770 [03:50<1:43:56, 1.56it/s]
1%| | 52/9770 [03:51<1:44:15, 1.55it/s]
1%| | 53/9770 [03:52<1:45:22, 1.54it/s]
1%| | 54/9770 [03:52<1:44:45, 1.55it/s]
1%| | 55/9770 [03:53<1:44:59, 1.54it/s]
1%| | 56/9770 [03:54<1:45:54, 1.53it/s]
1%| | 57/9770 [03:54<1:45:31, 1.53it/s]
1%| | 58/9770 [03:55<1:47:10, 1.51it/s]
1%| | 59/9770 [03:56<1:46:09, 1.52it/s]
1%| | 60/9770 [03:56<1:45:38, 1.53it/s]
1%| | 60/9770 [03:56<1:45:38, 1.53it/s]
1%| | 61/9770 [03:57<1:45:32, 1.53it/s]
1%| | 62/9770 [03:58<1:45:41, 1.53it/s]
1%| | 63/9770 [03:58<1:45:57, 1.53it/s]
1%| | 64/9770 [03:59<1:46:32, 1.52it/s]
1%| | 65/9770 [04:00<1:45:44, 1.53it/s]
1%| | 66/9770 [04:00<1:46:3
+0: {'loss': 0.7718, 'grad_norm': 0.7738322924474639, 'learning_rate': 1.0280000000000002e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.01}
+0: {'loss': 0.7649, 'grad_norm': 0.6784661053627246, 'learning_rate': 1.148e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.01}
+0: 6, 1.52it/s]
1%| | 67/9770 [04:01<1:45:18, 1.54it/s]
1%| | 68/9770 [04:02<1:45:24, 1.53it/s]
1%| | 69/9770 [04:02<1:44:32, 1.55it/s]
1%| | 70/9770 [04:03<1:44:56, 1.54it/s]
1%| | 70/9770 [04:03<1:44:56, 1.54it/s]
1%| | 71/9770 [04:04<1:47:32, 1.50it/s]
1%| | 72/9770 [04:04<1:47:37, 1.50it/s]
1%| | 73/9770 [04:05<1:47:36, 1.50it/s]
1%| | 74/9770 [04:06<1:46:59, 1.51it/s]
1%| | 75/9770 [04:06<1:46:14, 1.52it/s]
1%| | 76/9770 [04:07<1:45:41, 1.53it/s]
1%| | 77/9770 [04:08<1:44:49, 1.54it/s]
1%| | 78/9770 [04:08<1:47:30, 1.50it/s]
1%| | 79/9770 [04:09<1:47:08, 1.51it/s]
1%| | 80/9770 [04:10<1:47:15, 1.51it/s]
1%| | 80/9770 [04:10<1:47:15, 1.51it/s]
1%| | 81/9770 [04:10<1:46:25, 1.52it/s]
1%| | 82/
+0: {'loss': 0.7384, 'grad_norm': 0.6481618613494908, 'learning_rate': 1.268e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.01}
+0: 9770 [04:11<1:46:26, 1.52it/s]
1%| | 83/9770 [04:12<1:47:32, 1.50it/s]
1%| | 84/9770 [04:12<1:46:36, 1.51it/s]
1%| | 85/9770 [04:13<1:47:51, 1.50it/s]
1%| | 86/9770 [04:14<1:48:17, 1.49it/s]
1%| | 87/9770 [04:14<1:48:14, 1.49it/s]
1%| | 88/9770 [04:15<1:48:52, 1.48it/s]
1%| | 89/9770 [04:16<1:47:43, 1.50it/s]
1%| | 90/9770 [04:16<1:46:32, 1.51it/s]
1%| | 90/9770 [04:16<1:46:32, 1.51it/s]
1%| | 91/9770 [04:17<1:46:26, 1.52it/s]
1%| | 92/9770 [04:17<1:46:20, 1.52it/s]
1%| | 93/9770 [04:18<1:48:33, 1.49it/s]
1%| | 94/9770 [04:19<1:47:53, 1.49it/s]
1%| | 95/9770 [04:20<1:46:50, 1.51it/s]
1%| | 96/9770 [04:20<1:45:49, 1.52it/s]
1%| | 97/9770 [04:21<1:48:30, 1.49it/s]
1%| | 98/9770 [04:22<1:48:17, 1.49it/s]
1%| | 99/9770 [04:22<1:47:20, 1.50it/s]
1
+0: {'loss': 0.7464, 'grad_norm': 0.7340683217536735, 'learning_rate': 1.3880000000000003e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.01}
+0: {'loss': 0.7763, 'grad_norm': 0.8330098034721529, 'learning_rate': 1.5080000000000001e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.01}
+0: %| | 100/9770 [04:23<1:47:08, 1.50it/s]
1%| | 100/9770 [04:23<1:47:08, 1.50it/s]
1%| | 101/9770 [04:24<1:47:13, 1.50it/s]
1%| | 102/9770 [04:24<1:46:01, 1.52it/s]
1%| | 103/9770 [04:25<1:45:52, 1.52it/s]
1%| | 104/9770 [04:25<1:45:07, 1.53it/s]
1%| | 105/9770 [04:26<1:44:25, 1.54it/s]
1%| | 106/9770 [04:27<1:45:38, 1.52it/s]
1%| | 107/9770 [04:27<1:47:24, 1.50it/s]
1%| | 108/9770 [04:28<1:47:13, 1.50it/s]
1%| | 109/9770 [04:29<1:45:34, 1.53it/s]
1%| | 110/9770 [04:29<1:44:52, 1.54it/s]
1%| | 110/9770 [04:29<1:44:52, 1.54it/s]
1%| | 111/9770 [04:30<1:45:36, 1.52it/s]
1%| | 112/9770 [04:31<1:46:35, 1.51it/s]
1%| | 113/9770 [04:31<1:45:15, 1.53it/s]
1%| | 114/9770 [04:32<1:45:39, 1.52it/s]
1%| | 1
+0: {'loss': 0.7715, 'grad_norm': 0.7542528426325983, 'learning_rate': 1.628e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.01}
+0: {'loss': 0.7613, 'grad_norm': 0.6839745730828016, 'learning_rate': 1.7480000000000002e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.01}
+0: 15/9770 [04:33<1:45:48, 1.52it/s]
1%| | 116/9770 [04:33<1:45:03, 1.53it/s]
1%| | 117/9770 [04:34<1:44:45, 1.54it/s]
1%| | 118/9770 [04:35<1:44:54, 1.53it/s]
1%| | 119/9770 [04:35<1:47:23, 1.50it/s]
1%| | 120/9770 [04:36<1:46:56, 1.50it/s]
1%| | 120/9770 [04:36<1:46:56, 1.50it/s]
1%| | 121/9770 [04:37<1:47:02, 1.50it/s]
1%| | 122/9770 [04:37<1:46:52, 1.50it/s]
1%|▏ | 123/9770 [04:38<1:45:42, 1.52it/s]
1%|▏ | 124/9770 [04:39<1:45:38, 1.52it/s]
1%|▏ | 125/9770 [04:39<1:45:58, 1.52it/s]
1%|▏ | 126/9770 [04:40<1:45:25, 1.52it/s]
1%|▏ | 127/9770 [04:41<1:47:00, 1.50it/s]
1%|▏ | 128/9770 [04:41<1:46:53, 1.50it/s]
1%|▏ | 129/9770 [04:42<1:46:26, 1.51it/s]
1%|▏ | 130/9770 [04:43<1:45:53, 1.52it/s]
1%|▏
+0: {'loss': 0.7713, 'grad_norm': 0.7564292058760052, 'learning_rate': 1.868e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.01}
+0: | 130/9770 [04:43<1:45:53, 1.52it/s]
1%|▏ | 131/9770 [04:43<1:46:42, 1.51it/s]
1%|▏ | 132/9770 [04:44<1:45:11, 1.53it/s]
1%|▏ | 133/9770 [04:45<1:45:09, 1.53it/s]
1%|▏ | 134/9770 [04:45<1:44:53, 1.53it/s]
1%|▏ | 135/9770 [04:46<1:46:01, 1.51it/s]
1%|▏ | 136/9770 [04:47<1:46:07, 1.51it/s]
1%|▏ | 137/9770 [04:47<1:46:17, 1.51it/s]
1%|▏ | 138/9770 [04:48<1:46:06, 1.51it/s]
1%|▏ | 139/9770 [04:49<1:46:02, 1.51it/s]
1%|▏ | 140/9770 [04:49<1:46:28, 1.51it/s]
1%|▏ | 140/9770 [04:49<1:46:28, 1.51it/s]
1%|▏ | 141/9770 [04:50<1:46:15, 1.51it/s]
1%|▏ | 142/9770 [04:50<1:44:42, 1.53it/s]
1%|▏ | 143/9770 [04:51<1:44:43, 1.53it/s]
1%|▏ | 144/9770 [04:52<1:45:48, 1.52it/s]
1%|▏ | 145/9770 [04:52<1:44:41, 1.53it/s]
1%|▏ | 146/9770 [04:53<1:43:52, 1.54it/
+0: {'loss': 0.7643, 'grad_norm': 0.7226619797998896, 'learning_rate': 1.9880000000000003e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.02}
+0: {'loss': 0.7492, 'grad_norm': 0.7613324127555616, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.02}
+0: s]
2%|▏ | 147/9770 [04:54<1:44:20, 1.54it/s]
2%|▏ | 148/9770 [04:54<1:43:59, 1.54it/s]
2%|▏ | 149/9770 [04:55<1:44:02, 1.54it/s]
2%|▏ | 150/9770 [04:56<1:44:07, 1.54it/s]
2%|▏ | 150/9770 [04:56<1:44:07, 1.54it/s]
2%|▏ | 151/9770 [04:56<1:44:38, 1.53it/s]
2%|▏ | 152/9770 [04:57<1:46:01, 1.51it/s]
2%|▏ | 153/9770 [04:58<1:44:37, 1.53it/s]
2%|▏ | 154/9770 [04:58<1:47:00, 1.50it/s]
2%|▏ | 155/9770 [04:59<1:47:45, 1.49it/s]
2%|▏ | 156/9770 [05:00<1:45:51, 1.51it/s]
2%|▏ | 157/9770 [05:00<1:45:55, 1.51it/s]
2%|▏ | 158/9770 [05:01<1:46:08, 1.51it/s]
2%|▏ | 159/9770 [05:02<1:45:53, 1.51it/s]
2%|▏ | 160/9770 [05:02<1:46:13, 1.51it/s]
2%|▏ | 160/9770 [05:02<1:46:13, 1.51it/s]
2%|▏ | 161/9770 [05:0
+0: {'loss': 0.7834, 'grad_norm': 0.7665650060528073, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.02}
+0: 3<1:45:40, 1.52it/s]
2%|▏ | 162/9770 [05:04<1:45:51, 1.51it/s]
2%|▏ | 163/9770 [05:04<1:43:51, 1.54it/s]
2%|▏ | 164/9770 [05:05<1:44:18, 1.53it/s]
2%|▏ | 165/9770 [05:06<1:46:11, 1.51it/s]
2%|▏ | 166/9770 [05:06<1:45:37, 1.52it/s]
2%|▏ | 167/9770 [05:07<1:45:49, 1.51it/s]
2%|▏ | 168/9770 [05:08<1:45:30, 1.52it/s]
2%|▏ | 169/9770 [05:08<1:46:01, 1.51it/s]
2%|▏ | 170/9770 [05:09<1:44:34, 1.53it/s]
2%|▏ | 170/9770 [05:09<1:44:34, 1.53it/s]
2%|▏ | 171/9770 [05:10<1:44:35, 1.53it/s]
2%|▏ | 172/9770 [05:10<1:44:52, 1.53it/s]
2%|▏ | 173/9770 [05:11<1:45:12, 1.52it/s]
2%|▏ | 174/9770 [05:12<1:45:12, 1.52it/s]
2%|▏ | 175/9770 [05:12<1:45:20, 1.52it/s]
2%|▏ | 176/9770 [05:13<1:45:43, 1.51it/s]
2%|▏ | 177/9770 [05:14<1:45:29, 1.52it/s]
2%|▏
+0: {'loss': 0.7583, 'grad_norm': 0.7789047641431397, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.02}
+0: {'loss': 0.7455, 'grad_norm': 0.7569259235834734, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.02}
+0: | 178/9770 [05:14<1:45:48, 1.51it/s]
2%|▏ | 179/9770 [05:15<1:43:56, 1.54it/s]
2%|▏ | 180/9770 [05:15<1:45:00, 1.52it/s]
2%|▏ | 180/9770 [05:15<1:45:00, 1.52it/s]
2%|▏ | 181/9770 [05:16<1:45:35, 1.51it/s]
2%|▏ | 182/9770 [05:17<1:47:30, 1.49it/s]
2%|▏ | 183/9770 [05:18<1:47:09, 1.49it/s]
2%|▏ | 184/9770 [05:18<1:46:29, 1.50it/s]
2%|▏ | 185/9770 [05:19<1:45:22, 1.52it/s]
2%|▏ | 186/9770 [05:19<1:44:54, 1.52it/s]
2%|▏ | 187/9770 [05:20<1:45:28, 1.51it/s]
2%|▏ | 188/9770 [05:21<1:45:36, 1.51it/s]
2%|▏ | 189/9770 [05:21<1:44:47, 1.52it/s]
2%|▏ | 190/9770 [05:22<1:44:15, 1.53it/s]
2%|▏ | 190/9770 [05:22<1:44:15, 1.53it/s]
2%|▏ | 191/9770 [05:23<1:43:28, 1.54it/s]
2%|▏ | 192/9770 [05:23<1:43:37, 1.54
+0: {'loss': 0.7728, 'grad_norm': 0.7471740052827762, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.02}
+0: it/s]
2%|▏ | 193/9770 [05:24<1:44:20, 1.53it/s]
2%|▏ | 194/9770 [05:25<1:45:16, 1.52it/s]
2%|▏ | 195/9770 [05:25<1:44:34, 1.53it/s]
2%|▏ | 196/9770 [05:26<1:44:41, 1.52it/s]
2%|▏ | 197/9770 [05:27<1:45:15, 1.52it/s]
2%|▏ | 198/9770 [05:27<1:46:12, 1.50it/s]
2%|▏ | 199/9770 [05:28<1:46:14, 1.50it/s]
2%|▏ | 200/9770 [05:29<1:47:07, 1.49it/s]
2%|▏ | 200/9770 [05:29<1:47:07, 1.49it/s]
2%|▏ | 201/9770 [05:29<1:45:58, 1.51it/s]
2%|▏ | 202/9770 [05:30<1:45:16, 1.51it/s]
2%|▏ | 203/9770 [05:31<1:44:31, 1.53it/s]
2%|▏ | 204/9770 [05:31<1:44:06, 1.53it/s]
2%|▏ | 205/9770 [05:32<1:44:54, 1.52it/s]
2%|▏ | 206/9770 [05:33<1:44:03, 1.53it/s]
2%|▏ | 207/9770 [05:33<1:44:38, 1.52it/s]
2%|▏ | 208/9770 [05:34<1:44:41, 1.52it/s]
2%|▏ | 209/9770 [
+0: {'loss': 0.7572, 'grad_norm': 0.7374676617758451, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.02}
+0: {'loss': 0.7718, 'grad_norm': 0.7629597525187471, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.02}
+0: 05:35<1:44:24, 1.53it/s]
2%|▏ | 210/9770 [05:35<1:46:35, 1.49it/s]
2%|▏ | 210/9770 [05:35<1:46:35, 1.49it/s]
2%|▏ | 211/9770 [05:36<1:47:22, 1.48it/s]
2%|▏ | 212/9770 [05:37<1:46:53, 1.49it/s]
2%|▏ | 213/9770 [05:37<1:46:07, 1.50it/s]
2%|▏ | 214/9770 [05:38<1:46:15, 1.50it/s]
2%|▏ | 215/9770 [05:39<1:45:28, 1.51it/s]
2%|▏ | 216/9770 [05:39<1:45:13, 1.51it/s]
2%|▏ | 217/9770 [05:40<1:45:21, 1.51it/s]
2%|▏ | 218/9770 [05:41<1:45:20, 1.51it/s]
2%|▏ | 219/9770 [05:41<1:45:47, 1.50it/s]
2%|▏ | 220/9770 [05:42<1:45:29, 1.51it/s]
2%|▏ | 220/9770 [05:42<1:45:29, 1.51it/s]
2%|▏ | 221/9770 [05:43<1:45:35, 1.51it/s]
2%|▏ | 222/9770 [05:43<1:44:53, 1.52it/s]
2%|▏ | 223/9770 [05:44<1:44:01, 1.53it/s]
2%|▏
+0: {'loss': 0.7598, 'grad_norm': 0.7531879365781883, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.02}
+0: | 224/9770 [05:45<1:44:14, 1.53it/s]
2%|▏ | 225/9770 [05:45<1:44:57, 1.52it/s]
2%|▏ | 226/9770 [05:46<1:44:04, 1.53it/s]
2%|▏ | 227/9770 [05:47<1:44:43, 1.52it/s]
2%|▏ | 228/9770 [05:47<1:45:09, 1.51it/s]
2%|▏ | 229/9770 [05:48<1:45:40, 1.50it/s]
2%|▏ | 230/9770 [05:49<1:44:52, 1.52it/s]
2%|▏ | 230/9770 [05:49<1:44:52, 1.52it/s]
2%|▏ | 231/9770 [05:49<1:45:24, 1.51it/s]
2%|▏ | 232/9770 [05:50<1:45:46, 1.50it/s]
2%|▏ | 233/9770 [05:51<1:44:50, 1.52it/s]
2%|▏ | 234/9770 [05:51<1:46:55, 1.49it/s]
2%|▏ | 235/9770 [05:52<1:45:40, 1.50it/s]
2%|▏ | 236/9770 [05:52<1:44:14, 1.52it/s]
2%|▏ | 237/9770 [05:53<1:43:38, 1.53it/s]
2%|▏ | 238/9770 [05:54<1:44:38, 1.52it/s]
2%|▏ | 239/9770 [05:55<1:46:53, 1.49it/s]
2%|▏ | 240/9770 [05:55<1:46:19,
+0: {'loss': 0.7319, 'grad_norm': 0.6819478363087886, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.02}
+0: {'loss': 0.7609, 'grad_norm': 0.7426229652245786, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.03}
+0: 1.49it/s]
2%|▏ | 240/9770 [05:55<1:46:19, 1.49it/s]
2%|▏ | 241/9770 [05:56<1:45:44, 1.50it/s]
2%|▏ | 242/9770 [05:56<1:45:18, 1.51it/s]
2%|▏ | 243/9770 [05:57<1:44:44, 1.52it/s]
2%|▏ | 244/9770 [05:58<1:44:06, 1.53it/s]
3%|▎ | 245/9770 [05:58<1:45:15, 1.51it/s]
3%|▎ | 246/9770 [05:59<1:45:14, 1.51it/s]
3%|▎ | 247/9770 [06:00<1:44:48, 1.51it/s]
3%|▎ | 248/9770 [06:00<1:45:46, 1.50it/s]
3%|▎ | 249/9770 [06:01<1:46:07, 1.50it/s]
3%|▎ | 250/9770 [06:02<1:46:26, 1.49it/s]
3%|▎ | 250/9770 [06:02<1:46:26, 1.49it/s]
3%|▎ | 251/9770 [06:02<1:46:32, 1.49it/s]
3%|▎ | 252/9770 [06:03<1:45:56, 1.50it/s]
3%|▎ | 253/9770 [06:04<1:45:52, 1.50it/s]
3%|▎ | 254/9770 [06:04<1:44:43, 1.51it/s]
3%|▎ | 255/977
+0: {'loss': 0.7616, 'grad_norm': 0.751664873395375, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.03}
+0: {'loss': 0.7657, 'grad_norm': 0.7442777883826834, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.03}
+0: 0 [06:05<1:43:53, 1.53it/s]
3%|▎ | 256/9770 [06:06<1:44:15, 1.52it/s]
3%|▎ | 257/9770 [06:06<1:45:15, 1.51it/s]
3%|▎ | 258/9770 [06:07<1:45:19, 1.51it/s]
3%|▎ | 259/9770 [06:08<1:45:08, 1.51it/s]
3%|▎ | 260/9770 [06:08<1:45:02, 1.51it/s]
3%|▎ | 260/9770 [06:08<1:45:02, 1.51it/s]
3%|▎ | 261/9770 [06:09<1:44:13, 1.52it/s]
3%|▎ | 262/9770 [06:10<1:43:43, 1.53it/s]
3%|▎ | 263/9770 [06:10<1:46:36, 1.49it/s]
3%|▎ | 264/9770 [06:11<1:45:26, 1.50it/s]
3%|▎ | 265/9770 [06:12<1:45:30, 1.50it/s]
3%|▎ | 266/9770 [06:12<1:44:44, 1.51it/s]
3%|▎ | 267/9770 [06:13<1:46:21, 1.49it/s]
3%|▎ | 268/9770 [06:14<1:46:29, 1.49it/s]
3%|▎ | 269/9770 [06:14<1:45:59, 1.49it/s]
3%|▎ | 270/9770 [06:15<1:46:00, 1.49it/s]
3%|�
+0: {'loss': 0.7551, 'grad_norm': 0.7730411300291219, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.03}
+0: � | 270/9770 [06:15<1:46:00, 1.49it/s]
3%|▎ | 271/9770 [06:16<1:44:45, 1.51it/s]
3%|▎ | 272/9770 [06:16<1:45:06, 1.51it/s]
3%|▎ | 273/9770 [06:17<1:44:32, 1.51it/s]
3%|▎ | 274/9770 [06:18<1:44:28, 1.51it/s]
3%|▎ | 275/9770 [06:18<1:44:30, 1.51it/s]
3%|▎ | 276/9770 [06:19<1:44:25, 1.52it/s]
3%|▎ | 277/9770 [06:20<1:44:31, 1.51it/s]
3%|▎ | 278/9770 [06:20<1:44:56, 1.51it/s]
3%|▎ | 279/9770 [06:21<1:44:18, 1.52it/s]
3%|▎ | 280/9770 [06:22<1:46:12, 1.49it/s]
3%|▎ | 280/9770 [06:22<1:46:12, 1.49it/s]
3%|▎ | 281/9770 [06:22<1:46:05, 1.49it/s]
3%|▎ | 282/9770 [06:23<1:46:00, 1.49it/s]
3%|▎ | 283/9770 [06:24<1:45:47, 1.49it/s]
3%|▎ | 284/9770 [06:24<1:45:24, 1.50it/s]
3%|▎ | 285/9770 [06:25<1:45:29, 1.50it/s]
3%|▎ | 286/9770 [06:26<1:45:01
+0: {'loss': 0.7581, 'grad_norm': 0.6847058107679785, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.03}
+0: {'loss': 0.745, 'grad_norm': 0.6755495813728286, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.03}
+0: , 1.50it/s]
3%|▎ | 287/9770 [06:26<1:45:07, 1.50it/s]
3%|▎ | 288/9770 [06:27<1:44:41, 1.51it/s]
3%|▎ | 289/9770 [06:28<1:43:55, 1.52it/s]
3%|▎ | 290/9770 [06:28<1:44:02, 1.52it/s]
3%|▎ | 290/9770 [06:28<1:44:02, 1.52it/s]
3%|▎ | 291/9770 [06:29<1:44:04, 1.52it/s]
3%|▎ | 292/9770 [06:30<1:44:40, 1.51it/s]
3%|▎ | 293/9770 [06:30<1:44:14, 1.52it/s]
3%|▎ | 294/9770 [06:31<1:44:14, 1.52it/s]
3%|▎ | 295/9770 [06:32<1:44:52, 1.51it/s]
3%|▎ | 296/9770 [06:32<1:46:26, 1.48it/s]
3%|▎ | 297/9770 [06:33<1:45:22, 1.50it/s]
3%|▎ | 298/9770 [06:34<1:45:22, 1.50it/s]
3%|▎ | 299/9770 [06:34<1:46:29, 1.48it/s]
3%|▎ | 300/9770 [06:35<1:45:12, 1.50it/s]
3%|▎ | 300/9770 [06:35<1:45:12, 1.50it/s]
3%|▎ | 301/
+0: {'loss': 0.7519, 'grad_norm': 0.7121174201695833, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.03}
+0: 9770 [06:36<1:44:57, 1.50it/s]
3%|▎ | 302/9770 [06:36<1:45:29, 1.50it/s]
3%|▎ | 303/9770 [06:37<1:45:01, 1.50it/s]
3%|▎ | 304/9770 [06:38<1:43:34, 1.52it/s]
3%|▎ | 305/9770 [06:38<1:45:34, 1.49it/s]
3%|▎ | 306/9770 [06:39<1:45:29, 1.50it/s]
3%|▎ | 307/9770 [06:40<1:44:54, 1.50it/s]
3%|▎ | 308/9770 [06:40<1:44:15, 1.51it/s]
3%|▎ | 309/9770 [06:41<1:43:38, 1.52it/s]
3%|▎ | 310/9770 [06:42<1:43:31, 1.52it/s]
3%|▎ | 310/9770 [06:42<1:43:31, 1.52it/s]
3%|▎ | 311/9770 [06:42<1:44:11, 1.51it/s]
3%|▎ | 312/9770 [06:43<1:44:24, 1.51it/s]
3%|▎ | 313/9770 [06:44<1:44:22, 1.51it/s]
3%|▎ | 314/9770 [06:44<1:46:06, 1.49it/s]
3%|▎ | 315/9770 [06:45<1:45:12, 1.50it/s]
3%|▎ | 316/9770 [06:46<1:44:04, 1.51it/s]
3%|▎ | 317/9770 [06:46<1:43:53, 1.52it/s]
3
+0: {'loss': 0.7433, 'grad_norm': 0.653143837780577, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.03}
+0: {'loss': 0.7453, 'grad_norm': 0.7225255109223913, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.03}
+0: %|▎ | 318/9770 [06:47<1:42:38, 1.53it/s]
3%|▎ | 319/9770 [06:48<1:44:13, 1.51it/s]
3%|▎ | 320/9770 [06:48<1:43:26, 1.52it/s]
3%|▎ | 320/9770 [06:48<1:43:26, 1.52it/s]
3%|▎ | 321/9770 [06:49<1:44:16, 1.51it/s]
3%|▎ | 322/9770 [06:50<1:46:05, 1.48it/s]
3%|▎ | 323/9770 [06:50<1:44:39, 1.50it/s]
3%|▎ | 324/9770 [06:51<1:44:43, 1.50it/s]
3%|▎ | 325/9770 [06:52<1:44:32, 1.51it/s]
3%|▎ | 326/9770 [06:52<1:43:44, 1.52it/s]
3%|▎ | 327/9770 [06:53<1:43:23, 1.52it/s]
3%|▎ | 328/9770 [06:54<1:44:13, 1.51it/s]
3%|▎ | 329/9770 [06:54<1:46:19, 1.48it/s]
3%|▎ | 330/9770 [06:55<1:47:41, 1.46it/s]
3%|▎ | 330/9770 [06:55<1:47:41, 1.46it/s]
3%|▎ | 331/9770 [06:56<1:46:23, 1.48it/s]
3%|▎ | 332/9770 [06:56<1:44
+0: {'loss': 0.7677, 'grad_norm': 0.7502311827608458, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.03}
+0: :46, 1.50it/s]
3%|▎ | 333/9770 [06:57<1:44:12, 1.51it/s]
3%|▎ | 334/9770 [06:58<1:43:37, 1.52it/s]
3%|▎ | 335/9770 [06:58<1:43:33, 1.52it/s]
3%|▎ | 336/9770 [06:59<1:44:12, 1.51it/s]
3%|▎ | 337/9770 [07:00<1:43:03, 1.53it/s]
3%|▎ | 338/9770 [07:00<1:44:01, 1.51it/s]
3%|▎ | 339/9770 [07:01<1:43:31, 1.52it/s]
3%|▎ | 340/9770 [07:02<1:44:25, 1.51it/s]
3%|▎ | 340/9770 [07:02<1:44:25, 1.51it/s]
3%|▎ | 341/9770 [07:02<1:44:34, 1.50it/s]
4%|▎ | 342/9770 [07:03<1:45:47, 1.49it/s]
4%|▎ | 343/9770 [07:04<1:45:05, 1.50it/s]
4%|▎ | 344/9770 [07:04<1:44:42, 1.50it/s]
4%|▎ | 345/9770 [07:05<1:43:48, 1.51it/s]
4%|▎ | 346/9770 [07:06<1:43:52, 1.51it/s]
4%|▎ | 347/9770 [07:06<1:43:27, 1.52it/s]
4%|▎ | 348/9770 [07:07<1:43:46, 1.51it/s]
4%|▎ |
+0: {'loss': 0.761, 'grad_norm': 0.7389752596875864, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.04}
+0: {'loss': 0.7436, 'grad_norm': 0.7200512748129708, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.04}
+0: 349/9770 [07:08<1:44:29, 1.50it/s]
4%|▎ | 350/9770 [07:08<1:43:57, 1.51it/s]
4%|▎ | 350/9770 [07:08<1:43:57, 1.51it/s]
4%|▎ | 351/9770 [07:09<1:44:23, 1.50it/s]
4%|▎ | 352/9770 [07:10<1:44:31, 1.50it/s]
4%|▎ | 353/9770 [07:10<1:43:52, 1.51it/s]
4%|▎ | 354/9770 [07:11<1:41:59, 1.54it/s]
4%|▎ | 355/9770 [07:11<1:42:53, 1.53it/s]
4%|▎ | 356/9770 [07:12<1:43:48, 1.51it/s]
4%|▎ | 357/9770 [07:13<1:45:45, 1.48it/s]
4%|▎ | 358/9770 [07:14<1:44:24, 1.50it/s]
4%|▎ | 359/9770 [07:14<1:44:19, 1.50it/s]
4%|▎ | 360/9770 [07:15<1:43:09, 1.52it/s]
4%|▎ | 360/9770 [07:15<1:43:09, 1.52it/s]
4%|▎ | 361/9770 [07:15<1:43:47, 1.51it/s]
4%|▎ | 362/9770 [07:16<1:43:52, 1.51it/s]
4%|▎ | 363/9770 [07:17<1:42:15, 1.53it/s]
+0: {'loss': 0.7516, 'grad_norm': 0.6817309664315718, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.04}
+0: 4%|▎ | 364/9770 [07:17<1:43:19, 1.52it/s]
4%|▎ | 365/9770 [07:18<1:42:30, 1.53it/s]
4%|▎ | 366/9770 [07:19<1:41:39, 1.54it/s]
4%|▍ | 367/9770 [07:19<1:42:35, 1.53it/s]
4%|▍ | 368/9770 [07:20<1:42:08, 1.53it/s]
4%|▍ | 369/9770 [07:21<1:43:15, 1.52it/s]
4%|▍ | 370/9770 [07:21<1:42:24, 1.53it/s]
4%|▍ | 370/9770 [07:21<1:42:24, 1.53it/s]
4%|▍ | 371/9770 [07:22<1:42:32, 1.53it/s]
4%|▍ | 372/9770 [07:23<1:44:13, 1.50it/s]
4%|▍ | 373/9770 [07:23<1:42:52, 1.52it/s]
4%|▍ | 374/9770 [07:24<1:42:44, 1.52it/s]
4%|▍ | 375/9770 [07:25<1:42:24, 1.53it/s]
4%|▍ | 376/9770 [07:25<1:43:07, 1.52it/s]
4%|▍ | 377/9770 [07:26<1:43:18, 1.52it/s]
4%|▍ | 378/9770 [07:27<1:45:10, 1.49it/s]
4%|▍ | 379/9770 [07:27<1:45:14, 1.49it/s]
4%|▍ | 380/9770 [07:28<
+0: {'loss': 0.7586, 'grad_norm': 0.7021451517592754, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.04}
+0: {'loss': 0.7498, 'grad_norm': 0.7142724734303186, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.04}
+0: 1:44:10, 1.50it/s]
4%|▍ | 380/9770 [07:28<1:44:10, 1.50it/s]
4%|▍ | 381/9770 [07:29<1:43:38, 1.51it/s]
4%|▍ | 382/9770 [07:29<1:43:54, 1.51it/s]
4%|▍ | 383/9770 [07:30<1:43:00, 1.52it/s]
4%|▍ | 384/9770 [07:31<1:42:14, 1.53it/s]
4%|▍ | 385/9770 [07:31<1:41:55, 1.53it/s]
4%|▍ | 386/9770 [07:32<1:43:07, 1.52it/s]
4%|▍ | 387/9770 [07:33<1:43:39, 1.51it/s]
4%|▍ | 388/9770 [07:33<1:44:15, 1.50it/s]
4%|▍ | 389/9770 [07:34<1:44:06, 1.50it/s]
4%|▍ | 390/9770 [07:35<1:43:29, 1.51it/s]
4%|▍ | 390/9770 [07:35<1:43:29, 1.51it/s]
4%|▍ | 391/9770 [07:35<1:42:31, 1.52it/s]
4%|▍ | 392/9770 [07:36<1:42:00, 1.53it/s]
4%|▍ | 393/9770 [07:37<1:42:37, 1.52it/s]
4%|▍ | 394/9770 [07:37<1:42:41, 1.52it/s]
4%|▍
+0: {'loss': 0.7496, 'grad_norm': 0.6653368137802452, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.04}
+0: | 395/9770 [07:38<1:42:23, 1.53it/s]
4%|▍ | 396/9770 [07:39<1:42:41, 1.52it/s]
4%|▍ | 397/9770 [07:39<1:42:15, 1.53it/s]
4%|▍ | 398/9770 [07:40<1:40:55, 1.55it/s]
4%|▍ | 399/9770 [07:40<1:40:55, 1.55it/s]
4%|▍ | 400/9770 [07:41<1:40:41, 1.55it/s]
4%|▍ | 400/9770 [07:41<1:40:41, 1.55it/s]
4%|▍ | 401/9770 [07:42<1:41:42, 1.54it/s]
4%|▍ | 402/9770 [07:42<1:40:55, 1.55it/s]
4%|▍ | 403/9770 [07:43<1:41:58, 1.53it/s]
4%|▍ | 404/9770 [07:44<1:42:19, 1.53it/s]
4%|▍ | 405/9770 [07:44<1:42:25, 1.52it/s]
4%|▍ | 406/9770 [07:45<1:42:08, 1.53it/s]
4%|▍ | 407/9770 [07:46<1:42:29, 1.52it/s]
4%|▍ | 408/9770 [07:46<1:42:37, 1.52it/s]
4%|▍ | 409/9770 [07:47<1:42:34, 1.52it/s]
4%|▍ | 410/9770 [07:48<1:44:44, 1.49it/s]
+0: {'loss': 0.7622, 'grad_norm': 0.6958621507498385, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.04}
+0: {'loss': 0.7415, 'grad_norm': 0.6752777941808835, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.04}
+0:
4%|▍ | 410/9770 [07:48<1:44:44, 1.49it/s]
4%|▍ | 411/9770 [07:48<1:43:21, 1.51it/s]
4%|▍ | 412/9770 [07:49<1:43:33, 1.51it/s]
4%|▍ | 413/9770 [07:50<1:42:09, 1.53it/s]
4%|▍ | 414/9770 [07:50<1:42:31, 1.52it/s]
4%|▍ | 415/9770 [07:51<1:43:11, 1.51it/s]
4%|▍ | 416/9770 [07:52<1:43:12, 1.51it/s]
4%|▍ | 417/9770 [07:52<1:43:02, 1.51it/s]
4%|▍ | 418/9770 [07:53<1:42:45, 1.52it/s]
4%|▍ | 419/9770 [07:54<1:45:05, 1.48it/s]
4%|▍ | 420/9770 [07:54<1:43:43, 1.50it/s]
4%|▍ | 420/9770 [07:54<1:43:43, 1.50it/s]
4%|▍ | 421/9770 [07:55<1:44:07, 1.50it/s]
4%|▍ | 422/9770 [07:56<1:44:14, 1.49it/s]
4%|▍ | 423/9770 [07:56<1:44:06, 1.50it/s]
4%|▍ | 424/9770 [07:57<1:44:26, 1.49it/s]
4%|▍ | 425/9770 [07:58<1:44:00, 1.50it/s]
4%|▍ | 426/9770 [07:
+0: {'loss': 0.7389, 'grad_norm': 0.6633411929554945, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.04}
+0: {'loss': 0.7134, 'grad_norm': 0.7015557044207102, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.05}
+0: 58<1:43:08, 1.51it/s]
4%|▍ | 427/9770 [07:59<1:43:20, 1.51it/s]
4%|▍ | 428/9770 [08:00<1:42:44, 1.52it/s]
4%|▍ | 429/9770 [08:00<1:42:20, 1.52it/s]
4%|▍ | 430/9770 [08:01<1:42:05, 1.52it/s]
4%|▍ | 430/9770 [08:01<1:42:05, 1.52it/s]
4%|▍ | 431/9770 [08:02<1:41:06, 1.54it/s]
4%|▍ | 432/9770 [08:02<1:42:05, 1.52it/s]
4%|▍ | 433/9770 [08:03<1:42:11, 1.52it/s]
4%|▍ | 434/9770 [08:04<1:41:40, 1.53it/s]
4%|▍ | 435/9770 [08:04<1:42:36, 1.52it/s]
4%|▍ | 436/9770 [08:05<1:42:55, 1.51it/s]
4%|▍ | 437/9770 [08:06<1:42:39, 1.52it/s]
4%|▍ | 438/9770 [08:06<1:42:44, 1.51it/s]
4%|▍ | 439/9770 [08:07<1:42:59, 1.51it/s]
5%|▍ | 440/9770 [08:08<1:43:58, 1.50it/s]
5%|▍ | 440/9770 [08:08<1:43:58, 1.50it/s]
5%|▍
+0: {'loss': 0.7501, 'grad_norm': 0.6841982281051456, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.05}
+0: | 441/9770 [08:08<1:43:41, 1.50it/s]
5%|▍ | 442/9770 [08:09<1:44:31, 1.49it/s]
5%|▍ | 443/9770 [08:10<1:44:01, 1.49it/s]
5%|▍ | 444/9770 [08:10<1:42:07, 1.52it/s]
5%|▍ | 445/9770 [08:11<1:42:47, 1.51it/s]
5%|▍ | 446/9770 [08:12<1:42:16, 1.52it/s]
5%|▍ | 447/9770 [08:12<1:42:29, 1.52it/s]
5%|▍ | 448/9770 [08:13<1:42:29, 1.52it/s]
5%|▍ | 449/9770 [08:14<1:42:31, 1.52it/s]
5%|▍ | 450/9770 [08:14<1:41:59, 1.52it/s]
5%|▍ | 450/9770 [08:14<1:41:59, 1.52it/s]
5%|▍ | 451/9770 [08:15<1:41:54, 1.52it/s]
5%|▍ | 452/9770 [08:15<1:42:02, 1.52it/s]
5%|▍ | 453/9770 [08:16<1:42:22, 1.52it/s]
5%|▍ | 454/9770 [08:17<1:42:21, 1.52it/s]
5%|▍ | 455/9770 [08:17<1:42:34, 1.51it/s]
5%|▍ | 456/9770 [08:18<1:42:52, 1.51it/s]
5%|▍ | 457/9770 [08:19<1:42:21, 1.5
+0: {'loss': 0.7246, 'grad_norm': 0.743106387320585, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.05}
+0: {'loss': 0.7371, 'grad_norm': 0.6879097294479487, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.05}
+0: 2it/s]
5%|▍ | 458/9770 [08:19<1:42:23, 1.52it/s]
5%|▍ | 459/9770 [08:20<1:42:04, 1.52it/s]
5%|▍ | 460/9770 [08:21<1:42:02, 1.52it/s]
5%|▍ | 460/9770 [08:21<1:42:02, 1.52it/s]
5%|▍ | 461/9770 [08:21<1:42:33, 1.51it/s]
5%|▍ | 462/9770 [08:22<1:42:16, 1.52it/s]
5%|▍ | 463/9770 [08:23<1:42:12, 1.52it/s]
5%|▍ | 464/9770 [08:23<1:41:18, 1.53it/s]
5%|▍ | 465/9770 [08:24<1:41:16, 1.53it/s]
5%|▍ | 466/9770 [08:25<1:41:07, 1.53it/s]
5%|▍ | 467/9770 [08:25<1:40:59, 1.54it/s]
5%|▍ | 468/9770 [08:26<1:41:46, 1.52it/s]
5%|▍ | 469/9770 [08:27<1:40:47, 1.54it/s]
5%|▍ | 470/9770 [08:27<1:41:29, 1.53it/s]
5%|▍ | 470/9770 [08:27<1:41:29, 1.53it/s]
5%|▍ | 471/9770 [08:28<1:43:49, 1.49it/s]
5%|▍ | 472/9770 [
+0: {'loss': 0.7705, 'grad_norm': 0.7042277183837815, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.05}
+0: 08:29<1:44:00, 1.49it/s]
5%|▍ | 473/9770 [08:29<1:43:20, 1.50it/s]
5%|▍ | 474/9770 [08:30<1:42:13, 1.52it/s]
5%|▍ | 475/9770 [08:31<1:41:53, 1.52it/s]
5%|▍ | 476/9770 [08:31<1:41:24, 1.53it/s]
5%|▍ | 477/9770 [08:32<1:40:58, 1.53it/s]
5%|▍ | 478/9770 [08:33<1:41:36, 1.52it/s]
5%|▍ | 479/9770 [08:33<1:42:06, 1.52it/s]
5%|▍ | 480/9770 [08:34<1:42:07, 1.52it/s]
5%|▍ | 480/9770 [08:34<1:42:07, 1.52it/s]
5%|▍ | 481/9770 [08:35<1:41:52, 1.52it/s]
5%|▍ | 482/9770 [08:35<1:41:48, 1.52it/s]
5%|▍ | 483/9770 [08:36<1:42:02, 1.52it/s]
5%|▍ | 484/9770 [08:37<1:42:28, 1.51it/s]
5%|▍ | 485/9770 [08:37<1:41:35, 1.52it/s]
5%|▍ | 486/9770 [08:38<1:41:43, 1.52it/s]
5%|▍ | 487/9770 [08:39<1:41:05, 1.53it/s]
5%|▍ | 488/9770 [08:39<1:41:20, 1.53it/s]
5%|▌
+0: {'loss': 0.75, 'grad_norm': 0.6712405479719712, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.05}
+0: {'loss': 0.7455, 'grad_norm': 0.7019863088088841, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.05}
+0: | 489/9770 [08:40<1:41:49, 1.52it/s]
5%|▌ | 490/9770 [08:40<1:41:19, 1.53it/s]
5%|▌ | 490/9770 [08:40<1:41:19, 1.53it/s]
5%|▌ | 491/9770 [08:41<1:41:02, 1.53it/s]
5%|▌ | 492/9770 [08:42<1:41:37, 1.52it/s]
5%|▌ | 493/9770 [08:42<1:42:17, 1.51it/s]
5%|▌ | 494/9770 [08:43<1:41:55, 1.52it/s]
5%|▌ | 495/9770 [08:44<1:41:59, 1.52it/s]
5%|▌ | 496/9770 [08:44<1:42:19, 1.51it/s]
5%|▌ | 497/9770 [08:45<1:42:55, 1.50it/s]
5%|▌ | 498/9770 [08:46<1:42:41, 1.50it/s]
5%|▌ | 499/9770 [08:46<1:43:04, 1.50it/s]
5%|▌ | 500/9770 [08:47<1:43:01, 1.50it/s]
5%|▌ | 500/9770 [08:47<1:43:01, 1.50it/s]
5%|▌ | 501/9770 [08:48<1:43:17, 1.50it/s]
5%|▌ | 502/9770 [08:48<1:43:17, 1.50it/s]
5%|▌ | 503/9770 [08:49<1:42:16,
+0: {'loss': 0.7429, 'grad_norm': 0.6977512296171098, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.05}
+0: 1.51it/s]
5%|▌ | 504/9770 [08:50<1:42:45, 1.50it/s]
5%|▌ | 505/9770 [08:50<1:43:38, 1.49it/s]
5%|▌ | 506/9770 [08:51<1:42:40, 1.50it/s]
5%|▌ | 507/9770 [08:52<1:42:56, 1.50it/s]
5%|▌ | 508/9770 [08:52<1:42:11, 1.51it/s]
5%|▌ | 509/9770 [08:53<1:42:52, 1.50it/s]
5%|▌ | 510/9770 [08:54<1:43:22, 1.49it/s]
5%|▌ | 510/9770 [08:54<1:43:22, 1.49it/s]
5%|▌ | 511/9770 [08:54<1:41:47, 1.52it/s]
5%|▌ | 512/9770 [08:55<1:41:17, 1.52it/s]
5%|▌ | 513/9770 [08:56<1:43:57, 1.48it/s]
5%|▌ | 514/9770 [08:56<1:41:44, 1.52it/s]
5%|▌ | 515/9770 [08:57<1:41:26, 1.52it/s]
5%|▌ | 516/9770 [08:58<1:41:49, 1.51it/s]
5%|▌ | 517/9770 [08:58<1:41:50, 1.51it/s]
5%|▌ | 518/9770 [08:59<1:41:55, 1.51it/s]
5%|▌ | 519/9770 [09:00<1:42:15, 1.51it/s]
5%|▌ | 520/97
+0: {'loss': 0.7222, 'grad_norm': 1.2757794987077393, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.05}
+0: {'loss': 0.7534, 'grad_norm': 0.7285140922395443, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.05}
+0: 70 [09:00<1:41:56, 1.51it/s]
5%|▌ | 520/9770 [09:00<1:41:56, 1.51it/s]
5%|▌ | 521/9770 [09:01<1:41:45, 1.51it/s]
5%|▌ | 522/9770 [09:02<1:42:03, 1.51it/s]
5%|▌ | 523/9770 [09:02<1:43:11, 1.49it/s]
5%|▌ | 524/9770 [09:03<1:42:24, 1.50it/s]
5%|▌ | 525/9770 [09:04<1:41:35, 1.52it/s]
5%|▌ | 526/9770 [09:04<1:41:52, 1.51it/s]
5%|▌ | 527/9770 [09:05<1:41:45, 1.51it/s]
5%|▌ | 528/9770 [09:06<1:44:04, 1.48it/s]
5%|▌ | 529/9770 [09:06<1:42:52, 1.50it/s]
5%|▌ | 530/9770 [09:07<1:41:50, 1.51it/s]
5%|▌ | 530/9770 [09:07<1:41:50, 1.51it/s]
5%|▌ | 531/9770 [09:08<1:43:08, 1.49it/s]
5%|▌ | 532/9770 [09:08<1:42:43, 1.50it/s]
5%|▌ | 533/9770 [09:09<1:42:11, 1.51it/s]
5%|▌ | 534/9770 [09:10<1:42:27, 1.50it/s]
5%|�
+0: {'loss': 0.7517, 'grad_norm': 0.7201411269652276, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.06}
+0: �� | 535/9770 [09:10<1:41:02, 1.52it/s]
5%|▌ | 536/9770 [09:11<1:40:19, 1.53it/s]
5%|▌ | 537/9770 [09:12<1:40:25, 1.53it/s]
6%|▌ | 538/9770 [09:12<1:42:25, 1.50it/s]
6%|▌ | 539/9770 [09:13<1:40:50, 1.53it/s]
6%|▌ | 540/9770 [09:14<1:41:19, 1.52it/s]
6%|▌ | 540/9770 [09:14<1:41:19, 1.52it/s]
6%|▌ | 541/9770 [09:14<1:40:41, 1.53it/s]
6%|▌ | 542/9770 [09:15<1:41:55, 1.51it/s]
6%|▌ | 543/9770 [09:16<1:42:04, 1.51it/s]
6%|▌ | 544/9770 [09:16<1:41:07, 1.52it/s]
6%|▌ | 545/9770 [09:17<1:40:42, 1.53it/s]
6%|▌ | 546/9770 [09:18<1:41:17, 1.52it/s]
6%|▌ | 547/9770 [09:18<1:41:10, 1.52it/s]
6%|▌ | 548/9770 [09:19<1:41:36, 1.51it/s]
6%|▌ | 549/9770 [09:20<1:41:40, 1.51it/s]
6%|▌ | 550/9770 [09:20<1:40:48, 1.52it/s]
+0: {'loss': 0.7417, 'grad_norm': 0.7340283812534444, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.06}
+0: {'loss': 0.7547, 'grad_norm': 0.7217522787519168, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.06}
+0:
6%|▌ | 550/9770 [09:20<1:40:48, 1.52it/s]
6%|▌ | 551/9770 [09:21<1:39:50, 1.54it/s]
6%|▌ | 552/9770 [09:21<1:39:54, 1.54it/s]
6%|▌ | 553/9770 [09:22<1:40:01, 1.54it/s]
6%|▌ | 554/9770 [09:23<1:40:25, 1.53it/s]
6%|▌ | 555/9770 [09:23<1:40:41, 1.53it/s]
6%|▌ | 556/9770 [09:24<1:39:57, 1.54it/s]
6%|▌ | 557/9770 [09:25<1:40:39, 1.53it/s]
6%|▌ | 558/9770 [09:25<1:40:49, 1.52it/s]
6%|▌ | 559/9770 [09:26<1:41:05, 1.52it/s]
6%|▌ | 560/9770 [09:27<1:41:27, 1.51it/s]
6%|▌ | 560/9770 [09:27<1:41:27, 1.51it/s]
6%|▌ | 561/9770 [09:27<1:43:00, 1.49it/s]
6%|▌ | 562/9770 [09:28<1:42:23, 1.50it/s]
6%|▌ | 563/9770 [09:29<1:41:11, 1.52it/s]
6%|▌ | 564/9770 [09:29<1:42:11, 1.50it/s]
6%|▌ | 565/9770 [09:30<1:41:55, 1.51it/s]
6%|▌ | 566
+0: {'loss': 0.7541, 'grad_norm': 0.7104274644996863, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.06}
+0: {'loss': 0.7558, 'grad_norm': 0.6883659870653294, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.06}
+0: /9770 [09:31<1:44:00, 1.47it/s]
6%|▌ | 567/9770 [09:31<1:42:58, 1.49it/s]
6%|▌ | 568/9770 [09:32<1:44:34, 1.47it/s]
6%|▌ | 569/9770 [09:33<1:44:32, 1.47it/s]
6%|▌ | 570/9770 [09:33<1:42:01, 1.50it/s]
6%|▌ | 570/9770 [09:33<1:42:01, 1.50it/s]
6%|▌ | 571/9770 [09:34<1:42:49, 1.49it/s]
6%|▌ | 572/9770 [09:35<1:41:55, 1.50it/s]
6%|▌ | 573/9770 [09:36<1:43:47, 1.48it/s]
6%|▌ | 574/9770 [09:36<1:42:59, 1.49it/s]
6%|▌ | 575/9770 [09:37<1:43:10, 1.49it/s]
6%|▌ | 576/9770 [09:38<1:42:56, 1.49it/s]
6%|▌ | 577/9770 [09:38<1:41:56, 1.50it/s]
6%|▌ | 578/9770 [09:39<1:41:42, 1.51it/s]
6%|▌ | 579/9770 [09:40<1:41:39, 1.51it/s]
6%|▌ | 580/9770 [09:40<1:40:12, 1.53it/s]
6%|▌ | 580/9770 [09:40<1:40:12, 1.53it/s]
6
+0: {'loss': 0.7368, 'grad_norm': 0.7245055812247989, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.06}
+0: %|▌ | 581/9770 [09:41<1:40:27, 1.52it/s]
6%|▌ | 582/9770 [09:41<1:40:36, 1.52it/s]
6%|▌ | 583/9770 [09:42<1:40:59, 1.52it/s]
6%|▌ | 584/9770 [09:43<1:40:15, 1.53it/s]
6%|▌ | 585/9770 [09:43<1:39:23, 1.54it/s]
6%|▌ | 586/9770 [09:44<1:42:01, 1.50it/s]
6%|▌ | 587/9770 [09:45<1:40:46, 1.52it/s]
6%|▌ | 588/9770 [09:45<1:40:55, 1.52it/s]
6%|▌ | 589/9770 [09:46<1:41:07, 1.51it/s]
6%|▌ | 590/9770 [09:47<1:40:33, 1.52it/s]
6%|▌ | 590/9770 [09:47<1:40:33, 1.52it/s]
6%|▌ | 591/9770 [09:47<1:41:41, 1.50it/s]
6%|▌ | 592/9770 [09:48<1:41:04, 1.51it/s]
6%|▌ | 593/9770 [09:49<1:42:12, 1.50it/s]
6%|▌ | 594/9770 [09:49<1:42:11, 1.50it/s]
6%|▌ | 595/9770 [09:50<1:42:15, 1.50it/s]
6%|▌ | 596/9770 [09:51<1:42:33, 1.49it/s]
6%|▌ | 597/9770 [09:51<1:4
+0: {'loss': 0.7076, 'grad_norm': 0.7106477121472634, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.06}
+0: {'loss': 0.7788, 'grad_norm': 0.7733853626920865, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.06}
+0: 3:53, 1.47it/s]
6%|▌ | 598/9770 [09:52<1:43:09, 1.48it/s]
6%|▌ | 599/9770 [09:53<1:43:47, 1.47it/s]
6%|▌ | 600/9770 [09:53<1:42:56, 1.48it/s]
6%|▌ | 600/9770 [09:53<1:42:56, 1.48it/s]
6%|▌ | 601/9770 [09:54<1:42:49, 1.49it/s]
6%|▌ | 602/9770 [09:55<1:41:43, 1.50it/s]
6%|▌ | 603/9770 [09:55<1:41:26, 1.51it/s]
6%|▌ | 604/9770 [09:56<1:43:18, 1.48it/s]
6%|▌ | 605/9770 [09:57<1:42:18, 1.49it/s]
6%|▌ | 606/9770 [09:57<1:41:31, 1.50it/s]
6%|▌ | 607/9770 [09:58<1:42:01, 1.50it/s]
6%|▌ | 608/9770 [09:59<1:40:57, 1.51it/s]
6%|▌ | 609/9770 [09:59<1:40:49, 1.51it/s]
6%|▌ | 610/9770 [10:00<1:40:01, 1.53it/s]
6%|▌ | 610/9770 [10:00<1:40:01, 1.53it/s]
6%|▋ | 611/9770 [10:01<1:40:54, 1.51it/s]
6%|▋ |
+0: {'loss': 0.7402, 'grad_norm': 0.7477003820910803, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.06}
+0: 612/9770 [10:01<1:39:56, 1.53it/s]
6%|▋ | 613/9770 [10:02<1:42:30, 1.49it/s]
6%|▋ | 614/9770 [10:03<1:41:27, 1.50it/s]
6%|▋ | 615/9770 [10:03<1:40:58, 1.51it/s]
6%|▋ | 616/9770 [10:04<1:41:37, 1.50it/s]
6%|▋ | 617/9770 [10:05<1:41:14, 1.51it/s]
6%|▋ | 618/9770 [10:05<1:41:34, 1.50it/s]
6%|▋ | 619/9770 [10:06<1:41:30, 1.50it/s]
6%|▋ | 620/9770 [10:07<1:41:50, 1.50it/s]
6%|▋ | 620/9770 [10:07<1:41:50, 1.50it/s]
6%|▋ | 621/9770 [10:07<1:41:26, 1.50it/s]
6%|▋ | 622/9770 [10:08<1:41:16, 1.51it/s]
6%|▋ | 623/9770 [10:09<1:41:42, 1.50it/s]
6%|▋ | 624/9770 [10:09<1:41:39, 1.50it/s]
6%|▋ | 625/9770 [10:10<1:41:54, 1.50it/s]
6%|▋ | 626/9770 [10:11<1:41:47, 1.50it/s]
6%|▋ | 627/9770 [10:11<1:41:04, 1.51it/s]
6%|▋ | 628/9770 [10:12<1:41:42, 1.50it/s]
+0: {'loss': 0.7299, 'grad_norm': 0.7562869568357816, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.06}
+0: {'loss': 0.7478, 'grad_norm': 0.6714961138460858, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.07}
+0:
6%|▋ | 629/9770 [10:13<1:41:40, 1.50it/s]
6%|▋ | 630/9770 [10:13<1:40:38, 1.51it/s]
6%|▋ | 630/9770 [10:13<1:40:38, 1.51it/s]
6%|▋ | 631/9770 [10:14<1:41:37, 1.50it/s]
6%|▋ | 632/9770 [10:15<1:41:35, 1.50it/s]
6%|▋ | 633/9770 [10:15<1:41:31, 1.50it/s]
6%|▋ | 634/9770 [10:16<1:43:06, 1.48it/s]
6%|▋ | 635/9770 [10:17<1:44:13, 1.46it/s]
7%|▋ | 636/9770 [10:17<1:43:02, 1.48it/s]
7%|▋ | 637/9770 [10:18<1:41:31, 1.50it/s]
7%|▋ | 638/9770 [10:19<1:40:17, 1.52it/s]
7%|▋ | 639/9770 [10:19<1:39:55, 1.52it/s]
7%|▋ | 640/9770 [10:20<1:39:12, 1.53it/s]
7%|▋ | 640/9770 [10:20<1:39:12, 1.53it/s]
7%|▋ | 641/9770 [10:21<1:40:04, 1.52it/s]
7%|▋ | 642/9770 [10:21<1:39:57, 1.52it/s]
7%|▋ | 643/9770 [10:22<
+0: {'loss': 0.7528, 'grad_norm': 0.6820183688641436, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.07}
+0: 1:40:14, 1.52it/s]
7%|▋ | 644/9770 [10:23<1:41:19, 1.50it/s]
7%|▋ | 645/9770 [10:23<1:42:18, 1.49it/s]
7%|▋ | 646/9770 [10:24<1:41:38, 1.50it/s]
7%|▋ | 647/9770 [10:25<1:40:51, 1.51it/s]
7%|▋ | 648/9770 [10:25<1:42:01, 1.49it/s]
7%|▋ | 649/9770 [10:26<1:41:38, 1.50it/s]
7%|▋ | 650/9770 [10:27<1:39:55, 1.52it/s]
7%|▋ | 650/9770 [10:27<1:39:55, 1.52it/s]
7%|▋ | 651/9770 [10:27<1:40:25, 1.51it/s]
7%|▋ | 652/9770 [10:28<1:40:41, 1.51it/s]
7%|▋ | 653/9770 [10:29<1:40:39, 1.51it/s]
7%|▋ | 654/9770 [10:29<1:41:06, 1.50it/s]
7%|▋ | 655/9770 [10:30<1:41:27, 1.50it/s]
7%|▋ | 656/9770 [10:31<1:40:11, 1.52it/s]
7%|▋ | 657/9770 [10:31<1:40:39, 1.51it/s]
7%|▋ | 658/9770 [10:32<1:40:01, 1.52it/s]
7%|▋ | 659/9770 [10:33<1:40:07, 1.52it/s]
7%|▋
+0: {'loss': 0.7496, 'grad_norm': 0.732552118023361, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.07}
+0: {'loss': 0.7128, 'grad_norm': 0.666259934984153, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.07}
+0: | 660/9770 [10:33<1:40:19, 1.51it/s]
7%|▋ | 660/9770 [10:33<1:40:19, 1.51it/s]
7%|▋ | 661/9770 [10:34<1:39:51, 1.52it/s]
7%|▋ | 662/9770 [10:35<1:40:11, 1.52it/s]
7%|▋ | 663/9770 [10:35<1:39:50, 1.52it/s]
7%|▋ | 664/9770 [10:36<1:40:11, 1.51it/s]
7%|▋ | 665/9770 [10:37<1:39:25, 1.53it/s]
7%|▋ | 666/9770 [10:37<1:39:56, 1.52it/s]
7%|▋ | 667/9770 [10:38<1:38:38, 1.54it/s]
7%|▋ | 668/9770 [10:39<1:38:48, 1.54it/s]
7%|▋ | 669/9770 [10:39<1:39:32, 1.52it/s]
7%|▋ | 670/9770 [10:40<1:39:22, 1.53it/s]
7%|▋ | 670/9770 [10:40<1:39:22, 1.53it/s]
7%|▋ | 671/9770 [10:41<1:41:31, 1.49it/s]
7%|▋ | 672/9770 [10:41<1:41:42, 1.49it/s]
7%|▋ | 673/9770 [10:42<1:41:01, 1.50it/s]
7%|▋ | 674/9770 [10:43<1:40:03, 1.52it
+0: {'loss': 0.7361, 'grad_norm': 0.6819845853956553, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.07}
+0: /s]
7%|▋ | 675/9770 [10:43<1:40:10, 1.51it/s]
7%|▋ | 676/9770 [10:44<1:40:59, 1.50it/s]
7%|▋ | 677/9770 [10:45<1:41:02, 1.50it/s]
7%|▋ | 678/9770 [10:45<1:40:08, 1.51it/s]
7%|▋ | 679/9770 [10:46<1:39:19, 1.53it/s]
7%|▋ | 680/9770 [10:47<1:39:32, 1.52it/s]
7%|▋ | 680/9770 [10:47<1:39:32, 1.52it/s]
7%|▋ | 681/9770 [10:47<1:39:35, 1.52it/s]
7%|▋ | 682/9770 [10:48<1:40:01, 1.51it/s]
7%|▋ | 683/9770 [10:49<1:40:27, 1.51it/s]
7%|▋ | 684/9770 [10:49<1:41:12, 1.50it/s]
7%|▋ | 685/9770 [10:50<1:41:33, 1.49it/s]
7%|▋ | 686/9770 [10:51<1:40:46, 1.50it/s]
7%|▋ | 687/9770 [10:51<1:41:21, 1.49it/s]
7%|▋ | 688/9770 [10:52<1:40:29, 1.51it/s]
7%|▋ | 689/9770 [10:53<1:40:29, 1.51it/s]
7%|▋ | 690/9770 [10:53<1:40:51, 1.50it/s]
+0: {'loss': 0.7253, 'grad_norm': 0.7036677301551879, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.07}
+0: {'loss': 0.7188, 'grad_norm': 0.7260872605299662, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.07}
+0:
7%|▋ | 690/9770 [10:53<1:40:51, 1.50it/s]
7%|▋ | 691/9770 [10:54<1:41:34, 1.49it/s]
7%|▋ | 692/9770 [10:55<1:41:53, 1.48it/s]
7%|▋ | 693/9770 [10:55<1:41:26, 1.49it/s]
7%|▋ | 694/9770 [10:56<1:39:56, 1.51it/s]
7%|▋ | 695/9770 [10:57<1:40:22, 1.51it/s]
7%|▋ | 696/9770 [10:57<1:40:00, 1.51it/s]
7%|▋ | 697/9770 [10:58<1:40:13, 1.51it/s]
7%|▋ | 698/9770 [10:59<1:42:12, 1.48it/s]
7%|▋ | 699/9770 [10:59<1:40:09, 1.51it/s]
7%|▋ | 700/9770 [11:00<1:39:24, 1.52it/s]
7%|▋ | 700/9770 [11:00<1:39:24, 1.52it/s]
7%|▋ | 701/9770 [11:00<1:38:16, 1.54it/s]
7%|▋ | 702/9770 [11:01<1:39:32, 1.52it/s]
7%|▋ | 703/9770 [11:02<1:39:59, 1.51it/s]
7%|▋ | 704/9770 [11:03<1:42:05, 1.48it/s]
7%|▋ | 705/9770 [11:03<1:42:10, 1.48it/s]
7%|▋
+0: {'loss': 0.7465, 'grad_norm': 0.7770216966956396, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.07}
+0: {'loss': 0.7345, 'grad_norm': 0.7229523530021423, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.07}
+0: | 706/9770 [11:04<1:41:18, 1.49it/s]
7%|▋ | 707/9770 [11:04<1:40:06, 1.51it/s]
7%|▋ | 708/9770 [11:05<1:40:50, 1.50it/s]
7%|▋ | 709/9770 [11:06<1:40:09, 1.51it/s]
7%|▋ | 710/9770 [11:06<1:40:16, 1.51it/s]
7%|▋ | 710/9770 [11:07<1:40:16, 1.51it/s]
7%|▋ | 711/9770 [11:07<1:40:21, 1.50it/s]
7%|▋ | 712/9770 [11:08<1:40:44, 1.50it/s]
7%|▋ | 713/9770 [11:09<1:41:50, 1.48it/s]
7%|▋ | 714/9770 [11:09<1:41:32, 1.49it/s]
7%|▋ | 715/9770 [11:10<1:39:32, 1.52it/s]
7%|▋ | 716/9770 [11:10<1:39:57, 1.51it/s]
7%|▋ | 717/9770 [11:11<1:39:57, 1.51it/s]
7%|▋ | 718/9770 [11:12<1:38:56, 1.52it/s]
7%|▋ | 719/9770 [11:12<1:39:25, 1.52it/s]
7%|▋ | 720/9770 [11:13<1:39:20, 1.52it/s]
7%|▋ | 720/9770 [11:13<1:39:20, 1.5
+0: {'loss': 0.7417, 'grad_norm': 0.6779657465744413, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.07}
+0: 2it/s]
7%|▋ | 721/9770 [11:14<1:39:42, 1.51it/s]
7%|▋ | 722/9770 [11:14<1:39:12, 1.52it/s]
7%|▋ | 723/9770 [11:15<1:37:54, 1.54it/s]
7%|▋ | 724/9770 [11:16<1:38:50, 1.53it/s]
7%|▋ | 725/9770 [11:16<1:38:38, 1.53it/s]
7%|▋ | 726/9770 [11:17<1:38:25, 1.53it/s]
7%|▋ | 727/9770 [11:18<1:40:10, 1.50it/s]
7%|▋ | 728/9770 [11:18<1:41:34, 1.48it/s]
7%|▋ | 729/9770 [11:19<1:40:30, 1.50it/s]
7%|▋ | 730/9770 [11:20<1:40:48, 1.49it/s]
7%|▋ | 730/9770 [11:20<1:40:48, 1.49it/s]
7%|▋ | 731/9770 [11:20<1:41:03, 1.49it/s]
7%|▋ | 732/9770 [11:21<1:41:37, 1.48it/s]
8%|▊ | 733/9770 [11:22<1:40:42, 1.50it/s]
8%|▊ | 734/9770 [11:22<1:40:05, 1.50it/s]
8%|▊ | 735/9770 [11:23<1:39:58, 1.51it/s]
8%|▊ | 736/9770 [11:24<1:39:43, 1.51it/s]
8%|▊ | 737/9770
+0: {'loss': 0.7138, 'grad_norm': 0.7018408368309211, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.08}
+0: {'loss': 0.7146, 'grad_norm': 0.65869291590411, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.08}
+0: [11:24<1:40:54, 1.49it/s]
8%|▊ | 738/9770 [11:25<1:40:36, 1.50it/s]
8%|▊ | 739/9770 [11:26<1:41:03, 1.49it/s]
8%|▊ | 740/9770 [11:26<1:40:42, 1.49it/s]
8%|▊ | 740/9770 [11:26<1:40:42, 1.49it/s]
8%|▊ | 741/9770 [11:27<1:40:06, 1.50it/s]
8%|▊ | 742/9770 [11:28<1:39:33, 1.51it/s]
8%|▊ | 743/9770 [11:28<1:38:35, 1.53it/s]
8%|▊ | 744/9770 [11:29<1:39:01, 1.52it/s]
8%|▊ | 745/9770 [11:30<1:38:10, 1.53it/s]
8%|▊ | 746/9770 [11:30<1:36:59, 1.55it/s]
8%|▊ | 747/9770 [11:31<1:38:27, 1.53it/s]
8%|▊ | 748/9770 [11:32<1:37:52, 1.54it/s]
8%|▊ | 749/9770 [11:32<1:38:41, 1.52it/s]
8%|▊ | 750/9770 [11:33<1:39:42, 1.51it/s]
8%|▊ | 750/9770 [11:33<1:39:42, 1.51it/s]
8%|▊ | 751/9770 [11:34<1:39:58, 1.50it/s]
8%|▊
+0: {'loss': 0.7547, 'grad_norm': 0.7091292439205819, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.08}
+0: | 752/9770 [11:34<1:38:57, 1.52it/s]
8%|▊ | 753/9770 [11:35<1:38:37, 1.52it/s]
8%|▊ | 754/9770 [11:36<1:39:21, 1.51it/s]
8%|▊ | 755/9770 [11:36<1:39:53, 1.50it/s]
8%|▊ | 756/9770 [11:37<1:39:18, 1.51it/s]
8%|▊ | 757/9770 [11:38<1:38:44, 1.52it/s]
8%|▊ | 758/9770 [11:38<1:38:42, 1.52it/s]
8%|▊ | 759/9770 [11:39<1:38:18, 1.53it/s]
8%|▊ | 760/9770 [11:40<1:37:54, 1.53it/s]
8%|▊ | 760/9770 [11:40<1:37:54, 1.53it/s]
8%|▊ | 761/9770 [11:40<1:38:03, 1.53it/s]
8%|▊ | 762/9770 [11:41<1:38:07, 1.53it/s]
8%|▊ | 763/9770 [11:42<1:38:17, 1.53it/s]
8%|▊ | 764/9770 [11:42<1:37:04, 1.55it/s]
8%|▊ | 765/9770 [11:43<1:36:30, 1.56it/s]
8%|▊ | 766/9770 [11:43<1:36:23, 1.56it/s]
8%|▊ | 767/9770 [11:44<1:37:36, 1.54it/s]
8%|▊ | 768/9770 [11:45<1:37:14,
+0: {'loss': 0.723, 'grad_norm': 0.7466168764271491, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.08}
+0: {'loss': 0.7481, 'grad_norm': 0.6832693090356019, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.08}
+0: 1.54it/s]
8%|▊ | 769/9770 [11:45<1:38:44, 1.52it/s]
8%|▊ | 770/9770 [11:46<1:39:45, 1.50it/s]
8%|▊ | 770/9770 [11:46<1:39:45, 1.50it/s]
8%|▊ | 771/9770 [11:47<1:39:13, 1.51it/s]
8%|▊ | 772/9770 [11:47<1:41:28, 1.48it/s]
8%|▊ | 773/9770 [11:48<1:42:31, 1.46it/s]
8%|▊ | 774/9770 [11:49<1:42:52, 1.46it/s]
8%|▊ | 775/9770 [11:50<1:42:02, 1.47it/s]
8%|▊ | 776/9770 [11:50<1:41:22, 1.48it/s]
8%|▊ | 777/9770 [11:51<1:41:20, 1.48it/s]
8%|▊ | 778/9770 [11:52<1:41:43, 1.47it/s]
8%|▊ | 779/9770 [11:52<1:39:57, 1.50it/s]
8%|▊ | 780/9770 [11:53<1:38:35, 1.52it/s]
8%|▊ | 780/9770 [11:53<1:38:35, 1.52it/s]
8%|▊ | 781/9770 [11:53<1:38:24, 1.52it/s]
8%|▊ | 782/9770 [11:54<1:37:56, 1.53it/s]
8%|▊ | 783/97
+0: {'loss': 0.7379, 'grad_norm': 0.6880026205183939, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.08}
+0: 70 [11:55<1:38:03, 1.53it/s]
8%|▊ | 784/9770 [11:55<1:37:42, 1.53it/s]
8%|▊ | 785/9770 [11:56<1:37:38, 1.53it/s]
8%|▊ | 786/9770 [11:57<1:37:46, 1.53it/s]
8%|▊ | 787/9770 [11:57<1:37:48, 1.53it/s]
8%|▊ | 788/9770 [11:58<1:38:15, 1.52it/s]
8%|▊ | 789/9770 [11:59<1:37:31, 1.53it/s]
8%|▊ | 790/9770 [11:59<1:37:42, 1.53it/s]
8%|▊ | 790/9770 [11:59<1:37:42, 1.53it/s]
8%|▊ | 791/9770 [12:00<1:38:31, 1.52it/s]
8%|▊ | 792/9770 [12:01<1:38:39, 1.52it/s]
8%|▊ | 793/9770 [12:01<1:38:40, 1.52it/s]
8%|▊ | 794/9770 [12:02<1:39:23, 1.51it/s]
8%|▊ | 795/9770 [12:03<1:40:18, 1.49it/s]
8%|▊ | 796/9770 [12:03<1:38:46, 1.51it/s]
8%|▊ | 797/9770 [12:04<1:38:24, 1.52it/s]
8%|▊ | 798/9770 [12:05<1:39:50, 1.50it/s]
8%|▊ | 799/9770 [12:05<1:39:51, 1.50it/s]
8%|
+0: {'loss': 0.7226, 'grad_norm': 0.6495199535599289, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.08}
+0: {'loss': 0.743, 'grad_norm': 0.6885464404777386, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.08}
+0: ▊ | 800/9770 [12:06<1:39:52, 1.50it/s]
8%|▊ | 800/9770 [12:06<1:39:52, 1.50it/s]
8%|▊ | 801/9770 [12:07<1:40:08, 1.49it/s]
8%|▊ | 802/9770 [12:07<1:41:49, 1.47it/s]
8%|▊ | 803/9770 [12:08<1:40:14, 1.49it/s]
8%|▊ | 804/9770 [12:09<1:40:06, 1.49it/s]
8%|▊ | 805/9770 [12:09<1:38:54, 1.51it/s]
8%|▊ | 806/9770 [12:10<1:39:15, 1.51it/s]
8%|▊ | 807/9770 [12:11<1:38:58, 1.51it/s]
8%|▊ | 808/9770 [12:11<1:39:41, 1.50it/s]
8%|▊ | 809/9770 [12:12<1:39:45, 1.50it/s]
8%|▊ | 810/9770 [12:13<1:38:18, 1.52it/s]
8%|▊ | 810/9770 [12:13<1:38:18, 1.52it/s]
8%|▊ | 811/9770 [12:13<1:37:25, 1.53it/s]
8%|▊ | 812/9770 [12:14<1:37:35, 1.53it/s]
8%|▊ | 813/9770 [12:15<1:37:38, 1.53it/s]
8%|▊ | 814/9770 [12:15<1:38:4
+0: {'loss': 0.7356, 'grad_norm': 0.6882226690537028, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.08}
+0: 7, 1.51it/s]
8%|▊ | 815/9770 [12:16<1:39:42, 1.50it/s]
8%|▊ | 816/9770 [12:17<1:39:07, 1.51it/s]
8%|▊ | 817/9770 [12:17<1:39:03, 1.51it/s]
8%|▊ | 818/9770 [12:18<1:37:49, 1.53it/s]
8%|▊ | 819/9770 [12:19<1:37:48, 1.53it/s]
8%|▊ | 820/9770 [12:19<1:38:08, 1.52it/s]
8%|▊ | 820/9770 [12:19<1:38:08, 1.52it/s]
8%|▊ | 821/9770 [12:20<1:37:44, 1.53it/s]
8%|▊ | 822/9770 [12:21<1:37:30, 1.53it/s]
8%|▊ | 823/9770 [12:21<1:37:44, 1.53it/s]
8%|▊ | 824/9770 [12:22<1:37:41, 1.53it/s]
8%|▊ | 825/9770 [12:23<1:36:49, 1.54it/s]
8%|▊ | 826/9770 [12:23<1:37:28, 1.53it/s]
8%|▊ | 827/9770 [12:24<1:37:15, 1.53it/s]
8%|▊ | 828/9770 [12:24<1:37:32, 1.53it/s]
8%|▊ | 829/9770 [12:25<1:37:05, 1.53it/s]
8%|▊ | 830/9770 [12:26<1:36:59, 1.54it/s]
+0: {'loss': 0.7216, 'grad_norm': 0.6690167991754846, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.08}
+0: {'loss': 0.7211, 'grad_norm': 0.7265324626150281, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.09}
+0:
8%|▊ | 830/9770 [12:26<1:36:59, 1.54it/s]
9%|▊ | 831/9770 [12:26<1:37:18, 1.53it/s]
9%|▊ | 832/9770 [12:27<1:39:15, 1.50it/s]
9%|▊ | 833/9770 [12:28<1:39:04, 1.50it/s]
9%|▊ | 834/9770 [12:28<1:37:41, 1.52it/s]
9%|▊ | 835/9770 [12:29<1:37:11, 1.53it/s]
9%|▊ | 836/9770 [12:30<1:37:45, 1.52it/s]
9%|▊ | 837/9770 [12:30<1:39:53, 1.49it/s]
9%|▊ | 838/9770 [12:31<1:37:55, 1.52it/s]
9%|▊ | 839/9770 [12:32<1:39:41, 1.49it/s]
9%|▊ | 840/9770 [12:32<1:39:04, 1.50it/s]
9%|▊ | 840/9770 [12:32<1:39:04, 1.50it/s]
9%|▊ | 841/9770 [12:33<1:38:49, 1.51it/s]
9%|▊ | 842/9770 [12:34<1:38:06, 1.52it/s]
9%|▊ | 843/9770 [12:34<1:37:31, 1.53it/s]
9%|▊ | 844/9770 [12:35<1:37:19, 1.53it/s]
9%|▊ | 845/9770 [12:36<1:39:02, 1.50it/s]
+0: {'loss': 0.7319, 'grad_norm': 0.7130304396605052, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.09}
+0: {'loss': 0.7556, 'grad_norm': 0.6949832508567161, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.09}
+0: 9%|▊ | 846/9770 [12:36<1:38:24, 1.51it/s]
9%|▊ | 847/9770 [12:37<1:38:41, 1.51it/s]
9%|▊ | 848/9770 [12:38<1:39:36, 1.49it/s]
9%|▊ | 849/9770 [12:38<1:38:39, 1.51it/s]
9%|▊ | 850/9770 [12:39<1:37:31, 1.52it/s]
9%|▊ | 850/9770 [12:39<1:37:31, 1.52it/s]
9%|▊ | 851/9770 [12:40<1:37:07, 1.53it/s]
9%|▊ | 852/9770 [12:40<1:39:05, 1.50it/s]
9%|▊ | 853/9770 [12:41<1:40:20, 1.48it/s]
9%|▊ | 854/9770 [12:42<1:39:44, 1.49it/s]
9%|▉ | 855/9770 [12:42<1:38:50, 1.50it/s]
9%|▉ | 856/9770 [12:43<1:39:16, 1.50it/s]
9%|▉ | 857/9770 [12:44<1:38:50, 1.50it/s]
9%|▉ | 858/9770 [12:44<1:38:26, 1.51it/s]
9%|▉ | 859/9770 [12:45<1:38:52, 1.50it/s]
9%|▉ | 860/9770 [12:46<1:39:06, 1.50it/s]
9%|▉ | 860/9770 [12:46<1:3
+0: {'loss': 0.7171, 'grad_norm': 0.7461139430490219, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.09}
+0: 9:06, 1.50it/s]
9%|▉ | 861/9770 [12:46<1:38:19, 1.51it/s]
9%|▉ | 862/9770 [12:47<1:38:41, 1.50it/s]
9%|▉ | 863/9770 [12:48<1:38:55, 1.50it/s]
9%|▉ | 864/9770 [12:48<1:38:00, 1.51it/s]
9%|▉ | 865/9770 [12:49<1:38:11, 1.51it/s]
9%|▉ | 866/9770 [12:50<1:37:49, 1.52it/s]
9%|▉ | 867/9770 [12:50<1:38:51, 1.50it/s]
9%|▉ | 868/9770 [12:51<1:39:27, 1.49it/s]
9%|▉ | 869/9770 [12:52<1:38:02, 1.51it/s]
9%|▉ | 870/9770 [12:52<1:38:14, 1.51it/s]
9%|▉ | 870/9770 [12:52<1:38:14, 1.51it/s]
9%|▉ | 871/9770 [12:53<1:38:15, 1.51it/s]
9%|▉ | 872/9770 [12:54<1:40:13, 1.48it/s]
9%|▉ | 873/9770 [12:54<1:38:50, 1.50it/s]
9%|▉ | 874/9770 [12:55<1:39:23, 1.49it/s]
9%|▉ | 875/9770 [12:56<1:38:46, 1.50it/s]
9%|▉ | 876/9770 [12:56<1:38:34, 1.50it/s]
9%|▉ |
+0: {'loss': 0.7209, 'grad_norm': 0.7516643611434419, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.09}
+0: {'loss': 0.7057, 'grad_norm': 0.662765127238728, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.09}
+0: 877/9770 [12:57<1:38:33, 1.50it/s]
9%|▉ | 878/9770 [12:58<1:38:28, 1.51it/s]
9%|▉ | 879/9770 [12:58<1:37:13, 1.52it/s]
9%|▉ | 880/9770 [12:59<1:38:05, 1.51it/s]
9%|▉ | 880/9770 [12:59<1:38:05, 1.51it/s]
9%|▉ | 881/9770 [13:00<1:36:56, 1.53it/s]
9%|▉ | 882/9770 [13:00<1:36:24, 1.54it/s]
9%|▉ | 883/9770 [13:01<1:36:16, 1.54it/s]
9%|▉ | 884/9770 [13:02<1:37:06, 1.53it/s]
9%|▉ | 885/9770 [13:02<1:37:31, 1.52it/s]
9%|▉ | 886/9770 [13:03<1:37:59, 1.51it/s]
9%|▉ | 887/9770 [13:04<1:37:35, 1.52it/s]
9%|▉ | 888/9770 [13:04<1:37:44, 1.51it/s]
9%|▉ | 889/9770 [13:05<1:39:24, 1.49it/s]
9%|▉ | 890/9770 [13:06<1:40:21, 1.47it/s]
9%|▉ | 890/9770 [13:06<1:40:21, 1.47it/s]
9%|▉ | 891/9770 [13:06<1:39:56, 1.48it/s]
+0: {'loss': 0.7489, 'grad_norm': 0.740736768151442, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.09}
+0:
9%|▉ | 892/9770 [13:07<1:39:30, 1.49it/s]
9%|▉ | 893/9770 [13:08<1:37:38, 1.52it/s]
9%|▉ | 894/9770 [13:08<1:37:28, 1.52it/s]
9%|▉ | 895/9770 [13:09<1:37:33, 1.52it/s]
9%|▉ | 896/9770 [13:10<1:37:50, 1.51it/s]
9%|▉ | 897/9770 [13:10<1:36:32, 1.53it/s]
9%|▉ | 898/9770 [13:11<1:38:20, 1.50it/s]
9%|▉ | 899/9770 [13:12<1:38:10, 1.51it/s]
9%|▉ | 900/9770 [13:12<1:37:04, 1.52it/s]
9%|▉ | 900/9770 [13:12<1:37:04, 1.52it/s]
9%|▉ | 901/9770 [13:13<1:37:48, 1.51it/s]
9%|▉ | 902/9770 [13:14<1:37:09, 1.52it/s]
9%|▉ | 903/9770 [13:14<1:35:57, 1.54it/s]
9%|▉ | 904/9770 [13:15<1:38:54, 1.49it/s]
9%|▉ | 905/9770 [13:16<1:38:12, 1.50it/s]
9%|▉ | 906/9770 [13:16<1:37:49, 1.51it/s]
9%|▉ | 907/9770 [13:17<1:38:03, 1.51it/s]
9%|▉ | 908/9770 [13:18
+0: {'loss': 0.7295, 'grad_norm': 0.7544407365175977, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.09}
+0: {'loss': 0.7226, 'grad_norm': 0.6464521728241293, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.09}
+0: <1:38:37, 1.50it/s]
9%|▉ | 909/9770 [13:18<1:38:28, 1.50it/s]
9%|▉ | 910/9770 [13:19<1:38:28, 1.50it/s]
9%|▉ | 910/9770 [13:19<1:38:28, 1.50it/s]
9%|▉ | 911/9770 [13:19<1:37:52, 1.51it/s]
9%|▉ | 912/9770 [13:20<1:37:58, 1.51it/s]
9%|▉ | 913/9770 [13:21<1:37:50, 1.51it/s]
9%|▉ | 914/9770 [13:21<1:36:50, 1.52it/s]
9%|▉ | 915/9770 [13:22<1:36:39, 1.53it/s]
9%|▉ | 916/9770 [13:23<1:38:02, 1.51it/s]
9%|▉ | 917/9770 [13:23<1:37:36, 1.51it/s]
9%|▉ | 918/9770 [13:24<1:37:58, 1.51it/s]
9%|▉ | 919/9770 [13:25<1:37:20, 1.52it/s]
9%|▉ | 920/9770 [13:25<1:37:57, 1.51it/s]
9%|▉ | 920/9770 [13:25<1:37:57, 1.51it/s]
9%|▉ | 921/9770 [13:26<1:37:46, 1.51it/s]
9%|▉ | 922/9770 [13:27<1:37:48, 1.51it/s]
9%|▉
+0: {'loss': 0.7197, 'grad_norm': 0.6797085168266879, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.1}
+0: | 923/9770 [13:27<1:37:18, 1.52it/s]
9%|▉ | 924/9770 [13:28<1:36:44, 1.52it/s]
9%|▉ | 925/9770 [13:29<1:36:14, 1.53it/s]
9%|▉ | 926/9770 [13:29<1:36:23, 1.53it/s]
9%|▉ | 927/9770 [13:30<1:37:16, 1.52it/s]
9%|▉ | 928/9770 [13:31<1:36:43, 1.52it/s]
10%|▉ | 929/9770 [13:31<1:36:15, 1.53it/s]
10%|▉ | 930/9770 [13:32<1:35:46, 1.54it/s]
10%|▉ | 930/9770 [13:32<1:35:46, 1.54it/s]
10%|▉ | 931/9770 [13:33<1:36:06, 1.53it/s]
10%|▉ | 932/9770 [13:33<1:36:59, 1.52it/s]
10%|▉ | 933/9770 [13:34<1:39:07, 1.49it/s]
10%|▉ | 934/9770 [13:35<1:38:42, 1.49it/s]
10%|▉ | 935/9770 [13:35<1:38:59, 1.49it/s]
10%|▉ | 936/9770 [13:36<1:39:32, 1.48it/s]
10%|▉ | 937/9770 [13:37<1:40:46, 1.46it/s]
10%|▉ | 938/9770 [13:37<1:39:46, 1.48it/s]
10%|▉ | 939/9770 [13:38<1:39:02, 1.49i
+0: {'loss': 0.7279, 'grad_norm': 0.6516631120912061, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.1}
+0: {'loss': 0.7177, 'grad_norm': 0.6500084335367152, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.1}
+0: t/s]
10%|▉ | 940/9770 [13:39<1:38:40, 1.49it/s]
10%|▉ | 940/9770 [13:39<1:38:40, 1.49it/s]
10%|▉ | 941/9770 [13:39<1:38:18, 1.50it/s]
10%|▉ | 942/9770 [13:40<1:37:07, 1.51it/s]
10%|▉ | 943/9770 [13:41<1:36:23, 1.53it/s]
10%|▉ | 944/9770 [13:41<1:35:37, 1.54it/s]
10%|▉ | 945/9770 [13:42<1:35:32, 1.54it/s]
10%|▉ | 946/9770 [13:43<1:37:52, 1.50it/s]
10%|▉ | 947/9770 [13:43<1:38:20, 1.50it/s]
10%|▉ | 948/9770 [13:44<1:38:22, 1.49it/s]
10%|▉ | 949/9770 [13:45<1:37:54, 1.50it/s]
10%|▉ | 950/9770 [13:45<1:37:24, 1.51it/s]
10%|▉ | 950/9770 [13:45<1:37:24, 1.51it/s]
10%|▉ | 951/9770 [13:46<1:38:00, 1.50it/s]
10%|▉ | 952/9770 [13:47<1:38:05, 1.50it/s]
10%|▉ | 953/9770 [13:47<1:36:38, 1.52it/s]
10%|▉ | 954/9770 [13
+0: {'loss': 0.7277, 'grad_norm': 0.6893143454628127, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.1}
+0: :48<1:35:26, 1.54it/s]
10%|▉ | 955/9770 [13:49<1:36:15, 1.53it/s]
10%|▉ | 956/9770 [13:49<1:35:37, 1.54it/s]
10%|▉ | 957/9770 [13:50<1:36:01, 1.53it/s]
10%|▉ | 958/9770 [13:51<1:36:03, 1.53it/s]
10%|▉ | 959/9770 [13:51<1:36:01, 1.53it/s]
10%|▉ | 960/9770 [13:52<1:35:21, 1.54it/s]
10%|▉ | 960/9770 [13:52<1:35:21, 1.54it/s]
10%|▉ | 961/9770 [13:53<1:35:43, 1.53it/s]
10%|▉ | 962/9770 [13:53<1:35:23, 1.54it/s]
10%|▉ | 963/9770 [13:54<1:35:12, 1.54it/s]
10%|▉ | 964/9770 [13:54<1:36:34, 1.52it/s]
10%|▉ | 965/9770 [13:55<1:36:45, 1.52it/s]
10%|▉ | 966/9770 [13:56<1:36:45, 1.52it/s]
10%|▉ | 967/9770 [13:56<1:36:51, 1.51it/s]
10%|▉ | 968/9770 [13:57<1:36:03, 1.53it/s]
10%|▉ | 969/9770 [13:58<1:36:48, 1.52it/s]
10%|▉ | 970/9770 [13:58<1:36:35, 1.52it/s]
+0: {'loss': 0.7343, 'grad_norm': 0.6982460126696003, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.1}
+0: {'loss': 0.7313, 'grad_norm': 0.6661299414245518, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.1}
+0:
10%|▉ | 970/9770 [13:58<1:36:35, 1.52it/s]
10%|▉ | 971/9770 [13:59<1:36:47, 1.52it/s]
10%|▉ | 972/9770 [14:00<1:37:07, 1.51it/s]
10%|▉ | 973/9770 [14:00<1:37:29, 1.50it/s]
10%|▉ | 974/9770 [14:01<1:37:01, 1.51it/s]
10%|▉ | 975/9770 [14:02<1:36:39, 1.52it/s]
10%|▉ | 976/9770 [14:02<1:36:29, 1.52it/s]
10%|█ | 977/9770 [14:03<1:37:35, 1.50it/s]
10%|█ | 978/9770 [14:04<1:35:45, 1.53it/s]
10%|█ | 979/9770 [14:04<1:36:22, 1.52it/s]
10%|█ | 980/9770 [14:05<1:35:48, 1.53it/s]
10%|█ | 980/9770 [14:05<1:35:48, 1.53it/s]
10%|█ | 981/9770 [14:06<1:36:14, 1.52it/s]
10%|█ | 982/9770 [14:06<1:35:53, 1.53it/s]
10%|█ | 983/9770 [14:07<1:35:49, 1.53it/s]
10%|█ | 984/9770 [14:08<1:36:45, 1.51it/s]
10%|█ | 985/9770 [14:08<1:37:44, 1.
+0: {'loss': 0.7384, 'grad_norm': 0.6903639148703681, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.1}
+0: {'loss': 0.7234, 'grad_norm': 0.7065131396712995, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.1}
+0: 50it/s]
10%|█ | 986/9770 [14:09<1:36:54, 1.51it/s]
10%|█ | 987/9770 [14:10<1:37:10, 1.51it/s]
10%|█ | 988/9770 [14:10<1:39:23, 1.47it/s]
10%|█ | 989/9770 [14:11<1:37:32, 1.50it/s]
10%|█ | 990/9770 [14:12<1:39:20, 1.47it/s]
10%|█ | 990/9770 [14:12<1:39:20, 1.47it/s]
10%|█ | 991/9770 [14:12<1:37:44, 1.50it/s]
10%|█ | 992/9770 [14:13<1:36:50, 1.51it/s]
10%|█ | 993/9770 [14:14<1:37:01, 1.51it/s]
10%|█ | 994/9770 [14:14<1:38:45, 1.48it/s]
10%|█ | 995/9770 [14:15<1:37:30, 1.50it/s]
10%|█ | 996/9770 [14:16<1:37:48, 1.50it/s]
10%|█ | 997/9770 [14:16<1:38:31, 1.48it/s]
10%|█ | 998/9770 [14:17<1:37:27, 1.50it/s]
10%|█ | 999/9770 [14:18<1:35:31, 1.53it/s]
10%|█ | 1000/9770 [14:18<1:35:20, 1.53it/s]
10%|█ | 1000/97
+0: {'loss': 0.7326, 'grad_norm': 0.7039545270723172, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.1}
+0: 70 [14:18<1:35:20, 1.53it/s]
10%|█ | 1001/9770 [14:19<1:36:03, 1.52it/s]
10%|█ | 1002/9770 [14:20<1:35:51, 1.52it/s]
10%|█ | 1003/9770 [14:20<1:35:42, 1.53it/s]
10%|█ | 1004/9770 [14:21<1:34:22, 1.55it/s]
10%|█ | 1005/9770 [14:22<1:35:28, 1.53it/s]
10%|█ | 1006/9770 [14:22<1:35:24, 1.53it/s]
10%|█ | 1007/9770 [14:23<1:36:13, 1.52it/s]
10%|█ | 1008/9770 [14:24<1:36:12, 1.52it/s]
10%|█ | 1009/9770 [14:24<1:36:35, 1.51it/s]
10%|█ | 1010/9770 [14:25<1:36:07, 1.52it/s]
10%|█ | 1010/9770 [14:25<1:36:07, 1.52it/s]
10%|█ | 1011/9770 [14:26<1:35:47, 1.52it/s]
10%|█ | 1012/9770 [14:26<1:37:16, 1.50it/s]
10%|█ | 1013/9770 [14:27<1:36:49, 1.51it/s]
10%|█ | 1014/9770 [14:28<1:36:19, 1.51it/s]
10%|█ | 1015/9770 [14:28<1:36:03, 1.52it/s]
10%|█ | 1016/9770 [14:29<1:38:33
+0: {'loss': 0.7301, 'grad_norm': 0.6962783177309145, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.1}
+0: {'loss': 0.7358, 'grad_norm': 0.6653584314961221, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.11}
+0: , 1.48it/s]
10%|█ | 1017/9770 [14:30<1:37:19, 1.50it/s]
10%|█ | 1018/9770 [14:30<1:37:02, 1.50it/s]
10%|█ | 1019/9770 [14:31<1:36:43, 1.51it/s]
10%|█ | 1020/9770 [14:32<1:37:20, 1.50it/s]
10%|█ | 1020/9770 [14:32<1:37:20, 1.50it/s]
10%|█ | 1021/9770 [14:32<1:35:57, 1.52it/s]
10%|█ | 1022/9770 [14:33<1:35:21, 1.53it/s]
10%|█ | 1023/9770 [14:33<1:35:08, 1.53it/s]
10%|█ | 1024/9770 [14:34<1:37:37, 1.49it/s]
10%|█ | 1025/9770 [14:35<1:37:07, 1.50it/s]
11%|█ | 1026/9770 [14:36<1:36:34, 1.51it/s]
11%|█ | 1027/9770 [14:36<1:36:54, 1.50it/s]
11%|█ | 1028/9770 [14:37<1:37:29, 1.49it/s]
11%|█ | 1029/9770 [14:38<1:36:32, 1.51it/s]
11%|█ | 1030/9770 [14:38<1:36:39, 1.51it/s]
11%|█ | 1030/9770 [14:38<1:36:39, 1.51it/s]
11%|
+0: {'loss': 0.7101, 'grad_norm': 0.6313113690595087, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.11}
+0: █ | 1031/9770 [14:39<1:35:51, 1.52it/s]
11%|█ | 1032/9770 [14:39<1:34:55, 1.53it/s]
11%|█ | 1033/9770 [14:40<1:34:39, 1.54it/s]
11%|█ | 1034/9770 [14:41<1:35:40, 1.52it/s]
11%|█ | 1035/9770 [14:41<1:35:47, 1.52it/s]
11%|█ | 1036/9770 [14:42<1:35:16, 1.53it/s]
11%|█ | 1037/9770 [14:43<1:35:15, 1.53it/s]
11%|█ | 1038/9770 [14:43<1:34:17, 1.54it/s]
11%|█ | 1039/9770 [14:44<1:33:08, 1.56it/s]
11%|█ | 1040/9770 [14:45<1:35:31, 1.52it/s]
11%|█ | 1040/9770 [14:45<1:35:31, 1.52it/s]
11%|█ | 1041/9770 [14:45<1:34:51, 1.53it/s]
11%|█ | 1042/9770 [14:46<1:37:01, 1.50it/s]
11%|█ | 1043/9770 [14:47<1:37:08, 1.50it/s]
11%|█ | 1044/9770 [14:47<1:37:20, 1.49it/s]
11%|█ | 1045/9770 [14:48<1:36:52, 1.50it/s]
11%|█ | 1046/9770 [14:49<1:35:39, 1.52it/s]
11%|█ | 104
+0: {'loss': 0.7098, 'grad_norm': 0.6302999246927546, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.11}
+0: {'loss': 0.7113, 'grad_norm': 0.6585400529639046, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.11}
+0: 7/9770 [14:49<1:35:12, 1.53it/s]
11%|█ | 1048/9770 [14:50<1:35:00, 1.53it/s]
11%|█ | 1049/9770 [14:51<1:34:55, 1.53it/s]
11%|█ | 1050/9770 [14:51<1:34:29, 1.54it/s]
11%|█ | 1050/9770 [14:51<1:34:29, 1.54it/s]
11%|█ | 1051/9770 [14:52<1:34:41, 1.53it/s]
11%|█ | 1052/9770 [14:53<1:33:51, 1.55it/s]
11%|█ | 1053/9770 [14:53<1:34:37, 1.54it/s]
11%|█ | 1054/9770 [14:54<1:35:12, 1.53it/s]
11%|█ | 1055/9770 [14:55<1:35:01, 1.53it/s]
11%|█ | 1056/9770 [14:55<1:35:10, 1.53it/s]
11%|█ | 1057/9770 [14:56<1:36:02, 1.51it/s]
11%|█ | 1058/9770 [14:57<1:35:31, 1.52it/s]
11%|█ | 1059/9770 [14:57<1:35:13, 1.52it/s]
11%|█ | 1060/9770 [14:58<1:35:44, 1.52it/s]
11%|█ | 1060/9770 [14:58<1:35:44, 1.52it/s]
11%|█ | 1061/9770 [14:58<1:35
+0: {'loss': 0.7311, 'grad_norm': 0.7449615344148319, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.11}
+0: :18, 1.52it/s]
11%|█ | 1062/9770 [14:59<1:35:10, 1.52it/s]
11%|█ | 1063/9770 [15:00<1:36:01, 1.51it/s]
11%|█ | 1064/9770 [15:00<1:36:47, 1.50it/s]
11%|█ | 1065/9770 [15:01<1:35:52, 1.51it/s]
11%|█ | 1066/9770 [15:02<1:35:50, 1.51it/s]
11%|█ | 1067/9770 [15:02<1:34:58, 1.53it/s]
11%|█ | 1068/9770 [15:03<1:34:10, 1.54it/s]
11%|█ | 1069/9770 [15:04<1:34:34, 1.53it/s]
11%|█ | 1070/9770 [15:04<1:34:46, 1.53it/s]
11%|█ | 1070/9770 [15:04<1:34:46, 1.53it/s]
11%|█ | 1071/9770 [15:05<1:37:09, 1.49it/s]
11%|█ | 1072/9770 [15:06<1:38:20, 1.47it/s]
11%|█ | 1073/9770 [15:06<1:37:03, 1.49it/s]
11%|█ | 1074/9770 [15:07<1:35:45, 1.51it/s]
11%|█ | 1075/9770 [15:08<1:36:17, 1.51it/s]
11%|█ | 1076/9770 [15:08<1:36:27, 1.50it/s]
11%|█ | 1077/9770 [15:09<1:35:54, 1.51it/s]
+0: {'loss': 0.7135, 'grad_norm': 0.7170818401140561, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.11}
+0: {'loss': 0.7215, 'grad_norm': 0.7236218911060883, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.11}
+0: 11%|█ | 1078/9770 [15:10<1:35:01, 1.52it/s]
11%|█ | 1079/9770 [15:10<1:34:41, 1.53it/s]
11%|█ | 1080/9770 [15:11<1:34:04, 1.54it/s]
11%|█ | 1080/9770 [15:11<1:34:04, 1.54it/s]
11%|█ | 1081/9770 [15:12<1:33:46, 1.54it/s]
11%|█ | 1082/9770 [15:12<1:34:38, 1.53it/s]
11%|█ | 1083/9770 [15:13<1:34:59, 1.52it/s]
11%|█ | 1084/9770 [15:14<1:34:07, 1.54it/s]
11%|█ | 1085/9770 [15:14<1:35:10, 1.52it/s]
11%|█ | 1086/9770 [15:15<1:35:14, 1.52it/s]
11%|█ | 1087/9770 [15:16<1:36:57, 1.49it/s]
11%|█ | 1088/9770 [15:16<1:36:23, 1.50it/s]
11%|█ | 1089/9770 [15:17<1:36:35, 1.50it/s]
11%|█ | 1090/9770 [15:18<1:36:11, 1.50it/s]
11%|█ | 1090/9770 [15:18<1:36:11, 1.50it/s]
11%|█ | 1091/9770 [15:18<1:36:08, 1.50it/s]
11%|█ |
+0: {'loss': 0.7336, 'grad_norm': 0.7232609581471786, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.11}
+0: 1092/9770 [15:19<1:37:39, 1.48it/s]
11%|█ | 1093/9770 [15:20<1:38:32, 1.47it/s]
11%|█ | 1094/9770 [15:20<1:37:39, 1.48it/s]
11%|█ | 1095/9770 [15:21<1:37:18, 1.49it/s]
11%|█ | 1096/9770 [15:22<1:37:02, 1.49it/s]
11%|█ | 1097/9770 [15:22<1:35:25, 1.51it/s]
11%|█ | 1098/9770 [15:23<1:35:50, 1.51it/s]
11%|█ | 1099/9770 [15:24<1:35:32, 1.51it/s]
11%|█▏ | 1100/9770 [15:24<1:36:27, 1.50it/s]
11%|█▏ | 1100/9770 [15:24<1:36:27, 1.50it/s]
11%|█▏ | 1101/9770 [15:25<1:36:43, 1.49it/s]
11%|█▏ | 1102/9770 [15:26<1:36:05, 1.50it/s]
11%|█▏ | 1103/9770 [15:26<1:36:04, 1.50it/s]
11%|█▏ | 1104/9770 [15:27<1:35:47, 1.51it/s]
11%|█▏ | 1105/9770 [15:28<1:34:50, 1.52it/s]
11%|█▏ | 1106/9770 [15:28<1:35:08, 1.52it/s]
11%|█▏ | 1107/9770 [15:29<1:34:08, 1.53it/s]
11%|█▏
+0: {'loss': 0.7373, 'grad_norm': 0.7452568041300582, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.11}
+0: {'loss': 0.708, 'grad_norm': 0.6841435218462218, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.11}
+0: | 1108/9770 [15:30<1:34:28, 1.53it/s]
11%|█▏ | 1109/9770 [15:30<1:33:55, 1.54it/s]
11%|█▏ | 1110/9770 [15:31<1:34:39, 1.52it/s]
11%|█▏ | 1110/9770 [15:31<1:34:39, 1.52it/s]
11%|█▏ | 1111/9770 [15:32<1:34:13, 1.53it/s]
11%|█▏ | 1112/9770 [15:32<1:34:35, 1.53it/s]
11%|█▏ | 1113/9770 [15:33<1:35:01, 1.52it/s]
11%|█▏ | 1114/9770 [15:34<1:35:27, 1.51it/s]
11%|█▏ | 1115/9770 [15:34<1:35:25, 1.51it/s]
11%|█▏ | 1116/9770 [15:35<1:36:18, 1.50it/s]
11%|█▏ | 1117/9770 [15:36<1:37:09, 1.48it/s]
11%|█▏ | 1118/9770 [15:36<1:36:44, 1.49it/s]
11%|█▏ | 1119/9770 [15:37<1:36:11, 1.50it/s]
11%|█▏ | 1120/9770 [15:38<1:35:13, 1.51it/s]
11%|█▏ | 1120/9770 [15:38<1:35:13, 1.51it/s]
11%|█▏ | 1121/9770 [15:38<1:34:36, 1.52it/s]
11%
+0: {'loss': 0.7046, 'grad_norm': 0.6645545244306722, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.12}
+0: |█▏ | 1122/9770 [15:39<1:36:52, 1.49it/s]
11%|█▏ | 1123/9770 [15:40<1:36:20, 1.50it/s]
12%|█▏ | 1124/9770 [15:40<1:35:46, 1.50it/s]
12%|█▏ | 1125/9770 [15:41<1:34:49, 1.52it/s]
12%|█▏ | 1126/9770 [15:42<1:33:59, 1.53it/s]
12%|█▏ | 1127/9770 [15:42<1:33:50, 1.54it/s]
12%|█▏ | 1128/9770 [15:43<1:34:01, 1.53it/s]
12%|█▏ | 1129/9770 [15:43<1:33:32, 1.54it/s]
12%|█▏ | 1130/9770 [15:44<1:33:51, 1.53it/s]
12%|█▏ | 1130/9770 [15:44<1:33:51, 1.53it/s]
12%|█▏ | 1131/9770 [15:45<1:34:49, 1.52it/s]
12%|█▏ | 1132/9770 [15:45<1:34:41, 1.52it/s]
12%|█▏ | 1133/9770 [15:46<1:35:13, 1.51it/s]
12%|█▏ | 1134/9770 [15:47<1:35:24, 1.51it/s]
12%|█▏ | 1135/9770 [15:47<1:34:46, 1.52it/s]
12%|█▏ | 1136/9770 [15:48<1:33:48, 1.53it/s]
12%|█▏ | 1137/9770 [15:49<1:33:45
+0: {'loss': 0.7214, 'grad_norm': 0.6926276834784485, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.12}
+0: {'loss': 0.7177, 'grad_norm': 0.6652073621218122, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.12}
+0: , 1.53it/s]
12%|█▏ | 1138/9770 [15:49<1:33:39, 1.54it/s]
12%|█▏ | 1139/9770 [15:50<1:33:52, 1.53it/s]
12%|█▏ | 1140/9770 [15:51<1:34:48, 1.52it/s]
12%|█▏ | 1140/9770 [15:51<1:34:48, 1.52it/s]
12%|█▏ | 1141/9770 [15:51<1:34:54, 1.52it/s]
12%|█▏ | 1142/9770 [15:52<1:35:03, 1.51it/s]
12%|█▏ | 1143/9770 [15:53<1:37:18, 1.48it/s]
12%|█▏ | 1144/9770 [15:53<1:37:06, 1.48it/s]
12%|█▏ | 1145/9770 [15:54<1:36:30, 1.49it/s]
12%|█▏ | 1146/9770 [15:55<1:35:15, 1.51it/s]
12%|█▏ | 1147/9770 [15:55<1:34:44, 1.52it/s]
12%|█▏ | 1148/9770 [15:56<1:34:25, 1.52it/s]
12%|█▏ | 1149/9770 [15:57<1:34:30, 1.52it/s]
12%|█▏ | 1150/9770 [15:57<1:34:32, 1.52it/s]
12%|█▏ | 1150/9770 [15:57<1:34:32, 1.52it/s]
12%|█▏ | 1151/9770
+0: {'loss': 0.7209, 'grad_norm': 0.6823674591385668, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.12}
+0: [15:58<1:36:24, 1.49it/s]
12%|█▏ | 1152/9770 [15:59<1:36:04, 1.50it/s]
12%|█▏ | 1153/9770 [15:59<1:34:58, 1.51it/s]
12%|█▏ | 1154/9770 [16:00<1:35:12, 1.51it/s]
12%|█▏ | 1155/9770 [16:01<1:34:47, 1.51it/s]
12%|█▏ | 1156/9770 [16:01<1:35:31, 1.50it/s]
12%|█▏ | 1157/9770 [16:02<1:34:56, 1.51it/s]
12%|█▏ | 1158/9770 [16:03<1:34:15, 1.52it/s]
12%|█▏ | 1159/9770 [16:03<1:33:35, 1.53it/s]
12%|█▏ | 1160/9770 [16:04<1:32:46, 1.55it/s]
12%|█▏ | 1160/9770 [16:04<1:32:46, 1.55it/s]
12%|█▏ | 1161/9770 [16:05<1:33:50, 1.53it/s]
12%|█▏ | 1162/9770 [16:05<1:33:35, 1.53it/s]
12%|█▏ | 1163/9770 [16:06<1:33:57, 1.53it/s]
12%|█▏ | 1164/9770 [16:07<1:36:03, 1.49it/s]
12%|█▏ | 1165/9770 [16:07<1:35:25, 1.50it/s]
12%|█▏ | 1166/9770 [16:08<1:34:50, 1.51it/s]
12%|█▏
+0: {'loss': 0.7028, 'grad_norm': 0.701210417059177, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.12}
+0: {'loss': 0.7209, 'grad_norm': 0.6743273050584929, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.12}
+0: | 1167/9770 [16:09<1:33:41, 1.53it/s]
12%|█▏ | 1168/9770 [16:09<1:33:22, 1.54it/s]
12%|█▏ | 1169/9770 [16:10<1:33:56, 1.53it/s]
12%|█▏ | 1170/9770 [16:11<1:34:02, 1.52it/s]
12%|█▏ | 1170/9770 [16:11<1:34:02, 1.52it/s]
12%|█▏ | 1171/9770 [16:11<1:34:13, 1.52it/s]
12%|█▏ | 1172/9770 [16:12<1:34:14, 1.52it/s]
12%|█▏ | 1173/9770 [16:12<1:34:37, 1.51it/s]
12%|█▏ | 1174/9770 [16:13<1:34:33, 1.52it/s]
12%|█▏ | 1175/9770 [16:14<1:34:48, 1.51it/s]
12%|█▏ | 1176/9770 [16:14<1:35:15, 1.50it/s]
12%|█▏ | 1177/9770 [16:15<1:34:50, 1.51it/s]
12%|█▏ | 1178/9770 [16:16<1:34:33, 1.51it/s]
12%|█▏ | 1179/9770 [16:16<1:33:30, 1.53it/s]
12%|█▏ | 1180/9770 [16:17<1:33:28, 1.53it/s]
12%|█▏ | 1180/9770 [16:17<1:33:28, 1.53it/s]
+0: {'loss': 0.6927, 'grad_norm': 0.6365117360915615, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.12}
+0: 12%|█▏ | 1181/9770 [16:18<1:34:04, 1.52it/s]
12%|█▏ | 1182/9770 [16:18<1:32:28, 1.55it/s]
12%|█▏ | 1183/9770 [16:19<1:32:05, 1.55it/s]
12%|█▏ | 1184/9770 [16:20<1:32:18, 1.55it/s]
12%|█▏ | 1185/9770 [16:20<1:32:45, 1.54it/s]
12%|█▏ | 1186/9770 [16:21<1:33:22, 1.53it/s]
12%|█▏ | 1187/9770 [16:22<1:34:16, 1.52it/s]
12%|█▏ | 1188/9770 [16:22<1:34:29, 1.51it/s]
12%|█▏ | 1189/9770 [16:23<1:33:59, 1.52it/s]
12%|█▏ | 1190/9770 [16:24<1:33:38, 1.53it/s]
12%|█▏ | 1190/9770 [16:24<1:33:38, 1.53it/s]
12%|█▏ | 1191/9770 [16:24<1:33:18, 1.53it/s]
12%|█▏ | 1192/9770 [16:25<1:34:05, 1.52it/s]
12%|█▏ | 1193/9770 [16:26<1:35:00, 1.50it/s]
12%|█▏ | 1194/9770 [16:26<1:35:04, 1.50it/s]
12%|█▏ | 1195/9770 [16:27<1:35:19, 1.50it/s]
12%|█▏ | 1196/9770 [16:28<1:3
+0: {'loss': 0.7094, 'grad_norm': 0.7216981541650839, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.12}
+0: {'loss': 0.722, 'grad_norm': 0.7296991087611251, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.12}
+0: 4:21, 1.51it/s]
12%|█▏ | 1197/9770 [16:28<1:33:56, 1.52it/s]
12%|█▏ | 1198/9770 [16:29<1:33:27, 1.53it/s]
12%|█▏ | 1199/9770 [16:30<1:33:28, 1.53it/s]
12%|█▏ | 1200/9770 [16:30<1:34:09, 1.52it/s]
12%|█▏ | 1200/9770 [16:30<1:34:09, 1.52it/s]
12%|█▏ | 1201/9770 [16:31<1:35:16, 1.50it/s]
12%|█▏ | 1202/9770 [16:32<1:35:05, 1.50it/s]
12%|█▏ | 1203/9770 [16:32<1:35:12, 1.50it/s]
12%|█▏ | 1204/9770 [16:33<1:36:21, 1.48it/s]
12%|█▏ | 1205/9770 [16:34<1:36:05, 1.49it/s]
12%|█▏ | 1206/9770 [16:34<1:35:39, 1.49it/s]
12%|█▏ | 1207/9770 [16:35<1:36:02, 1.49it/s]
12%|█▏ | 1208/9770 [16:36<1:35:29, 1.49it/s]
12%|█▏ | 1209/9770 [16:36<1:35:44, 1.49it/s]
12%|█▏ | 1210/9770 [16:37<1:34:36, 1.51it/s]
12%|█▏ | 1210/9
+0: {'loss': 0.7138, 'grad_norm': 0.6649576830874662, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.12}
+0: 770 [16:37<1:34:36, 1.51it/s]
12%|█▏ | 1211/9770 [16:38<1:33:52, 1.52it/s]
12%|█▏ | 1212/9770 [16:38<1:33:46, 1.52it/s]
12%|█▏ | 1213/9770 [16:39<1:33:18, 1.53it/s]
12%|█▏ | 1214/9770 [16:40<1:33:05, 1.53it/s]
12%|█▏ | 1215/9770 [16:40<1:32:31, 1.54it/s]
12%|█▏ | 1216/9770 [16:41<1:33:19, 1.53it/s]
12%|█▏ | 1217/9770 [16:41<1:32:54, 1.53it/s]
12%|█▏ | 1218/9770 [16:42<1:33:11, 1.53it/s]
12%|█▏ | 1219/9770 [16:43<1:33:34, 1.52it/s]
12%|█▏ | 1220/9770 [16:43<1:33:28, 1.52it/s]
12%|█▏ | 1220/9770 [16:43<1:33:28, 1.52it/s]
12%|█▏ | 1221/9770 [16:44<1:34:48, 1.50it/s]
13%|█▎ | 1222/9770 [16:45<1:36:23, 1.48it/s]
13%|█▎ | 1223/9770 [16:45<1:35:04, 1.50it/s]
13%|█▎ | 1224/9770 [16:46<1:34:24, 1.51it/s]
13%|█▎ | 1225/9770 [16:47<1:33:45, 1.52it/s]
13%|█�
+0: {'loss': 0.697, 'grad_norm': 0.6469093295533984, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.13}
+0: � | 1226/9770 [16:47<1:34:33, 1.51it/s]
13%|█▎ | 1227/9770 [16:48<1:34:26, 1.51it/s]
13%|█▎ | 1228/9770 [16:49<1:34:30, 1.51it/s]
13%|█▎ | 1229/9770 [16:49<1:34:36, 1.50it/s]
13%|█▎ | 1230/9770 [16:50<1:36:15, 1.48it/s]
13%|█▎ | 1230/9770 [16:50<1:36:15, 1.48it/s]
13%|█▎ | 1231/9770 [16:51<1:35:48, 1.49it/s]
13%|█▎ | 1232/9770 [16:51<1:35:06, 1.50it/s]
13%|█▎ | 1233/9770 [16:52<1:34:32, 1.50it/s]
13%|█▎ | 1234/9770 [16:53<1:34:23, 1.51it/s]
13%|█▎ | 1235/9770 [16:53<1:33:20, 1.52it/s]
13%|█▎ | 1236/9770 [16:54<1:34:23, 1.51it/s]
13%|█▎ | 1237/9770 [16:55<1:34:10, 1.51it/s]
13%|█▎ | 1238/9770 [16:55<1:34:25, 1.51it/s]
13%|█▎ | 1239/9770 [16:56<1:35:33, 1.49it/s]
13%|█▎ | 1240/9770 [16:57<1:35:42, 1.49it/s]
+0: {'loss': 0.7068, 'grad_norm': 0.692761726268952, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.13}
+0: {'loss': 0.7376, 'grad_norm': 0.7248731651851534, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.13}
+0:
13%|█▎ | 1240/9770 [16:57<1:35:42, 1.49it/s]
13%|█▎ | 1241/9770 [16:57<1:34:33, 1.50it/s]
13%|█▎ | 1242/9770 [16:58<1:34:06, 1.51it/s]
13%|█▎ | 1243/9770 [16:59<1:33:54, 1.51it/s]
13%|█▎ | 1244/9770 [16:59<1:33:50, 1.51it/s]
13%|█▎ | 1245/9770 [17:00<1:33:30, 1.52it/s]
13%|█▎ | 1246/9770 [17:01<1:33:27, 1.52it/s]
13%|█▎ | 1247/9770 [17:01<1:33:16, 1.52it/s]
13%|█▎ | 1248/9770 [17:02<1:32:37, 1.53it/s]
13%|█▎ | 1249/9770 [17:03<1:32:38, 1.53it/s]
13%|█▎ | 1250/9770 [17:03<1:32:35, 1.53it/s]
13%|█▎ | 1250/9770 [17:03<1:32:35, 1.53it/s]
13%|█▎ | 1251/9770 [17:04<1:33:42, 1.52it/s]
13%|█▎ | 1252/9770 [17:05<1:34:04, 1.51it/s]
13%|█▎ | 1253/9770 [17:05<1:33:47, 1.51it/s]
13%|█▎ | 1254/9770 [17:06<1:34:03, 1.51it/s]
13%|█▎ | 1255/9770 [17:07
+0: {'loss': 0.7383, 'grad_norm': 0.6334478000061585, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.13}
+0: <1:34:01, 1.51it/s]
13%|█▎ | 1256/9770 [17:07<1:34:06, 1.51it/s]
13%|█▎ | 1257/9770 [17:08<1:33:52, 1.51it/s]
13%|█▎ | 1258/9770 [17:09<1:33:28, 1.52it/s]
13%|█▎ | 1259/9770 [17:09<1:32:40, 1.53it/s]
13%|█▎ | 1260/9770 [17:10<1:32:54, 1.53it/s]
13%|█▎ | 1260/9770 [17:10<1:32:54, 1.53it/s]
13%|█▎ | 1261/9770 [17:11<1:32:55, 1.53it/s]
13%|█▎ | 1262/9770 [17:11<1:32:38, 1.53it/s]
13%|█▎ | 1263/9770 [17:12<1:33:02, 1.52it/s]
13%|█▎ | 1264/9770 [17:13<1:34:12, 1.50it/s]
13%|█▎ | 1265/9770 [17:13<1:32:54, 1.53it/s]
13%|█▎ | 1266/9770 [17:14<1:33:07, 1.52it/s]
13%|█▎ | 1267/9770 [17:15<1:32:21, 1.53it/s]
13%|█▎ | 1268/9770 [17:15<1:33:08, 1.52it/s]
13%|█▎ | 1269/9770 [17:16<1:33:02, 1.52it/s]
13%|█▎ | 1270/9770 [17:17<1:34:39, 1.50it/s]
+0: {'loss': 0.708, 'grad_norm': 0.6728790581264449, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.13}
+0: {'loss': 0.7102, 'grad_norm': 0.7111734228950166, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.13}
+0:
13%|█▎ | 1270/9770 [17:17<1:34:39, 1.50it/s]
13%|█▎ | 1271/9770 [17:17<1:34:50, 1.49it/s]
13%|█▎ | 1272/9770 [17:18<1:33:47, 1.51it/s]
13%|█▎ | 1273/9770 [17:19<1:33:26, 1.52it/s]
13%|█▎ | 1274/9770 [17:19<1:33:42, 1.51it/s]
13%|█▎ | 1275/9770 [17:20<1:33:58, 1.51it/s]
13%|█▎ | 1276/9770 [17:21<1:33:53, 1.51it/s]
13%|█▎ | 1277/9770 [17:21<1:33:32, 1.51it/s]
13%|█▎ | 1278/9770 [17:22<1:33:10, 1.52it/s]
13%|█▎ | 1279/9770 [17:23<1:34:49, 1.49it/s]
13%|█▎ | 1280/9770 [17:23<1:35:47, 1.48it/s]
13%|█▎ | 1280/9770 [17:23<1:35:47, 1.48it/s]
13%|█▎ | 1281/9770 [17:24<1:33:50, 1.51it/s]
13%|█▎ | 1282/9770 [17:25<1:34:00, 1.50it/s]
13%|█▎ | 1283/9770 [17:25<1:33:55, 1.51it/s]
13%|█▎ | 1284/9770 [17:26<1:34:18, 1.50it/s]
13%|�
+0: {'loss': 0.7149, 'grad_norm': 0.6980786726034038, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.13}
+0: ��▎ | 1285/9770 [17:27<1:34:40, 1.49it/s]
13%|█▎ | 1286/9770 [17:27<1:33:46, 1.51it/s]
13%|█▎ | 1287/9770 [17:28<1:33:20, 1.51it/s]
13%|█▎ | 1288/9770 [17:29<1:33:39, 1.51it/s]
13%|█▎ | 1289/9770 [17:29<1:32:31, 1.53it/s]
13%|█▎ | 1290/9770 [17:30<1:32:59, 1.52it/s]
13%|█▎ | 1290/9770 [17:30<1:32:59, 1.52it/s]
13%|█▎ | 1291/9770 [17:30<1:32:23, 1.53it/s]
13%|█▎ | 1292/9770 [17:31<1:33:53, 1.50it/s]
13%|█▎ | 1293/9770 [17:32<1:33:25, 1.51it/s]
13%|█▎ | 1294/9770 [17:32<1:32:22, 1.53it/s]
13%|█▎ | 1295/9770 [17:33<1:32:23, 1.53it/s]
13%|█▎ | 1296/9770 [17:34<1:32:45, 1.52it/s]
13%|█▎ | 1297/9770 [17:34<1:32:53, 1.52it/s]
13%|█▎ | 1298/9770 [17:35<1:32:36, 1.52it/s]
13%|█▎ | 1299/9770 [17:36<1:32:45, 1.52it/s]
13%|█▎ | 1300/9770 [17:36<1:32:18,
+0: {'loss': 0.7202, 'grad_norm': 0.6806181084582316, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.13}
+0: {'loss': 0.7112, 'grad_norm': 0.6927153286902209, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.13}
+0: 1.53it/s]
13%|█▎ | 1300/9770 [17:36<1:32:18, 1.53it/s]
13%|█▎ | 1301/9770 [17:37<1:33:56, 1.50it/s]
13%|█▎ | 1302/9770 [17:38<1:33:22, 1.51it/s]
13%|█▎ | 1303/9770 [17:38<1:33:37, 1.51it/s]
13%|█▎ | 1304/9770 [17:39<1:32:33, 1.52it/s]
13%|█▎ | 1305/9770 [17:40<1:32:40, 1.52it/s]
13%|█▎ | 1306/9770 [17:40<1:32:44, 1.52it/s]
13%|█▎ | 1307/9770 [17:41<1:34:16, 1.50it/s]
13%|█▎ | 1308/9770 [17:42<1:34:08, 1.50it/s]
13%|█▎ | 1309/9770 [17:42<1:34:07, 1.50it/s]
13%|█▎ | 1310/9770 [17:43<1:32:50, 1.52it/s]
13%|█▎ | 1310/9770 [17:43<1:32:50, 1.52it/s]
13%|█▎ | 1311/9770 [17:44<1:32:49, 1.52it/s]
13%|█▎ | 1312/9770 [17:44<1:32:36, 1.52it/s]
13%|█▎ | 1313/9770 [17:45<1:32:07, 1.53it/s]
13%|█▎ | 1314/9770 [1
+0: {'loss': 0.7097, 'grad_norm': 0.6596149039183153, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.14}
+0: 7:46<1:32:51, 1.52it/s]
13%|█▎ | 1315/9770 [17:46<1:34:27, 1.49it/s]
13%|█▎ | 1316/9770 [17:47<1:35:02, 1.48it/s]
13%|█▎ | 1317/9770 [17:48<1:34:46, 1.49it/s]
13%|█▎ | 1318/9770 [17:48<1:33:55, 1.50it/s]
14%|█▎ | 1319/9770 [17:49<1:32:56, 1.52it/s]
14%|█▎ | 1320/9770 [17:50<1:33:11, 1.51it/s]
14%|█▎ | 1320/9770 [17:50<1:33:11, 1.51it/s]
14%|█▎ | 1321/9770 [17:50<1:33:20, 1.51it/s]
14%|█▎ | 1322/9770 [17:51<1:32:42, 1.52it/s]
14%|█▎ | 1323/9770 [17:52<1:32:11, 1.53it/s]
14%|█▎ | 1324/9770 [17:52<1:31:51, 1.53it/s]
14%|█▎ | 1325/9770 [17:53<1:33:35, 1.50it/s]
14%|█▎ | 1326/9770 [17:54<1:35:08, 1.48it/s]
14%|█▎ | 1327/9770 [17:54<1:33:39, 1.50it/s]
14%|█▎ | 1328/9770 [17:55<1:33:56, 1.50it/s]
14%|█▎ | 1329/9770 [17:56<1:34:18, 1.49it/s]
14%|█▎
+0: {'loss': 0.7219, 'grad_norm': 0.6893886057005171, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.14}
+0: {'loss': 0.7123, 'grad_norm': 0.6657075197714051, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.14}
+0: | 1330/9770 [17:56<1:34:00, 1.50it/s]
14%|█▎ | 1330/9770 [17:56<1:34:00, 1.50it/s]
14%|█▎ | 1331/9770 [17:57<1:33:57, 1.50it/s]
14%|█▎ | 1332/9770 [17:58<1:32:36, 1.52it/s]
14%|█▎ | 1333/9770 [17:58<1:33:10, 1.51it/s]
14%|█▎ | 1334/9770 [17:59<1:32:47, 1.52it/s]
14%|█▎ | 1335/9770 [18:00<1:31:40, 1.53it/s]
14%|█▎ | 1336/9770 [18:00<1:31:53, 1.53it/s]
14%|█▎ | 1337/9770 [18:01<1:34:07, 1.49it/s]
14%|█▎ | 1338/9770 [18:02<1:32:58, 1.51it/s]
14%|█▎ | 1339/9770 [18:02<1:32:45, 1.51it/s]
14%|█▎ | 1340/9770 [18:03<1:32:06, 1.53it/s]
14%|█▎ | 1340/9770 [18:03<1:32:06, 1.53it/s]
14%|█▎ | 1341/9770 [18:04<1:31:57, 1.53it/s]
14%|█▎ | 1342/9770 [18:04<1:31:52, 1.53it/s]
14%|█▎ | 1343/9770 [18:05<1:32:05, 1.53it/s]
1
+0: {'loss': 0.7206, 'grad_norm': 0.6558014041627545, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.14}
+0: 4%|█▍ | 1344/9770 [18:05<1:31:37, 1.53it/s]
14%|█▍ | 1345/9770 [18:06<1:32:06, 1.52it/s]
14%|█▍ | 1346/9770 [18:07<1:32:02, 1.53it/s]
14%|█▍ | 1347/9770 [18:07<1:32:47, 1.51it/s]
14%|█▍ | 1348/9770 [18:08<1:33:16, 1.50it/s]
14%|█▍ | 1349/9770 [18:09<1:33:25, 1.50it/s]
14%|█▍ | 1350/9770 [18:09<1:34:01, 1.49it/s]
14%|█▍ | 1350/9770 [18:09<1:34:01, 1.49it/s]
14%|█▍ | 1351/9770 [18:10<1:33:19, 1.50it/s]
14%|█▍ | 1352/9770 [18:11<1:33:19, 1.50it/s]
14%|█▍ | 1353/9770 [18:11<1:32:23, 1.52it/s]
14%|█▍ | 1354/9770 [18:12<1:32:31, 1.52it/s]
14%|█▍ | 1355/9770 [18:13<1:31:39, 1.53it/s]
14%|█▍ | 1356/9770 [18:13<1:30:58, 1.54it/s]
14%|█▍ | 1357/9770 [18:14<1:30:41, 1.55it/s]
14%|█▍ | 1358/9770 [18:15<1:31:25, 1.53it/s]
14%|█▍ | 1359/9770 [18:15<1:31:
+0: {'loss': 0.7109, 'grad_norm': 0.6567457848634286, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.14}
+0: {'loss': 0.6993, 'grad_norm': 0.6524021123529937, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.14}
+0: 34, 1.53it/s]
14%|█▍ | 1360/9770 [18:16<1:31:17, 1.54it/s]
14%|█▍ | 1360/9770 [18:16<1:31:17, 1.54it/s]
14%|█▍ | 1361/9770 [18:17<1:31:02, 1.54it/s]
14%|█▍ | 1362/9770 [18:17<1:31:51, 1.53it/s]
14%|█▍ | 1363/9770 [18:18<1:34:12, 1.49it/s]
14%|█▍ | 1364/9770 [18:19<1:33:10, 1.50it/s]
14%|█▍ | 1365/9770 [18:19<1:32:52, 1.51it/s]
14%|█▍ | 1366/9770 [18:20<1:32:40, 1.51it/s]
14%|█▍ | 1367/9770 [18:21<1:32:10, 1.52it/s]
14%|█▍ | 1368/9770 [18:21<1:31:31, 1.53it/s]
14%|█▍ | 1369/9770 [18:22<1:31:17, 1.53it/s]
14%|█▍ | 1370/9770 [18:23<1:31:49, 1.52it/s]
14%|█▍ | 1370/9770 [18:23<1:31:49, 1.52it/s]
14%|█▍ | 1371/9770 [18:23<1:32:06, 1.52it/s]
14%|█▍ | 1372/9770 [18:24<1:31:14, 1.53it/s]
14%|█▍ | 1373/977
+0: {'loss': 0.7195, 'grad_norm': 0.6760434408063445, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.14}
+0: 0 [18:25<1:31:49, 1.52it/s]
14%|█▍ | 1374/9770 [18:25<1:32:32, 1.51it/s]
14%|█▍ | 1375/9770 [18:26<1:32:49, 1.51it/s]
14%|█▍ | 1376/9770 [18:27<1:34:05, 1.49it/s]
14%|█▍ | 1377/9770 [18:27<1:34:02, 1.49it/s]
14%|█▍ | 1378/9770 [18:28<1:33:31, 1.50it/s]
14%|█▍ | 1379/9770 [18:29<1:32:58, 1.50it/s]
14%|█▍ | 1380/9770 [18:29<1:33:13, 1.50it/s]
14%|█▍ | 1380/9770 [18:29<1:33:13, 1.50it/s]
14%|█▍ | 1381/9770 [18:30<1:32:00, 1.52it/s]
14%|█▍ | 1382/9770 [18:31<1:32:12, 1.52it/s]
14%|█▍ | 1383/9770 [18:31<1:31:34, 1.53it/s]
14%|█▍ | 1384/9770 [18:32<1:32:31, 1.51it/s]
14%|█▍ | 1385/9770 [18:33<1:32:04, 1.52it/s]
14%|█▍ | 1386/9770 [18:33<1:32:01, 1.52it/s]
14%|█▍ | 1387/9770 [18:34<1:32:52, 1.50it/s]
14%|█▍ | 1388/9770 [18:35<1:34:11, 1.48it/s]
14%|█▍
+0: {'loss': 0.7212, 'grad_norm': 0.64911132047645, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.14}
+0: {'loss': 0.716, 'grad_norm': 0.6545453379441815, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.14}
+0: | 1389/9770 [18:35<1:33:13, 1.50it/s]
14%|█▍ | 1390/9770 [18:36<1:31:43, 1.52it/s]
14%|█▍ | 1390/9770 [18:36<1:31:43, 1.52it/s]
14%|█▍ | 1391/9770 [18:37<1:32:24, 1.51it/s]
14%|█▍ | 1392/9770 [18:37<1:32:48, 1.50it/s]
14%|█▍ | 1393/9770 [18:38<1:32:54, 1.50it/s]
14%|█▍ | 1394/9770 [18:39<1:33:20, 1.50it/s]
14%|█▍ | 1395/9770 [18:39<1:31:52, 1.52it/s]
14%|█▍ | 1396/9770 [18:40<1:31:38, 1.52it/s]
14%|█▍ | 1397/9770 [18:40<1:31:11, 1.53it/s]
14%|█▍ | 1398/9770 [18:41<1:31:58, 1.52it/s]
14%|█▍ | 1399/9770 [18:42<1:32:29, 1.51it/s]
14%|█▍ | 1400/9770 [18:42<1:32:41, 1.50it/s]
14%|█▍ | 1400/9770 [18:42<1:32:41, 1.50it/s]
14%|█▍ | 1401/9770 [18:43<1:32:24, 1.51it/s]
14%|█▍ | 1402/9770 [18:44<1:32:09, 1.51it/s
+0: {'loss': 0.7177, 'grad_norm': 0.7089443403867572, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.14}
+0: ]
14%|█▍ | 1403/9770 [18:44<1:31:53, 1.52it/s]
14%|█▍ | 1404/9770 [18:45<1:31:32, 1.52it/s]
14%|█▍ | 1405/9770 [18:46<1:33:30, 1.49it/s]
14%|█▍ | 1406/9770 [18:46<1:32:28, 1.51it/s]
14%|█▍ | 1407/9770 [18:47<1:31:34, 1.52it/s]
14%|█▍ | 1408/9770 [18:48<1:31:47, 1.52it/s]
14%|█▍ | 1409/9770 [18:48<1:32:57, 1.50it/s]
14%|█▍ | 1410/9770 [18:49<1:31:35, 1.52it/s]
14%|█▍ | 1410/9770 [18:49<1:31:35, 1.52it/s]
14%|█▍ | 1411/9770 [18:50<1:31:56, 1.52it/s]
14%|█▍ | 1412/9770 [18:50<1:32:06, 1.51it/s]
14%|█▍ | 1413/9770 [18:51<1:30:16, 1.54it/s]
14%|█▍ | 1414/9770 [18:52<1:30:49, 1.53it/s]
14%|█▍ | 1415/9770 [18:52<1:31:06, 1.53it/s]
14%|█▍ | 1416/9770 [18:53<1:33:36, 1.49it/s]
15%|█▍ | 1417/9770 [18:54<1:32:44, 1.50it/s]
15%|█▍ | 1418/9770 [18:54<1
+0: {'loss': 0.7313, 'grad_norm': 0.6621066224178461, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.15}
+0: {'loss': 0.7168, 'grad_norm': 0.7299994643919728, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.15}
+0: :32:50, 1.50it/s]
15%|█▍ | 1419/9770 [18:55<1:31:44, 1.52it/s]
15%|█▍ | 1420/9770 [18:56<1:31:47, 1.52it/s]
15%|█▍ | 1420/9770 [18:56<1:31:47, 1.52it/s]
15%|█▍ | 1421/9770 [18:56<1:31:57, 1.51it/s]
15%|█▍ | 1422/9770 [18:57<1:31:52, 1.51it/s]
15%|█▍ | 1423/9770 [18:58<1:31:18, 1.52it/s]
15%|█▍ | 1424/9770 [18:58<1:33:10, 1.49it/s]
15%|█▍ | 1425/9770 [18:59<1:31:27, 1.52it/s]
15%|█▍ | 1426/9770 [19:00<1:31:21, 1.52it/s]
15%|█▍ | 1427/9770 [19:00<1:31:10, 1.53it/s]
15%|█▍ | 1428/9770 [19:01<1:32:42, 1.50it/s]
15%|█▍ | 1429/9770 [19:02<1:31:42, 1.52it/s]
15%|█▍ | 1430/9770 [19:02<1:31:06, 1.53it/s]
15%|█▍ | 1430/9770 [19:02<1:31:06, 1.53it/s]
15%|█▍ | 1431/9770 [19:03<1:33:09, 1.49it/s]
15%|█▍ | 1432
+0: {'loss': 0.6951, 'grad_norm': 0.7068084735819559, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.15}
+0: /9770 [19:04<1:34:13, 1.47it/s]
15%|█▍ | 1433/9770 [19:04<1:33:39, 1.48it/s]
15%|█▍ | 1434/9770 [19:05<1:32:32, 1.50it/s]
15%|█▍ | 1435/9770 [19:06<1:32:04, 1.51it/s]
15%|█▍ | 1436/9770 [19:06<1:32:13, 1.51it/s]
15%|█▍ | 1437/9770 [19:07<1:31:30, 1.52it/s]
15%|█▍ | 1438/9770 [19:08<1:31:36, 1.52it/s]
15%|█▍ | 1439/9770 [19:08<1:31:42, 1.51it/s]
15%|█▍ | 1440/9770 [19:09<1:31:54, 1.51it/s]
15%|█▍ | 1440/9770 [19:09<1:31:54, 1.51it/s]
15%|█▍ | 1441/9770 [19:10<1:31:05, 1.52it/s]
15%|█▍ | 1442/9770 [19:10<1:31:00, 1.53it/s]
15%|█▍ | 1443/9770 [19:11<1:33:15, 1.49it/s]
15%|█▍ | 1444/9770 [19:12<1:32:38, 1.50it/s]
15%|█▍ | 1445/9770 [19:12<1:32:17, 1.50it/s]
15%|█▍ | 1446/9770 [19:13<1:32:18, 1.50it/s]
15%|█▍ | 1447/9770 [19:14<1:31:31, 1.52it/s]
15%|█
+0: {'loss': 0.7068, 'grad_norm': 0.7134032635173736, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.15}
+0: {'loss': 0.7173, 'grad_norm': 0.6712567580091471, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.15}
+0: ▍ | 1448/9770 [19:14<1:30:36, 1.53it/s]
15%|█▍ | 1449/9770 [19:15<1:32:10, 1.50it/s]
15%|█▍ | 1450/9770 [19:16<1:32:13, 1.50it/s]
15%|█▍ | 1450/9770 [19:16<1:32:13, 1.50it/s]
15%|█▍ | 1451/9770 [19:16<1:33:41, 1.48it/s]
15%|█▍ | 1452/9770 [19:17<1:34:48, 1.46it/s]
15%|█▍ | 1453/9770 [19:18<1:34:03, 1.47it/s]
15%|█▍ | 1454/9770 [19:18<1:34:14, 1.47it/s]
15%|█▍ | 1455/9770 [19:19<1:33:07, 1.49it/s]
15%|█▍ | 1456/9770 [19:20<1:34:13, 1.47it/s]
15%|█▍ | 1457/9770 [19:20<1:32:58, 1.49it/s]
15%|█▍ | 1458/9770 [19:21<1:32:45, 1.49it/s]
15%|█▍ | 1459/9770 [19:22<1:31:26, 1.51it/s]
15%|█▍ | 1460/9770 [19:22<1:31:29, 1.51it/s]
15%|█▍ | 1460/9770 [19:22<1:31:29, 1.51it/s]
15%|█▍ | 1461/9770 [19:23<1:31:43, 1.51
+0: {'loss': 0.7112, 'grad_norm': 0.6631079418985482, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.15}
+0: it/s]
15%|█▍ | 1462/9770 [19:24<1:31:53, 1.51it/s]
15%|█▍ | 1463/9770 [19:24<1:31:37, 1.51it/s]
15%|█▍ | 1464/9770 [19:25<1:31:24, 1.51it/s]
15%|█▍ | 1465/9770 [19:26<1:31:32, 1.51it/s]
15%|█▌ | 1466/9770 [19:26<1:31:30, 1.51it/s]
15%|█▌ | 1467/9770 [19:27<1:31:37, 1.51it/s]
15%|█▌ | 1468/9770 [19:28<1:31:47, 1.51it/s]
15%|█▌ | 1469/9770 [19:28<1:31:39, 1.51it/s]
15%|█▌ | 1470/9770 [19:29<1:31:49, 1.51it/s]
15%|█▌ | 1470/9770 [19:29<1:31:49, 1.51it/s]
15%|█▌ | 1471/9770 [19:30<1:31:38, 1.51it/s]
15%|█▌ | 1472/9770 [19:30<1:31:05, 1.52it/s]
15%|█▌ | 1473/9770 [19:31<1:29:53, 1.54it/s]
15%|█▌ | 1474/9770 [19:32<1:30:46, 1.52it/s]
15%|█▌ | 1475/9770 [19:32<1:30:53, 1.52it/s]
15%|█▌ | 1476/9770 [19:33<1:31:03, 1.52it/s]
15%|█▌ | 1477/9770 [19:
+0: {'loss': 0.7049, 'grad_norm': 0.6661452313635602, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.15}
+0: {'loss': 0.7067, 'grad_norm': 0.7100267139981215, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.15}
+0: 34<1:31:06, 1.52it/s]
15%|█▌ | 1478/9770 [19:34<1:29:56, 1.54it/s]
15%|█▌ | 1479/9770 [19:35<1:30:27, 1.53it/s]
15%|█▌ | 1480/9770 [19:35<1:31:09, 1.52it/s]
15%|█▌ | 1480/9770 [19:35<1:31:09, 1.52it/s]
15%|█▌ | 1481/9770 [19:36<1:31:51, 1.50it/s]
15%|█▌ | 1482/9770 [19:37<1:32:06, 1.50it/s]
15%|█▌ | 1483/9770 [19:37<1:31:17, 1.51it/s]
15%|█▌ | 1484/9770 [19:38<1:30:35, 1.52it/s]
15%|█▌ | 1485/9770 [19:39<1:31:02, 1.52it/s]
15%|█▌ | 1486/9770 [19:39<1:30:34, 1.52it/s]
15%|█▌ | 1487/9770 [19:40<1:30:50, 1.52it/s]
15%|█▌ | 1488/9770 [19:41<1:32:35, 1.49it/s]
15%|█▌ | 1489/9770 [19:41<1:32:07, 1.50it/s]
15%|█▌ | 1490/9770 [19:42<1:33:37, 1.47it/s]
15%|█▌ | 1490/9770 [19:42<1:33:37, 1.47it/s]
15%|█▌ |
+0: {'loss': 0.7179, 'grad_norm': 0.6639321669934656, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.15}
+0: 1491/9770 [19:43<1:33:20, 1.48it/s]
15%|█▌ | 1492/9770 [19:44<1:32:59, 1.48it/s]
15%|█▌ | 1493/9770 [19:44<1:32:38, 1.49it/s]
15%|█▌ | 1494/9770 [19:45<1:32:18, 1.49it/s]
15%|█▌ | 1495/9770 [19:45<1:31:10, 1.51it/s]
15%|█▌ | 1496/9770 [19:46<1:32:00, 1.50it/s]
15%|█▌ | 1497/9770 [19:47<1:33:30, 1.47it/s]
15%|█▌ | 1498/9770 [19:48<1:33:04, 1.48it/s]
15%|█▌ | 1499/9770 [19:48<1:32:21, 1.49it/s]
15%|█▌ | 1500/9770 [19:49<1:32:18, 1.49it/s]
15%|█▌ | 1500/9770 [19:49<1:32:18, 1.49it/s]
15%|█▌ | 1501/9770 [19:50<1:31:47, 1.50it/s]
15%|█▌ | 1502/9770 [19:50<1:31:37, 1.50it/s]
15%|█▌ | 1503/9770 [19:51<1:31:13, 1.51it/s]
15%|█▌ | 1504/9770 [19:52<1:31:17, 1.51it/s]
15%|█▌ | 1505/9770 [19:52<1:30:40, 1.52it/s]
15%|█▌ | 1506/9770 [19:53<1:30:40, 1.52it/s]
15%
+0: {'loss': 0.7148, 'grad_norm': 0.6739816552265073, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.15}
+0: {'loss': 0.7125, 'grad_norm': 0.662563570377684, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.16}
+0: |█▌ | 1507/9770 [19:53<1:29:43, 1.53it/s]
15%|█▌ | 1508/9770 [19:54<1:29:36, 1.54it/s]
15%|█▌ | 1509/9770 [19:55<1:29:57, 1.53it/s]
15%|█▌ | 1510/9770 [19:55<1:29:17, 1.54it/s]
15%|█▌ | 1510/9770 [19:55<1:29:17, 1.54it/s]
15%|█▌ | 1511/9770 [19:56<1:29:34, 1.54it/s]
15%|█▌ | 1512/9770 [19:57<1:30:04, 1.53it/s]
15%|█▌ | 1513/9770 [19:57<1:29:39, 1.53it/s]
15%|█▌ | 1514/9770 [19:58<1:30:27, 1.52it/s]
16%|█▌ | 1515/9770 [19:59<1:31:39, 1.50it/s]
16%|█▌ | 1516/9770 [19:59<1:31:47, 1.50it/s]
16%|█▌ | 1517/9770 [20:00<1:30:59, 1.51it/s]
16%|█▌ | 1518/9770 [20:01<1:30:25, 1.52it/s]
16%|█▌ | 1519/9770 [20:01<1:30:57, 1.51it/s]
16%|█▌ | 1520/9770 [20:02<1:30:43, 1.52it/s]
16%|█▌ | 1520/9770 [20:02<1:30:43,
+0: {'loss': 0.6948, 'grad_norm': 0.6981927450083755, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.16}
+0: 1.52it/s]
16%|█▌ | 1521/9770 [20:03<1:32:13, 1.49it/s]
16%|█▌ | 1522/9770 [20:03<1:32:52, 1.48it/s]
16%|█▌ | 1523/9770 [20:04<1:31:56, 1.49it/s]
16%|█▌ | 1524/9770 [20:05<1:31:25, 1.50it/s]
16%|█▌ | 1525/9770 [20:05<1:33:17, 1.47it/s]
16%|█▌ | 1526/9770 [20:06<1:31:44, 1.50it/s]
16%|█▌ | 1527/9770 [20:07<1:31:21, 1.50it/s]
16%|█▌ | 1528/9770 [20:07<1:30:42, 1.51it/s]
16%|█▌ | 1529/9770 [20:08<1:29:48, 1.53it/s]
16%|█▌ | 1530/9770 [20:09<1:31:11, 1.51it/s]
16%|█▌ | 1530/9770 [20:09<1:31:11, 1.51it/s]
16%|█▌ | 1531/9770 [20:09<1:31:18, 1.50it/s]
16%|█▌ | 1532/9770 [20:10<1:31:12, 1.51it/s]
16%|█▌ | 1533/9770 [20:11<1:31:25, 1.50it/s]
16%|█▌ | 1534/9770 [20:11<1:31:23, 1.50it/s]
16%|█▌ | 1535/9770 [20:12<1:31:57, 1.49it/s]
16%|█▌ | 1536/9770
+0: {'loss': 0.7286, 'grad_norm': 0.6621256493658119, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.16}
+0: {'loss': 0.696, 'grad_norm': 0.6610565287666501, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.16}
+0: [20:13<1:30:56, 1.51it/s]
16%|█▌ | 1537/9770 [20:13<1:30:50, 1.51it/s]
16%|█▌ | 1538/9770 [20:14<1:30:11, 1.52it/s]
16%|█▌ | 1539/9770 [20:15<1:29:57, 1.52it/s]
16%|█▌ | 1540/9770 [20:15<1:31:14, 1.50it/s]
16%|█▌ | 1540/9770 [20:15<1:31:14, 1.50it/s]
16%|█▌ | 1541/9770 [20:16<1:31:51, 1.49it/s]
16%|█▌ | 1542/9770 [20:17<1:31:22, 1.50it/s]
16%|█▌ | 1543/9770 [20:17<1:30:11, 1.52it/s]
16%|█▌ | 1544/9770 [20:18<1:30:30, 1.51it/s]
16%|█▌ | 1545/9770 [20:19<1:30:36, 1.51it/s]
16%|█▌ | 1546/9770 [20:19<1:30:07, 1.52it/s]
16%|█▌ | 1547/9770 [20:20<1:30:12, 1.52it/s]
16%|█▌ | 1548/9770 [20:21<1:30:42, 1.51it/s]
16%|█▌ | 1549/9770 [20:21<1:31:02, 1.51it/s]
16%|█▌ | 1550/9770 [20:22<1:31:10, 1.50it/s]
16%|█▌
+0: {'loss': 0.7044, 'grad_norm': 0.6696477548735924, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.16}
+0: | 1550/9770 [20:22<1:31:10, 1.50it/s]
16%|█▌ | 1551/9770 [20:23<1:30:18, 1.52it/s]
16%|█▌ | 1552/9770 [20:23<1:29:47, 1.53it/s]
16%|█▌ | 1553/9770 [20:24<1:28:17, 1.55it/s]
16%|█▌ | 1554/9770 [20:25<1:28:45, 1.54it/s]
16%|█▌ | 1555/9770 [20:25<1:29:18, 1.53it/s]
16%|█▌ | 1556/9770 [20:26<1:29:25, 1.53it/s]
16%|█▌ | 1557/9770 [20:27<1:30:02, 1.52it/s]
16%|█▌ | 1558/9770 [20:27<1:28:58, 1.54it/s]
16%|█▌ | 1559/9770 [20:28<1:29:13, 1.53it/s]
16%|█▌ | 1560/9770 [20:28<1:28:42, 1.54it/s]
16%|█▌ | 1560/9770 [20:28<1:28:42, 1.54it/s]
16%|█▌ | 1561/9770 [20:29<1:28:50, 1.54it/s]
16%|█▌ | 1562/9770 [20:30<1:29:04, 1.54it/s]
16%|█▌ | 1563/9770 [20:30<1:29:35, 1.53it/s]
16%|█▌ | 1564/9770 [20:31<1:31:53, 1.49it/s]
16%|█▌ | 1565/9770 [20:32<1:31:43, 1.49it/s]
+0: {'loss': 0.7117, 'grad_norm': 0.6637871066257344, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.16}
+0: 16%|█▌ | 1566/9770 [20:32<1:31:48, 1.49it/s]
16%|█▌ | 1567/9770 [20:33<1:31:15, 1.50it/s]
16%|█▌ | 1568/9770 [20:34<1:31:47, 1.49it/s]
16%|█▌ | 1569/9770 [20:34<1:31:47, 1.49it/s]
16%|█▌ | 1570/9770 [20:35<1:31:32, 1.49it/s]
16%|█▌ | 1570/9770 [20:35<1:31:32, 1.49it/s]
16%|█▌ | 1571/9770 [20:36<1:31:20, 1.50it/s]
16%|█▌ | 1572/9770 [20:36<1:30:22, 1.51it/s]
16%|█▌ | 1573/9770 [20:37<1:30:14, 1.51it/s]
16%|█▌ | 1574/9770 [20:38<1:30:33, 1.51it/s]
16%|█▌ | 1575/9770 [20:38<1:29:48, 1.52it/s]
16%|█▌ | 1576/9770 [20:39<1:29:48, 1.52it/s]
16%|█▌ | 1577/9770 [20:40<1:29:19, 1.53it/s]
16%|█▌ | 1578/9770 [20:40<1:29:17, 1.53it/s]
16%|█▌ | 1579/9770 [20:41<1:29:19, 1.53it/s]
16%|█▌ | 1580/9770 [20:42<1:30:38, 1.51it/s]
+0: {'loss': 0.705, 'grad_norm': 0.6309546850465658, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.16}
+0: {'loss': 0.7036, 'grad_norm': 0.6288371723180415, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.16}
+0:
16%|█▌ | 1580/9770 [20:42<1:30:38, 1.51it/s]
16%|█▌ | 1581/9770 [20:42<1:31:01, 1.50it/s]
16%|█▌ | 1582/9770 [20:43<1:31:03, 1.50it/s]
16%|█▌ | 1583/9770 [20:44<1:31:29, 1.49it/s]
16%|█▌ | 1584/9770 [20:44<1:31:00, 1.50it/s]
16%|█▌ | 1585/9770 [20:45<1:30:14, 1.51it/s]
16%|█▌ | 1586/9770 [20:46<1:29:38, 1.52it/s]
16%|█▌ | 1587/9770 [20:46<1:29:00, 1.53it/s]
16%|█▋ | 1588/9770 [20:47<1:29:37, 1.52it/s]
16%|█▋ | 1589/9770 [20:48<1:30:34, 1.51it/s]
16%|█▋ | 1590/9770 [20:48<1:30:22, 1.51it/s]
16%|█▋ | 1590/9770 [20:48<1:30:22, 1.51it/s]
16%|█▋ | 1591/9770 [20:49<1:30:18, 1.51it/s]
16%|█▋ | 1592/9770 [20:50<1:30:45, 1.50it/s]
16%|█▋ | 1593/9770 [20:50<1:29:27, 1.52it/s]
16%|█▋ | 1594/9770 [20:51<1:28:53, 1.53it/s]
16%|█▋ | 1595/9
+0: {'loss': 0.6847, 'grad_norm': 0.6823549766396124, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.16}
+0: 770 [20:52<1:28:41, 1.54it/s]
16%|█▋ | 1596/9770 [20:52<1:28:58, 1.53it/s]
16%|█▋ | 1597/9770 [20:53<1:28:59, 1.53it/s]
16%|█▋ | 1598/9770 [20:54<1:28:41, 1.54it/s]
16%|█▋ | 1599/9770 [20:54<1:29:11, 1.53it/s]
16%|█▋ | 1600/9770 [20:55<1:29:45, 1.52it/s]
16%|█▋ | 1600/9770 [20:55<1:29:45, 1.52it/s]
16%|█▋ | 1601/9770 [20:56<1:30:20, 1.51it/s]
16%|█▋ | 1602/9770 [20:56<1:29:53, 1.51it/s]
16%|█▋ | 1603/9770 [20:57<1:29:34, 1.52it/s]
16%|█▋ | 1604/9770 [20:58<1:30:17, 1.51it/s]
16%|█▋ | 1605/9770 [20:58<1:30:17, 1.51it/s]
16%|█▋ | 1606/9770 [20:59<1:30:06, 1.51it/s]
16%|█▋ | 1607/9770 [21:00<1:29:51, 1.51it/s]
16%|█▋ | 1608/9770 [21:00<1:28:57, 1.53it/s]
16%|█▋ | 1609/9770 [21:01<1:29:13, 1.52it/s]
16%|█▋ | 1610/9770 [21:01<1:29:43, 1.52it/s]
+0: {'loss': 0.7155, 'grad_norm': 0.6899701563409427, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.16}
+0: {'loss': 0.7005, 'grad_norm': 0.631018132428086, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.17}
+0:
16%|█▋ | 1610/9770 [21:01<1:29:43, 1.52it/s]
16%|█▋ | 1611/9770 [21:02<1:29:37, 1.52it/s]
16%|█▋ | 1612/9770 [21:03<1:30:36, 1.50it/s]
17%|█▋ | 1613/9770 [21:04<1:30:46, 1.50it/s]
17%|█▋ | 1614/9770 [21:04<1:29:55, 1.51it/s]
17%|█▋ | 1615/9770 [21:05<1:30:04, 1.51it/s]
17%|█▋ | 1616/9770 [21:05<1:30:21, 1.50it/s]
17%|█▋ | 1617/9770 [21:06<1:29:39, 1.52it/s]
17%|█▋ | 1618/9770 [21:07<1:28:58, 1.53it/s]
17%|█▋ | 1619/9770 [21:07<1:28:32, 1.53it/s]
17%|█▋ | 1620/9770 [21:08<1:29:25, 1.52it/s]
17%|█▋ | 1620/9770 [21:08<1:29:25, 1.52it/s]
17%|█▋ | 1621/9770 [21:09<1:30:06, 1.51it/s]
17%|█▋ | 1622/9770 [21:09<1:30:24, 1.50it/s]
17%|█▋ | 1623/9770 [21:10<1:29:09, 1.52it/s]
17%|█▋ | 1624/9770 [21:11<1:30:17, 1.50it
+0: {'loss': 0.7057, 'grad_norm': 0.6332315753665916, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.17}
+0: /s]
17%|█▋ | 1625/9770 [21:11<1:28:32, 1.53it/s]
17%|█▋ | 1626/9770 [21:12<1:27:26, 1.55it/s]
17%|█▋ | 1627/9770 [21:13<1:28:48, 1.53it/s]
17%|█▋ | 1628/9770 [21:13<1:28:57, 1.53it/s]
17%|█▋ | 1629/9770 [21:14<1:29:06, 1.52it/s]
17%|█▋ | 1630/9770 [21:15<1:29:09, 1.52it/s]
17%|█▋ | 1630/9770 [21:15<1:29:09, 1.52it/s]
17%|█▋ | 1631/9770 [21:15<1:29:30, 1.52it/s]
17%|█▋ | 1632/9770 [21:16<1:30:43, 1.49it/s]
17%|█▋ | 1633/9770 [21:17<1:29:28, 1.52it/s]
17%|█▋ | 1634/9770 [21:17<1:29:41, 1.51it/s]
17%|█▋ | 1635/9770 [21:18<1:31:02, 1.49it/s]
17%|█▋ | 1636/9770 [21:19<1:30:10, 1.50it/s]
17%|█▋ | 1637/9770 [21:19<1:30:08, 1.50it/s]
17%|█▋ | 1638/9770 [21:20<1:29:54, 1.51it/s]
17%|█▋ | 1639/9770 [21:21<1:30:08, 1.50it/s]
17%|█▋ | 1640/9770 [21:21
+0: {'loss': 0.7111, 'grad_norm': 0.7191528013301974, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.17}
+0: {'loss': 0.7021, 'grad_norm': 0.7153026649727555, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.17}
+0: <1:30:08, 1.50it/s]
17%|█▋ | 1640/9770 [21:21<1:30:08, 1.50it/s]
17%|█▋ | 1641/9770 [21:22<1:30:15, 1.50it/s]
17%|█▋ | 1642/9770 [21:23<1:29:19, 1.52it/s]
17%|█▋ | 1643/9770 [21:23<1:30:28, 1.50it/s]
17%|█▋ | 1644/9770 [21:24<1:30:53, 1.49it/s]
17%|█▋ | 1645/9770 [21:25<1:30:18, 1.50it/s]
17%|█▋ | 1646/9770 [21:25<1:30:04, 1.50it/s]
17%|█▋ | 1647/9770 [21:26<1:29:53, 1.51it/s]
17%|█▋ | 1648/9770 [21:27<1:29:41, 1.51it/s]
17%|█▋ | 1649/9770 [21:27<1:29:53, 1.51it/s]
17%|█▋ | 1650/9770 [21:28<1:29:34, 1.51it/s]
17%|█▋ | 1650/9770 [21:28<1:29:34, 1.51it/s]
17%|█▋ | 1651/9770 [21:29<1:28:13, 1.53it/s]
17%|█▋ | 1652/9770 [21:29<1:28:43, 1.52it/s]
17%|█▋ | 1653/9770 [21:30<1:27:59, 1.54it/s]
17%|█▋ | 16
+0: {'loss': 0.6944, 'grad_norm': 0.7051560141949752, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.17}
+0: 54/9770 [21:31<1:28:33, 1.53it/s]
17%|█▋ | 1655/9770 [21:31<1:28:28, 1.53it/s]
17%|█▋ | 1656/9770 [21:32<1:29:01, 1.52it/s]
17%|█▋ | 1657/9770 [21:33<1:29:06, 1.52it/s]
17%|█▋ | 1658/9770 [21:33<1:29:27, 1.51it/s]
17%|█▋ | 1659/9770 [21:34<1:28:30, 1.53it/s]
17%|█▋ | 1660/9770 [21:34<1:27:31, 1.54it/s]
17%|█▋ | 1660/9770 [21:34<1:27:31, 1.54it/s]
17%|█▋ | 1661/9770 [21:35<1:29:21, 1.51it/s]
17%|█▋ | 1662/9770 [21:36<1:29:41, 1.51it/s]
17%|█▋ | 1663/9770 [21:37<1:29:33, 1.51it/s]
17%|█▋ | 1664/9770 [21:37<1:29:10, 1.51it/s]
17%|█▋ | 1665/9770 [21:38<1:29:10, 1.51it/s]
17%|█▋ | 1666/9770 [21:38<1:28:05, 1.53it/s]
17%|█▋ | 1667/9770 [21:39<1:27:32, 1.54it/s]
17%|█▋ | 1668/9770 [21:40<1:29:46, 1.50it/s]
17%|█▋ | 1669/9770 [21:40<1:28:19, 1.53it/s]
17%|�
+0: {'loss': 0.6988, 'grad_norm': 0.6886653408918136, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.17}
+0: {'loss': 0.7254, 'grad_norm': 0.6490327812962825, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.17}
+0: ��▋ | 1670/9770 [21:41<1:28:54, 1.52it/s]
17%|█▋ | 1670/9770 [21:41<1:28:54, 1.52it/s]
17%|█▋ | 1671/9770 [21:42<1:29:38, 1.51it/s]
17%|█▋ | 1672/9770 [21:42<1:31:38, 1.47it/s]
17%|█▋ | 1673/9770 [21:43<1:30:10, 1.50it/s]
17%|█▋ | 1674/9770 [21:44<1:29:41, 1.50it/s]
17%|█▋ | 1675/9770 [21:44<1:30:01, 1.50it/s]
17%|█▋ | 1676/9770 [21:45<1:29:03, 1.51it/s]
17%|█▋ | 1677/9770 [21:46<1:28:54, 1.52it/s]
17%|█▋ | 1678/9770 [21:46<1:29:22, 1.51it/s]
17%|█▋ | 1679/9770 [21:47<1:28:40, 1.52it/s]
17%|█▋ | 1680/9770 [21:48<1:28:43, 1.52it/s]
17%|█▋ | 1680/9770 [21:48<1:28:43, 1.52it/s]
17%|█▋ | 1681/9770 [21:48<1:28:28, 1.52it/s]
17%|█▋ | 1682/9770 [21:49<1:27:30, 1.54it/s]
17%|█▋ | 1683/9770 [21:50<1:28:06, 1.
+0: {'loss': 0.7077, 'grad_norm': 0.7222801303406803, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.17}
+0: 53it/s]
17%|█▋ | 1684/9770 [21:50<1:28:48, 1.52it/s]
17%|█▋ | 1685/9770 [21:51<1:28:59, 1.51it/s]
17%|█▋ | 1686/9770 [21:52<1:29:03, 1.51it/s]
17%|█▋ | 1687/9770 [21:52<1:28:23, 1.52it/s]
17%|█▋ | 1688/9770 [21:53<1:27:41, 1.54it/s]
17%|█▋ | 1689/9770 [21:54<1:27:27, 1.54it/s]
17%|█▋ | 1690/9770 [21:54<1:27:48, 1.53it/s]
17%|█▋ | 1690/9770 [21:54<1:27:48, 1.53it/s]
17%|█▋ | 1691/9770 [21:55<1:29:56, 1.50it/s]
17%|█▋ | 1692/9770 [21:56<1:28:50, 1.52it/s]
17%|█▋ | 1693/9770 [21:56<1:28:22, 1.52it/s]
17%|█▋ | 1694/9770 [21:57<1:27:38, 1.54it/s]
17%|█▋ | 1695/9770 [21:58<1:28:24, 1.52it/s]
17%|█▋ | 1696/9770 [21:58<1:28:09, 1.53it/s]
17%|█▋ | 1697/9770 [21:59<1:26:58, 1.55it/s]
17%|█▋ | 1698/9770 [21:59<1:26:22, 1.56it/s]
17%|█▋ | 1699/9770 [2
+0: {'loss': 0.6953, 'grad_norm': 0.6311012069047182, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.17}
+0: {'loss': 0.7328, 'grad_norm': 0.6981305042612962, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.17}
+0: 2:00<1:27:24, 1.54it/s]
17%|█▋ | 1700/9770 [22:01<1:29:40, 1.50it/s]
17%|█▋ | 1700/9770 [22:01<1:29:40, 1.50it/s]
17%|█▋ | 1701/9770 [22:02<1:30:12, 1.49it/s]
17%|█▋ | 1702/9770 [22:02<1:29:29, 1.50it/s]
17%|█▋ | 1703/9770 [22:03<1:29:05, 1.51it/s]
17%|█▋ | 1704/9770 [22:03<1:28:39, 1.52it/s]
17%|█▋ | 1705/9770 [22:04<1:28:38, 1.52it/s]
17%|█▋ | 1706/9770 [22:05<1:27:52, 1.53it/s]
17%|█▋ | 1707/9770 [22:05<1:27:39, 1.53it/s]
17%|█▋ | 1708/9770 [22:06<1:28:18, 1.52it/s]
17%|█▋ | 1709/9770 [22:07<1:28:33, 1.52it/s]
18%|█▊ | 1710/9770 [22:07<1:30:01, 1.49it/s]
18%|█▊ | 1710/9770 [22:07<1:30:01, 1.49it/s]
18%|█▊ | 1711/9770 [22:08<1:30:04, 1.49it/s]
18%|█▊ | 1712/9770 [22:09<1:28:48, 1.51it/s]
18%|█▊
+0: {'loss': 0.6989, 'grad_norm': 0.6612029966787886, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.18}
+0: | 1713/9770 [22:09<1:29:09, 1.51it/s]
18%|█▊ | 1714/9770 [22:10<1:27:46, 1.53it/s]
18%|█▊ | 1715/9770 [22:11<1:28:34, 1.52it/s]
18%|█▊ | 1716/9770 [22:11<1:28:29, 1.52it/s]
18%|█▊ | 1717/9770 [22:12<1:28:26, 1.52it/s]
18%|█▊ | 1718/9770 [22:13<1:27:47, 1.53it/s]
18%|█▊ | 1719/9770 [22:13<1:27:23, 1.54it/s]
18%|█▊ | 1720/9770 [22:14<1:28:50, 1.51it/s]
18%|█▊ | 1720/9770 [22:14<1:28:50, 1.51it/s]
18%|█▊ | 1721/9770 [22:15<1:29:26, 1.50it/s]
18%|█▊ | 1722/9770 [22:15<1:28:31, 1.52it/s]
18%|█▊ | 1723/9770 [22:16<1:27:40, 1.53it/s]
18%|█▊ | 1724/9770 [22:17<1:28:25, 1.52it/s]
18%|█▊ | 1725/9770 [22:17<1:29:07, 1.50it/s]
18%|█▊ | 1726/9770 [22:18<1:31:03, 1.47it/s]
18%|█▊ | 1727/9770 [22:19<1:30:12, 1.49it/s]
18%|█▊ | 1728/9770 [22:19<1:29:03, 1.51it/s]
1
+0: {'loss': 0.738, 'grad_norm': 0.6601662974172445, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.18}
+0: {'loss': 0.705, 'grad_norm': 0.6416467570673219, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.18}
+0: 8%|█▊ | 1729/9770 [22:20<1:28:58, 1.51it/s]
18%|█▊ | 1730/9770 [22:21<1:28:58, 1.51it/s]
18%|█▊ | 1730/9770 [22:21<1:28:58, 1.51it/s]
18%|█▊ | 1731/9770 [22:21<1:28:52, 1.51it/s]
18%|█▊ | 1732/9770 [22:22<1:28:40, 1.51it/s]
18%|█▊ | 1733/9770 [22:23<1:28:40, 1.51it/s]
18%|█▊ | 1734/9770 [22:23<1:27:51, 1.52it/s]
18%|█▊ | 1735/9770 [22:24<1:26:47, 1.54it/s]
18%|█▊ | 1736/9770 [22:25<1:26:42, 1.54it/s]
18%|█▊ | 1737/9770 [22:25<1:27:00, 1.54it/s]
18%|█▊ | 1738/9770 [22:26<1:27:38, 1.53it/s]
18%|█▊ | 1739/9770 [22:27<1:29:35, 1.49it/s]
18%|█▊ | 1740/9770 [22:27<1:28:07, 1.52it/s]
18%|█▊ | 1740/9770 [22:27<1:28:07, 1.52it/s]
18%|█▊ | 1741/9770 [22:28<1:27:13, 1.53it/s]
18%|█▊ | 1742/9770 [22:29<1:27:09,
+0: {'loss': 0.6936, 'grad_norm': 0.6436643395755568, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.18}
+0: 1.54it/s]
18%|█▊ | 1743/9770 [22:29<1:26:35, 1.55it/s]
18%|█▊ | 1744/9770 [22:30<1:27:24, 1.53it/s]
18%|█▊ | 1745/9770 [22:30<1:26:24, 1.55it/s]
18%|█▊ | 1746/9770 [22:31<1:27:24, 1.53it/s]
18%|█▊ | 1747/9770 [22:32<1:27:27, 1.53it/s]
18%|█▊ | 1748/9770 [22:32<1:27:26, 1.53it/s]
18%|█▊ | 1749/9770 [22:33<1:27:20, 1.53it/s]
18%|█▊ | 1750/9770 [22:34<1:28:04, 1.52it/s]
18%|█▊ | 1750/9770 [22:34<1:28:04, 1.52it/s]
18%|█▊ | 1751/9770 [22:34<1:27:47, 1.52it/s]
18%|█▊ | 1752/9770 [22:35<1:30:07, 1.48it/s]
18%|█▊ | 1753/9770 [22:36<1:28:40, 1.51it/s]
18%|█▊ | 1754/9770 [22:36<1:27:51, 1.52it/s]
18%|█▊ | 1755/9770 [22:37<1:27:48, 1.52it/s]
18%|█▊ | 1756/9770 [22:38<1:27:02, 1.53it/s]
18%|█▊ | 1757/9770 [22:38<1:27:37, 1.52it/s]
18%|█▊ | 1758/977
+0: {'loss': 0.6818, 'grad_norm': 0.6367935485032978, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.18}
+0: {'loss': 0.7111, 'grad_norm': 0.6730334998061857, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.18}
+0: 0 [22:39<1:27:22, 1.53it/s]
18%|█▊ | 1759/9770 [22:40<1:27:09, 1.53it/s]
18%|█▊ | 1760/9770 [22:40<1:26:20, 1.55it/s]
18%|█▊ | 1760/9770 [22:40<1:26:20, 1.55it/s]
18%|█▊ | 1761/9770 [22:41<1:26:47, 1.54it/s]
18%|█▊ | 1762/9770 [22:42<1:26:21, 1.55it/s]
18%|█▊ | 1763/9770 [22:42<1:28:18, 1.51it/s]
18%|█▊ | 1764/9770 [22:43<1:28:08, 1.51it/s]
18%|█▊ | 1765/9770 [22:44<1:27:40, 1.52it/s]
18%|█▊ | 1766/9770 [22:44<1:27:45, 1.52it/s]
18%|█▊ | 1767/9770 [22:45<1:27:35, 1.52it/s]
18%|█▊ | 1768/9770 [22:46<1:28:28, 1.51it/s]
18%|█▊ | 1769/9770 [22:46<1:29:08, 1.50it/s]
18%|█▊ | 1770/9770 [22:47<1:29:04, 1.50it/s]
18%|█▊ | 1770/9770 [22:47<1:29:04, 1.50it/s]
18%|█▊ | 1771/9770 [22:48<1:27:54, 1.52it/s]
18%|█▊
+0: {'loss': 0.7138, 'grad_norm': 0.6697989761249823, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.18}
+0: | 1772/9770 [22:48<1:29:32, 1.49it/s]
18%|█▊ | 1773/9770 [22:49<1:28:26, 1.51it/s]
18%|█▊ | 1774/9770 [22:50<1:28:00, 1.51it/s]
18%|█▊ | 1775/9770 [22:50<1:27:27, 1.52it/s]
18%|█▊ | 1776/9770 [22:51<1:27:36, 1.52it/s]
18%|█▊ | 1777/9770 [22:52<1:27:10, 1.53it/s]
18%|█▊ | 1778/9770 [22:52<1:26:42, 1.54it/s]
18%|█▊ | 1779/9770 [22:53<1:26:56, 1.53it/s]
18%|█▊ | 1780/9770 [22:54<1:28:01, 1.51it/s]
18%|█▊ | 1780/9770 [22:54<1:28:01, 1.51it/s]
18%|█▊ | 1781/9770 [22:54<1:28:52, 1.50it/s]
18%|█▊ | 1782/9770 [22:55<1:28:14, 1.51it/s]
18%|█▊ | 1783/9770 [22:56<1:27:57, 1.51it/s]
18%|█▊ | 1784/9770 [22:56<1:27:36, 1.52it/s]
18%|█▊ | 1785/9770 [22:57<1:28:02, 1.51it/s]
18%|█▊ | 1786/9770 [22:57<1:26:47, 1.53it/s]
18%|█▊ | 1787/9770 [22:58<1:27:36, 1.52it/s
+0: {'loss': 0.7088, 'grad_norm': 0.6516163848532479, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.18}
+0: {'loss': 0.7142, 'grad_norm': 0.6350922904457226, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.18}
+0: ]
18%|█▊ | 1788/9770 [22:59<1:29:25, 1.49it/s]
18%|█▊ | 1789/9770 [23:00<1:30:00, 1.48it/s]
18%|█▊ | 1790/9770 [23:00<1:28:43, 1.50it/s]
18%|█▊ | 1790/9770 [23:00<1:28:43, 1.50it/s]
18%|█▊ | 1791/9770 [23:01<1:30:09, 1.47it/s]
18%|█▊ | 1792/9770 [23:02<1:31:27, 1.45it/s]
18%|█▊ | 1793/9770 [23:02<1:30:47, 1.46it/s]
18%|█▊ | 1794/9770 [23:03<1:30:07, 1.47it/s]
18%|█▊ | 1795/9770 [23:04<1:28:30, 1.50it/s]
18%|█▊ | 1796/9770 [23:04<1:28:14, 1.51it/s]
18%|█▊ | 1797/9770 [23:05<1:28:22, 1.50it/s]
18%|█▊ | 1798/9770 [23:06<1:28:13, 1.51it/s]
18%|█▊ | 1799/9770 [23:06<1:28:12, 1.51it/s]
18%|█▊ | 1800/9770 [23:07<1:27:34, 1.52it/s]
18%|█▊ | 1800/9770 [23:07<1:27:34, 1.52it/s]
18%|█▊ | 1801/9770 [23:08<1:27
+0: {'loss': 0.6841, 'grad_norm': 0.6553509587727674, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.19}
+0: :56, 1.51it/s]
18%|█▊ | 1802/9770 [23:08<1:27:00, 1.53it/s]
18%|█▊ | 1803/9770 [23:09<1:27:28, 1.52it/s]
18%|█▊ | 1804/9770 [23:10<1:28:44, 1.50it/s]
18%|█▊ | 1805/9770 [23:10<1:28:51, 1.49it/s]
18%|█▊ | 1806/9770 [23:11<1:27:41, 1.51it/s]
18%|█▊ | 1807/9770 [23:12<1:27:35, 1.52it/s]
19%|█▊ | 1808/9770 [23:12<1:26:26, 1.54it/s]
19%|█▊ | 1809/9770 [23:13<1:26:30, 1.53it/s]
19%|█▊ | 1810/9770 [23:13<1:26:05, 1.54it/s]
19%|█▊ | 1810/9770 [23:13<1:26:05, 1.54it/s]
19%|█▊ | 1811/9770 [23:14<1:26:17, 1.54it/s]
19%|█▊ | 1812/9770 [23:15<1:27:29, 1.52it/s]
19%|█▊ | 1813/9770 [23:15<1:26:43, 1.53it/s]
19%|█▊ | 1814/9770 [23:16<1:26:54, 1.53it/s]
19%|█▊ | 1815/9770 [23:17<1:26:59, 1.52it/s]
19%|█▊ | 1816/9770 [23:17<1:27:25, 1.52it/s]
19%|█▊ | 1817
+0: {'loss': 0.714, 'grad_norm': 0.6730232319323477, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.19}
+0: {'loss': 0.7017, 'grad_norm': 0.6546312936833185, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.19}
+0: /9770 [23:18<1:26:43, 1.53it/s]
19%|█▊ | 1818/9770 [23:19<1:26:17, 1.54it/s]
19%|█▊ | 1819/9770 [23:19<1:28:01, 1.51it/s]
19%|█▊ | 1820/9770 [23:20<1:27:55, 1.51it/s]
19%|█▊ | 1820/9770 [23:20<1:27:55, 1.51it/s]
19%|█▊ | 1821/9770 [23:21<1:28:29, 1.50it/s]
19%|█▊ | 1822/9770 [23:21<1:28:16, 1.50it/s]
19%|█▊ | 1823/9770 [23:22<1:28:06, 1.50it/s]
19%|█▊ | 1824/9770 [23:23<1:26:51, 1.52it/s]
19%|█▊ | 1825/9770 [23:23<1:26:42, 1.53it/s]
19%|█▊ | 1826/9770 [23:24<1:25:48, 1.54it/s]
19%|█▊ | 1827/9770 [23:25<1:26:13, 1.54it/s]
19%|█▊ | 1828/9770 [23:25<1:26:53, 1.52it/s]
19%|█▊ | 1829/9770 [23:26<1:28:15, 1.50it/s]
19%|█▊ | 1830/9770 [23:27<1:26:58, 1.52it/s]
19%|█▊ | 1830/9770 [23:27<1:26:58, 1.52it/s]
19%|█▊
+0: {'loss': 0.708, 'grad_norm': 0.6411598866366788, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.19}
+0: | 1831/9770 [23:27<1:27:17, 1.52it/s]
19%|█▉ | 1832/9770 [23:28<1:27:32, 1.51it/s]
19%|█▉ | 1833/9770 [23:29<1:27:37, 1.51it/s]
19%|█▉ | 1834/9770 [23:29<1:27:00, 1.52it/s]
19%|█▉ | 1835/9770 [23:30<1:27:16, 1.52it/s]
19%|█▉ | 1836/9770 [23:31<1:27:18, 1.51it/s]
19%|█▉ | 1837/9770 [23:31<1:27:37, 1.51it/s]
19%|█▉ | 1838/9770 [23:32<1:27:24, 1.51it/s]
19%|█▉ | 1839/9770 [23:33<1:26:47, 1.52it/s]
19%|█▉ | 1840/9770 [23:33<1:26:24, 1.53it/s]
19%|█▉ | 1840/9770 [23:33<1:26:24, 1.53it/s]
19%|█▉ | 1841/9770 [23:34<1:26:19, 1.53it/s]
19%|█▉ | 1842/9770 [23:35<1:28:13, 1.50it/s]
19%|█▉ | 1843/9770 [23:35<1:28:29, 1.49it/s]
19%|█▉ | 1844/9770 [23:36<1:28:42, 1.49it/s]
19%|█▉ | 1845/9770 [23:37<1:27:43, 1.51it/s]
19%|█▉ | 1846/9770 [23:37<1:28:57, 1.48
+0: {'loss': 0.7125, 'grad_norm': 0.6575170336315558, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.19}
+0: {'loss': 0.6845, 'grad_norm': 0.6860193865897046, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.19}
+0: it/s]
19%|█▉ | 1847/9770 [23:38<1:29:15, 1.48it/s]
19%|█▉ | 1848/9770 [23:39<1:28:36, 1.49it/s]
19%|█▉ | 1849/9770 [23:39<1:28:17, 1.50it/s]
19%|█▉ | 1850/9770 [23:40<1:29:57, 1.47it/s]
19%|█▉ | 1850/9770 [23:40<1:29:57, 1.47it/s]
19%|█▉ | 1851/9770 [23:41<1:28:06, 1.50it/s]
19%|█▉ | 1852/9770 [23:41<1:28:15, 1.50it/s]
19%|█▉ | 1853/9770 [23:42<1:27:01, 1.52it/s]
19%|█▉ | 1854/9770 [23:43<1:26:07, 1.53it/s]
19%|█▉ | 1855/9770 [23:43<1:26:53, 1.52it/s]
19%|█▉ | 1856/9770 [23:44<1:27:05, 1.51it/s]
19%|█▉ | 1857/9770 [23:45<1:28:19, 1.49it/s]
19%|█▉ | 1858/9770 [23:45<1:26:57, 1.52it/s]
19%|█▉ | 1859/9770 [23:46<1:26:08, 1.53it/s]
19%|█▉ | 1860/9770 [23:47<1:26:40, 1.52it/s]
19%|█▉ | 1860/9770 [23:47<
+0: {'loss': 0.7005, 'grad_norm': 0.7450953538179935, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.19}
+0: 1:26:40, 1.52it/s]
19%|█▉ | 1861/9770 [23:47<1:27:34, 1.51it/s]
19%|█▉ | 1862/9770 [23:48<1:28:40, 1.49it/s]
19%|█▉ | 1863/9770 [23:49<1:27:09, 1.51it/s]
19%|█▉ | 1864/9770 [23:49<1:28:40, 1.49it/s]
19%|█▉ | 1865/9770 [23:50<1:27:56, 1.50it/s]
19%|█▉ | 1866/9770 [23:51<1:28:26, 1.49it/s]
19%|█▉ | 1867/9770 [23:51<1:28:53, 1.48it/s]
19%|█▉ | 1868/9770 [23:52<1:28:16, 1.49it/s]
19%|█▉ | 1869/9770 [23:53<1:27:56, 1.50it/s]
19%|█▉ | 1870/9770 [23:53<1:28:36, 1.49it/s]
19%|█▉ | 1870/9770 [23:53<1:28:36, 1.49it/s]
19%|█▉ | 1871/9770 [23:54<1:28:51, 1.48it/s]
19%|█▉ | 1872/9770 [23:55<1:29:51, 1.46it/s]
19%|█▉ | 1873/9770 [23:55<1:28:06, 1.49it/s]
19%|█▉ | 1874/9770 [23:56<1:27:59, 1.50it/s]
19%|█▉ | 1875/9770 [23:57<1:27:04, 1.51it/s]
19%|█▉ |
+0: {'loss': 0.694, 'grad_norm': 0.6907586729724744, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.19}
+0: {'loss': 0.6943, 'grad_norm': 0.6653968402488354, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.19}
+0: 1876/9770 [23:57<1:26:31, 1.52it/s]
19%|█▉ | 1877/9770 [23:58<1:26:20, 1.52it/s]
19%|█▉ | 1878/9770 [23:59<1:25:52, 1.53it/s]
19%|█▉ | 1879/9770 [23:59<1:26:13, 1.53it/s]
19%|█▉ | 1880/9770 [24:00<1:26:31, 1.52it/s]
19%|█▉ | 1880/9770 [24:00<1:26:31, 1.52it/s]
19%|█▉ | 1881/9770 [24:01<1:27:19, 1.51it/s]
19%|█▉ | 1882/9770 [24:01<1:27:24, 1.50it/s]
19%|█▉ | 1883/9770 [24:02<1:28:04, 1.49it/s]
19%|█▉ | 1884/9770 [24:03<1:26:54, 1.51it/s]
19%|█▉ | 1885/9770 [24:03<1:26:05, 1.53it/s]
19%|█▉ | 1886/9770 [24:04<1:25:06, 1.54it/s]
19%|█▉ | 1887/9770 [24:04<1:24:45, 1.55it/s]
19%|█▉ | 1888/9770 [24:05<1:25:10, 1.54it/s]
19%|█▉ | 1889/9770 [24:06<1:24:25, 1.56it/s]
19%|█▉ | 1890/9770 [24:06<1:25:14, 1.54it/s]
19%|�
+0: {'loss': 0.7169, 'grad_norm': 0.6579165805236343, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.19}
+0: �▉ | 1890/9770 [24:06<1:25:14, 1.54it/s]
19%|█▉ | 1891/9770 [24:07<1:26:20, 1.52it/s]
19%|█▉ | 1892/9770 [24:08<1:25:57, 1.53it/s]
19%|█▉ | 1893/9770 [24:08<1:25:48, 1.53it/s]
19%|█▉ | 1894/9770 [24:09<1:25:58, 1.53it/s]
19%|█▉ | 1895/9770 [24:10<1:25:41, 1.53it/s]
19%|█▉ | 1896/9770 [24:10<1:26:25, 1.52it/s]
19%|█▉ | 1897/9770 [24:11<1:26:55, 1.51it/s]
19%|█▉ | 1898/9770 [24:12<1:26:34, 1.52it/s]
19%|█▉ | 1899/9770 [24:12<1:25:34, 1.53it/s]
19%|█▉ | 1900/9770 [24:13<1:25:41, 1.53it/s]
19%|█▉ | 1900/9770 [24:13<1:25:41, 1.53it/s]
19%|█▉ | 1901/9770 [24:14<1:27:20, 1.50it/s]
19%|█▉ | 1902/9770 [24:14<1:26:14, 1.52it/s]
19%|█▉ | 1903/9770 [24:15<1:25:11, 1.54it/s]
19%|█▉ | 1904/9770 [24:16<1:25:14, 1.54it/s]
19%|█▉ | 1905/9770 [24:16<1:27:35,
+0: {'loss': 0.7117, 'grad_norm': 0.6814850674583643, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.2}
+0: 1.50it/s]
20%|█▉ | 1906/9770 [24:17<1:27:22, 1.50it/s]
20%|█▉ | 1907/9770 [24:18<1:27:22, 1.50it/s]
20%|█▉ | 1908/9770 [24:18<1:27:35, 1.50it/s]
20%|█▉ | 1909/9770 [24:19<1:26:30, 1.51it/s]
20%|█▉ | 1910/9770 [24:20<1:25:31, 1.53it/s]
20%|█▉ | 1910/9770 [24:20<1:25:31, 1.53it/s]
20%|█▉ | 1911/9770 [24:20<1:26:00, 1.52it/s]
20%|█▉ | 1912/9770 [24:21<1:27:38, 1.49it/s]
20%|█▉ | 1913/9770 [24:22<1:26:23, 1.52it/s]
20%|█▉ | 1914/9770 [24:22<1:26:24, 1.52it/s]
20%|█▉ | 1915/9770 [24:23<1:25:33, 1.53it/s]
20%|█▉ | 1916/9770 [24:24<1:26:33, 1.51it/s]
20%|█▉ | 1917/9770 [24:24<1:26:21, 1.52it/s]
20%|█▉ | 1918/9770 [24:25<1:26:24, 1.51it/s]
20%|█▉ | 1919/9770 [24:26<1:26:30, 1.51it/s]
20%|█▉ | 1920/9770 [24:26<1:25:34, 1.53it/s]
+0: {'loss': 0.7129, 'grad_norm': 0.666641152490347, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.2}
+0: {'loss': 0.7242, 'grad_norm': 0.7500278717182322, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.2}
+0:
20%|█▉ | 1920/9770 [24:26<1:25:34, 1.53it/s]
20%|█▉ | 1921/9770 [24:27<1:27:19, 1.50it/s]
20%|█▉ | 1922/9770 [24:27<1:25:54, 1.52it/s]
20%|█▉ | 1923/9770 [24:28<1:27:28, 1.50it/s]
20%|█▉ | 1924/9770 [24:29<1:27:16, 1.50it/s]
20%|█▉ | 1925/9770 [24:30<1:27:01, 1.50it/s]
20%|█▉ | 1926/9770 [24:30<1:26:18, 1.51it/s]
20%|█▉ | 1927/9770 [24:31<1:26:04, 1.52it/s]
20%|█▉ | 1928/9770 [24:31<1:25:54, 1.52it/s]
20%|█▉ | 1929/9770 [24:32<1:25:29, 1.53it/s]
20%|█▉ | 1930/9770 [24:33<1:27:43, 1.49it/s]
20%|█▉ | 1930/9770 [24:33<1:27:43, 1.49it/s]
20%|█▉ | 1931/9770 [24:33<1:27:18, 1.50it/s]
20%|█▉ | 1932/9770 [24:34<1:26:31, 1.51it/s]
20%|█▉ | 1933/9770 [24:35<1:26:51, 1.50it/s]
20%|█▉ | 1934/9770 [24:35<1:26:34, 1.51it/s]
20%|█▉
+0: {'loss': 0.7059, 'grad_norm': 0.7111800869539243, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.2}
+0: | 1935/9770 [24:36<1:26:22, 1.51it/s]
20%|█▉ | 1936/9770 [24:37<1:25:59, 1.52it/s]
20%|█▉ | 1937/9770 [24:37<1:25:14, 1.53it/s]
20%|█▉ | 1938/9770 [24:38<1:25:24, 1.53it/s]
20%|█▉ | 1939/9770 [24:39<1:24:40, 1.54it/s]
20%|█▉ | 1940/9770 [24:39<1:25:36, 1.52it/s]
20%|█▉ | 1940/9770 [24:39<1:25:36, 1.52it/s]
20%|█▉ | 1941/9770 [24:40<1:26:04, 1.52it/s]
20%|█▉ | 1942/9770 [24:41<1:26:05, 1.52it/s]
20%|█▉ | 1943/9770 [24:41<1:27:37, 1.49it/s]
20%|█▉ | 1944/9770 [24:42<1:27:19, 1.49it/s]
20%|█▉ | 1945/9770 [24:43<1:26:46, 1.50it/s]
20%|█▉ | 1946/9770 [24:43<1:25:30, 1.53it/s]
20%|█▉ | 1947/9770 [24:44<1:25:55, 1.52it/s]
20%|█▉ | 1948/9770 [24:45<1:25:22, 1.53it/s]
20%|█▉ | 1949/9770 [24:45<1:25:30, 1.52it/s]
20%|█▉ | 1950/9770 [24:46<1:25:17, 1.53it/s]
+0: {'loss': 0.6949, 'grad_norm': 0.6640638859953982, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.2}
+0: [2025-09-02 20:20:50,539] [INFO] [axolotl.core.trainers.base._save:613] [PID:3622631] [RANK:0] Saving model checkpoint to /lustre/fswork/projects/rech/dgo/udv55np/math/Qwen3-235B-A22B/Qwen2.5-0.5B_reasoning/1/checkpoint-1954[39m
+0: [2025-09-02 20:20:51,388] [INFO] [axolotl.core.trainers.base._save:662] [PID:3622631] [RANK:0] Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`[39m
+0: {'loss': 0.6927, 'grad_norm': 0.6565078425741968, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.2}
+0:
20%|█▉ | 1950/9770 [24:46<1:25:17, 1.53it/s]
20%|█▉ | 1951/9770 [24:47<1:24:52, 1.54it/s]
20%|█▉ | 1952/9770 [24:47<1:24:47, 1.54it/s]
20%|█▉ | 1953/9770 [24:48<1:25:17, 1.53it/s]
20%|██ | 1954/9770 [24:49<1:26:20, 1.51it/s]
20%|██ | 1955/9770 [24:51<2:48:26, 1.29s/it]
20%|██ | 1956/9770 [24:52<2:23:36, 1.10s/it]
20%|██ | 1957/9770 [24:53<2:06:29, 1.03it/s]
20%|██ | 1958/9770 [24:53<1:54:24, 1.14it/s]
20%|██ | 1959/9770 [24:54<1:46:27, 1.22it/s]
20%|██ | 1960/9770 [24:55<1:40:21, 1.30it/s]
20%|██ | 1960/9770 [24:55<1:40:21, 1.30it/s]
20%|██ | 1961/9770 [24:55<1:37:10, 1.34it/s]
20%|██ | 1962/9770 [24:56<1:35:28, 1.36it/s]
20%|██ | 1963/9770 [24:57<1:34:22, 1.38it/s]
20%|██ | 1964/9770 [24:57<1:31:2
+0: {'loss': 0.6842, 'grad_norm': 0.6945081398640743, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.2}
+0: 9, 1.42it/s]
20%|██ | 1965/9770 [24:58<1:29:16, 1.46it/s]
20%|██ | 1966/9770 [24:59<1:26:50, 1.50it/s]
20%|██ | 1967/9770 [24:59<1:26:06, 1.51it/s]
20%|██ | 1968/9770 [25:00<1:26:01, 1.51it/s]
20%|██ | 1969/9770 [25:01<1:25:14, 1.53it/s]
20%|██ | 1970/9770 [25:01<1:24:56, 1.53it/s]
20%|██ | 1970/9770 [25:01<1:24:56, 1.53it/s]
20%|██ | 1971/9770 [25:02<1:25:04, 1.53it/s]
20%|██ | 1972/9770 [25:03<1:25:10, 1.53it/s]
20%|██ | 1973/9770 [25:03<1:24:18, 1.54it/s]
20%|██ | 1974/9770 [25:04<1:25:15, 1.52it/s]
20%|██ | 1975/9770 [25:05<1:24:48, 1.53it/s]
20%|██ | 1976/9770 [25:05<1:24:12, 1.54it/s]
20%|██ | 1977/9770 [25:06<1:25:40, 1.52it/s]
20%|██ | 1978/9770 [25:07<1:25:52, 1.51it/s]
20%|██ | 1979/9770 [25:07<1:25:27, 1.52it/s]
20%|██ | 1980/9
+0: {'loss': 0.6914, 'grad_norm': 0.6543023871657132, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.2}
+0: {'loss': 0.6991, 'grad_norm': 0.6500560980171683, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.2}
+0: 770 [25:08<1:25:19, 1.52it/s]
20%|██ | 1980/9770 [25:08<1:25:19, 1.52it/s]
20%|██ | 1981/9770 [25:09<1:25:43, 1.51it/s]
20%|██ | 1982/9770 [25:09<1:25:42, 1.51it/s]
20%|██ | 1983/9770 [25:10<1:25:29, 1.52it/s]
20%|██ | 1984/9770 [25:11<1:24:46, 1.53it/s]
20%|██ | 1985/9770 [25:11<1:26:35, 1.50it/s]
20%|██ | 1986/9770 [25:12<1:25:44, 1.51it/s]
20%|██ | 1987/9770 [25:12<1:24:14, 1.54it/s]
20%|██ | 1988/9770 [25:13<1:24:56, 1.53it/s]
20%|██ | 1989/9770 [25:14<1:24:16, 1.54it/s]
20%|██ | 1990/9770 [25:14<1:24:46, 1.53it/s]
20%|██ | 1990/9770 [25:14<1:24:46, 1.53it/s]
20%|██ | 1991/9770 [25:15<1:25:18, 1.52it/s]
20%|██ | 1992/9770 [25:16<1:26:14, 1.50it/s]
20%|██ | 1993/9770 [25:17<1:27:23, 1.48it/s]
20%|██
+0: {'loss': 0.7013, 'grad_norm': 0.6539763552325782, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.2}
+0: | 1994/9770 [25:17<1:26:37, 1.50it/s]
20%|██ | 1995/9770 [25:18<1:26:44, 1.49it/s]
20%|██ | 1996/9770 [25:18<1:26:14, 1.50it/s]
20%|██ | 1997/9770 [25:19<1:25:22, 1.52it/s]
20%|██ | 1998/9770 [25:20<1:24:34, 1.53it/s]
20%|██ | 1999/9770 [25:20<1:25:13, 1.52it/s]
20%|██ | 2000/9770 [25:21<1:25:13, 1.52it/s]
20%|██ | 2000/9770 [25:21<1:25:13, 1.52it/s]
20%|██ | 2001/9770 [25:22<1:25:40, 1.51it/s]
20%|██ | 2002/9770 [25:22<1:26:21, 1.50it/s]
21%|██ | 2003/9770 [25:23<1:26:18, 1.50it/s]
21%|██ | 2004/9770 [25:24<1:26:11, 1.50it/s]
21%|██ | 2005/9770 [25:24<1:25:54, 1.51it/s]
21%|██ | 2006/9770 [25:25<1:27:45, 1.47it/s]
21%|██ | 2007/9770 [25:26<1:26:57, 1.49it/s]
21%|██ | 2008/9770 [25:26<1:25:52, 1.51it/s]
21%|██ | 2009/9770 [25:27<1:25:07, 1.52it
+0: {'loss': 0.6943, 'grad_norm': 0.6710374386168445, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.21}
+0: {'loss': 0.7014, 'grad_norm': 0.6612901855942517, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.21}
+0: /s]
21%|██ | 2010/9770 [25:28<1:25:54, 1.51it/s]
21%|██ | 2010/9770 [25:28<1:25:54, 1.51it/s]
21%|██ | 2011/9770 [25:28<1:25:22, 1.51it/s]
21%|██ | 2012/9770 [25:29<1:24:57, 1.52it/s]
21%|██ | 2013/9770 [25:30<1:24:23, 1.53it/s]
21%|██ | 2014/9770 [25:30<1:25:09, 1.52it/s]
21%|██ | 2015/9770 [25:31<1:24:48, 1.52it/s]
21%|██ | 2016/9770 [25:32<1:24:53, 1.52it/s]
21%|██ | 2017/9770 [25:32<1:24:26, 1.53it/s]
21%|██ | 2018/9770 [25:33<1:23:51, 1.54it/s]
21%|██ | 2019/9770 [25:34<1:23:59, 1.54it/s]
21%|██ | 2020/9770 [25:34<1:23:50, 1.54it/s]
21%|██ | 2020/9770 [25:34<1:23:50, 1.54it/s]
21%|██ | 2021/9770 [25:35<1:23:34, 1.55it/s]
21%|██ | 2022/9770 [25:36<1:23:04, 1.55it/s]
21%|██ | 2023/9770 [25:36<1:
+0: {'loss': 0.675, 'grad_norm': 0.6712996353460524, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.21}
+0: 23:32, 1.55it/s]
21%|██ | 2024/9770 [25:37<1:25:49, 1.50it/s]
21%|██ | 2025/9770 [25:38<1:25:28, 1.51it/s]
21%|██ | 2026/9770 [25:38<1:24:50, 1.52it/s]
21%|██ | 2027/9770 [25:39<1:24:25, 1.53it/s]
21%|██ | 2028/9770 [25:40<1:23:49, 1.54it/s]
21%|██ | 2029/9770 [25:40<1:23:00, 1.55it/s]
21%|██ | 2030/9770 [25:41<1:23:07, 1.55it/s]
21%|██ | 2030/9770 [25:41<1:23:07, 1.55it/s]
21%|██ | 2031/9770 [25:41<1:24:03, 1.53it/s]
21%|██ | 2032/9770 [25:42<1:23:31, 1.54it/s]
21%|██ | 2033/9770 [25:43<1:23:16, 1.55it/s]
21%|██ | 2034/9770 [25:43<1:24:05, 1.53it/s]
21%|██ | 2035/9770 [25:44<1:23:37, 1.54it/s]
21%|██ | 2036/9770 [25:45<1:23:35, 1.54it/s]
21%|██ | 2037/9770 [25:45<1:24:19, 1.53it/s]
21%|██ | 2038/9770 [25:46<1:23:09, 1.55it/s]
21%|██ | 20
+0: {'loss': 0.6897, 'grad_norm': 0.6589979758154797, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.21}
+0: {'loss': 0.7257, 'grad_norm': 0.7412955588179325, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.21}
+0: 39/9770 [25:47<1:23:50, 1.54it/s]
21%|██ | 2040/9770 [25:47<1:23:41, 1.54it/s]
21%|██ | 2040/9770 [25:47<1:23:41, 1.54it/s]
21%|██ | 2041/9770 [25:48<1:24:29, 1.52it/s]
21%|██ | 2042/9770 [25:49<1:24:56, 1.52it/s]
21%|██ | 2043/9770 [25:49<1:26:21, 1.49it/s]
21%|██ | 2044/9770 [25:50<1:25:59, 1.50it/s]
21%|██ | 2045/9770 [25:51<1:24:32, 1.52it/s]
21%|██ | 2046/9770 [25:51<1:24:46, 1.52it/s]
21%|██ | 2047/9770 [25:52<1:24:52, 1.52it/s]
21%|██ | 2048/9770 [25:53<1:24:27, 1.52it/s]
21%|██ | 2049/9770 [25:53<1:23:53, 1.53it/s]
21%|██ | 2050/9770 [25:54<1:24:38, 1.52it/s]
21%|██ | 2050/9770 [25:54<1:24:38, 1.52it/s]
21%|██ | 2051/9770 [25:55<1:24:49, 1.52it/s]
21%|██ | 2052/9770 [25:55<1:25:07, 1.51it/s]
21%|█�
+0: {'loss': 0.7084, 'grad_norm': 0.6664743998503643, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.21}
+0: �� | 2053/9770 [25:56<1:24:11, 1.53it/s]
21%|██ | 2054/9770 [25:57<1:24:31, 1.52it/s]
21%|██ | 2055/9770 [25:57<1:24:54, 1.51it/s]
21%|██ | 2056/9770 [25:58<1:26:30, 1.49it/s]
21%|██ | 2057/9770 [25:59<1:26:03, 1.49it/s]
21%|██ | 2058/9770 [25:59<1:25:24, 1.50it/s]
21%|██ | 2059/9770 [26:00<1:23:57, 1.53it/s]
21%|██ | 2060/9770 [26:01<1:24:16, 1.52it/s]
21%|██ | 2060/9770 [26:01<1:24:16, 1.52it/s]
21%|██ | 2061/9770 [26:01<1:23:45, 1.53it/s]
21%|██ | 2062/9770 [26:02<1:23:46, 1.53it/s]
21%|██ | 2063/9770 [26:02<1:23:26, 1.54it/s]
21%|██ | 2064/9770 [26:03<1:23:41, 1.53it/s]
21%|██ | 2065/9770 [26:04<1:23:18, 1.54it/s]
21%|██ | 2066/9770 [26:04<1:23:55, 1.53it/s]
21%|██ | 2067/9770 [26:05<1:24:01, 1.53it/s]
21%|██ | 2068/9770 [26:06<1:23:50, 1.
+0: {'loss': 0.7128, 'grad_norm': 0.6373230557382719, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.21}
+0: {'loss': 0.7016, 'grad_norm': 0.6594369817948404, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.21}
+0: 53it/s]
21%|██ | 2069/9770 [26:06<1:23:26, 1.54it/s]
21%|██ | 2070/9770 [26:07<1:23:43, 1.53it/s]
21%|██ | 2070/9770 [26:07<1:23:43, 1.53it/s]
21%|██ | 2071/9770 [26:08<1:23:09, 1.54it/s]
21%|██ | 2072/9770 [26:08<1:24:12, 1.52it/s]
21%|██ | 2073/9770 [26:09<1:24:22, 1.52it/s]
21%|██ | 2074/9770 [26:10<1:24:11, 1.52it/s]
21%|██ | 2075/9770 [26:10<1:23:27, 1.54it/s]
21%|██ | 2076/9770 [26:11<1:23:08, 1.54it/s]
21%|██▏ | 2077/9770 [26:12<1:23:17, 1.54it/s]
21%|██▏ | 2078/9770 [26:12<1:23:29, 1.54it/s]
21%|██▏ | 2079/9770 [26:13<1:23:10, 1.54it/s]
21%|██▏ | 2080/9770 [26:14<1:23:33, 1.53it/s]
21%|██▏ | 2080/9770 [26:14<1:23:33, 1.53it/s]
21%|██▏ | 2081/9770 [26:14<1:23:11, 1.54it/s]
21%|██▏ | 2
+0: {'loss': 0.6942, 'grad_norm': 0.6404038862051435, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.21}
+0: 082/9770 [26:15<1:23:07, 1.54it/s]
21%|██▏ | 2083/9770 [26:15<1:22:42, 1.55it/s]
21%|██▏ | 2084/9770 [26:16<1:23:16, 1.54it/s]
21%|██▏ | 2085/9770 [26:17<1:22:29, 1.55it/s]
21%|██▏ | 2086/9770 [26:17<1:24:51, 1.51it/s]
21%|██▏ | 2087/9770 [26:18<1:23:56, 1.53it/s]
21%|██▏ | 2088/9770 [26:19<1:24:22, 1.52it/s]
21%|██▏ | 2089/9770 [26:19<1:23:12, 1.54it/s]
21%|██▏ | 2090/9770 [26:20<1:23:21, 1.54it/s]
21%|██▏ | 2090/9770 [26:20<1:23:21, 1.54it/s]
21%|██▏ | 2091/9770 [26:21<1:23:53, 1.53it/s]
21%|██▏ | 2092/9770 [26:21<1:23:50, 1.53it/s]
21%|██▏ | 2093/9770 [26:22<1:23:21, 1.54it/s]
21%|██▏ | 2094/9770 [26:23<1:22:56, 1.54it/s]
21%|██▏ | 2095/9770 [26:23<1:22:27, 1.55it/s]
21%|██▏ | 2096/9770 [26:24<1:23:35, 1.53it/s]
21%|██▏ | 2097/9770
+0: {'loss': 0.719, 'grad_norm': 0.6519542506170287, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.21}
+0: {'loss': 0.6835, 'grad_norm': 0.6536422816732395, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.22}
+0: [26:25<1:25:15, 1.50it/s]
21%|██▏ | 2098/9770 [26:25<1:24:44, 1.51it/s]
21%|██▏ | 2099/9770 [26:26<1:23:35, 1.53it/s]
21%|██▏ | 2100/9770 [26:27<1:23:23, 1.53it/s]
21%|██▏ | 2100/9770 [26:27<1:23:23, 1.53it/s]
22%|██▏ | 2101/9770 [26:27<1:22:21, 1.55it/s]
22%|██▏ | 2102/9770 [26:28<1:21:54, 1.56it/s]
22%|██▏ | 2103/9770 [26:29<1:22:37, 1.55it/s]
22%|██▏ | 2104/9770 [26:29<1:22:22, 1.55it/s]
22%|██▏ | 2105/9770 [26:30<1:22:05, 1.56it/s]
22%|██▏ | 2106/9770 [26:30<1:21:58, 1.56it/s]
22%|██▏ | 2107/9770 [26:31<1:22:50, 1.54it/s]
22%|██▏ | 2108/9770 [26:32<1:23:10, 1.54it/s]
22%|██▏ | 2109/9770 [26:32<1:23:23, 1.53it/s]
22%|██▏ | 2110/9770 [26:33<1:24:11, 1.52it/s]
22%|██▏ | 2110/9770 [26:33<1:24:11
+0: {'loss': 0.6961, 'grad_norm': 0.6535060663386806, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.22}
+0: , 1.52it/s]
22%|██▏ | 2111/9770 [26:34<1:26:00, 1.48it/s]
22%|██▏ | 2112/9770 [26:34<1:25:48, 1.49it/s]
22%|██▏ | 2113/9770 [26:35<1:25:42, 1.49it/s]
22%|██▏ | 2114/9770 [26:36<1:25:22, 1.49it/s]
22%|██▏ | 2115/9770 [26:36<1:24:45, 1.51it/s]
22%|██▏ | 2116/9770 [26:37<1:23:58, 1.52it/s]
22%|██▏ | 2117/9770 [26:38<1:23:57, 1.52it/s]
22%|██▏ | 2118/9770 [26:38<1:23:42, 1.52it/s]
22%|██▏ | 2119/9770 [26:39<1:24:18, 1.51it/s]
22%|██▏ | 2120/9770 [26:40<1:24:42, 1.51it/s]
22%|██▏ | 2120/9770 [26:40<1:24:42, 1.51it/s]
22%|██▏ | 2121/9770 [26:40<1:25:42, 1.49it/s]
22%|██▏ | 2122/9770 [26:41<1:26:53, 1.47it/s]
22%|██▏ | 2123/9770 [26:42<1:25:22, 1.49it/s]
22%|██▏ | 2124/9770 [26:42<1:24:40, 1.51it/s]
22%|██▏ | 2125/9770 [26:43<1:24:35, 1.51it
+0: {'loss': 0.6927, 'grad_norm': 0.5960848397298177, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.22}
+0: /s]
22%|██▏ | 2126/9770 [26:44<1:23:22, 1.53it/s]
22%|██▏ | 2127/9770 [26:44<1:24:27, 1.51it/s]
22%|██▏ | 2128/9770 [26:45<1:23:51, 1.52it/s]
22%|██▏ | 2129/9770 [26:46<1:23:42, 1.52it/s]
22%|██▏ | 2130/9770 [26:46<1:24:19, 1.51it/s]
22%|██▏ | 2130/9770 [26:46<1:24:19, 1.51it/s]
22%|██▏ | 2131/9770 [26:47<1:23:34, 1.52it/s]
22%|██▏ | 2132/9770 [26:48<1:25:30, 1.49it/s]
22%|██▏ | 2133/9770 [26:48<1:25:10, 1.49it/s]
22%|██▏ | 2134/9770 [26:49<1:24:39, 1.50it/s]
22%|██▏ | 2135/9770 [26:50<1:24:30, 1.51it/s]
22%|██▏ | 2136/9770 [26:50<1:24:58, 1.50it/s]
22%|██▏ | 2137/9770 [26:51<1:24:38, 1.50it/s]
22%|██▏ | 2138/9770 [26:52<1:24:46, 1.50it/s]
22%|██▏ | 2139/9770 [26:52<1:24:41, 1.50it/s]
22%|██▏ | 2140/9770 [26:53<1:24:42, 1.50it/s]
+0: {'loss': 0.6726, 'grad_norm': 0.6985278723307764, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.22}
+0: {'loss': 0.6854, 'grad_norm': 0.6551569393057011, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.22}
+0:
22%|██▏ | 2140/9770 [26:53<1:24:42, 1.50it/s]
22%|██▏ | 2141/9770 [26:54<1:24:21, 1.51it/s]
22%|██▏ | 2142/9770 [26:54<1:23:40, 1.52it/s]
22%|██▏ | 2143/9770 [26:55<1:23:39, 1.52it/s]
22%|██▏ | 2144/9770 [26:56<1:23:03, 1.53it/s]
22%|██▏ | 2145/9770 [26:56<1:22:50, 1.53it/s]
22%|██▏ | 2146/9770 [26:57<1:23:10, 1.53it/s]
22%|██▏ | 2147/9770 [26:58<1:23:28, 1.52it/s]
22%|██▏ | 2148/9770 [26:58<1:22:57, 1.53it/s]
22%|██▏ | 2149/9770 [26:59<1:23:24, 1.52it/s]
22%|██▏ | 2150/9770 [27:00<1:23:57, 1.51it/s]
22%|██▏ | 2150/9770 [27:00<1:23:57, 1.51it/s]
22%|██▏ | 2151/9770 [27:00<1:23:32, 1.52it/s]
22%|██▏ | 2152/9770 [27:01<1:24:15, 1.51it/s]
22%|██▏ | 2153/9770 [27:02<1:24:59, 1.49it/s]
22%|██▏
+0: {'loss': 0.7044, 'grad_norm': 0.6285554727242006, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.22}
+0: | 2154/9770 [27:02<1:26:06, 1.47it/s]
22%|██▏ | 2155/9770 [27:03<1:24:45, 1.50it/s]
22%|██▏ | 2156/9770 [27:04<1:24:32, 1.50it/s]
22%|██▏ | 2157/9770 [27:04<1:24:03, 1.51it/s]
22%|██▏ | 2158/9770 [27:05<1:24:01, 1.51it/s]
22%|██▏ | 2159/9770 [27:06<1:23:33, 1.52it/s]
22%|██▏ | 2160/9770 [27:06<1:22:55, 1.53it/s]
22%|██▏ | 2160/9770 [27:06<1:22:55, 1.53it/s]
22%|██▏ | 2161/9770 [27:07<1:24:39, 1.50it/s]
22%|██▏ | 2162/9770 [27:08<1:23:46, 1.51it/s]
22%|██▏ | 2163/9770 [27:08<1:23:52, 1.51it/s]
22%|██▏ | 2164/9770 [27:09<1:24:17, 1.50it/s]
22%|██▏ | 2165/9770 [27:10<1:25:14, 1.49it/s]
22%|██▏ | 2166/9770 [27:10<1:24:24, 1.50it/s]
22%|██▏ | 2167/9770 [27:11<1:23:38, 1.51it/s]
22%|██▏ | 2168/9770 [27:12<1:25:27, 1.48it/s]
22%|██▏ | 2169/
+0: {'loss': 0.701, 'grad_norm': 0.7001124718908, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.22}
+0: {'loss': 0.6869, 'grad_norm': 0.6676667608170364, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.22}
+0: 9770 [27:12<1:24:46, 1.49it/s]
22%|██▏ | 2170/9770 [27:13<1:23:39, 1.51it/s]
22%|██▏ | 2170/9770 [27:13<1:23:39, 1.51it/s]
22%|██▏ | 2171/9770 [27:14<1:23:49, 1.51it/s]
22%|██▏ | 2172/9770 [27:14<1:24:10, 1.50it/s]
22%|██▏ | 2173/9770 [27:15<1:24:17, 1.50it/s]
22%|██▏ | 2174/9770 [27:16<1:24:03, 1.51it/s]
22%|██▏ | 2175/9770 [27:16<1:23:47, 1.51it/s]
22%|██▏ | 2176/9770 [27:17<1:23:34, 1.51it/s]
22%|██▏ | 2177/9770 [27:18<1:23:00, 1.52it/s]
22%|██▏ | 2178/9770 [27:18<1:23:42, 1.51it/s]
22%|██▏ | 2179/9770 [27:19<1:23:15, 1.52it/s]
22%|██▏ | 2180/9770 [27:20<1:22:57, 1.52it/s]
22%|██▏ | 2180/9770 [27:20<1:22:57, 1.52it/s]
22%|██▏ | 2181/9770 [27:20<1:22:49, 1.53it/s]
22%|██▏ | 2182/9770 [27:21<1:
+0: {'loss': 0.6991, 'grad_norm': 0.6496071275873664, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.22}
+0: 22:35, 1.53it/s]
22%|██▏ | 2183/9770 [27:21<1:22:45, 1.53it/s]
22%|██▏ | 2184/9770 [27:22<1:22:40, 1.53it/s]
22%|██▏ | 2185/9770 [27:23<1:22:07, 1.54it/s]
22%|██▏ | 2186/9770 [27:23<1:22:29, 1.53it/s]
22%|██▏ | 2187/9770 [27:24<1:22:28, 1.53it/s]
22%|██▏ | 2188/9770 [27:25<1:23:03, 1.52it/s]
22%|██▏ | 2189/9770 [27:25<1:23:26, 1.51it/s]
22%|██▏ | 2190/9770 [27:26<1:22:29, 1.53it/s]
22%|██▏ | 2190/9770 [27:26<1:22:29, 1.53it/s]
22%|██▏ | 2191/9770 [27:27<1:22:05, 1.54it/s]
22%|██▏ | 2192/9770 [27:27<1:22:05, 1.54it/s]
22%|██▏ | 2193/9770 [27:28<1:21:08, 1.56it/s]
22%|██▏ | 2194/9770 [27:29<1:21:58, 1.54it/s]
22%|██▏ | 2195/9770 [27:29<1:24:10, 1.50it/s]
22%|██▏ | 2196/9770 [27:30<1:24:18, 1.50it/s]
22%|██▏ | 2197/9770 [27:31<1:24:07, 1
+0: {'loss': 0.7162, 'grad_norm': 0.6612981904173098, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.23}
+0: {'loss': 0.7154, 'grad_norm': 0.6796044502133897, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.23}
+0: .50it/s]
22%|██▏ | 2198/9770 [27:31<1:24:14, 1.50it/s]
23%|██▎ | 2199/9770 [27:32<1:25:59, 1.47it/s]
23%|██▎ | 2200/9770 [27:33<1:25:23, 1.48it/s]
23%|██▎ | 2200/9770 [27:33<1:25:23, 1.48it/s]
23%|██▎ | 2201/9770 [27:33<1:24:44, 1.49it/s]
23%|██▎ | 2202/9770 [27:34<1:23:17, 1.51it/s]
23%|██▎ | 2203/9770 [27:35<1:23:23, 1.51it/s]
23%|██▎ | 2204/9770 [27:35<1:22:43, 1.52it/s]
23%|██▎ | 2205/9770 [27:36<1:22:49, 1.52it/s]
23%|██▎ | 2206/9770 [27:37<1:22:32, 1.53it/s]
23%|██▎ | 2207/9770 [27:37<1:22:05, 1.54it/s]
23%|██▎ | 2208/9770 [27:38<1:24:11, 1.50it/s]
23%|██▎ | 2209/9770 [27:39<1:23:50, 1.50it/s]
23%|██▎ | 2210/9770 [27:39<1:23:24, 1.51it/s]
23%|██▎ | 2210/9770 [27:39<1:23:24, 1.51it/s]
23%|
+0: {'loss': 0.6913, 'grad_norm': 0.6462252491398575, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.23}
+0: ██▎ | 2211/9770 [27:40<1:23:43, 1.50it/s]
23%|██▎ | 2212/9770 [27:41<1:23:03, 1.52it/s]
23%|██▎ | 2213/9770 [27:41<1:23:30, 1.51it/s]
23%|██▎ | 2214/9770 [27:42<1:25:25, 1.47it/s]
23%|██▎ | 2215/9770 [27:43<1:24:14, 1.49it/s]
23%|██▎ | 2216/9770 [27:43<1:23:46, 1.50it/s]
23%|██▎ | 2217/9770 [27:44<1:23:06, 1.51it/s]
23%|██▎ | 2218/9770 [27:45<1:22:55, 1.52it/s]
23%|██▎ | 2219/9770 [27:45<1:22:56, 1.52it/s]
23%|██▎ | 2220/9770 [27:46<1:22:21, 1.53it/s]
23%|██▎ | 2220/9770 [27:46<1:22:21, 1.53it/s]
23%|██▎ | 2221/9770 [27:47<1:22:53, 1.52it/s]
23%|██▎ | 2222/9770 [27:47<1:22:45, 1.52it/s]
23%|██▎ | 2223/9770 [27:48<1:23:22, 1.51it/s]
23%|██▎ | 2224/9770 [27:49<1:21:52, 1.54it/s]
23%|██▎ | 2225/9770 [27:49<1:21:14, 1.55it/s]
23%|██▎
+0: {'loss': 0.6924, 'grad_norm': 0.6051102836694884, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.23}
+0: | 2226/9770 [27:50<1:21:09, 1.55it/s]
23%|██▎ | 2227/9770 [27:50<1:21:19, 1.55it/s]
23%|██▎ | 2228/9770 [27:51<1:21:51, 1.54it/s]
23%|██▎ | 2229/9770 [27:52<1:23:26, 1.51it/s]
23%|██▎ | 2230/9770 [27:52<1:22:49, 1.52it/s]
23%|██▎ | 2230/9770 [27:52<1:22:49, 1.52it/s]
23%|██▎ | 2231/9770 [27:53<1:23:14, 1.51it/s]
23%|██▎ | 2232/9770 [27:54<1:23:29, 1.50it/s]
23%|██▎ | 2233/9770 [27:54<1:22:47, 1.52it/s]
23%|██▎ | 2234/9770 [27:55<1:21:53, 1.53it/s]
23%|██▎ | 2235/9770 [27:56<1:21:56, 1.53it/s]
23%|██▎ | 2236/9770 [27:56<1:23:17, 1.51it/s]
23%|██▎ | 2237/9770 [27:57<1:22:38, 1.52it/s]
23%|██▎ | 2238/9770 [27:58<1:22:50, 1.52it/s]
23%|██▎ | 2239/9770 [27:58<1:24:35, 1.48it/s]
23%|██▎ | 2240/9770 [27:59<1:23:21, 1.51it/s]
+0: {'loss': 0.7015, 'grad_norm': 0.6365712298187204, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.23}
+0: {'loss': 0.7024, 'grad_norm': 0.681782965572124, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.23}
+0:
23%|██▎ | 2240/9770 [27:59<1:23:21, 1.51it/s]
23%|██▎ | 2241/9770 [28:00<1:23:18, 1.51it/s]
23%|██▎ | 2242/9770 [28:00<1:23:04, 1.51it/s]
23%|██▎ | 2243/9770 [28:01<1:22:24, 1.52it/s]
23%|██▎ | 2244/9770 [28:02<1:22:32, 1.52it/s]
23%|██▎ | 2245/9770 [28:02<1:22:32, 1.52it/s]
23%|██▎ | 2246/9770 [28:03<1:23:31, 1.50it/s]
23%|██��� | 2247/9770 [28:04<1:22:45, 1.52it/s]
23%|██▎ | 2248/9770 [28:05<1:30:12, 1.39it/s]
23%|██▎ | 2249/9770 [28:05<1:27:30, 1.43it/s]
23%|██▎ | 2250/9770 [28:06<1:25:41, 1.46it/s]
23%|██▎ | 2250/9770 [28:06<1:25:41, 1.46it/s]
23%|██▎ | 2251/9770 [28:07<1:25:31, 1.47it/s]
23%|██▎ | 2252/9770 [28:07<1:23:49, 1.49it/s]
23%|██▎ | 2253/9770 [28:08<1:24:02, 1.49it/s]
23%|██▎ | 2254/9770 [28:
+0: {'loss': 0.6898, 'grad_norm': 0.6831612937779107, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.23}
+0: 09<1:23:35, 1.50it/s]
23%|██▎ | 2255/9770 [28:09<1:23:25, 1.50it/s]
23%|██▎ | 2256/9770 [28:10<1:25:01, 1.47it/s]
23%|██▎ | 2257/9770 [28:11<1:24:20, 1.48it/s]
23%|██▎ | 2258/9770 [28:11<1:23:23, 1.50it/s]
23%|██▎ | 2259/9770 [28:12<1:23:28, 1.50it/s]
23%|██▎ | 2260/9770 [28:13<1:23:36, 1.50it/s]
23%|██▎ | 2260/9770 [28:13<1:23:36, 1.50it/s]
23%|██▎ | 2261/9770 [28:13<1:23:14, 1.50it/s]
23%|██▎ | 2262/9770 [28:14<1:23:08, 1.50it/s]
23%|██▎ | 2263/9770 [28:15<1:22:55, 1.51it/s]
23%|██▎ | 2264/9770 [28:15<1:23:13, 1.50it/s]
23%|██▎ | 2265/9770 [28:16<1:22:38, 1.51it/s]
23%|██▎ | 2266/9770 [28:17<1:23:03, 1.51it/s]
23%|██▎ | 2267/9770 [28:17<1:22:53, 1.51it/s]
23%|██▎ | 2268/9770 [28:18<1:22:45, 1.51it/s]
23%|██▎ | 2269/9770 [28:18<1:23:0
+0: {'loss': 0.6938, 'grad_norm': 0.6407106322264341, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.23}
+0: {'loss': 0.6995, 'grad_norm': 0.6441926385283238, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.23}
+0: 0, 1.51it/s]
23%|██▎ | 2270/9770 [28:19<1:22:02, 1.52it/s]
23%|██▎ | 2270/9770 [28:19<1:22:02, 1.52it/s]
23%|██▎ | 2271/9770 [28:20<1:21:59, 1.52it/s]
23%|██▎ | 2272/9770 [28:20<1:22:01, 1.52it/s]
23%|██▎ | 2273/9770 [28:21<1:21:59, 1.52it/s]
23%|██▎ | 2274/9770 [28:22<1:22:02, 1.52it/s]
23%|██▎ | 2275/9770 [28:22<1:21:48, 1.53it/s]
23%|██▎ | 2276/9770 [28:23<1:21:30, 1.53it/s]
23%|██▎ | 2277/9770 [28:24<1:22:17, 1.52it/s]
23%|██▎ | 2278/9770 [28:24<1:21:56, 1.52it/s]
23%|██▎ | 2279/9770 [28:25<1:21:50, 1.53it/s]
23%|██▎ | 2280/9770 [28:26<1:22:36, 1.51it/s]
23%|██▎ | 2280/9770 [28:26<1:22:36, 1.51it/s]
23%|██▎ | 2281/9770 [28:26<1:21:51, 1.52it/s]
23%|██▎ | 2282/9770 [28:27<1:21:06, 1.54it/s]
+0: {'loss': 0.6953, 'grad_norm': 0.6621951656012156, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.23}
+0: 23%|██▎ | 2283/9770 [28:28<1:21:24, 1.53it/s]
23%|██▎ | 2284/9770 [28:28<1:21:18, 1.53it/s]
23%|██▎ | 2285/9770 [28:29<1:21:41, 1.53it/s]
23%|██▎ | 2286/9770 [28:30<1:20:25, 1.55it/s]
23%|██▎ | 2287/9770 [28:30<1:20:23, 1.55it/s]
23%|██▎ | 2288/9770 [28:31<1:21:09, 1.54it/s]
23%|██▎ | 2289/9770 [28:32<1:21:50, 1.52it/s]
23%|██▎ | 2290/9770 [28:32<1:21:19, 1.53it/s]
23%|██▎ | 2290/9770 [28:32<1:21:19, 1.53it/s]
23%|██▎ | 2291/9770 [28:33<1:21:34, 1.53it/s]
23%|██▎ | 2292/9770 [28:34<1:22:38, 1.51it/s]
23%|██▎ | 2293/9770 [28:34<1:22:40, 1.51it/s]
23%|██▎ | 2294/9770 [28:35<1:22:13, 1.52it/s]
23%|██▎ | 2295/9770 [28:35<1:21:32, 1.53it/s]
24%|██▎ | 2296/9770 [28:36<1:22:07, 1.52it/s]
24%|██▎ | 2297/9770 [28:37<1:21:02, 1.54it/s]
24%|█�
+0: {'loss': 0.7054, 'grad_norm': 0.7113487213728596, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.24}
+0: {'loss': 0.7023, 'grad_norm': 0.6298331863248442, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.24}
+0: ��▎ | 2298/9770 [28:37<1:22:18, 1.51it/s]
24%|██▎ | 2299/9770 [28:38<1:21:51, 1.52it/s]
24%|██▎ | 2300/9770 [28:39<1:21:52, 1.52it/s]
24%|██▎ | 2300/9770 [28:39<1:21:52, 1.52it/s]
24%|██▎ | 2301/9770 [28:39<1:21:25, 1.53it/s]
24%|██▎ | 2302/9770 [28:40<1:22:12, 1.51it/s]
24%|██▎ | 2303/9770 [28:41<1:20:36, 1.54it/s]
24%|██▎ | 2304/9770 [28:41<1:22:29, 1.51it/s]
24%|██▎ | 2305/9770 [28:42<1:24:00, 1.48it/s]
24%|██▎ | 2306/9770 [28:43<1:22:57, 1.50it/s]
24%|██▎ | 2307/9770 [28:43<1:21:41, 1.52it/s]
24%|██▎ | 2308/9770 [28:44<1:22:16, 1.51it/s]
24%|██▎ | 2309/9770 [28:45<1:22:40, 1.50it/s]
24%|██▎ | 2310/9770 [28:45<1:22:45, 1.50it/s]
24%|██▎ | 2310/9770 [28:45<1:22:45, 1.50it/s]
24%|██▎ |
+0: {'loss': 0.704, 'grad_norm': 0.6170509402541738, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.24}
+0: 2311/9770 [28:46<1:22:46, 1.50it/s]
24%|██▎ | 2312/9770 [28:47<1:22:06, 1.51it/s]
24%|██▎ | 2313/9770 [28:47<1:22:27, 1.51it/s]
24%|██▎ | 2314/9770 [28:48<1:22:50, 1.50it/s]
24%|██▎ | 2315/9770 [28:49<1:21:43, 1.52it/s]
24%|██▎ | 2316/9770 [28:49<1:22:31, 1.51it/s]
24%|██▎ | 2317/9770 [28:50<1:22:12, 1.51it/s]
24%|██▎ | 2318/9770 [28:51<1:22:30, 1.51it/s]
24%|██▎ | 2319/9770 [28:51<1:22:32, 1.50it/s]
24%|██▎ | 2320/9770 [28:52<1:22:10, 1.51it/s]
24%|██▎ | 2320/9770 [28:52<1:22:10, 1.51it/s]
24%|██▍ | 2321/9770 [28:53<1:23:17, 1.49it/s]
24%|██▍ | 2322/9770 [28:53<1:22:59, 1.50it/s]
24%|██▍ | 2323/9770 [28:54<1:21:49, 1.52it/s]
24%|██▍ | 2324/9770 [28:55<1:22:35, 1.50it/s]
24%|██▍ | 2325/9770 [28:55<1:24:04, 1.48it/s]
24%|██▍ | 2326/9770
+0: {'loss': 0.6919, 'grad_norm': 0.6560002134611792, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.24}
+0: [28:56<1:23:35, 1.48it/s]
24%|██▍ | 2327/9770 [28:57<1:23:11, 1.49it/s]
24%|██▍ | 2328/9770 [28:57<1:23:01, 1.49it/s]
24%|██▍ | 2329/9770 [28:58<1:22:07, 1.51it/s]
24%|██▍ | 2330/9770 [28:59<1:21:51, 1.51it/s]
24%|██▍ | 2330/9770 [28:59<1:21:51, 1.51it/s]
24%|██▍ | 2331/9770 [28:59<1:21:17, 1.53it/s]
24%|██▍ | 2332/9770 [29:00<1:20:39, 1.54it/s]
24%|██▍ | 2333/9770 [29:01<1:20:13, 1.55it/s]
24%|██▍ | 2334/9770 [29:01<1:19:31, 1.56it/s]
24%|██▍ | 2335/9770 [29:02<1:20:00, 1.55it/s]
24%|██▍ | 2336/9770 [29:03<1:20:01, 1.55it/s]
24%|██▍ | 2337/9770 [29:03<1:19:01, 1.57it/s]
24%|██▍ | 2338/9770 [29:04<1:18:48, 1.57it/s]
24%|██▍ | 2339/9770 [29:04<1:18:45, 1.57it/s]
24%|██▍ | 2340/9770 [29:05<1:18:36, 1.58it/s]
+0: {'loss': 0.6752, 'grad_norm': 0.7033228390739535, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.24}
+0: {'loss': 0.6889, 'grad_norm': 0.6453095540665097, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.24}
+0:
24%|██▍ | 2340/9770 [29:05<1:18:36, 1.58it/s]
24%|██▍ | 2341/9770 [29:06<1:20:11, 1.54it/s]
24%|██▍ | 2342/9770 [29:06<1:20:17, 1.54it/s]
24%|██▍ | 2343/9770 [29:07<1:20:11, 1.54it/s]
24%|██▍ | 2344/9770 [29:08<1:20:41, 1.53it/s]
24%|██▍ | 2345/9770 [29:08<1:20:26, 1.54it/s]
24%|██▍ | 2346/9770 [29:09<1:21:13, 1.52it/s]
24%|██▍ | 2347/9770 [29:10<1:21:01, 1.53it/s]
24%|██▍ | 2348/9770 [29:10<1:20:28, 1.54it/s]
24%|██▍ | 2349/9770 [29:11<1:21:03, 1.53it/s]
24%|██▍ | 2350/9770 [29:12<1:21:09, 1.52it/s]
24%|██▍ | 2350/9770 [29:12<1:21:09, 1.52it/s]
24%|██▍ | 2351/9770 [29:12<1:21:01, 1.53it/s]
24%|██▍ | 2352/9770 [29:13<1:20:34, 1.53it/s]
24%|██▍ | 2353/9770 [29:14<1:20:57, 1.53it/s]
24%|██▍ | 2354/9770 [29:14<1:21:19, 1.52i
+0: {'loss': 0.7065, 'grad_norm': 0.6661320101544158, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.24}
+0: t/s]
24%|██▍ | 2355/9770 [29:15<1:21:36, 1.51it/s]
24%|██▍ | 2356/9770 [29:16<1:22:25, 1.50it/s]
24%|██▍ | 2357/9770 [29:16<1:21:50, 1.51it/s]
24%|██▍ | 2358/9770 [29:17<1:22:01, 1.51it/s]
24%|██▍ | 2359/9770 [29:18<1:21:59, 1.51it/s]
24%|██▍ | 2360/9770 [29:18<1:20:35, 1.53it/s]
24%|██▍ | 2360/9770 [29:18<1:20:35, 1.53it/s]
24%|██▍ | 2361/9770 [29:19<1:20:51, 1.53it/s]
24%|██▍ | 2362/9770 [29:20<1:21:33, 1.51it/s]
24%|██▍ | 2363/9770 [29:20<1:21:36, 1.51it/s]
24%|██▍ | 2364/9770 [29:21<1:22:18, 1.50it/s]
24%|██▍ | 2365/9770 [29:22<1:21:00, 1.52it/s]
24%|██▍ | 2366/9770 [29:22<1:21:46, 1.51it/s]
24%|██▍ | 2367/9770 [29:23<1:21:22, 1.52it/s]
24%|██▍ | 2368/9770 [29:24<1:21:18, 1.52it/s]
24%|██▍ | 2369/9770 [29:24<1:21:36, 1.51it/s]
24%
+0: {'loss': 0.6995, 'grad_norm': 0.7447715527305673, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.24}
+0: {'loss': 0.7155, 'grad_norm': 0.7095958786007863, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.24}
+0: |██▍ | 2370/9770 [29:25<1:22:23, 1.50it/s]
24%|██▍ | 2370/9770 [29:25<1:22:23, 1.50it/s]
24%|██▍ | 2371/9770 [29:26<1:22:01, 1.50it/s]
24%|██▍ | 2372/9770 [29:26<1:21:25, 1.51it/s]
24%|██▍ | 2373/9770 [29:27<1:29:03, 1.38it/s]
24%|██▍ | 2374/9770 [29:28<1:26:29, 1.43it/s]
24%|██▍ | 2375/9770 [29:28<1:24:22, 1.46it/s]
24%|██▍ | 2376/9770 [29:29<1:25:06, 1.45it/s]
24%|██▍ | 2377/9770 [29:30<1:24:17, 1.46it/s]
24%|██▍ | 2378/9770 [29:30<1:23:12, 1.48it/s]
24%|██▍ | 2379/9770 [29:31<1:22:38, 1.49it/s]
24%|██▍ | 2380/9770 [29:32<1:23:00, 1.48it/s]
24%|██▍ | 2380/9770 [29:32<1:23:00, 1.48it/s]
24%|██▍ | 2381/9770 [29:32<1:22:18, 1.50it/s]
24%|██▍ | 2382/9770 [29:33<1:21:20, 1.51it/s]
24%|██▍
+0: {'loss': 0.6986, 'grad_norm': 0.6708710026563455, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.24}
+0: | 2383/9770 [29:34<1:20:44, 1.52it/s]
24%|██▍ | 2384/9770 [29:34<1:20:26, 1.53it/s]
24%|██▍ | 2385/9770 [29:35<1:20:26, 1.53it/s]
24%|██▍ | 2386/9770 [29:36<1:20:20, 1.53it/s]
24%|██▍ | 2387/9770 [29:36<1:19:22, 1.55it/s]
24%|██▍ | 2388/9770 [29:37<1:20:12, 1.53it/s]
24%|██▍ | 2389/9770 [29:38<1:19:23, 1.55it/s]
24%|██▍ | 2390/9770 [29:38<1:21:45, 1.50it/s]
24%|██▍ | 2390/9770 [29:38<1:21:45, 1.50it/s]
24%|██▍ | 2391/9770 [29:39<1:21:32, 1.51it/s]
24%|██▍ | 2392/9770 [29:40<1:21:13, 1.51it/s]
24%|██▍ | 2393/9770 [29:40<1:21:21, 1.51it/s]
25%|██▍ | 2394/9770 [29:41<1:20:56, 1.52it/s]
25%|██▍ | 2395/9770 [29:42<1:20:41, 1.52it/s]
25%|██▍ | 2396/9770 [29:42<1:20:41, 1.52it/s]
25%|██▍ | 2397/9770 [29:43<1:20:59, 1.52it/s]
25%|██▍ | 2398
+0: {'loss': 0.6886, 'grad_norm': 0.6958802805826247, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.25}
+0: {'loss': 0.6934, 'grad_norm': 0.6834266616612527, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.25}
+0: /9770 [29:44<1:21:06, 1.51it/s]
25%|██▍ | 2399/9770 [29:44<1:21:38, 1.50it/s]
25%|██▍ | 2400/9770 [29:45<1:21:52, 1.50it/s]
25%|██▍ | 2400/9770 [29:45<1:21:52, 1.50it/s]
25%|██▍ | 2401/9770 [29:46<1:21:40, 1.50it/s]
25%|██▍ | 2402/9770 [29:46<1:21:49, 1.50it/s]
25%|██▍ | 2403/9770 [29:47<1:21:33, 1.51it/s]
25%|██▍ | 2404/9770 [29:48<1:21:18, 1.51it/s]
25%|██▍ | 2405/9770 [29:48<1:20:06, 1.53it/s]
25%|██▍ | 2406/9770 [29:49<1:20:28, 1.52it/s]
25%|██▍ | 2407/9770 [29:49<1:20:06, 1.53it/s]
25%|██▍ | 2408/9770 [29:50<1:21:11, 1.51it/s]
25%|██▍ | 2409/9770 [29:51<1:20:59, 1.51it/s]
25%|██▍ | 2410/9770 [29:51<1:20:58, 1.51it/s]
25%|██▍ | 2410/9770 [29:51<1:20:58, 1.51it/s]
25%|██▍ | 2411/9770 [29:52<1
+0: {'loss': 0.705, 'grad_norm': 0.6605714232757514, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.25}
+0: :20:51, 1.52it/s]
25%|██▍ | 2412/9770 [29:53<1:21:38, 1.50it/s]
25%|██▍ | 2413/9770 [29:53<1:20:52, 1.52it/s]
25%|██▍ | 2414/9770 [29:54<1:20:39, 1.52it/s]
25%|██▍ | 2415/9770 [29:55<1:19:48, 1.54it/s]
25%|██▍ | 2416/9770 [29:55<1:19:44, 1.54it/s]
25%|██▍ | 2417/9770 [29:56<1:19:52, 1.53it/s]
25%|██▍ | 2418/9770 [29:57<1:22:04, 1.49it/s]
25%|██▍ | 2419/9770 [29:57<1:21:39, 1.50it/s]
25%|██▍ | 2420/9770 [29:58<1:21:09, 1.51it/s]
25%|██▍ | 2420/9770 [29:58<1:21:09, 1.51it/s]
25%|██▍ | 2421/9770 [29:59<1:21:26, 1.50it/s]
25%|██▍ | 2422/9770 [29:59<1:20:57, 1.51it/s]
25%|██▍ | 2423/9770 [30:00<1:20:44, 1.52it/s]
25%|██▍ | 2424/9770 [30:01<1:20:01, 1.53it/s]
25%|██▍ | 2425/9770 [30:01<1:20:36, 1.52it/s]
25%|██▍ | 2426/9770 [30:02<1:20:36,
+0: {'loss': 0.6885, 'grad_norm': 0.6327435562591638, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.25}
+0: {'loss': 0.6963, 'grad_norm': 0.6650247149095739, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.25}
+0: 1.52it/s]
25%|██▍ | 2427/9770 [30:03<1:20:07, 1.53it/s]
25%|██▍ | 2428/9770 [30:03<1:19:23, 1.54it/s]
25%|██▍ | 2429/9770 [30:04<1:19:53, 1.53it/s]
25%|██▍ | 2430/9770 [30:05<1:21:33, 1.50it/s]
25%|██▍ | 2430/9770 [30:05<1:21:33, 1.50it/s]
25%|██▍ | 2431/9770 [30:05<1:21:23, 1.50it/s]
25%|██▍ | 2432/9770 [30:06<1:21:25, 1.50it/s]
25%|██▍ | 2433/9770 [30:07<1:21:04, 1.51it/s]
25%|██▍ | 2434/9770 [30:07<1:20:00, 1.53it/s]
25%|██▍ | 2435/9770 [30:08<1:19:42, 1.53it/s]
25%|██▍ | 2436/9770 [30:09<1:20:21, 1.52it/s]
25%|██▍ | 2437/9770 [30:09<1:21:24, 1.50it/s]
25%|██▍ | 2438/9770 [30:10<1:20:05, 1.53it/s]
25%|██▍ | 2439/9770 [30:11<1:21:58, 1.49it/s]
25%|██▍ | 2440/9770 [30:11<1:20:49, 1.51it/s]
25%
+0: {'loss': 0.7184, 'grad_norm': 0.7083155582450582, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.25}
+0: |██▍ | 2440/9770 [30:11<1:20:49, 1.51it/s]
25%|██▍ | 2441/9770 [30:12<1:19:47, 1.53it/s]
25%|██▍ | 2442/9770 [30:13<1:20:33, 1.52it/s]
25%|██▌ | 2443/9770 [30:13<1:20:32, 1.52it/s]
25%|██▌ | 2444/9770 [30:14<1:20:35, 1.52it/s]
25%|██▌ | 2445/9770 [30:15<1:20:52, 1.51it/s]
25%|██▌ | 2446/9770 [30:15<1:21:17, 1.50it/s]
25%|██▌ | 2447/9770 [30:16<1:20:30, 1.52it/s]
25%|██▌ | 2448/9770 [30:17<1:20:01, 1.53it/s]
25%|██▌ | 2449/9770 [30:17<1:20:09, 1.52it/s]
25%|██▌ | 2450/9770 [30:18<1:21:25, 1.50it/s]
25%|██▌ | 2450/9770 [30:18<1:21:25, 1.50it/s]
25%|██▌ | 2451/9770 [30:19<1:21:22, 1.50it/s]
25%|██▌ | 2452/9770 [30:19<1:21:12, 1.50it/s]
25%|██▌ | 2453/9770 [30:20<1:20:19, 1.52it/s]
25%|██▌ | 2454/9770 [30:20<1:20:03, 1.52it/s]
25%|██�
+0: {'loss': 0.674, 'grad_norm': 0.6638480769410481, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.25}
+0: � | 2455/9770 [30:21<1:20:00, 1.52it/s]
25%|██▌ | 2456/9770 [30:22<1:19:49, 1.53it/s]
25%|██▌ | 2457/9770 [30:22<1:20:55, 1.51it/s]
25%|██▌ | 2458/9770 [30:23<1:20:40, 1.51it/s]
25%|██▌ | 2459/9770 [30:24<1:20:30, 1.51it/s]
25%|██▌ | 2460/9770 [30:24<1:21:33, 1.49it/s]
25%|██▌ | 2460/9770 [30:24<1:21:33, 1.49it/s]
25%|██▌ | 2461/9770 [30:25<1:20:58, 1.50it/s]
25%|██▌ | 2462/9770 [30:26<1:21:58, 1.49it/s]
25%|██▌ | 2463/9770 [30:26<1:21:30, 1.49it/s]
25%|██▌ | 2464/9770 [30:27<1:20:36, 1.51it/s]
25%|██▌ | 2465/9770 [30:28<1:20:05, 1.52it/s]
25%|██▌ | 2466/9770 [30:28<1:19:41, 1.53it/s]
25%|██▌ | 2467/9770 [30:29<1:19:46, 1.53it/s]
25%|██▌ | 2468/9770 [30:30<1:19:33, 1.53it/s]
25%|██▌ | 2469/9770 [30:30<1:19:18, 1.53it/s]
25%|██▌ |
+0: {'loss': 0.6835, 'grad_norm': 0.6872804901742495, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.25}
+0: {'loss': 0.7102, 'grad_norm': 0.6261018581440215, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.25}
+0: 2470/9770 [30:31<1:19:11, 1.54it/s]
25%|██▌ | 2470/9770 [30:31<1:19:11, 1.54it/s]
25%|██▌ | 2471/9770 [30:32<1:18:34, 1.55it/s]
25%|██▌ | 2472/9770 [30:32<1:20:06, 1.52it/s]
25%|██▌ | 2473/9770 [30:33<1:19:34, 1.53it/s]
25%|██▌ | 2474/9770 [30:34<1:19:50, 1.52it/s]
25%|██▌ | 2475/9770 [30:34<1:19:40, 1.53it/s]
25%|██▌ | 2476/9770 [30:35<1:19:28, 1.53it/s]
25%|██▌ | 2477/9770 [30:36<1:18:38, 1.55it/s]
25%|██▌ | 2478/9770 [30:36<1:19:00, 1.54it/s]
25%|██▌ | 2479/9770 [30:37<1:18:29, 1.55it/s]
25%|██▌ | 2480/9770 [30:38<1:19:07, 1.54it/s]
25%|██▌ | 2480/9770 [30:38<1:19:07, 1.54it/s]
25%|██▌ | 2481/9770 [30:38<1:19:25, 1.53it/s]
25%|██▌ | 2482/9770 [30:39<1:19:43, 1.52it/s]
25%|██▌ | 2483/9770 [30
+0: {'loss': 0.6872, 'grad_norm': 0.6594941981971951, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.25}
+0: :40<1:19:56, 1.52it/s]
25%|██▌ | 2484/9770 [30:40<1:19:47, 1.52it/s]
25%|██▌ | 2485/9770 [30:41<1:19:43, 1.52it/s]
25%|██▌ | 2486/9770 [30:41<1:19:05, 1.53it/s]
25%|██▌ | 2487/9770 [30:42<1:18:39, 1.54it/s]
25%|██▌ | 2488/9770 [30:43<1:19:35, 1.52it/s]
25%|██▌ | 2489/9770 [30:43<1:19:52, 1.52it/s]
25%|██▌ | 2490/9770 [30:44<1:21:02, 1.50it/s]
25%|██▌ | 2490/9770 [30:44<1:21:02, 1.50it/s]
25%|██▌ | 2491/9770 [30:45<1:21:00, 1.50it/s]
26%|██▌ | 2492/9770 [30:45<1:20:33, 1.51it/s]
26%|██▌ | 2493/9770 [30:46<1:19:06, 1.53it/s]
26%|██▌ | 2494/9770 [30:47<1:19:34, 1.52it/s]
26%|██▌ | 2495/9770 [30:47<1:20:36, 1.50it/s]
26%|██▌ | 2496/9770 [30:48<1:20:10, 1.51it/s]
26%|██▌ | 2497/9770 [30:49<1:19:48, 1.52it/s]
26%|██▌ | 2498/9770 [30:49<1:19:
+0: {'loss': 0.6848, 'grad_norm': 0.6579965013718169, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.26}
+0: {'loss': 0.7132, 'grad_norm': 0.6706785406272323, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.26}
+0: 44, 1.52it/s]
26%|██▌ | 2499/9770 [30:50<1:20:34, 1.50it/s]
26%|██▌ | 2500/9770 [30:51<1:20:14, 1.51it/s]
26%|██▌ | 2500/9770 [30:51<1:20:14, 1.51it/s]
26%|██▌ | 2501/9770 [30:51<1:19:48, 1.52it/s]
26%|██▌ | 2502/9770 [30:52<1:20:03, 1.51it/s]
26%|██▌ | 2503/9770 [30:53<1:19:27, 1.52it/s]
26%|██▌ | 2504/9770 [30:53<1:19:28, 1.52it/s]
26%|██▌ | 2505/9770 [30:54<1:19:45, 1.52it/s]
26%|██▌ | 2506/9770 [30:55<1:19:42, 1.52it/s]
26%|██▌ | 2507/9770 [30:55<1:20:04, 1.51it/s]
26%|██▌ | 2508/9770 [30:56<1:20:15, 1.51it/s]
26%|██▌ | 2509/9770 [30:57<1:19:38, 1.52it/s]
26%|██▌ | 2510/9770 [30:57<1:19:40, 1.52it/s]
26%|██▌ | 2510/9770 [30:57<1:19:40, 1.52it/s]
26%|██▌ | 2511/9770 [30:58<1:19:21, 1.52it/s]
+0: {'loss': 0.695, 'grad_norm': 0.6819231660488712, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.26}
+0:
26%|██▌ | 2512/9770 [30:59<1:18:48, 1.54it/s]
26%|██▌ | 2513/9770 [30:59<1:17:57, 1.55it/s]
26%|██▌ | 2514/9770 [31:00<1:18:31, 1.54it/s]
26%|██▌ | 2515/9770 [31:01<1:18:49, 1.53it/s]
26%|██▌ | 2516/9770 [31:01<1:19:06, 1.53it/s]
26%|██▌ | 2517/9770 [31:02<1:18:42, 1.54it/s]
26%|██▌ | 2518/9770 [31:03<1:18:22, 1.54it/s]
26%|██▌ | 2519/9770 [31:03<1:18:40, 1.54it/s]
26%|██▌ | 2520/9770 [31:04<1:19:28, 1.52it/s]
26%|██▌ | 2520/9770 [31:04<1:19:28, 1.52it/s]
26%|██▌ | 2521/9770 [31:04<1:18:54, 1.53it/s]
26%|██▌ | 2522/9770 [31:05<1:19:28, 1.52it/s]
26%|██▌ | 2523/9770 [31:06<1:18:36, 1.54it/s]
26%|██▌ | 2524/9770 [31:06<1:18:34, 1.54it/s]
26%|██▌ | 2525/9770 [31:07<1:18:13, 1.54it/s]
26%|██▌ | 2526/9770 [31:08<1:17:38, 1.56it/s]
26%|█
+0: {'loss': 0.7041, 'grad_norm': 0.6260161993010644, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.26}
+0: {'loss': 0.6826, 'grad_norm': 0.6721983154332225, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.26}
+0: █▌ | 2527/9770 [31:08<1:17:48, 1.55it/s]
26%|██▌ | 2528/9770 [31:09<1:18:09, 1.54it/s]
26%|██▌ | 2529/9770 [31:10<1:18:25, 1.54it/s]
26%|██▌ | 2530/9770 [31:10<1:19:07, 1.52it/s]
26%|██▌ | 2530/9770 [31:10<1:19:07, 1.52it/s]
26%|██▌ | 2531/9770 [31:11<1:20:47, 1.49it/s]
26%|██▌ | 2532/9770 [31:12<1:21:04, 1.49it/s]
26%|██▌ | 2533/9770 [31:12<1:22:23, 1.46it/s]
26%|██▌ | 2534/9770 [31:13<1:20:35, 1.50it/s]
26%|██▌ | 2535/9770 [31:14<1:19:57, 1.51it/s]
26%|██▌ | 2536/9770 [31:14<1:19:27, 1.52it/s]
26%|██▌ | 2537/9770 [31:15<1:18:50, 1.53it/s]
26%|██▌ | 2538/9770 [31:16<1:19:23, 1.52it/s]
26%|██▌ | 2539/9770 [31:16<1:19:27, 1.52it/s]
26%|██▌ | 2540/9770 [31:17<1:19:21, 1.52it/s]
26%|██▌ |
+0: {'loss': 0.683, 'grad_norm': 0.632845371003001, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.26}
+0: 2540/9770 [31:17<1:19:21, 1.52it/s]
26%|██▌ | 2541/9770 [31:18<1:19:22, 1.52it/s]
26%|██▌ | 2542/9770 [31:18<1:19:18, 1.52it/s]
26%|██▌ | 2543/9770 [31:19<1:19:59, 1.51it/s]
26%|██▌ | 2544/9770 [31:20<1:18:38, 1.53it/s]
26%|██▌ | 2545/9770 [31:20<1:18:52, 1.53it/s]
26%|██▌ | 2546/9770 [31:21<1:18:44, 1.53it/s]
26%|██▌ | 2547/9770 [31:22<1:18:28, 1.53it/s]
26%|██▌ | 2548/9770 [31:22<1:19:26, 1.52it/s]
26%|██▌ | 2549/9770 [31:23<1:19:42, 1.51it/s]
26%|██▌ | 2550/9770 [31:24<1:19:32, 1.51it/s]
26%|██▌ | 2550/9770 [31:24<1:19:32, 1.51it/s]
26%|██▌ | 2551/9770 [31:24<1:20:01, 1.50it/s]
26%|██▌ | 2552/9770 [31:25<1:19:53, 1.51it/s]
26%|██▌ | 2553/9770 [31:26<1:18:49, 1.53it/s]
26%|██▌ | 2554/9770 [31:26<1:19:31, 1.51it/s]
26%|██▌ | 2555/977
+0: {'loss': 0.7043, 'grad_norm': 0.6468594271100815, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.26}
+0: 0 [31:27<1:19:14, 1.52it/s]
26%|██▌ | 2556/9770 [31:28<1:19:03, 1.52it/s]
26%|██▌ | 2557/9770 [31:28<1:19:04, 1.52it/s]
26%|██▌ | 2558/9770 [31:29<1:20:48, 1.49it/s]
26%|██▌ | 2559/9770 [31:30<1:20:25, 1.49it/s]
26%|██▌ | 2560/9770 [31:30<1:20:00, 1.50it/s]
26%|██▌ | 2560/9770 [31:30<1:20:00, 1.50it/s]
26%|██▌ | 2561/9770 [31:31<1:20:02, 1.50it/s]
26%|██▌ | 2562/9770 [31:32<1:19:51, 1.50it/s]
26%|██▌ | 2563/9770 [31:32<1:19:20, 1.51it/s]
26%|██▌ | 2564/9770 [31:33<1:19:23, 1.51it/s]
26%|██▋ | 2565/9770 [31:34<1:19:10, 1.52it/s]
26%|██▋ | 2566/9770 [31:34<1:19:05, 1.52it/s]
26%|██▋ | 2567/9770 [31:35<1:20:53, 1.48it/s]
26%|██▋ | 2568/9770 [31:36<1:19:53, 1.50it/s]
26%|██▋ | 2569/9770 [31:36<1:20:03, 1.50it/s]
26%|██▋ | 2570/9770 [31:37<
+0: {'loss': 0.699, 'grad_norm': 0.6503548334613835, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.26}
+0: {'loss': 0.6898, 'grad_norm': 0.6850959775677724, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.26}
+0: 1:20:19, 1.49it/s]
26%|██▋ | 2570/9770 [31:37<1:20:19, 1.49it/s]
26%|██▋ | 2571/9770 [31:38<1:20:24, 1.49it/s]
26%|██▋ | 2572/9770 [31:38<1:19:50, 1.50it/s]
26%|██▋ | 2573/9770 [31:39<1:19:31, 1.51it/s]
26%|██▋ | 2574/9770 [31:40<1:19:07, 1.52it/s]
26%|██▋ | 2575/9770 [31:40<1:20:53, 1.48it/s]
26%|██▋ | 2576/9770 [31:41<1:18:58, 1.52it/s]
26%|██▋ | 2577/9770 [31:41<1:18:27, 1.53it/s]
26%|██▋ | 2578/9770 [31:42<1:18:40, 1.52it/s]
26%|██▋ | 2579/9770 [31:43<1:18:46, 1.52it/s]
26%|██▋ | 2580/9770 [31:43<1:19:24, 1.51it/s]
26%|██▋ | 2580/9770 [31:43<1:19:24, 1.51it/s]
26%|██▋ | 2581/9770 [31:44<1:19:37, 1.50it/s]
26%|██▋ | 2582/9770 [31:45<1:18:55, 1.52it/s]
26%|██▋ | 2583/9770 [31:45<1:18:36, 1.52
+0: {'loss': 0.6983, 'grad_norm': 0.7601607684741332, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.26}
+0: it/s]
26%|██▋ | 2584/9770 [31:46<1:19:03, 1.51it/s]
26%|██▋ | 2585/9770 [31:47<1:19:09, 1.51it/s]
26%|██▋ | 2586/9770 [31:47<1:19:40, 1.50it/s]
26%|██▋ | 2587/9770 [31:48<1:18:29, 1.53it/s]
26%|██▋ | 2588/9770 [31:49<1:18:47, 1.52it/s]
26%|██▋ | 2589/9770 [31:49<1:18:35, 1.52it/s]
27%|██▋ | 2590/9770 [31:50<1:18:31, 1.52it/s]
27%|██▋ | 2590/9770 [31:50<1:18:31, 1.52it/s]
27%|██▋ | 2591/9770 [31:51<1:20:16, 1.49it/s]
27%|██▋ | 2592/9770 [31:51<1:19:35, 1.50it/s]
27%|██▋ | 2593/9770 [31:52<1:19:13, 1.51it/s]
27%|██▋ | 2594/9770 [31:53<1:17:53, 1.54it/s]
27%|██▋ | 2595/9770 [31:53<1:17:58, 1.53it/s]
27%|██▋ | 2596/9770 [31:54<1:19:03, 1.51it/s]
27%|██▋ | 2597/9770 [31:55<1:19:32, 1.50it/s]
27%|██▋ | 2598/9770 [31:55<1:19:46, 1.50it/s]
27
+0: {'loss': 0.7158, 'grad_norm': 0.714844249648634, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.27}
+0: {'loss': 0.6902, 'grad_norm': 0.6566247528951694, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.27}
+0: %|██▋ | 2599/9770 [31:56<1:18:36, 1.52it/s]
27%|██▋ | 2600/9770 [31:57<1:18:45, 1.52it/s]
27%|██▋ | 2600/9770 [31:57<1:18:45, 1.52it/s]
27%|██▋ | 2601/9770 [31:57<1:19:21, 1.51it/s]
27%|██▋ | 2602/9770 [31:58<1:19:29, 1.50it/s]
27%|██▋ | 2603/9770 [31:59<1:19:32, 1.50it/s]
27%|██▋ | 2604/9770 [31:59<1:19:04, 1.51it/s]
27%|██▋ | 2605/9770 [32:00<1:19:11, 1.51it/s]
27%|██▋ | 2606/9770 [32:01<1:19:32, 1.50it/s]
27%|██▋ | 2607/9770 [32:01<1:19:16, 1.51it/s]
27%|██▋ | 2608/9770 [32:02<1:18:26, 1.52it/s]
27%|██▋ | 2609/9770 [32:03<1:18:21, 1.52it/s]
27%|██▋ | 2610/9770 [32:03<1:18:49, 1.51it/s]
27%|██▋ | 2610/9770 [32:03<1:18:49, 1.51it/s]
27%|██▋ | 2611/9770 [32:04<1:18:40, 1.52it/s]
27%|██▋
+0: {'loss': 0.6799, 'grad_norm': 0.6818533156172748, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.27}
+0: | 2612/9770 [32:05<1:19:00, 1.51it/s]
27%|██▋ | 2613/9770 [32:05<1:19:07, 1.51it/s]
27%|██▋ | 2614/9770 [32:06<1:17:56, 1.53it/s]
27%|██▋ | 2615/9770 [32:07<1:18:00, 1.53it/s]
27%|██▋ | 2616/9770 [32:07<1:18:37, 1.52it/s]
27%|██▋ | 2617/9770 [32:08<1:18:49, 1.51it/s]
27%|██▋ | 2618/9770 [32:09<1:19:01, 1.51it/s]
27%|██▋ | 2619/9770 [32:09<1:20:14, 1.49it/s]
27%|██▋ | 2620/9770 [32:10<1:19:01, 1.51it/s]
27%|██▋ | 2620/9770 [32:10<1:19:01, 1.51it/s]
27%|██▋ | 2621/9770 [32:11<1:20:23, 1.48it/s]
27%|██▋ | 2622/9770 [32:11<1:19:26, 1.50it/s]
27%|██▋ | 2623/9770 [32:12<1:18:31, 1.52it/s]
27%|██▋ | 2624/9770 [32:13<1:18:18, 1.52it/s]
27%|██▋ | 2625/9770 [32:13<1:19:19, 1.50it/s]
27%|██▋ | 2626/9770 [32:14<1:18:34, 1.52it/s]
27%|██▋ | 262
+0: {'loss': 0.68, 'grad_norm': 0.6587684863284928, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.27}
+0: {'loss': 0.705, 'grad_norm': 0.6693220598241055, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.27}
+0: 7/9770 [32:15<1:17:57, 1.53it/s]
27%|██▋ | 2628/9770 [32:15<1:18:21, 1.52it/s]
27%|██▋ | 2629/9770 [32:16<1:17:47, 1.53it/s]
27%|██▋ | 2630/9770 [32:17<1:18:07, 1.52it/s]
27%|██▋ | 2630/9770 [32:17<1:18:07, 1.52it/s]
27%|██▋ | 2631/9770 [32:17<1:18:55, 1.51it/s]
27%|██▋ | 2632/9770 [32:18<1:20:31, 1.48it/s]
27%|██▋ | 2633/9770 [32:19<1:19:22, 1.50it/s]
27%|██▋ | 2634/9770 [32:19<1:18:27, 1.52it/s]
27%|██▋ | 2635/9770 [32:20<1:17:58, 1.53it/s]
27%|██▋ | 2636/9770 [32:21<1:18:13, 1.52it/s]
27%|██▋ | 2637/9770 [32:21<1:17:14, 1.54it/s]
27%|██▋ | 2638/9770 [32:22<1:17:44, 1.53it/s]
27%|██▋ | 2639/9770 [32:22<1:17:31, 1.53it/s]
27%|██▋ | 2640/9770 [32:23<1:18:38, 1.51it/s]
27%|██▋ | 2640/9770 [32:23<
+0: {'loss': 0.7191, 'grad_norm': 0.6404089490596173, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.27}
+0: 1:18:38, 1.51it/s]
27%|██▋ | 2641/9770 [32:24<1:18:38, 1.51it/s]
27%|██▋ | 2642/9770 [32:24<1:18:45, 1.51it/s]
27%|██▋ | 2643/9770 [32:25<1:18:01, 1.52it/s]
27%|██▋ | 2644/9770 [32:26<1:19:39, 1.49it/s]
27%|██▋ | 2645/9770 [32:26<1:18:47, 1.51it/s]
27%|██▋ | 2646/9770 [32:27<1:17:53, 1.52it/s]
27%|██▋ | 2647/9770 [32:28<1:18:12, 1.52it/s]
27%|██▋ | 2648/9770 [32:28<1:17:53, 1.52it/s]
27%|██▋ | 2649/9770 [32:29<1:17:28, 1.53it/s]
27%|██▋ | 2650/9770 [32:30<1:17:17, 1.54it/s]
27%|██▋ | 2650/9770 [32:30<1:17:17, 1.54it/s]
27%|██▋ | 2651/9770 [32:30<1:17:35, 1.53it/s]
27%|██▋ | 2652/9770 [32:31<1:16:32, 1.55it/s]
27%|██▋ | 2653/9770 [32:32<1:15:56, 1.56it/s]
27%|██▋ | 2654/9770 [32:32<1:16:35, 1.55it/s]
27%|██▋ | 2655/9770 [32:33<1:17:05,
+0: {'loss': 0.7207, 'grad_norm': 0.6434468794401105, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.27}
+0: 1.54it/s]
27%|██▋ | 2656/9770 [32:34<1:17:09, 1.54it/s]
27%|██▋ | 2657/9770 [32:34<1:19:09, 1.50it/s]
27%|██▋ | 2658/9770 [32:35<1:19:20, 1.49it/s]
27%|██▋ | 2659/9770 [32:36<1:19:02, 1.50it/s]
27%|██▋ | 2660/9770 [32:36<1:18:07, 1.52it/s]
27%|██▋ | 2660/9770 [32:36<1:18:07, 1.52it/s]
27%|██▋ | 2661/9770 [32:37<1:18:40, 1.51it/s]
27%|██▋ | 2662/9770 [32:38<1:18:04, 1.52it/s]
27%|██▋ | 2663/9770 [32:38<1:18:13, 1.51it/s]
27%|██▋ | 2664/9770 [32:39<1:18:19, 1.51it/s]
27%|██▋ | 2665/9770 [32:40<1:17:44, 1.52it/s]
27%|██▋ | 2666/9770 [32:40<1:17:18, 1.53it/s]
27%|██▋ | 2667/9770 [32:41<1:17:17, 1.53it/s]
27%|██▋ | 2668/9770 [32:42<1:17:01, 1.54it/s]
27%|██▋ | 2669/9770 [32:42<1:16:52, 1.54it/s]
27%|██▋ | 2670/9770 [32:43<1:16:49, 1.54it/s
+0: {'loss': 0.6956, 'grad_norm': 0.6373079921026094, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.27}
+0: {'loss': 0.7138, 'grad_norm': 0.6716590217601842, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.27}
+0: ]
27%|██▋ | 2670/9770 [32:43<1:16:49, 1.54it/s]
27%|██▋ | 2671/9770 [32:43<1:17:00, 1.54it/s]
27%|██▋ | 2672/9770 [32:44<1:18:13, 1.51it/s]
27%|██▋ | 2673/9770 [32:45<1:17:15, 1.53it/s]
27%|██▋ | 2674/9770 [32:45<1:17:18, 1.53it/s]
27%|██▋ | 2675/9770 [32:46<1:17:44, 1.52it/s]
27%|██▋ | 2676/9770 [32:47<1:18:21, 1.51it/s]
27%|██▋ | 2677/9770 [32:47<1:18:11, 1.51it/s]
27%|██▋ | 2678/9770 [32:48<1:17:27, 1.53it/s]
27%|██▋ | 2679/9770 [32:49<1:18:26, 1.51it/s]
27%|██▋ | 2680/9770 [32:49<1:18:51, 1.50it/s]
27%|██▋ | 2680/9770 [32:49<1:18:51, 1.50it/s]
27%|██▋ | 2681/9770 [32:50<1:17:48, 1.52it/s]
27%|██▋ | 2682/9770 [32:51<1:17:23, 1.53it/s]
27%|██▋ | 2683/9770 [32:51<1:16:50, 1.54it/s]
27%|██�
+0: {'loss': 0.6953, 'grad_norm': 0.7004235270005645, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.28}
+0: �� | 2684/9770 [32:52<1:16:16, 1.55it/s]
27%|██▋ | 2685/9770 [32:53<1:16:35, 1.54it/s]
27%|██▋ | 2686/9770 [32:53<1:17:38, 1.52it/s]
28%|██▊ | 2687/9770 [32:54<1:17:48, 1.52it/s]
28%|██▊ | 2688/9770 [32:55<1:16:56, 1.53it/s]
28%|██▊ | 2689/9770 [32:55<1:16:35, 1.54it/s]
28%|██▊ | 2690/9770 [32:56<1:16:45, 1.54it/s]
28%|██▊ | 2690/9770 [32:56<1:16:45, 1.54it/s]
28%|██▊ | 2691/9770 [32:57<1:16:26, 1.54it/s]
28%|██▊ | 2692/9770 [32:57<1:17:35, 1.52it/s]
28%|██▊ | 2693/9770 [32:58<1:17:43, 1.52it/s]
28%|██▊ | 2694/9770 [32:59<1:17:42, 1.52it/s]
28%|██▊ | 2695/9770 [32:59<1:16:55, 1.53it/s]
28%|██▊ | 2696/9770 [33:00<1:16:31, 1.54it/s]
28%|██▊ | 2697/9770 [33:00<1:16:43, 1.54it/s]
28%|██▊ | 2698/9770 [33:01<1:16:57, 1.53it/s]
28%|██▊
+0: {'loss': 0.6957, 'grad_norm': 0.6823894126516401, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.28}
+0: {'loss': 0.6987, 'grad_norm': 0.6873280943033803, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.28}
+0: | 2699/9770 [33:02<1:16:52, 1.53it/s]
28%|██▊ | 2700/9770 [33:02<1:17:10, 1.53it/s]
28%|██▊ | 2700/9770 [33:02<1:17:10, 1.53it/s]
28%|██▊ | 2701/9770 [33:03<1:17:14, 1.53it/s]
28%|██▊ | 2702/9770 [33:04<1:16:57, 1.53it/s]
28%|██▊ | 2703/9770 [33:04<1:18:43, 1.50it/s]
28%|██▊ | 2704/9770 [33:05<1:18:35, 1.50it/s]
28%|██▊ | 2705/9770 [33:06<1:17:53, 1.51it/s]
28%|██▊ | 2706/9770 [33:06<1:17:55, 1.51it/s]
28%|██▊ | 2707/9770 [33:07<1:18:17, 1.50it/s]
28%|██▊ | 2708/9770 [33:08<1:18:37, 1.50it/s]
28%|██▊ | 2709/9770 [33:08<1:17:44, 1.51it/s]
28%|██▊ | 2710/9770 [33:09<1:17:38, 1.52it/s]
28%|██▊ | 2710/9770 [33:09<1:17:38, 1.52it/s]
28%|██▊ | 2711/9770 [33:10<1:17:04, 1.53it/s]
28%|██▊ | 2712/9770 [3
+0: {'loss': 0.6776, 'grad_norm': 0.6137549934793076, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.28}
+0: 3:10<1:16:51, 1.53it/s]
28%|██▊ | 2713/9770 [33:11<1:15:50, 1.55it/s]
28%|██▊ | 2714/9770 [33:12<1:16:48, 1.53it/s]
28%|██▊ | 2715/9770 [33:12<1:16:54, 1.53it/s]
28%|██▊ | 2716/9770 [33:13<1:17:10, 1.52it/s]
28%|██▊ | 2717/9770 [33:14<1:16:47, 1.53it/s]
28%|██▊ | 2718/9770 [33:14<1:16:41, 1.53it/s]
28%|██▊ | 2719/9770 [33:15<1:16:37, 1.53it/s]
28%|██▊ | 2720/9770 [33:16<1:16:44, 1.53it/s]
28%|██▊ | 2720/9770 [33:16<1:16:44, 1.53it/s]
28%|██▊ | 2721/9770 [33:16<1:16:27, 1.54it/s]
28%|██▊ | 2722/9770 [33:17<1:16:48, 1.53it/s]
28%|██▊ | 2723/9770 [33:18<1:16:24, 1.54it/s]
28%|██▊ | 2724/9770 [33:18<1:17:04, 1.52it/s]
28%|██▊ | 2725/9770 [33:19<1:16:54, 1.53it/s]
28%|██▊ | 2726/9770 [33:20<1:16:47, 1.53it/s]
28%|██▊ | 2727/9770 [33:20<1:17
+0: {'loss': 0.7144, 'grad_norm': 0.7114441459312836, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.28}
+0: {'loss': 0.6926, 'grad_norm': 0.6816157335832983, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.28}
+0: :15, 1.52it/s]
28%|██▊ | 2728/9770 [33:21<1:17:15, 1.52it/s]
28%|██▊ | 2729/9770 [33:22<1:17:17, 1.52it/s]
28%|██▊ | 2730/9770 [33:22<1:17:14, 1.52it/s]
28%|██▊ | 2730/9770 [33:22<1:17:14, 1.52it/s]
28%|██▊ | 2731/9770 [33:23<1:17:03, 1.52it/s]
28%|██▊ | 2732/9770 [33:23<1:16:18, 1.54it/s]
28%|██▊ | 2733/9770 [33:24<1:16:49, 1.53it/s]
28%|██▊ | 2734/9770 [33:25<1:17:06, 1.52it/s]
28%|██▊ | 2735/9770 [33:25<1:16:38, 1.53it/s]
28%|██▊ | 2736/9770 [33:26<1:16:26, 1.53it/s]
28%|██▊ | 2737/9770 [33:27<1:16:49, 1.53it/s]
28%|██▊ | 2738/9770 [33:27<1:16:44, 1.53it/s]
28%|██▊ | 2739/9770 [33:28<1:17:02, 1.52it/s]
28%|██▊ | 2740/9770 [33:29<1:16:53, 1.52it/s]
28%|██▊ | 2740/9770 [33:29<1:16:53, 1.52it/s
+0: {'loss': 0.6937, 'grad_norm': 0.6993635335347452, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.28}
+0: ]
28%|██▊ | 2741/9770 [33:29<1:16:56, 1.52it/s]
28%|██▊ | 2742/9770 [33:30<1:17:00, 1.52it/s]
28%|██▊ | 2743/9770 [33:31<1:16:24, 1.53it/s]
28%|██▊ | 2744/9770 [33:31<1:17:09, 1.52it/s]
28%|██▊ | 2745/9770 [33:32<1:17:36, 1.51it/s]
28%|██▊ | 2746/9770 [33:33<1:18:10, 1.50it/s]
28%|██▊ | 2747/9770 [33:33<1:17:56, 1.50it/s]
28%|██▊ | 2748/9770 [33:34<1:18:07, 1.50it/s]
28%|██▊ | 2749/9770 [33:35<1:17:10, 1.52it/s]
28%|██▊ | 2750/9770 [33:35<1:17:05, 1.52it/s]
28%|██▊ | 2750/9770 [33:35<1:17:05, 1.52it/s]
28%|██▊ | 2751/9770 [33:36<1:16:21, 1.53it/s]
28%|██▊ | 2752/9770 [33:37<1:15:56, 1.54it/s]
28%|██▊ | 2753/9770 [33:37<1:15:55, 1.54it/s]
28%|██▊ | 2754/9770 [33:38<1:16:02, 1.54it/s]
28%|██▊ | 2755/9770 [33:39<1:15:59, 1.54it/s]
28%|�
+0: {'loss': 0.6744, 'grad_norm': 0.7044848293097536, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.28}
+0: �█▊ | 2756/9770 [33:39<1:15:45, 1.54it/s]
28%|██▊ | 2757/9770 [33:40<1:16:07, 1.54it/s]
28%|██▊ | 2758/9770 [33:41<1:16:01, 1.54it/s]
28%|██▊ | 2759/9770 [33:41<1:16:25, 1.53it/s]
28%|██▊ | 2760/9770 [33:42<1:17:04, 1.52it/s]
28%|██▊ | 2760/9770 [33:42<1:17:04, 1.52it/s]
28%|██▊ | 2761/9770 [33:43<1:18:19, 1.49it/s]
28%|██▊ | 2762/9770 [33:43<1:17:52, 1.50it/s]
28%|██▊ | 2763/9770 [33:44<1:17:35, 1.51it/s]
28%|██▊ | 2764/9770 [33:45<1:16:54, 1.52it/s]
28%|██▊ | 2765/9770 [33:45<1:17:21, 1.51it/s]
28%|██▊ | 2766/9770 [33:46<1:17:30, 1.51it/s]
28%|██▊ | 2767/9770 [33:47<1:25:46, 1.36it/s]
28%|██▊ | 2768/9770 [33:47<1:22:48, 1.41it/s]
28%|██▊ | 2769/9770 [33:48<1:20:37, 1.45it/s]
28%|██▊ | 2770/9770 [33:49<1:19:46, 1.46it/s]
+0: {'loss': 0.6785, 'grad_norm': 0.6635997869028231, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.28}
+0: {'loss': 0.7011, 'grad_norm': 0.6666638442957333, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.28}
+0:
28%|██▊ | 2770/9770 [33:49<1:19:46, 1.46it/s]
28%|██▊ | 2771/9770 [33:49<1:19:23, 1.47it/s]
28%|██▊ | 2772/9770 [33:50<1:18:03, 1.49it/s]
28%|██▊ | 2773/9770 [33:51<1:16:41, 1.52it/s]
28%|██▊ | 2774/9770 [33:51<1:15:27, 1.55it/s]
28%|██▊ | 2775/9770 [33:52<1:15:21, 1.55it/s]
28%|██▊ | 2776/9770 [33:53<1:16:27, 1.52it/s]
28%|██▊ | 2777/9770 [33:53<1:15:43, 1.54it/s]
28%|██▊ | 2778/9770 [33:54<1:17:04, 1.51it/s]
28%|██▊ | 2779/9770 [33:55<1:16:17, 1.53it/s]
28%|██▊ | 2780/9770 [33:55<1:17:16, 1.51it/s]
28%|██▊ | 2780/9770 [33:55<1:17:16, 1.51it/s]
28%|██▊ | 2781/9770 [33:56<1:17:38, 1.50it/s]
28%|██▊ | 2782/9770 [33:57<1:16:13, 1.53it/s]
28%|██▊ | 2783/9770 [33:57<1:15:58, 1.53it/s]
28%|██▊ | 2784/97
+0: {'loss': 0.6971, 'grad_norm': 0.6546325638321053, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.29}
+0: 70 [33:58<1:15:46, 1.54it/s]
29%|██▊ | 2785/9770 [33:59<1:15:53, 1.53it/s]
29%|██▊ | 2786/9770 [33:59<1:16:14, 1.53it/s]
29%|██▊ | 2787/9770 [34:00<1:16:55, 1.51it/s]
29%|██▊ | 2788/9770 [34:00<1:16:27, 1.52it/s]
29%|██▊ | 2789/9770 [34:01<1:16:51, 1.51it/s]
29%|██▊ | 2790/9770 [34:02<1:26:22, 1.35it/s]
29%|██▊ | 2790/9770 [34:02<1:26:22, 1.35it/s]
29%|██▊ | 2791/9770 [34:03<1:23:36, 1.39it/s]
29%|██▊ | 2792/9770 [34:03<1:22:59, 1.40it/s]
29%|██▊ | 2793/9770 [34:04<1:21:08, 1.43it/s]
29%|██▊ | 2794/9770 [34:05<1:19:20, 1.47it/s]
29%|██▊ | 2795/9770 [34:05<1:19:46, 1.46it/s]
29%|██▊ | 2796/9770 [34:06<1:18:12, 1.49it/s]
29%|██▊ | 2797/9770 [34:07<1:17:14, 1.50it/s]
29%|██▊ | 2798/9770 [34:07<1:17:16, 1.50it/s]
29%|██▊ | 2799/9770 [34:08
+0: {'loss': 0.6971, 'grad_norm': 0.7212135225200016, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.29}
+0: {'loss': 0.7044, 'grad_norm': 0.6597693574036081, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.29}
+0: <1:16:41, 1.51it/s]
29%|██▊ | 2800/9770 [34:09<1:16:47, 1.51it/s]
29%|██▊ | 2800/9770 [34:09<1:16:47, 1.51it/s]
29%|██▊ | 2801/9770 [34:09<1:16:41, 1.51it/s]
29%|██▊ | 2802/9770 [34:10<1:17:28, 1.50it/s]
29%|██▊ | 2803/9770 [34:11<1:16:52, 1.51it/s]
29%|██▊ | 2804/9770 [34:11<1:16:48, 1.51it/s]
29%|██▊ | 2805/9770 [34:12<1:16:40, 1.51it/s]
29%|██▊ | 2806/9770 [34:13<1:16:43, 1.51it/s]
29%|██▊ | 2807/9770 [34:13<1:16:34, 1.52it/s]
29%|██▊ | 2808/9770 [34:14<1:16:47, 1.51it/s]
29%|██▉ | 2809/9770 [34:15<1:16:17, 1.52it/s]
29%|██▉ | 2810/9770 [34:15<1:15:59, 1.53it/s]
29%|██▉ | 2810/9770 [34:15<1:15:59, 1.53it/s]
29%|██▉ | 2811/9770 [34:16<1:16:09, 1.52it/s]
29%|██▉ | 2812/9770 [34:17<1:16:07, 1.5
+0: {'loss': 0.6917, 'grad_norm': 0.6578503686481688, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.29}
+0: 2it/s]
29%|██▉ | 2813/9770 [34:17<1:16:10, 1.52it/s]
29%|██▉ | 2814/9770 [34:18<1:16:00, 1.53it/s]
29%|██▉ | 2815/9770 [34:19<1:15:52, 1.53it/s]
29%|██▉ | 2816/9770 [34:19<1:16:12, 1.52it/s]
29%|██▉ | 2817/9770 [34:20<1:16:25, 1.52it/s]
29%|██▉ | 2818/9770 [34:21<1:15:28, 1.54it/s]
29%|██▉ | 2819/9770 [34:21<1:14:51, 1.55it/s]
29%|██▉ | 2820/9770 [34:22<1:15:25, 1.54it/s]
29%|██▉ | 2820/9770 [34:22<1:15:25, 1.54it/s]
29%|██▉ | 2821/9770 [34:22<1:14:59, 1.54it/s]
29%|██▉ | 2822/9770 [34:23<1:16:16, 1.52it/s]
29%|██▉ | 2823/9770 [34:24<1:16:17, 1.52it/s]
29%|██▉ | 2824/9770 [34:24<1:15:57, 1.52it/s]
29%|██▉ | 2825/9770 [34:25<1:16:13, 1.52it/s]
29%|██▉ | 2826/9770 [34:26<1:25:17, 1.36it/s]
29%|██▉ | 2827/9770 [34:27<1:23:14, 1.39it/s]
2
+0: {'loss': 0.6794, 'grad_norm': 0.6592209926870769, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.29}
+0: {'loss': 0.7005, 'grad_norm': 0.6793546919894566, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.29}
+0: 9%|██▉ | 2828/9770 [34:27<1:21:10, 1.43it/s]
29%|██▉ | 2829/9770 [34:28<1:19:47, 1.45it/s]
29%|██▉ | 2830/9770 [34:29<1:19:05, 1.46it/s]
29%|██▉ | 2830/9770 [34:29<1:19:05, 1.46it/s]
29%|██▉ | 2831/9770 [34:29<1:17:10, 1.50it/s]
29%|██▉ | 2832/9770 [34:30<1:16:18, 1.52it/s]
29%|██▉ | 2833/9770 [34:31<1:16:08, 1.52it/s]
29%|██▉ | 2834/9770 [34:31<1:15:19, 1.53it/s]
29%|██▉ | 2835/9770 [34:32<1:26:53, 1.33it/s]
29%|██▉ | 2836/9770 [34:33<1:23:21, 1.39it/s]
29%|██▉ | 2837/9770 [34:34<1:31:32, 1.26it/s]
29%|██▉ | 2838/9770 [34:35<1:27:12, 1.32it/s]
29%|██▉ | 2839/9770 [34:35<1:24:13, 1.37it/s]
29%|██▉ | 2840/9770 [34:36<1:22:02, 1.41it/s]
29%|██▉ | 2840/9770 [34:36<1:22:02, 1.41it/s]
29%|██▉
+0: {'loss': 0.7013, 'grad_norm': 0.6364167733450667, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.29}
+0: | 2841/9770 [34:37<1:20:29, 1.43it/s]
29%|██▉ | 2842/9770 [34:37<1:19:34, 1.45it/s]
29%|██▉ | 2843/9770 [34:38<1:17:54, 1.48it/s]
29%|██▉ | 2844/9770 [34:39<1:16:12, 1.51it/s]
29%|██▉ | 2845/9770 [34:39<1:15:43, 1.52it/s]
29%|██▉ | 2846/9770 [34:40<1:15:52, 1.52it/s]
29%|██▉ | 2847/9770 [34:40<1:15:33, 1.53it/s]
29%|██▉ | 2848/9770 [34:41<1:15:37, 1.53it/s]
29%|██▉ | 2849/9770 [34:42<1:28:18, 1.31it/s]
29%|██▉ | 2850/9770 [34:43<1:24:02, 1.37it/s]
29%|██▉ | 2850/9770 [34:43<1:24:02, 1.37it/s]
29%|██▉ | 2851/9770 [34:43<1:21:39, 1.41it/s]
29%|██▉ | 2852/9770 [34:44<1:30:36, 1.27it/s]
29%|██▉ | 2853/9770 [34:45<1:26:12, 1.34it/s]
29%|██▉ | 2854/9770 [34:46<1:23:09, 1.39it/s]
29%|██▉ | 2855/9770 [34:46<1:20:38, 1.43it/s]
29%|██▉ | 28
+0: {'loss': 0.69, 'grad_norm': 0.7694239540103294, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.29}
+0: 56/9770 [34:47<1:19:48, 1.44it/s]
29%|██▉ | 2857/9770 [34:48<1:18:36, 1.47it/s]
29%|██▉ | 2858/9770 [34:48<1:18:24, 1.47it/s]
29%|██▉ | 2859/9770 [34:49<1:17:29, 1.49it/s]
29%|██▉ | 2860/9770 [34:50<1:25:26, 1.35it/s]
29%|██▉ | 2860/9770 [34:50<1:25:26, 1.35it/s]
29%|██▉ | 2861/9770 [34:51<1:30:31, 1.27it/s]
29%|██▉ | 2862/9770 [34:51<1:26:11, 1.34it/s]
29%|██▉ | 2863/9770 [34:52<1:32:19, 1.25it/s]
29%|██▉ | 2864/9770 [34:53<1:27:16, 1.32it/s]
29%|██▉ | 2865/9770 [34:54<1:23:29, 1.38it/s]
29%|██▉ | 2866/9770 [34:54<1:22:50, 1.39it/s]
29%|██▉ | 2867/9770 [34:55<1:20:26, 1.43it/s]
29%|██▉ | 2868/9770 [34:56<1:19:19, 1.45it/s]
29%|██▉ | 2869/9770 [34:56<1:17:27, 1.48it/s]
29%|██▉ | 2870/9770 [34:57<1:15:53, 1.52it/s]
+0: {'loss': 0.6773, 'grad_norm': 0.6418972172187125, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.29}
+0: {'loss': 0.7115, 'grad_norm': 0.7219525462185958, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.29}
+0:
29%|██▉ | 2870/9770 [34:57<1:15:53, 1.52it/s]
29%|██▉ | 2871/9770 [34:58<1:14:53, 1.54it/s]
29%|██▉ | 2872/9770 [34:59<1:27:20, 1.32it/s]
29%|██▉ | 2873/9770 [34:59<1:22:57, 1.39it/s]
29%|██▉ | 2874/9770 [35:00<1:20:53, 1.42it/s]
29%|██▉ | 2875/9770 [35:01<1:19:08, 1.45it/s]
29%|██▉ | 2876/9770 [35:01<1:17:30, 1.48it/s]
29%|██▉ | 2877/9770 [35:02<1:16:43, 1.50it/s]
29%|██▉ | 2878/9770 [35:03<1:16:23, 1.50it/s]
29%|██▉ | 2879/9770 [35:03<1:15:56, 1.51it/s]
29%|██▉ | 2880/9770 [35:04<1:16:09, 1.51it/s]
29%|██▉ | 2880/9770 [35:04<1:16:09, 1.51it/s]
29%|██▉ | 2881/9770 [35:05<1:15:14, 1.53it/s]
29%|██▉ | 2882/9770 [35:05<1:14:47, 1.53it/s]
30%|██▉ | 2883/9770 [35:06<1:14:04, 1.55it/s]
30%|██▉ | 2884/9770 [35:06<1:14:42,
+0: {'loss': 0.6836, 'grad_norm': 0.7350393946798834, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.3}
+0: 1.54it/s]
30%|██▉ | 2885/9770 [35:07<1:14:53, 1.53it/s]
30%|██▉ | 2886/9770 [35:08<1:25:58, 1.33it/s]
30%|██▉ | 2887/9770 [35:09<1:22:43, 1.39it/s]
30%|██▉ | 2888/9770 [35:09<1:21:44, 1.40it/s]
30%|██▉ | 2889/9770 [35:10<1:19:28, 1.44it/s]
30%|██▉ | 2890/9770 [35:11<1:18:00, 1.47it/s]
30%|██▉ | 2890/9770 [35:11<1:18:00, 1.47it/s]
30%|██▉ | 2891/9770 [35:11<1:17:31, 1.48it/s]
30%|██▉ | 2892/9770 [35:12<1:16:50, 1.49it/s]
30%|██▉ | 2893/9770 [35:13<1:16:13, 1.50it/s]
30%|██▉ | 2894/9770 [35:13<1:15:43, 1.51it/s]
30%|██▉ | 2895/9770 [35:14<1:16:27, 1.50it/s]
30%|██▉ | 2896/9770 [35:15<1:16:59, 1.49it/s]
30%|██▉ | 2897/9770 [35:15<1:16:46, 1.49it/s]
30%|██▉ | 2898/9770 [35:16<1:15:21, 1.52it/s]
30%|██▉ | 2899/9770 [35:17<1:15:22, 1.52it/
+0: {'loss': 0.7202, 'grad_norm': 0.6766754879569933, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.3}
+0: {'loss': 0.7011, 'grad_norm': 0.6646716199848026, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.3}
+0: s]
30%|██▉ | 2900/9770 [35:17<1:15:55, 1.51it/s]
30%|██▉ | 2900/9770 [35:17<1:15:55, 1.51it/s]
30%|██▉ | 2901/9770 [35:18<1:16:00, 1.51it/s]
30%|██▉ | 2902/9770 [35:19<1:16:00, 1.51it/s]
30%|██▉ | 2903/9770 [35:19<1:15:57, 1.51it/s]
30%|██▉ | 2904/9770 [35:20<1:15:33, 1.51it/s]
30%|██▉ | 2905/9770 [35:21<1:15:31, 1.51it/s]
30%|██▉ | 2906/9770 [35:21<1:15:54, 1.51it/s]
30%|██▉ | 2907/9770 [35:22<1:15:36, 1.51it/s]
30%|██▉ | 2908/9770 [35:23<1:15:37, 1.51it/s]
30%|██▉ | 2909/9770 [35:23<1:14:58, 1.53it/s]
30%|██▉ | 2910/9770 [35:24<1:14:58, 1.52it/s]
30%|██▉ | 2910/9770 [35:24<1:14:58, 1.52it/s]
30%|██▉ | 2911/9770 [35:25<1:15:46, 1.51it/s]
30%|██▉ | 2912/9770 [35:25<1:14:48, 1.53it/s]
30%|██
+0: {'loss': 0.6891, 'grad_norm': 0.6485553091648115, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.3}
+0: ▉ | 2913/9770 [35:26<1:14:55, 1.53it/s]
30%|██▉ | 2914/9770 [35:27<1:15:05, 1.52it/s]
30%|██▉ | 2915/9770 [35:27<1:15:23, 1.52it/s]
30%|██▉ | 2916/9770 [35:28<1:15:30, 1.51it/s]
30%|██▉ | 2917/9770 [35:29<1:16:02, 1.50it/s]
30%|██▉ | 2918/9770 [35:29<1:15:57, 1.50it/s]
30%|██▉ | 2919/9770 [35:30<1:15:15, 1.52it/s]
30%|██▉ | 2920/9770 [35:31<1:16:02, 1.50it/s]
30%|██▉ | 2920/9770 [35:31<1:16:02, 1.50it/s]
30%|██▉ | 2921/9770 [35:31<1:16:14, 1.50it/s]
30%|██▉ | 2922/9770 [35:32<1:15:56, 1.50it/s]
30%|██▉ | 2923/9770 [35:33<1:15:15, 1.52it/s]
30%|██▉ | 2924/9770 [35:33<1:14:56, 1.52it/s]
30%|██▉ | 2925/9770 [35:34<1:15:22, 1.51it/s]
30%|██▉ | 2926/9770 [35:35<1:15:06, 1.52it/s]
30%|██▉ | 2927/9770 [35:35<1:15:24, 1.51it/s]
30%|██▉
+0: {'loss': 0.6953, 'grad_norm': 0.6789681285276337, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.3}
+0: {'loss': 0.6945, 'grad_norm': 0.7220289214637919, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.3}
+0: | 2928/9770 [35:36<1:15:10, 1.52it/s]
30%|██▉ | 2929/9770 [35:37<1:15:03, 1.52it/s]
30%|██▉ | 2930/9770 [35:37<1:15:43, 1.51it/s]
30%|██▉ | 2930/9770 [35:37<1:15:43, 1.51it/s]
30%|███ | 2931/9770 [35:38<1:15:59, 1.50it/s]
30%|███ | 2932/9770 [35:39<1:17:42, 1.47it/s]
30%|███ | 2933/9770 [35:39<1:18:03, 1.46it/s]
30%|███ | 2934/9770 [35:40<1:17:18, 1.47it/s]
30%|███ | 2935/9770 [35:41<1:16:32, 1.49it/s]
30%|███ | 2936/9770 [35:41<1:22:57, 1.37it/s]
30%|███ | 2937/9770 [35:42<1:20:27, 1.42it/s]
30%|███ | 2938/9770 [35:43<1:18:29, 1.45it/s]
30%|███ | 2939/9770 [35:43<1:16:49, 1.48it/s]
30%|███ | 2940/9770 [35:44<1:16:24, 1.49it/s]
30%|███ | 2940/9770 [35:44<1:16:24, 1.49it/s]
30%|███ | 2941/9770 [
+0: {'loss': 0.6687, 'grad_norm': 0.6520813555126397, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.3}
+0: 35:45<1:16:26, 1.49it/s]
30%|███ | 2942/9770 [35:45<1:15:52, 1.50it/s]
30%|███ | 2943/9770 [35:46<1:17:07, 1.48it/s]
30%|███ | 2944/9770 [35:47<1:16:33, 1.49it/s]
30%|███ | 2945/9770 [35:47<1:15:54, 1.50it/s]
30%|███ | 2946/9770 [35:48<1:17:08, 1.47it/s]
30%|███ | 2947/9770 [35:49<1:16:32, 1.49it/s]
30%|███ | 2948/9770 [35:49<1:14:54, 1.52it/s]
30%|███ | 2949/9770 [35:50<1:15:14, 1.51it/s]
30%|███ | 2950/9770 [35:51<1:15:03, 1.51it/s]
30%|███ | 2950/9770 [35:51<1:15:03, 1.51it/s]
30%|███ | 2951/9770 [35:51<1:16:38, 1.48it/s]
30%|███ | 2952/9770 [35:52<1:17:33, 1.47it/s]
30%|███ | 2953/9770 [35:53<1:16:44, 1.48it/s]
30%|███ | 2954/9770 [35:53<1:16:35, 1.48it/s]
30%|███ | 2955/9770 [35:54<1:16:38, 1.48it/s]
30%|███ | 2956/9770 [35:55<1:1
+0: {'loss': 0.7153, 'grad_norm': 0.6057842498278948, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.3}
+0: 5:57, 1.50it/s]
30%|███ | 2957/9770 [35:55<1:15:13, 1.51it/s]
30%|███ | 2958/9770 [35:56<1:15:28, 1.50it/s]
30%|███ | 2959/9770 [35:57<1:15:44, 1.50it/s]
30%|███ | 2960/9770 [35:57<1:15:58, 1.49it/s]
30%|███ | 2960/9770 [35:57<1:15:58, 1.49it/s]
30%|███ | 2961/9770 [35:58<1:15:18, 1.51it/s]
30%|███ | 2962/9770 [35:59<1:14:35, 1.52it/s]
30%|███ | 2963/9770 [35:59<1:16:22, 1.49it/s]
30%|███ | 2964/9770 [36:00<1:15:08, 1.51it/s]
30%|███ | 2965/9770 [36:01<1:15:57, 1.49it/s]
30%|███ | 2966/9770 [36:01<1:15:33, 1.50it/s]
30%|███ | 2967/9770 [36:02<1:15:18, 1.51it/s]
30%|███ | 2968/9770 [36:03<1:16:43, 1.48it/s]
30%|███ | 2969/9770 [36:03<1:15:54, 1.49it/s]
30%|███ | 2970/9770 [36:04<1:15:15, 1.51it/s]
+0: {'loss': 0.6948, 'grad_norm': 0.6663808491051855, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.3}
+0: {'loss': 0.6808, 'grad_norm': 0.6202681039044606, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.3}
+0:
30%|███ | 2970/9770 [36:04<1:15:15, 1.51it/s]
30%|███ | 2971/9770 [36:05<1:15:19, 1.50it/s]
30%|███ | 2972/9770 [36:05<1:15:04, 1.51it/s]
30%|███ | 2973/9770 [36:06<1:14:18, 1.52it/s]
30%|███ | 2974/9770 [36:07<1:14:17, 1.52it/s]
30%|███ | 2975/9770 [36:07<1:13:37, 1.54it/s]
30%|███ | 2976/9770 [36:08<1:14:07, 1.53it/s]
30%|███ | 2977/9770 [36:09<1:13:39, 1.54it/s]
30%|███ | 2978/9770 [36:09<1:14:01, 1.53it/s]
30%|███ | 2979/9770 [36:10<1:14:54, 1.51it/s]
31%|███ | 2980/9770 [36:11<1:15:29, 1.50it/s]
31%|███ | 2980/9770 [36:11<1:15:29, 1.50it/s]
31%|███ | 2981/9770 [36:11<1:15:26, 1.50it/s]
31%|███ | 2982/9770 [36:12<1:14:30, 1.52it/s]
31%|███ | 2983/9770 [36:13<1:14:28, 1.52it/s]
31%|███ | 2984/9770 [36:13<1:14:31, 1.52it/s]
31%|�
+0: {'loss': 0.7121, 'grad_norm': 0.6380993857228097, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.31}
+0: ��██ | 2985/9770 [36:14<1:13:26, 1.54it/s]
31%|███ | 2986/9770 [36:15<1:14:02, 1.53it/s]
31%|███ | 2987/9770 [36:15<1:13:14, 1.54it/s]
31%|███ | 2988/9770 [36:16<1:12:08, 1.57it/s]
31%|███ | 2989/9770 [36:17<1:12:38, 1.56it/s]
31%|███ | 2990/9770 [36:17<1:13:14, 1.54it/s]
31%|███ | 2990/9770 [36:17<1:13:14, 1.54it/s]
31%|███ | 2991/9770 [36:18<1:13:29, 1.54it/s]
31%|███ | 2992/9770 [36:18<1:12:54, 1.55it/s]
31%|███ | 2993/9770 [36:19<1:13:50, 1.53it/s]
31%|███ | 2994/9770 [36:20<1:13:18, 1.54it/s]
31%|███ | 2995/9770 [36:20<1:13:53, 1.53it/s]
31%|███ | 2996/9770 [36:21<1:13:55, 1.53it/s]
31%|███ | 2997/9770 [36:22<1:14:17, 1.52it/s]
31%|███ | 2998/9770 [36:22<1:14:47, 1.51it/s]
31%|███ | 2999/9770 [36:23<1:15:06, 1.50it/s]
31%|███
+0: {'loss': 0.6971, 'grad_norm': 0.6776259121230762, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.31}
+0: {'loss': 0.6892, 'grad_norm': 0.7025850975168401, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.31}
+0: | 3000/9770 [36:24<1:14:56, 1.51it/s]
31%|███ | 3000/9770 [36:24<1:14:56, 1.51it/s]
31%|███ | 3001/9770 [36:24<1:14:25, 1.52it/s]
31%|███ | 3002/9770 [36:25<1:14:25, 1.52it/s]
31%|███ | 3003/9770 [36:26<1:14:11, 1.52it/s]
31%|███ | 3004/9770 [36:26<1:13:12, 1.54it/s]
31%|███ | 3005/9770 [36:27<1:13:02, 1.54it/s]
31%|███ | 3006/9770 [36:28<1:14:08, 1.52it/s]
31%|███ | 3007/9770 [36:28<1:14:13, 1.52it/s]
31%|███ | 3008/9770 [36:29<1:13:34, 1.53it/s]
31%|███ | 3009/9770 [36:30<1:14:03, 1.52it/s]
31%|███ | 3010/9770 [36:30<1:14:06, 1.52it/s]
31%|███ | 3010/9770 [36:30<1:14:06, 1.52it/s]
31%|███ | 3011/9770 [36:31<1:14:24, 1.51it/s]
31%|███ | 3012/9770 [36:32<1:14:11, 1.52it/s]
31%|███ | 3013/9
+0: {'loss': 0.6827, 'grad_norm': 0.6939108136749735, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.31}
+0: 770 [36:32<1:13:35, 1.53it/s]
31%|███ | 3014/9770 [36:33<1:15:08, 1.50it/s]
31%|███ | 3015/9770 [36:34<1:15:11, 1.50it/s]
31%|███ | 3016/9770 [36:34<1:15:04, 1.50it/s]
31%|███ | 3017/9770 [36:35<1:14:15, 1.52it/s]
31%|���██ | 3018/9770 [36:36<1:14:03, 1.52it/s]
31%|███ | 3019/9770 [36:36<1:13:51, 1.52it/s]
31%|███ | 3020/9770 [36:37<1:13:47, 1.52it/s]
31%|███ | 3020/9770 [36:37<1:13:47, 1.52it/s]
31%|███ | 3021/9770 [36:38<1:13:17, 1.53it/s]
31%|███ | 3022/9770 [36:38<1:13:07, 1.54it/s]
31%|███ | 3023/9770 [36:39<1:12:59, 1.54it/s]
31%|███ | 3024/9770 [36:40<1:13:12, 1.54it/s]
31%|███ | 3025/9770 [36:40<1:13:42, 1.53it/s]
31%|███ | 3026/9770 [36:41<1:13:26, 1.53it/s]
31%|███ | 3027/9770 [36:41<1:12:08, 1.56it/s]
31%|███ | 3028/9770 [36:4
+0: {'loss': 0.7, 'grad_norm': 0.6628349251671267, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.31}
+0: {'loss': 0.7257, 'grad_norm': 0.6559916093778086, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.31}
+0: 2<1:12:04, 1.56it/s]
31%|███ | 3029/9770 [36:43<1:13:01, 1.54it/s]
31%|███ | 3030/9770 [36:43<1:13:14, 1.53it/s]
31%|███ | 3030/9770 [36:43<1:13:14, 1.53it/s]
31%|███ | 3031/9770 [36:44<1:13:47, 1.52it/s]
31%|███ | 3032/9770 [36:45<1:14:10, 1.51it/s]
31%|███ | 3033/9770 [36:45<1:15:40, 1.48it/s]
31%|███ | 3034/9770 [36:46<1:15:16, 1.49it/s]
31%|███ | 3035/9770 [36:47<1:14:45, 1.50it/s]
31%|███ | 3036/9770 [36:47<1:14:17, 1.51it/s]
31%|███ | 3037/9770 [36:48<1:13:32, 1.53it/s]
31%|███ | 3038/9770 [36:49<1:13:27, 1.53it/s]
31%|███ | 3039/9770 [36:49<1:13:35, 1.52it/s]
31%|███ | 3040/9770 [36:50<1:13:17, 1.53it/s]
31%|███ | 3040/9770 [36:50<1:13:17, 1.53it/s]
31%|███ | 3041/9770 [36:51<1:13:25, 1.
+0: {'loss': 0.6922, 'grad_norm': 0.7144948194607025, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.31}
+0: 53it/s]
31%|███ | 3042/9770 [36:51<1:13:42, 1.52it/s]
31%|███ | 3043/9770 [36:52<1:15:15, 1.49it/s]
31%|███ | 3044/9770 [36:53<1:15:00, 1.49it/s]
31%|███ | 3045/9770 [36:53<1:15:01, 1.49it/s]
31%|███ | 3046/9770 [36:54<1:15:24, 1.49it/s]
31%|███ | 3047/9770 [36:55<1:15:02, 1.49it/s]
31%|███ | 3048/9770 [36:55<1:14:55, 1.50it/s]
31%|███ | 3049/9770 [36:56<1:15:03, 1.49it/s]
31%|███ | 3050/9770 [36:57<1:15:44, 1.48it/s]
31%|███ | 3050/9770 [36:57<1:15:44, 1.48it/s]
31%|███ | 3051/9770 [36:57<1:15:28, 1.48it/s]
31%|███ | 3052/9770 [36:58<1:14:32, 1.50it/s]
31%|███ | 3053/9770 [36:59<1:14:16, 1.51it/s]
31%|███▏ | 3054/9770 [36:59<1:13:16, 1.53it/s]
31%|███▏ | 3055/9770 [37:00<1:13:33, 1.52it/s]
31%|███▏ | 3056/9770 [37:01<1:12:46, 1.54i
+0: {'loss': 0.6651, 'grad_norm': 0.65454515169968, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.31}
+0: t/s]
31%|███▏ | 3057/9770 [37:01<1:13:09, 1.53it/s]
31%|███▏ | 3058/9770 [37:02<1:12:18, 1.55it/s]
31%|███▏ | 3059/9770 [37:03<1:12:56, 1.53it/s]
31%|███▏ | 3060/9770 [37:03<1:13:03, 1.53it/s]
31%|███▏ | 3060/9770 [37:03<1:13:03, 1.53it/s]
31%|███▏ | 3061/9770 [37:04<1:13:52, 1.51it/s]
31%|███▏ | 3062/9770 [37:05<1:13:11, 1.53it/s]
31%|███▏ | 3063/9770 [37:05<1:13:44, 1.52it/s]
31%|███▏ | 3064/9770 [37:06<1:13:58, 1.51it/s]
31%|███▏ | 3065/9770 [37:07<1:15:10, 1.49it/s]
31%|███▏ | 3066/9770 [37:07<1:14:42, 1.50it/s]
31%|███▏ | 3067/9770 [37:08<1:14:58, 1.49it/s]
31%|███▏ | 3068/9770 [37:09<1:13:43, 1.52it/s]
31%|███▏ | 3069/9770 [37:09<1:14:59, 1.49it/s]
31%|███▏ | 3070/9770 [37:10<1:14:57, 1.49it/s]
+0: {'loss': 0.6984, 'grad_norm': 0.656416424567537, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.31}
+0: {'loss': 0.6846, 'grad_norm': 0.657974632287886, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.32}
+0:
31%|███▏ | 3070/9770 [37:10<1:14:57, 1.49it/s]
31%|███▏ | 3071/9770 [37:11<1:14:24, 1.50it/s]
31%|███▏ | 3072/9770 [37:11<1:14:13, 1.50it/s]
31%|███▏ | 3073/9770 [37:12<1:15:31, 1.48it/s]
31%|███▏ | 3074/9770 [37:13<1:15:25, 1.48it/s]
31%|███▏ | 3075/9770 [37:13<1:15:17, 1.48it/s]
31%|███▏ | 3076/9770 [37:14<1:14:47, 1.49it/s]
31%|███▏ | 3077/9770 [37:15<1:15:22, 1.48it/s]
32%|███▏ | 3078/9770 [37:15<1:15:05, 1.49it/s]
32%|███▏ | 3079/9770 [37:16<1:14:44, 1.49it/s]
32%|███▏ | 3080/9770 [37:17<1:14:51, 1.49it/s]
32%|███▏ | 3080/9770 [37:17<1:14:51, 1.49it/s]
32%|███▏ | 3081/9770 [37:17<1:14:51, 1.49it/s]
32%|███▏ | 3082/9770 [37:18<1:14:58, 1.49it/s]
32%|███▏ | 3083/9770 [37:19<1:14:38, 1.49it/s]
32%|███▏
+0: {'loss': 0.694, 'grad_norm': 0.6135256708218882, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.32}
+0: | 3084/9770 [37:19<1:14:35, 1.49it/s]
32%|███▏ | 3085/9770 [37:20<1:13:29, 1.52it/s]
32%|███▏ | 3086/9770 [37:21<1:13:44, 1.51it/s]
32%|███▏ | 3087/9770 [37:21<1:14:04, 1.50it/s]
32%|███▏ | 3088/9770 [37:22<1:12:52, 1.53it/s]
32%|███▏ | 3089/9770 [37:23<1:12:39, 1.53it/s]
32%|███▏ | 3090/9770 [37:23<1:12:36, 1.53it/s]
32%|███▏ | 3090/9770 [37:23<1:12:36, 1.53it/s]
32%|███▏ | 3091/9770 [37:24<1:12:06, 1.54it/s]
32%|███▏ | 3092/9770 [37:25<1:12:40, 1.53it/s]
32%|███▏ | 3093/9770 [37:25<1:13:11, 1.52it/s]
32%|███▏ | 3094/9770 [37:26<1:14:05, 1.50it/s]
32%|███▏ | 3095/9770 [37:27<1:13:54, 1.51it/s]
32%|███▏ | 3096/9770 [37:27<1:14:03, 1.50it/s]
32%|███▏ | 3097/9770 [37:28<1:15:15, 1.48it/s]
32%|███▏ | 3098/9770 [37:29<1:14:08, 1.50i
+0: {'loss': 0.6862, 'grad_norm': 0.6402061509023796, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.32}
+0: {'loss': 0.699, 'grad_norm': 0.6419710411618812, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.32}
+0: t/s]
32%|███▏ | 3099/9770 [37:29<1:14:15, 1.50it/s]
32%|███▏ | 3100/9770 [37:30<1:13:58, 1.50it/s]
32%|███▏ | 3100/9770 [37:30<1:13:58, 1.50it/s]
32%|███▏ | 3101/9770 [37:31<1:14:37, 1.49it/s]
32%|███▏ | 3102/9770 [37:31<1:14:37, 1.49it/s]
32%|███▏ | 3103/9770 [37:32<1:13:41, 1.51it/s]
32%|███▏ | 3104/9770 [37:33<1:13:45, 1.51it/s]
32%|███▏ | 3105/9770 [37:33<1:14:23, 1.49it/s]
32%|███▏ | 3106/9770 [37:34<1:13:53, 1.50it/s]
32%|███▏ | 3107/9770 [37:35<1:13:14, 1.52it/s]
32%|███▏ | 3108/9770 [37:35<1:13:01, 1.52it/s]
32%|███▏ | 3109/9770 [37:36<1:13:21, 1.51it/s]
32%|███▏ | 3110/9770 [37:37<1:13:09, 1.52it/s]
32%|███▏ | 3110/9770 [37:37<1:13:09, 1.52it/s]
32%|███▏ | 3111/9770 [37:37
+0: {'loss': 0.7156, 'grad_norm': 0.6484948921832452, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.32}
+0: <1:13:28, 1.51it/s]
32%|███▏ | 3112/9770 [37:38<1:13:26, 1.51it/s]
32%|███▏ | 3113/9770 [37:39<1:13:18, 1.51it/s]
32%|███▏ | 3114/9770 [37:39<1:12:31, 1.53it/s]
32%|███▏ | 3115/9770 [37:40<1:12:32, 1.53it/s]
32%|███▏ | 3116/9770 [37:40<1:12:09, 1.54it/s]
32%|███▏ | 3117/9770 [37:41<1:12:29, 1.53it/s]
32%|███▏ | 3118/9770 [37:42<1:12:59, 1.52it/s]
32%|███▏ | 3119/9770 [37:42<1:12:10, 1.54it/s]
32%|███▏ | 3120/9770 [37:43<1:12:13, 1.53it/s]
32%|███▏ | 3120/9770 [37:43<1:12:13, 1.53it/s]
32%|███▏ | 3121/9770 [37:44<1:13:44, 1.50it/s]
32%|███▏ | 3122/9770 [37:44<1:13:04, 1.52it/s]
32%|███▏ | 3123/9770 [37:45<1:13:25, 1.51it/s]
32%|███▏ | 3124/9770 [37:46<1:14:15, 1.49it/s]
32%|███▏ | 3125/9770 [37:46<1:14:02, 1.50it/s]
32%|███▏
+0: {'loss': 0.6871, 'grad_norm': 0.6760327250876728, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.32}
+0: | 3126/9770 [37:47<1:13:55, 1.50it/s]
32%|███▏ | 3127/9770 [37:48<1:14:39, 1.48it/s]
32%|███▏ | 3128/9770 [37:48<1:14:20, 1.49it/s]
32%|███▏ | 3129/9770 [37:49<1:12:29, 1.53it/s]
32%|███▏ | 3130/9770 [37:50<1:12:29, 1.53it/s]
32%|███▏ | 3130/9770 [37:50<1:12:29, 1.53it/s]
32%|███▏ | 3131/9770 [37:50<1:12:42, 1.52it/s]
32%|███▏ | 3132/9770 [37:51<1:12:04, 1.54it/s]
32%|███▏ | 3133/9770 [37:52<1:12:27, 1.53it/s]
32%|███▏ | 3134/9770 [37:52<1:11:34, 1.55it/s]
32%|███▏ | 3135/9770 [37:53<1:11:41, 1.54it/s]
32%|███▏ | 3136/9770 [37:54<1:11:58, 1.54it/s]
32%|███▏ | 3137/9770 [37:54<1:12:25, 1.53it/s]
32%|███▏ | 3138/9770 [37:55<1:12:39, 1.52it/s]
32%|███▏ | 3139/9770 [37:56<1:13:41, 1.50it/s]
32%|███▏ | 3140/9770 [37:56<1:13:49, 1.50i
+0: {'loss': 0.6871, 'grad_norm': 0.6567712769002737, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.32}
+0: {'loss': 0.6751, 'grad_norm': 0.653068885444386, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.32}
+0: t/s]
32%|███▏ | 3140/9770 [37:56<1:13:49, 1.50it/s]
32%|███▏ | 3141/9770 [37:57<1:14:20, 1.49it/s]
32%|███▏ | 3142/9770 [37:58<1:14:22, 1.49it/s]
32%|███▏ | 3143/9770 [37:58<1:14:43, 1.48it/s]
32%|███▏ | 3144/9770 [37:59<1:13:55, 1.49it/s]
32%|███▏ | 3145/9770 [38:00<1:14:54, 1.47it/s]
32%|███▏ | 3146/9770 [38:00<1:14:30, 1.48it/s]
32%|███▏ | 3147/9770 [38:01<1:15:35, 1.46it/s]
32%|███▏ | 3148/9770 [38:02<1:14:40, 1.48it/s]
32%|███▏ | 3149/9770 [38:02<1:14:07, 1.49it/s]
32%|███▏ | 3150/9770 [38:03<1:13:12, 1.51it/s]
32%|███▏ | 3150/9770 [38:03<1:13:12, 1.51it/s]
32%|███▏ | 3151/9770 [38:04<1:13:27, 1.50it/s]
32%|███▏ | 3152/9770 [38:04<1:12:58, 1.51it/s]
32%|███▏ | 3153/9770 [38:05
+0: {'loss': 0.7061, 'grad_norm': 0.7236697260126095, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.32}
+0: <1:13:06, 1.51it/s]
32%|███▏ | 3154/9770 [38:06<1:13:01, 1.51it/s]
32%|███▏ | 3155/9770 [38:06<1:12:44, 1.52it/s]
32%|███▏ | 3156/9770 [38:07<1:12:48, 1.51it/s]
32%|███▏ | 3157/9770 [38:08<1:13:00, 1.51it/s]
32%|███▏ | 3158/9770 [38:08<1:12:58, 1.51it/s]
32%|███▏ | 3159/9770 [38:09<1:13:06, 1.51it/s]
32%|███▏ | 3160/9770 [38:10<1:12:47, 1.51it/s]
32%|███▏ | 3160/9770 [38:10<1:12:47, 1.51it/s]
32%|███▏ | 3161/9770 [38:10<1:12:17, 1.52it/s]
32%|███▏ | 3162/9770 [38:11<1:11:50, 1.53it/s]
32%|███▏ | 3163/9770 [38:12<1:13:14, 1.50it/s]
32%|███▏ | 3164/9770 [38:12<1:13:44, 1.49it/s]
32%|███▏ | 3165/9770 [38:13<1:13:33, 1.50it/s]
32%|███▏ | 3166/9770 [38:14<1:13:41, 1.49it/s]
32%|███▏ | 3167/9770 [38:14<1:14:20, 1.48it/s]
32%|███▏
+0: {'loss': 0.6945, 'grad_norm': 0.6666809603218877, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.32}
+0: {'loss': 0.6795, 'grad_norm': 0.7178729703245473, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.33}
+0: | 3168/9770 [38:15<1:14:33, 1.48it/s]
32%|███▏ | 3169/9770 [38:16<1:14:27, 1.48it/s]
32%|███▏ | 3170/9770 [38:16<1:14:54, 1.47it/s]
32%|███▏ | 3170/9770 [38:16<1:14:54, 1.47it/s]
32%|███▏ | 3171/9770 [38:17<1:14:42, 1.47it/s]
32%|███▏ | 3172/9770 [38:18<1:14:36, 1.47it/s]
32%|███▏ | 3173/9770 [38:18<1:13:57, 1.49it/s]
32%|███▏ | 3174/9770 [38:19<1:13:37, 1.49it/s]
32%|███▏ | 3175/9770 [38:20<1:14:01, 1.49it/s]
33%|███▎ | 3176/9770 [38:20<1:12:56, 1.51it/s]
33%|███▎ | 3177/9770 [38:21<1:12:36, 1.51it/s]
33%|███▎ | 3178/9770 [38:22<1:14:03, 1.48it/s]
33%|███▎ | 3179/9770 [38:22<1:13:27, 1.50it/s]
33%|███▎ | 3180/9770 [38:23<1:13:43, 1.49it/s]
33%|███▎ | 3180/9770 [38:23<1:13:43, 1.49it/s]
3
+0: {'loss': 0.6854, 'grad_norm': 0.6499722619294048, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.33}
+0: 3%|███▎ | 3181/9770 [38:24<1:13:40, 1.49it/s]
33%|███▎ | 3182/9770 [38:24<1:14:01, 1.48it/s]
33%|███▎ | 3183/9770 [38:25<1:13:43, 1.49it/s]
33%|███▎ | 3184/9770 [38:26<1:13:32, 1.49it/s]
33%|███▎ | 3185/9770 [38:26<1:12:27, 1.51it/s]
33%|███▎ | 3186/9770 [38:27<1:12:37, 1.51it/s]
33%|███▎ | 3187/9770 [38:28<1:12:39, 1.51it/s]
33%|███▎ | 3188/9770 [38:28<1:12:05, 1.52it/s]
33%|███▎ | 3189/9770 [38:29<1:11:27, 1.53it/s]
33%|███▎ | 3190/9770 [38:30<1:11:21, 1.54it/s]
33%|███▎ | 3190/9770 [38:30<1:11:21, 1.54it/s]
33%|███▎ | 3191/9770 [38:30<1:11:08, 1.54it/s]
33%|███▎ | 3192/9770 [38:31<1:11:46, 1.53it/s]
33%|███▎ | 3193/9770 [38:32<1:10:59, 1.54it/s]
33%|███▎ | 3194/9770 [38:32<1:11:43, 1.53it/s]
33%|███▎ | 3195/9770 [38:33
+0: {'loss': 0.718, 'grad_norm': 0.6500696032011751, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.33}
+0: <1:11:41, 1.53it/s]
33%|███▎ | 3196/9770 [38:34<1:10:47, 1.55it/s]
33%|███▎ | 3197/9770 [38:34<1:11:47, 1.53it/s]
33%|███▎ | 3198/9770 [38:35<1:11:52, 1.52it/s]
33%|███▎ | 3199/9770 [38:36<1:13:12, 1.50it/s]
33%|███▎ | 3200/9770 [38:36<1:14:16, 1.47it/s]
33%|███▎ | 3200/9770 [38:36<1:14:16, 1.47it/s]
33%|███▎ | 3201/9770 [38:37<1:13:31, 1.49it/s]
33%|███▎ | 3202/9770 [38:38<1:13:16, 1.49it/s]
33%|███▎ | 3203/9770 [38:38<1:13:55, 1.48it/s]
33%|███▎ | 3204/9770 [38:39<1:13:21, 1.49it/s]
33%|███▎ | 3205/9770 [38:40<1:13:35, 1.49it/s]
33%|███▎ | 3206/9770 [38:40<1:13:02, 1.50it/s]
33%|███▎ | 3207/9770 [38:41<1:13:46, 1.48it/s]
33%|███▎ | 3208/9770 [38:42<1:13:31, 1.49it/s]
33%|███▎ | 3209/9770 [38:42<1:13:04, 1.50it/s]
33%|███▎
+0: {'loss': 0.6964, 'grad_norm': 0.8590862790298767, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.33}
+0: {'loss': 0.7184, 'grad_norm': 0.6313948194132907, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.33}
+0: | 3210/9770 [38:43<1:12:41, 1.50it/s]
33%|███▎ | 3210/9770 [38:43<1:12:41, 1.50it/s]
33%|███▎ | 3211/9770 [38:44<1:13:02, 1.50it/s]
33%|███▎ | 3212/9770 [38:44<1:13:00, 1.50it/s]
33%|███▎ | 3213/9770 [38:45<1:12:23, 1.51it/s]
33%|███▎ | 3214/9770 [38:46<1:12:11, 1.51it/s]
33%|███▎ | 3215/9770 [38:46<1:11:54, 1.52it/s]
33%|███▎ | 3216/9770 [38:47<1:12:36, 1.50it/s]
33%|███▎ | 3217/9770 [38:48<1:12:05, 1.52it/s]
33%|███▎ | 3218/9770 [38:48<1:12:24, 1.51it/s]
33%|███▎ | 3219/9770 [38:49<1:13:19, 1.49it/s]
33%|███▎ | 3220/9770 [38:50<1:13:25, 1.49it/s]
33%|███▎ | 3220/9770 [38:50<1:13:25, 1.49it/s]
33%|███▎ | 3221/9770 [38:50<1:13:08, 1.49it/s]
33%|███▎ | 3222/9770 [38:51<1:12:46, 1.50it/s]
3
+0: {'loss': 0.6624, 'grad_norm': 0.6270739749160082, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.33}
+0: 3%|███▎ | 3223/9770 [38:52<1:12:30, 1.50it/s]
33%|███▎ | 3224/9770 [38:52<1:12:28, 1.51it/s]
33%|███▎ | 3225/9770 [38:53<1:12:21, 1.51it/s]
33%|███▎ | 3226/9770 [38:54<1:12:36, 1.50it/s]
33%|███▎ | 3227/9770 [38:54<1:12:29, 1.50it/s]
33%|███▎ | 3228/9770 [38:55<1:12:01, 1.51it/s]
33%|███▎ | 3229/9770 [38:56<1:12:01, 1.51it/s]
33%|███▎ | 3230/9770 [38:56<1:10:51, 1.54it/s]
33%|███▎ | 3230/9770 [38:56<1:10:51, 1.54it/s]
33%|███▎ | 3231/9770 [38:57<1:11:23, 1.53it/s]
33%|███▎ | 3232/9770 [38:58<1:12:18, 1.51it/s]
33%|███▎ | 3233/9770 [38:58<1:12:24, 1.50it/s]
33%|███▎ | 3234/9770 [38:59<1:12:30, 1.50it/s]
33%|███▎ | 3235/9770 [39:00<1:11:50, 1.52it/s]
33%|███▎ | 3236/9770 [39:00<1:11:50, 1.52it/s]
33%|███▎ | 3237/9770 [39:01
+0: {'loss': 0.7067, 'grad_norm': 0.7241317913957019, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.33}
+0: {'loss': 0.6861, 'grad_norm': 0.6420427525099284, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.33}
+0: <1:11:59, 1.51it/s]
33%|███▎ | 3238/9770 [39:02<1:11:23, 1.53it/s]
33%|███▎ | 3239/9770 [39:02<1:11:00, 1.53it/s]
33%|███▎ | 3240/9770 [39:03<1:11:13, 1.53it/s]
33%|███▎ | 3240/9770 [39:03<1:11:13, 1.53it/s]
33%|███▎ | 3241/9770 [39:03<1:11:01, 1.53it/s]
33%|███▎ | 3242/9770 [39:04<1:11:23, 1.52it/s]
33%|███▎ | 3243/9770 [39:05<1:11:07, 1.53it/s]
33%|███▎ | 3244/9770 [39:06<1:17:11, 1.41it/s]
33%|███▎ | 3245/9770 [39:06<1:15:18, 1.44it/s]
33%|███▎ | 3246/9770 [39:07<1:13:44, 1.47it/s]
33%|███▎ | 3247/9770 [39:08<1:12:53, 1.49it/s]
33%|███▎ | 3248/9770 [39:08<1:12:23, 1.50it/s]
33%|███▎ | 3249/9770 [39:09<1:12:01, 1.51it/s]
33%|███▎ | 3250/9770 [39:10<1:10:40, 1.54it/s]
33%|███▎ |
+0: {'loss': 0.6896, 'grad_norm': 0.6628162365168045, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.33}
+0: 3250/9770 [39:10<1:10:40, 1.54it/s]
33%|███▎ | 3251/9770 [39:10<1:12:22, 1.50it/s]
33%|███▎ | 3252/9770 [39:11<1:12:21, 1.50it/s]
33%|███▎ | 3253/9770 [39:12<1:12:13, 1.50it/s]
33%|███▎ | 3254/9770 [39:12<1:13:22, 1.48it/s]
33%|███▎ | 3255/9770 [39:13<1:12:46, 1.49it/s]
33%|███▎ | 3256/9770 [39:14<1:12:05, 1.51it/s]
33%|███▎ | 3257/9770 [39:14<1:11:59, 1.51it/s]
33%|███▎ | 3258/9770 [39:15<1:11:36, 1.52it/s]
33%|███▎ | 3259/9770 [39:16<1:11:36, 1.52it/s]
33%|███▎ | 3260/9770 [39:16<1:12:41, 1.49it/s]
33%|███▎ | 3260/9770 [39:16<1:12:41, 1.49it/s]
33%|███▎ | 3261/9770 [39:17<1:11:22, 1.52it/s]
33%|███▎ | 3262/9770 [39:17<1:10:17, 1.54it/s]
33%|███▎ | 3263/9770 [39:18<1:09:44, 1.55it/s]
33%|███▎ | 3264/9770 [39:19<1:10:30, 1.54it/s]
3
+0: {'loss': 0.6765, 'grad_norm': 0.6468844116395263, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.33}
+0: 3%|███▎ | 3265/9770 [39:19<1:10:30, 1.54it/s]
33%|███▎ | 3266/9770 [39:20<1:10:55, 1.53it/s]
33%|███▎ | 3267/9770 [39:21<1:11:20, 1.52it/s]
33%|███▎ | 3268/9770 [39:21<1:12:24, 1.50it/s]
33%|███▎ | 3269/9770 [39:22<1:12:09, 1.50it/s]
33%|███▎ | 3270/9770 [39:23<1:11:48, 1.51it/s]
33%|███▎ | 3270/9770 [39:23<1:11:48, 1.51it/s]
33%|███▎ | 3271/9770 [39:23<1:11:52, 1.51it/s]
33%|███▎ | 3272/9770 [39:24<1:11:44, 1.51it/s]
34%|███▎ | 3273/9770 [39:25<1:11:56, 1.51it/s]
34%|███▎ | 3274/9770 [39:25<1:11:44, 1.51it/s]
34%|███▎ | 3275/9770 [39:26<1:12:37, 1.49it/s]
34%|███▎ | 3276/9770 [39:27<1:11:21, 1.52it/s]
34%|███▎ | 3277/9770 [39:27<1:11:17, 1.52it/s]
34%|███▎ | 3278/9770 [39:28<1:11:30, 1.51it/s]
34%|███▎ | 3279/9770 [39:29
+0: {'loss': 0.6865, 'grad_norm': 0.6961831320373305, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.34}
+0: {'loss': 0.6863, 'grad_norm': 0.6721689589677343, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.34}
+0: <1:10:25, 1.54it/s]
34%|███▎ | 3280/9770 [39:29<1:11:39, 1.51it/s]
34%|███▎ | 3280/9770 [39:29<1:11:39, 1.51it/s]
34%|███▎ | 3281/9770 [39:30<1:10:57, 1.52it/s]
34%|███▎ | 3282/9770 [39:31<1:10:42, 1.53it/s]
34%|███▎ | 3283/9770 [39:31<1:11:08, 1.52it/s]
34%|███▎ | 3284/9770 [39:32<1:11:56, 1.50it/s]
34%|███▎ | 3285/9770 [39:33<1:12:08, 1.50it/s]
34%|███▎ | 3286/9770 [39:33<1:11:48, 1.50it/s]
34%|███▎ | 3287/9770 [39:34<1:11:13, 1.52it/s]
34%|███▎ | 3288/9770 [39:35<1:11:07, 1.52it/s]
34%|███▎ | 3289/9770 [39:35<1:10:16, 1.54it/s]
34%|███▎ | 3290/9770 [39:36<1:10:39, 1.53it/s]
34%|███▎ | 3290/9770 [39:36<1:10:39, 1.53it/s]
34%|███▎ | 3291/9770 [39:37<1:11:05, 1.52it/s]
34%|███▎ |
+0: {'loss': 0.731, 'grad_norm': 0.7145796054358293, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.34}
+0: 3292/9770 [39:37<1:10:47, 1.53it/s]
34%|███▎ | 3293/9770 [39:38<1:10:21, 1.53it/s]
34%|███▎ | 3294/9770 [39:39<1:10:26, 1.53it/s]
34%|███▎ | 3295/9770 [39:39<1:09:26, 1.55it/s]
34%|███▎ | 3296/9770 [39:40<1:09:58, 1.54it/s]
34%|███▎ | 3297/9770 [39:41<1:09:49, 1.54it/s]
34%|███▍ | 3298/9770 [39:41<1:11:28, 1.51it/s]
34%|███▍ | 3299/9770 [39:42<1:11:26, 1.51it/s]
34%|███▍ | 3300/9770 [39:43<1:11:17, 1.51it/s]
34%|███▍ | 3300/9770 [39:43<1:11:17, 1.51it/s]
34%|███▍ | 3301/9770 [39:43<1:11:44, 1.50it/s]
34%|███▍ | 3302/9770 [39:44<1:11:32, 1.51it/s]
34%|███▍ | 3303/9770 [39:45<1:11:36, 1.51it/s]
34%|███▍ | 3304/9770 [39:45<1:12:03, 1.50it/s]
34%|███▍ | 3305/9770 [39:46<1:11:46, 1.50it/s]
34%|███▍ | 3306/9770 [39:47<1:11:26, 1.51it/s]
3
+0: {'loss': 0.7096, 'grad_norm': 0.6755966861746031, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.34}
+0: 4%|███▍ | 3307/9770 [39:47<1:12:14, 1.49it/s]
34%|███▍ | 3308/9770 [39:48<1:12:00, 1.50it/s]
34%|███▍ | 3309/9770 [39:49<1:10:56, 1.52it/s]
34%|███▍ | 3310/9770 [39:49<1:12:12, 1.49it/s]
34%|███▍ | 3310/9770 [39:49<1:12:12, 1.49it/s]
34%|███▍ | 3311/9770 [39:50<1:11:02, 1.52it/s]
34%|███▍ | 3312/9770 [39:51<1:11:15, 1.51it/s]
34%|███▍ | 3313/9770 [39:51<1:10:28, 1.53it/s]
34%|███▍ | 3314/9770 [39:52<1:10:40, 1.52it/s]
34%|███▍ | 3315/9770 [39:52<1:09:59, 1.54it/s]
34%|███▍ | 3316/9770 [39:53<1:10:09, 1.53it/s]
34%|███▍ | 3317/9770 [39:54<1:10:35, 1.52it/s]
34%|███▍ | 3318/9770 [39:54<1:10:15, 1.53it/s]
34%|███▍ | 3319/9770 [39:55<1:09:52, 1.54it/s]
34%|███▍ | 3320/9770 [39:56<1:09:48, 1.54it/s]
+0: {'loss': 0.7029, 'grad_norm': 0.6802038383709352, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.34}
+0: {'loss': 0.689, 'grad_norm': 0.6470768527254148, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.34}
+0:
34%|███▍ | 3320/9770 [39:56<1:09:48, 1.54it/s]
34%|███▍ | 3321/9770 [39:56<1:10:00, 1.54it/s]
34%|███▍ | 3322/9770 [39:57<1:10:12, 1.53it/s]
34%|███▍ | 3323/9770 [39:58<1:09:57, 1.54it/s]
34%|███▍ | 3324/9770 [39:58<1:10:07, 1.53it/s]
34%|███▍ | 3325/9770 [39:59<1:09:18, 1.55it/s]
34%|███▍ | 3326/9770 [40:00<1:09:53, 1.54it/s]
34%|███▍ | 3327/9770 [40:00<1:10:57, 1.51it/s]
34%|███▍ | 3328/9770 [40:01<1:10:52, 1.51it/s]
34%|███▍ | 3329/9770 [40:02<1:10:57, 1.51it/s]
34%|███▍ | 3330/9770 [40:02<1:10:47, 1.52it/s]
34%|███▍ | 3330/9770 [40:02<1:10:47, 1.52it/s]
34%|███▍ | 3331/9770 [40:03<1:10:41, 1.52it/s]
34%|███▍ | 3332/9770 [40:04<1:10:29, 1.52it/s]
34%|███▍ | 3333/9770 [40:04<1:16:53, 1.40it/s]
34%|███▍ |
+0: {'loss': 0.6975, 'grad_norm': 0.7476840593808041, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.34}
+0: 3334/9770 [40:05<1:14:17, 1.44it/s]
34%|███▍ | 3335/9770 [40:06<1:13:21, 1.46it/s]
34%|███▍ | 3336/9770 [40:06<1:12:57, 1.47it/s]
34%|███▍ | 3337/9770 [40:07<1:12:12, 1.48it/s]
34%|███▍ | 3338/9770 [40:08<1:12:58, 1.47it/s]
34%|███▍ | 3339/9770 [40:08<1:12:08, 1.49it/s]
34%|███▍ | 3340/9770 [40:09<1:11:53, 1.49it/s]
34%|███▍ | 3340/9770 [40:09<1:11:53, 1.49it/s]
34%|███▍ | 3341/9770 [40:10<1:11:12, 1.50it/s]
34%|███▍ | 3342/9770 [40:10<1:11:01, 1.51it/s]
34%|███▍ | 3343/9770 [40:11<1:09:36, 1.54it/s]
34%|███▍ | 3344/9770 [40:12<1:09:50, 1.53it/s]
34%|███▍ | 3345/9770 [40:12<1:09:51, 1.53it/s]
34%|███▍ | 3346/9770 [40:13<1:10:06, 1.53it/s]
34%|███▍ | 3347/9770 [40:14<1:10:20, 1.52it/s]
34%|███▍ | 3348/9770 [40:14<1:10:23, 1.52it/s]
3
+0: {'loss': 0.7047, 'grad_norm': 0.7322290166481111, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.34}
+0: {'loss': 0.7213, 'grad_norm': 0.7545565015916705, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.34}
+0: 4%|███▍ | 3349/9770 [40:15<1:10:02, 1.53it/s]
34%|███▍ | 3350/9770 [40:16<1:10:07, 1.53it/s]
34%|███▍ | 3350/9770 [40:16<1:10:07, 1.53it/s]
34%|███▍ | 3351/9770 [40:16<1:09:56, 1.53it/s]
34%|███▍ | 3352/9770 [40:17<1:09:33, 1.54it/s]
34%|███▍ | 3353/9770 [40:18<1:09:51, 1.53it/s]
34%|███▍ | 3354/9770 [40:18<1:10:20, 1.52it/s]
34%|███▍ | 3355/9770 [40:19<1:09:26, 1.54it/s]
34%|███▍ | 3356/9770 [40:19<1:08:41, 1.56it/s]
34%|███▍ | 3357/9770 [40:20<1:09:14, 1.54it/s]
34%|███▍ | 3358/9770 [40:21<1:08:40, 1.56it/s]
34%|███▍ | 3359/9770 [40:21<1:09:02, 1.55it/s]
34%|███▍ | 3360/9770 [40:22<1:08:30, 1.56it/s]
34%|███▍ | 3360/9770 [40:22<1:08:30, 1.56it/s]
34%|███▍ | 3361/9770 [40:23<1:09:1
+0: {'loss': 0.6951, 'grad_norm': 0.6700011484635324, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.34}
+0: 5, 1.54it/s]
34%|███▍ | 3362/9770 [40:23<1:08:48, 1.55it/s]
34%|███▍ | 3363/9770 [40:24<1:10:00, 1.53it/s]
34%|███▍ | 3364/9770 [40:25<1:11:19, 1.50it/s]
34%|███▍ | 3365/9770 [40:25<1:10:59, 1.50it/s]
34%|███▍ | 3366/9770 [40:26<1:10:31, 1.51it/s]
34%|███▍ | 3367/9770 [40:27<1:10:14, 1.52it/s]
34%|███▍ | 3368/9770 [40:27<1:10:10, 1.52it/s]
34%|███▍ | 3369/9770 [40:28<1:10:27, 1.51it/s]
34%|███▍ | 3370/9770 [40:29<1:10:46, 1.51it/s]
34%|███▍ | 3370/9770 [40:29<1:10:46, 1.51it/s]
35%|███▍ | 3371/9770 [40:29<1:10:46, 1.51it/s]
35%|███▍ | 3372/9770 [40:30<1:09:52, 1.53it/s]
35%|███▍ | 3373/9770 [40:31<1:09:22, 1.54it/s]
35%|███▍ | 3374/9770 [40:31<1:08:53, 1.55it/s]
35%|███▍ | 3375/9770 [40:32<1:08:05, 1.57it/s]
35%|███▍ |
+0: {'loss': 0.658, 'grad_norm': 0.6007452148124209, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.35}
+0: 3376/9770 [40:33<1:07:35, 1.58it/s]
35%|███▍ | 3377/9770 [40:33<1:07:37, 1.58it/s]
35%|███▍ | 3378/9770 [40:34<1:07:32, 1.58it/s]
35%|███▍ | 3379/9770 [40:34<1:08:32, 1.55it/s]
35%|███▍ | 3380/9770 [40:35<1:09:15, 1.54it/s]
35%|███▍ | 3380/9770 [40:35<1:09:15, 1.54it/s]
35%|███▍ | 3381/9770 [40:36<1:09:43, 1.53it/s]
35%|███▍ | 3382/9770 [40:36<1:09:17, 1.54it/s]
35%|███▍ | 3383/9770 [40:37<1:09:37, 1.53it/s]
35%|███▍ | 3384/9770 [40:38<1:09:54, 1.52it/s]
35%|███▍ | 3385/9770 [40:38<1:10:02, 1.52it/s]
35%|███▍ | 3386/9770 [40:39<1:09:00, 1.54it/s]
35%|███▍ | 3387/9770 [40:40<1:08:28, 1.55it/s]
35%|███▍ | 3388/9770 [40:40<1:08:59, 1.54it/s]
35%|███▍ | 3389/9770 [40:41<1:09:58, 1.52it/s]
35%|███▍ | 3390/9770 [40:42<1:10:06, 1.52it/s]
+0: {'loss': 0.6857, 'grad_norm': 0.6451253885718397, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.35}
+0: {'loss': 0.6762, 'grad_norm': 0.6668972209227537, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.35}
+0:
35%|███▍ | 3390/9770 [40:42<1:10:06, 1.52it/s]
35%|███▍ | 3391/9770 [40:42<1:10:08, 1.52it/s]
35%|███▍ | 3392/9770 [40:43<1:09:42, 1.52it/s]
35%|███▍ | 3393/9770 [40:44<1:09:24, 1.53it/s]
35%|███▍ | 3394/9770 [40:44<1:08:49, 1.54it/s]
35%|███▍ | 3395/9770 [40:45<1:09:02, 1.54it/s]
35%|███▍ | 3396/9770 [40:46<1:10:22, 1.51it/s]
35%|███▍ | 3397/9770 [40:46<1:09:55, 1.52it/s]
35%|███▍ | 3398/9770 [40:47<1:10:12, 1.51it/s]
35%|███▍ | 3399/9770 [40:48<1:09:41, 1.52it/s]
35%|███▍ | 3400/9770 [40:48<1:11:14, 1.49it/s]
35%|███▍ | 3400/9770 [40:48<1:11:14, 1.49it/s]
35%|███▍ | 3401/9770 [40:49<1:10:49, 1.50it/s]
35%|███▍ | 3402/9770 [40:50<1:09:49, 1.52it/s]
35%|███▍ | 3403/9770 [40:50<1:09:2
+0: {'loss': 0.6795, 'grad_norm': 0.6619191600210886, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.35}
+0: 1, 1.53it/s]
35%|███▍ | 3404/9770 [40:51<1:10:03, 1.51it/s]
35%|███▍ | 3405/9770 [40:52<1:09:00, 1.54it/s]
35%|███▍ | 3406/9770 [40:52<1:08:37, 1.55it/s]
35%|███▍ | 3407/9770 [40:53<1:08:58, 1.54it/s]
35%|███▍ | 3408/9770 [40:53<1:08:48, 1.54it/s]
35%|███▍ | 3409/9770 [40:54<1:08:55, 1.54it/s]
35%|███▍ | 3410/9770 [40:55<1:10:12, 1.51it/s]
35%|███▍ | 3410/9770 [40:55<1:10:12, 1.51it/s]
35%|███▍ | 3411/9770 [40:55<1:09:44, 1.52it/s]
35%|███▍ | 3412/9770 [40:56<1:09:40, 1.52it/s]
35%|███▍ | 3413/9770 [40:57<1:09:56, 1.51it/s]
35%|███▍ | 3414/9770 [40:57<1:10:05, 1.51it/s]
35%|███▍ | 3415/9770 [40:58<1:09:53, 1.52it/s]
35%|███▍ | 3416/9770 [40:59<1:09:58, 1.51it/s]
35%|███▍ | 3417/9770 [40:59<1:10:05, 1.51it/s]
35%|███▍ |
+0: {'loss': 0.6838, 'grad_norm': 0.7122524777539763, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.35}
+0: {'loss': 0.6981, 'grad_norm': 0.6082673342201741, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.35}
+0: 3418/9770 [41:00<1:09:02, 1.53it/s]
35%|███▍ | 3419/9770 [41:01<1:09:15, 1.53it/s]
35%|███▌ | 3420/9770 [41:01<1:09:27, 1.52it/s]
35%|███▌ | 3420/9770 [41:01<1:09:27, 1.52it/s]
35%|███▌ | 3421/9770 [41:02<1:10:25, 1.50it/s]
35%|███▌ | 3422/9770 [41:03<1:10:07, 1.51it/s]
35%|███▌ | 3423/9770 [41:03<1:09:31, 1.52it/s]
35%|███▌ | 3424/9770 [41:04<1:09:53, 1.51it/s]
35%|███▌ | 3425/9770 [41:05<1:09:55, 1.51it/s]
35%|███▌ | 3426/9770 [41:05<1:09:53, 1.51it/s]
35%|███▌ | 3427/9770 [41:06<1:10:04, 1.51it/s]
35%|███▌ | 3428/9770 [41:07<1:09:20, 1.52it/s]
35%|███▌ | 3429/9770 [41:07<1:08:36, 1.54it/s]
35%|███▌ | 3430/9770 [41:08<1:07:33, 1.56it/s]
35%|███▌ | 3430/9770 [41:08<1:07:33, 1.56it/s]
35%|█�
+0: {'loss': 0.6995, 'grad_norm': 0.6055907561270045, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.35}
+0: ��█▌ | 3431/9770 [41:09<1:07:51, 1.56it/s]
35%|███▌ | 3432/9770 [41:09<1:07:40, 1.56it/s]
35%|███▌ | 3433/9770 [41:10<1:07:25, 1.57it/s]
35%|███▌ | 3434/9770 [41:10<1:07:58, 1.55it/s]
35%|███▌ | 3435/9770 [41:11<1:08:19, 1.55it/s]
35%|███▌ | 3436/9770 [41:12<1:08:13, 1.55it/s]
35%|███▌ | 3437/9770 [41:12<1:07:59, 1.55it/s]
35%|███▌ | 3438/9770 [41:13<1:08:21, 1.54it/s]
35%|███▌ | 3439/9770 [41:14<1:09:01, 1.53it/s]
35%|███▌ | 3440/9770 [41:14<1:09:09, 1.53it/s]
35%|███▌ | 3440/9770 [41:14<1:09:09, 1.53it/s]
35%|███▌ | 3441/9770 [41:15<1:09:47, 1.51it/s]
35%|███▌ | 3442/9770 [41:16<1:09:49, 1.51it/s]
35%|███▌ | 3443/9770 [41:16<1:09:13, 1.52it/s]
35%|███▌ | 3444/9770 [41:17<1:09:33, 1.52it/s]
35%|███▌ | 3445/9770 [41:18<1:09:3
+0: {'loss': 0.6838, 'grad_norm': 0.6146586564798304, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.35}
+0: 6, 1.51it/s]
35%|███▌ | 3446/9770 [41:18<1:09:36, 1.51it/s]
35%|███▌ | 3447/9770 [41:19<1:08:47, 1.53it/s]
35%|███▌ | 3448/9770 [41:20<1:09:19, 1.52it/s]
35%|███▌ | 3449/9770 [41:20<1:09:29, 1.52it/s]
35%|███▌ | 3450/9770 [41:21<1:09:27, 1.52it/s]
35%|███▌ | 3450/9770 [41:21<1:09:27, 1.52it/s]
35%|███▌ | 3451/9770 [41:22<1:09:25, 1.52it/s]
35%|███▌ | 3452/9770 [41:22<1:08:52, 1.53it/s]
35%|███▌ | 3453/9770 [41:23<1:08:54, 1.53it/s]
35%|███▌ | 3454/9770 [41:24<1:08:43, 1.53it/s]
35%|███▌ | 3455/9770 [41:24<1:08:54, 1.53it/s]
35%|███▌ | 3456/9770 [41:25<1:09:06, 1.52it/s]
35%|███▌ | 3457/9770 [41:26<1:08:38, 1.53it/s]
35%|███▌ | 3458/9770 [41:26<1:08:09, 1.54it/s]
35%|███▌ | 3459/9770 [41:27<1:08:39, 1.53it/s]
35%|███▌ |
+0: {'loss': 0.6918, 'grad_norm': 0.6645738122245831, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.35}
+0: {'loss': 0.6998, 'grad_norm': 0.7174373189706844, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.35}
+0: 3460/9770 [41:28<1:08:54, 1.53it/s]
35%|███▌ | 3460/9770 [41:28<1:08:54, 1.53it/s]
35%|███▌ | 3461/9770 [41:28<1:08:32, 1.53it/s]
35%|███▌ | 3462/9770 [41:29<1:08:35, 1.53it/s]
35%|███▌ | 3463/9770 [41:29<1:08:40, 1.53it/s]
35%|███▌ | 3464/9770 [41:30<1:09:11, 1.52it/s]
35%|███▌ | 3465/9770 [41:31<1:09:45, 1.51it/s]
35%|███▌ | 3466/9770 [41:32<1:10:17, 1.49it/s]
35%|███▌ | 3467/9770 [41:32<1:09:48, 1.50it/s]
35%|███▌ | 3468/9770 [41:33<1:09:33, 1.51it/s]
36%|███▌ | 3469/9770 [41:34<1:09:57, 1.50it/s]
36%|███▌ | 3470/9770 [41:34<1:09:48, 1.50it/s]
36%|███▌ | 3470/9770 [41:34<1:09:48, 1.50it/s]
36%|███▌ | 3471/9770 [41:35<1:09:39, 1.51it/s]
36%|███▌ | 3472/9770 [41:36<1:10:18, 1.49it/s]
36%|█�
+0: {'loss': 0.6625, 'grad_norm': 0.6082861578740087, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.36}
+0: ��█▌ | 3473/9770 [41:36<1:10:07, 1.50it/s]
36%|███▌ | 3474/9770 [41:37<1:09:44, 1.50it/s]
36%|███▌ | 3475/9770 [41:37<1:09:23, 1.51it/s]
36%|███▌ | 3476/9770 [41:38<1:08:30, 1.53it/s]
36%|███▌ | 3477/9770 [41:39<1:08:32, 1.53it/s]
36%|███▌ | 3478/9770 [41:39<1:08:55, 1.52it/s]
36%|███▌ | 3479/9770 [41:40<1:08:54, 1.52it/s]
36%|███▌ | 3480/9770 [41:41<1:08:34, 1.53it/s]
36%|███▌ | 3480/9770 [41:41<1:08:34, 1.53it/s]
36%|███▌ | 3481/9770 [41:41<1:08:42, 1.53it/s]
36%|███▌ | 3482/9770 [41:42<1:08:45, 1.52it/s]
36%|███▌ | 3483/9770 [41:43<1:09:44, 1.50it/s]
36%|███▌ | 3484/9770 [41:43<1:09:26, 1.51it/s]
36%|███▌ | 3485/9770 [41:44<1:09:22, 1.51it/s]
36%|███▌ | 3486/9770 [41:45<1:08:14, 1.53it/s]
36%|███▌ | 3487/9770 [41:45<1:08:3
+0: {'loss': 0.6981, 'grad_norm': 0.6853542901976013, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.36}
+0: {'loss': 0.6857, 'grad_norm': 0.6317417521320896, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.36}
+0: 1, 1.53it/s]
36%|███▌ | 3488/9770 [41:46<1:08:11, 1.54it/s]
36%|███▌ | 3489/9770 [41:47<1:07:23, 1.55it/s]
36%|███▌ | 3490/9770 [41:47<1:07:59, 1.54it/s]
36%|███▌ | 3490/9770 [41:47<1:07:59, 1.54it/s]
36%|███▌ | 3491/9770 [41:48<1:07:25, 1.55it/s]
36%|███▌ | 3492/9770 [41:49<1:08:41, 1.52it/s]
36%|███▌ | 3493/9770 [41:49<1:09:08, 1.51it/s]
36%|███▌ | 3494/9770 [41:50<1:08:13, 1.53it/s]
36%|███▌ | 3495/9770 [41:51<1:09:24, 1.51it/s]
36%|███▌ | 3496/9770 [41:51<1:09:09, 1.51it/s]
36%|███▌ | 3497/9770 [41:52<1:09:11, 1.51it/s]
36%|███▌ | 3498/9770 [41:53<1:09:04, 1.51it/s]
36%|███▌ | 3499/9770 [41:53<1:08:57, 1.52it/s]
36%|███▌ | 3500/9770 [41:54<1:08:38, 1.52it/s]
36%|███▌ | 3500/97
+0: {'loss': 0.6523, 'grad_norm': 0.6105208182727795, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.36}
+0: 70 [41:54<1:08:38, 1.52it/s]
36%|███▌ | 3501/9770 [41:55<1:09:32, 1.50it/s]
36%|███▌ | 3502/9770 [41:55<1:08:40, 1.52it/s]
36%|███▌ | 3503/9770 [41:56<1:08:11, 1.53it/s]
36%|███▌ | 3504/9770 [41:57<1:08:24, 1.53it/s]
36%|███▌ | 3505/9770 [41:57<1:08:01, 1.53it/s]
36%|███▌ | 3506/9770 [41:58<1:07:47, 1.54it/s]
36%|███▌ | 3507/9770 [41:58<1:07:53, 1.54it/s]
36%|███▌ | 3508/9770 [41:59<1:09:05, 1.51it/s]
36%|███▌ | 3509/9770 [42:00<1:08:37, 1.52it/s]
36%|███▌ | 3510/9770 [42:00<1:09:27, 1.50it/s]
36%|███▌ | 3510/9770 [42:00<1:09:27, 1.50it/s]
36%|███▌ | 3511/9770 [42:01<1:09:31, 1.50it/s]
36%|███▌ | 3512/9770 [42:02<1:09:32, 1.50it/s]
36%|███▌ | 3513/9770 [42:02<1:08:52, 1.51it/s]
36%|███▌ | 3514/9770 [42:03<1:09:05, 1.51it/s]
36%|█�
+0: {'loss': 0.6862, 'grad_norm': 0.6976614513906086, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.36}
+0: ��█▌ | 3515/9770 [42:04<1:08:31, 1.52it/s]
36%|███▌ | 3516/9770 [42:04<1:09:11, 1.51it/s]
36%|███▌ | 3517/9770 [42:05<1:09:07, 1.51it/s]
36%|███▌ | 3518/9770 [42:06<1:09:00, 1.51it/s]
36%|███▌ | 3519/9770 [42:06<1:09:01, 1.51it/s]
36%|███▌ | 3520/9770 [42:07<1:08:31, 1.52it/s]
36%|███▌ | 3520/9770 [42:07<1:08:31, 1.52it/s]
36%|███▌ | 3521/9770 [42:08<1:08:22, 1.52it/s]
36%|███▌ | 3522/9770 [42:08<1:08:12, 1.53it/s]
36%|███▌ | 3523/9770 [42:09<1:07:54, 1.53it/s]
36%|███▌ | 3524/9770 [42:10<1:07:48, 1.54it/s]
36%|███▌ | 3525/9770 [42:10<1:08:11, 1.53it/s]
36%|███▌ | 3526/9770 [42:11<1:08:02, 1.53it/s]
36%|███▌ | 3527/9770 [42:12<1:08:02, 1.53it/s]
36%|███▌ | 3528/9770 [42:12<1:08:39, 1.52it/s]
36%|███▌ | 3529/9770 [42:13<1:08:4
+0: {'loss': 0.6654, 'grad_norm': 0.600091402224677, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.36}
+0: {'loss': 0.694, 'grad_norm': 0.6270725977134045, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.36}
+0: 1, 1.51it/s]
36%|███▌ | 3530/9770 [42:14<1:08:32, 1.52it/s]
36%|███▌ | 3530/9770 [42:14<1:08:32, 1.52it/s]
36%|███▌ | 3531/9770 [42:14<1:08:48, 1.51it/s]
36%|███▌ | 3532/9770 [42:15<1:10:04, 1.48it/s]
36%|███▌ | 3533/9770 [42:16<1:10:17, 1.48it/s]
36%|███▌ | 3534/9770 [42:16<1:09:05, 1.50it/s]
36%|███▌ | 3535/9770 [42:17<1:08:26, 1.52it/s]
36%|███▌ | 3536/9770 [42:18<1:08:37, 1.51it/s]
36%|███▌ | 3537/9770 [42:18<1:08:56, 1.51it/s]
36%|███▌ | 3538/9770 [42:19<1:08:16, 1.52it/s]
36%|███▌ | 3539/9770 [42:20<1:08:22, 1.52it/s]
36%|███▌ | 3540/9770 [42:20<1:09:10, 1.50it/s]
36%|███▌ | 3540/9770 [42:20<1:09:10, 1.50it/s]
36%|███▌ | 3541/9770 [42:21<1:08:54, 1.51it/s]
36%|███▋ | 3542/97
+0: {'loss': 0.6688, 'grad_norm': 0.624146249088739, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.36}
+0: 70 [42:22<1:08:53, 1.51it/s]
36%|███▋ | 3543/9770 [42:22<1:09:47, 1.49it/s]
36%|███▋ | 3544/9770 [42:23<1:09:16, 1.50it/s]
36%|███▋ | 3545/9770 [42:24<1:08:37, 1.51it/s]
36%|███▋ | 3546/9770 [42:24<1:09:59, 1.48it/s]
36%|███▋ | 3547/9770 [42:25<1:09:10, 1.50it/s]
36%|███▋ | 3548/9770 [42:26<1:09:02, 1.50it/s]
36%|███▋ | 3549/9770 [42:26<1:09:20, 1.50it/s]
36%|███▋ | 3550/9770 [42:27<1:07:49, 1.53it/s]
36%|███▋ | 3550/9770 [42:27<1:07:49, 1.53it/s]
36%|███▋ | 3551/9770 [42:28<1:09:34, 1.49it/s]
36%|███▋ | 3552/9770 [42:28<1:09:03, 1.50it/s]
36%|███▋ | 3553/9770 [42:29<1:08:32, 1.51it/s]
36%|███▋ | 3554/9770 [42:30<1:08:05, 1.52it/s]
36%|███▋ | 3555/9770 [42:30<1:08:03, 1.52it/s]
36%|███▋ | 3556/9770 [42:31<1:09:09, 1.50it/s]
36%|█�
+0: {'loss': 0.6884, 'grad_norm': 0.6602346578907496, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.36}
+0: ��█▋ | 3557/9770 [42:32<1:08:46, 1.51it/s]
36%|███▋ | 3558/9770 [42:32<1:08:42, 1.51it/s]
36%|███▋ | 3559/9770 [42:33<1:08:41, 1.51it/s]
36%|███▋ | 3560/9770 [42:34<1:08:18, 1.52it/s]
36%|███▋ | 3560/9770 [42:34<1:08:18, 1.52it/s]
36%|███▋ | 3561/9770 [42:34<1:07:01, 1.54it/s]
36%|███▋ | 3562/9770 [42:35<1:07:38, 1.53it/s]
36%|███▋ | 3563/9770 [42:35<1:06:38, 1.55it/s]
36%|███▋ | 3564/9770 [42:36<1:07:10, 1.54it/s]
36%|███▋ | 3565/9770 [42:37<1:07:19, 1.54it/s]
36%|███▋ | 3566/9770 [42:37<1:07:36, 1.53it/s]
37%|███▋ | 3567/9770 [42:38<1:08:35, 1.51it/s]
37%|███▋ | 3568/9770 [42:39<1:08:00, 1.52it/s]
37%|███▋ | 3569/9770 [42:39<1:09:06, 1.50it/s]
37%|███▋ | 3570/9770 [42:40<1:09:58, 1.48it/s]
+0: {'loss': 0.6905, 'grad_norm': 0.6706614677701693, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.37}
+0: {'loss': 0.665, 'grad_norm': 0.638804433261994, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.37}
+0:
37%|███▋ | 3570/9770 [42:40<1:09:58, 1.48it/s]
37%|███▋ | 3571/9770 [42:41<1:08:58, 1.50it/s]
37%|███▋ | 3572/9770 [42:41<1:08:11, 1.51it/s]
37%|███▋ | 3573/9770 [42:42<1:08:24, 1.51it/s]
37%|███▋ | 3574/9770 [42:43<1:08:37, 1.50it/s]
37%|███▋ | 3575/9770 [42:43<1:07:48, 1.52it/s]
37%|███▋ | 3576/9770 [42:44<1:07:06, 1.54it/s]
37%|███▋ | 3577/9770 [42:45<1:07:24, 1.53it/s]
37%|███▋ | 3578/9770 [42:45<1:07:36, 1.53it/s]
37%|███▋ | 3579/9770 [42:46<1:07:24, 1.53it/s]
37%|███▋ | 3580/9770 [42:47<1:07:44, 1.52it/s]
37%|███▋ | 3580/9770 [42:47<1:07:44, 1.52it/s]
37%|███▋ | 3581/9770 [42:47<1:08:16, 1.51it/s]
37%|███▋ | 3582/9770 [42:48<1:07:02, 1.54it/s]
37%|███▋ | 3583/9770 [42:49<1:07:05, 1.54it/s]
37%|███▋ | 3584/97
+0: {'loss': 0.6954, 'grad_norm': 0.6753442133488154, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.37}
+0: 70 [42:49<1:07:27, 1.53it/s]
37%|███▋ | 3585/9770 [42:50<1:07:42, 1.52it/s]
37%|███▋ | 3586/9770 [42:51<1:08:04, 1.51it/s]
37%|███▋ | 3587/9770 [42:51<1:07:18, 1.53it/s]
37%|███▋ | 3588/9770 [42:52<1:07:21, 1.53it/s]
37%|███▋ | 3589/9770 [42:53<1:07:38, 1.52it/s]
37%|███▋ | 3590/9770 [42:53<1:06:54, 1.54it/s]
37%|███▋ | 3590/9770 [42:53<1:06:54, 1.54it/s]
37%|███▋ | 3591/9770 [42:54<1:07:27, 1.53it/s]
37%|███▋ | 3592/9770 [42:55<1:07:05, 1.53it/s]
37%|███▋ | 3593/9770 [42:55<1:07:18, 1.53it/s]
37%|███▋ | 3594/9770 [42:56<1:07:40, 1.52it/s]
37%|███▋ | 3595/9770 [42:57<1:07:29, 1.53it/s]
37%|███▋ | 3596/9770 [42:57<1:07:14, 1.53it/s]
37%|███▋ | 3597/9770 [42:58<1:07:28, 1.52it/s]
37%|███▋ | 3598/9770 [42:58<1:07:04, 1.53it/s]
37%|█�
+0: {'loss': 0.6908, 'grad_norm': 0.6147818933922237, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.37}
+0: {'loss': 0.6891, 'grad_norm': 0.6362226689113726, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.37}
+0: ��█▋ | 3599/9770 [42:59<1:06:45, 1.54it/s]
37%|███▋ | 3600/9770 [43:00<1:06:19, 1.55it/s]
37%|███▋ | 3600/9770 [43:00<1:06:19, 1.55it/s]
37%|███▋ | 3601/9770 [43:00<1:06:37, 1.54it/s]
37%|███▋ | 3602/9770 [43:01<1:06:49, 1.54it/s]
37%|███▋ | 3603/9770 [43:02<1:07:13, 1.53it/s]
37%|███▋ | 3604/9770 [43:02<1:07:29, 1.52it/s]
37%|███▋ | 3605/9770 [43:03<1:07:25, 1.52it/s]
37%|███▋ | 3606/9770 [43:04<1:07:27, 1.52it/s]
37%|███▋ | 3607/9770 [43:04<1:07:10, 1.53it/s]
37%|███▋ | 3608/9770 [43:05<1:06:43, 1.54it/s]
37%|███▋ | 3609/9770 [43:06<1:07:17, 1.53it/s]
37%|███▋ | 3610/9770 [43:06<1:07:43, 1.52it/s]
37%|███▋ | 3610/9770 [43:06<1:07:43, 1.52it/s]
37%|███▋ | 3611/9770 [43:07<1:07:50, 1.5
+0: {'loss': 0.662, 'grad_norm': 0.600883506632343, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.37}
+0: 1it/s]
37%|███▋ | 3612/9770 [43:08<1:06:34, 1.54it/s]
37%|███▋ | 3613/9770 [43:08<1:06:59, 1.53it/s]
37%|███▋ | 3614/9770 [43:09<1:07:50, 1.51it/s]
37%|███▋ | 3615/9770 [43:10<1:07:07, 1.53it/s]
37%|███▋ | 3616/9770 [43:10<1:07:37, 1.52it/s]
37%|███▋ | 3617/9770 [43:11<1:07:15, 1.52it/s]
37%|███▋ | 3618/9770 [43:12<1:06:32, 1.54it/s]
37%|███▋ | 3619/9770 [43:12<1:07:27, 1.52it/s]
37%|███▋ | 3620/9770 [43:13<1:07:26, 1.52it/s]
37%|███▋ | 3620/9770 [43:13<1:07:26, 1.52it/s]
37%|███▋ | 3621/9770 [43:14<1:07:14, 1.52it/s]
37%|███▋ | 3622/9770 [43:14<1:06:41, 1.54it/s]
37%|███▋ | 3623/9770 [43:15<1:06:47, 1.53it/s]
37%|███▋ | 3624/9770 [43:15<1:06:47, 1.53it/s]
37%|███▋ | 3625/9770 [43:16<1:08:14, 1.50it/s]
37%|███▋ | 3626/97
+0: {'loss': 0.6772, 'grad_norm': 0.6530779905064625, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.37}
+0: 70 [43:17<1:07:41, 1.51it/s]
37%|███▋ | 3627/9770 [43:17<1:07:11, 1.52it/s]
37%|███▋ | 3628/9770 [43:18<1:07:32, 1.52it/s]
37%|███▋ | 3629/9770 [43:19<1:06:39, 1.54it/s]
37%|███▋ | 3630/9770 [43:19<1:07:04, 1.53it/s]
37%|███▋ | 3630/9770 [43:19<1:07:04, 1.53it/s]
37%|███▋ | 3631/9770 [43:20<1:07:16, 1.52it/s]
37%|███▋ | 3632/9770 [43:21<1:06:27, 1.54it/s]
37%|███▋ | 3633/9770 [43:21<1:06:24, 1.54it/s]
37%|███▋ | 3634/9770 [43:22<1:06:11, 1.55it/s]
37%|███▋ | 3635/9770 [43:23<1:05:31, 1.56it/s]
37%|███▋ | 3636/9770 [43:23<1:06:30, 1.54it/s]
37%|███▋ | 3637/9770 [43:24<1:07:05, 1.52it/s]
37%|███▋ | 3638/9770 [43:25<1:06:28, 1.54it/s]
37%|███▋ | 3639/9770 [43:25<1:06:51, 1.53it/s]
37%|███▋ | 3640/9770 [43:26<1:07:08, 1.52it/s]
+0: {'loss': 0.6715, 'grad_norm': 0.5987265179126843, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.37}
+0: {'loss': 0.6982, 'grad_norm': 0.6085838817532605, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.37}
+0:
37%|███▋ | 3640/9770 [43:26<1:07:08, 1.52it/s]
37%|███▋ | 3641/9770 [43:27<1:07:35, 1.51it/s]
37%|███▋ | 3642/9770 [43:27<1:07:28, 1.51it/s]
37%|███▋ | 3643/9770 [43:28<1:07:21, 1.52it/s]
37%|███▋ | 3644/9770 [43:29<1:06:34, 1.53it/s]
37%|███▋ | 3645/9770 [43:29<1:06:48, 1.53it/s]
37%|███▋ | 3646/9770 [43:30<1:06:25, 1.54it/s]
37%|███▋ | 3647/9770 [43:31<1:06:47, 1.53it/s]
37%|███▋ | 3648/9770 [43:31<1:07:14, 1.52it/s]
37%|███▋ | 3649/9770 [43:32<1:06:58, 1.52it/s]
37%|███▋ | 3650/9770 [43:33<1:07:35, 1.51it/s]
37%|███▋ | 3650/9770 [43:33<1:07:35, 1.51it/s]
37%|███▋ | 3651/9770 [43:33<1:06:59, 1.52it/s]
37%|███▋ | 3652/9770 [43:34<1:06:52, 1.52it/s]
37%|███▋ | 3653/9770 [43:35<1:07:23, 1.5
+0: {'loss': 0.6938, 'grad_norm': 0.6480782617729687, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.37}
+0: 1it/s]
37%|███▋ | 3654/9770 [43:35<1:07:07, 1.52it/s]
37%|███▋ | 3655/9770 [43:36<1:06:50, 1.52it/s]
37%|███▋ | 3656/9770 [43:36<1:06:58, 1.52it/s]
37%|███▋ | 3657/9770 [43:37<1:06:33, 1.53it/s]
37%|███▋ | 3658/9770 [43:38<1:07:44, 1.50it/s]
37%|███▋ | 3659/9770 [43:38<1:07:52, 1.50it/s]
37%|███▋ | 3660/9770 [43:39<1:08:00, 1.50it/s]
37%|███▋ | 3660/9770 [43:39<1:08:00, 1.50it/s]
37%|███▋ | 3661/9770 [43:40<1:07:40, 1.50it/s]
37%|███▋ | 3662/9770 [43:40<1:07:13, 1.51it/s]
37%|███▋ | 3663/9770 [43:41<1:07:14, 1.51it/s]
38%|███▊ | 3664/9770 [43:42<1:07:05, 1.52it/s]
38%|███▊ | 3665/9770 [43:42<1:07:11, 1.51it/s]
38%|███▊ | 3666/9770 [43:43<1:06:42, 1.53it/s]
38%|███▊ | 3667/9770 [43:44<1:06:55, 1.52it/s]
38%|███▊ | 3668/97
+0: {'loss': 0.6965, 'grad_norm': 0.6308774434035178, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.38}
+0: {'loss': 0.6901, 'grad_norm': 0.6356834869732386, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.38}
+0: 70 [43:44<1:07:17, 1.51it/s]
38%|███▊ | 3669/9770 [43:45<1:07:27, 1.51it/s]
38%|███▊ | 3670/9770 [43:46<1:07:27, 1.51it/s]
38%|███▊ | 3670/9770 [43:46<1:07:27, 1.51it/s]
38%|███▊ | 3671/9770 [43:46<1:07:01, 1.52it/s]
38%|███▊ | 3672/9770 [43:47<1:06:26, 1.53it/s]
38%|███▊ | 3673/9770 [43:48<1:06:17, 1.53it/s]
38%|███▊ | 3674/9770 [43:48<1:06:48, 1.52it/s]
38%|███▊ | 3675/9770 [43:49<1:07:00, 1.52it/s]
38%|███▊ | 3676/9770 [43:50<1:06:10, 1.53it/s]
38%|███▊ | 3677/9770 [43:50<1:05:22, 1.55it/s]
38%|███▊ | 3678/9770 [43:51<1:05:11, 1.56it/s]
38%|███▊ | 3679/9770 [43:52<1:05:42, 1.54it/s]
38%|███▊ | 3680/9770 [43:52<1:05:23, 1.55it/s]
38%|███▊ | 3680/9770 [43:52<1:05:23, 1.55it/s]
38%|███�
+0: {'loss': 0.6814, 'grad_norm': 0.6490579846843991, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.38}
+0: � | 3681/9770 [43:53<1:05:53, 1.54it/s]
38%|███▊ | 3682/9770 [43:54<1:05:43, 1.54it/s]
38%|███▊ | 3683/9770 [43:54<1:05:28, 1.55it/s]
38%|███▊ | 3684/9770 [43:55<1:05:39, 1.54it/s]
38%|███▊ | 3685/9770 [43:55<1:06:01, 1.54it/s]
38%|███▊ | 3686/9770 [43:56<1:05:39, 1.54it/s]
38%|███▊ | 3687/9770 [43:57<1:07:07, 1.51it/s]
38%|███▊ | 3688/9770 [43:57<1:07:05, 1.51it/s]
38%|███▊ | 3689/9770 [43:58<1:06:02, 1.53it/s]
38%|███▊ | 3690/9770 [43:59<1:06:29, 1.52it/s]
38%|███▊ | 3690/9770 [43:59<1:06:29, 1.52it/s]
38%|███▊ | 3691/9770 [43:59<1:06:27, 1.52it/s]
38%|███▊ | 3692/9770 [44:00<1:05:12, 1.55it/s]
38%|███▊ | 3693/9770 [44:01<1:05:11, 1.55it/s]
38%|███▊ | 3694/9770 [44:01<1:05:24, 1.55it/s]
38%|███▊ | 3695/9770 [44:02<1:05:09, 1.5
+0: {'loss': 0.6676, 'grad_norm': 0.6203903723587489, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.38}
+0: 5it/s]
38%|███▊ | 3696/9770 [44:03<1:04:48, 1.56it/s]
38%|███▊ | 3697/9770 [44:03<1:05:37, 1.54it/s]
38%|███▊ | 3698/9770 [44:04<1:05:10, 1.55it/s]
38%|███▊ | 3699/9770 [44:05<1:05:26, 1.55it/s]
38%|███▊ | 3700/9770 [44:05<1:05:40, 1.54it/s]
38%|███▊ | 3700/9770 [44:05<1:05:40, 1.54it/s]
38%|███▊ | 3701/9770 [44:06<1:05:59, 1.53it/s]
38%|███▊ | 3702/9770 [44:07<1:05:36, 1.54it/s]
38%|███▊ | 3703/9770 [44:07<1:06:29, 1.52it/s]
38%|███▊ | 3704/9770 [44:08<1:07:48, 1.49it/s]
38%|███▊ | 3705/9770 [44:09<1:07:09, 1.51it/s]
38%|███▊ | 3706/9770 [44:09<1:07:54, 1.49it/s]
38%|███▊ | 3707/9770 [44:10<1:08:04, 1.48it/s]
38%|███▊ | 3708/9770 [44:11<1:07:32, 1.50it/s]
38%|███▊ | 3709/9770 [44:11<1:07:06, 1.51it/s]
38%|███▊ | 3710/97
+0: {'loss': 0.6882, 'grad_norm': 0.6567375739802029, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.38}
+0: {'loss': 0.6848, 'grad_norm': 0.6363369621308691, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.38}
+0: 70 [44:12<1:07:04, 1.51it/s]
38%|███▊ | 3710/9770 [44:12<1:07:04, 1.51it/s]
38%|███▊ | 3711/9770 [44:13<1:07:01, 1.51it/s]
38%|███▊ | 3712/9770 [44:13<1:07:01, 1.51it/s]
38%|███▊ | 3713/9770 [44:14<1:06:46, 1.51it/s]
38%|███▊ | 3714/9770 [44:15<1:06:39, 1.51it/s]
38%|███▊ | 3715/9770 [44:15<1:06:08, 1.53it/s]
38%|███▊ | 3716/9770 [44:16<1:06:44, 1.51it/s]
38%|███▊ | 3717/9770 [44:16<1:06:16, 1.52it/s]
38%|███▊ | 3718/9770 [44:17<1:05:42, 1.54it/s]
38%|███▊ | 3719/9770 [44:18<1:05:21, 1.54it/s]
38%|███▊ | 3720/9770 [44:18<1:04:56, 1.55it/s]
38%|███▊ | 3720/9770 [44:18<1:04:56, 1.55it/s]
38%|███▊ | 3721/9770 [44:19<1:05:47, 1.53it/s]
38%|███▊ | 3722/9770 [44:20<1:05:11, 1.55it/s]
38%|███�
+0: {'loss': 0.6794, 'grad_norm': 0.6574026161570821, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.38}
+0: � | 3723/9770 [44:20<1:05:40, 1.53it/s]
38%|███▊ | 3724/9770 [44:21<1:05:35, 1.54it/s]
38%|███▊ | 3725/9770 [44:22<1:05:11, 1.55it/s]
38%|███▊ | 3726/9770 [44:22<1:07:00, 1.50it/s]
38%|███▊ | 3727/9770 [44:23<1:07:04, 1.50it/s]
38%|███▊ | 3728/9770 [44:24<1:07:34, 1.49it/s]
38%|███▊ | 3729/9770 [44:24<1:08:25, 1.47it/s]
38%|███▊ | 3730/9770 [44:25<1:07:44, 1.49it/s]
38%|███▊ | 3730/9770 [44:25<1:07:44, 1.49it/s]
38%|███▊ | 3731/9770 [44:26<1:06:49, 1.51it/s]
38%|███▊ | 3732/9770 [44:26<1:06:01, 1.52it/s]
38%|███▊ | 3733/9770 [44:27<1:05:31, 1.54it/s]
38%|███▊ | 3734/9770 [44:28<1:05:21, 1.54it/s]
38%|███▊ | 3735/9770 [44:28<1:05:36, 1.53it/s]
38%|███▊ | 3736/9770 [44:29<1:05:57, 1.52it/s]
38%|███▊ | 3737/9770 [44:30<1:06:12, 1.5
+0: {'loss': 0.6808, 'grad_norm': 0.6319851533736518, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.38}
+0: {'loss': 0.6806, 'grad_norm': 0.6757901047661943, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.38}
+0: 2it/s]
38%|███▊ | 3738/9770 [44:30<1:05:30, 1.53it/s]
38%|███▊ | 3739/9770 [44:31<1:06:46, 1.51it/s]
38%|███▊ | 3740/9770 [44:32<1:07:19, 1.49it/s]
38%|███▊ | 3740/9770 [44:32<1:07:19, 1.49it/s]
38%|███▊ | 3741/9770 [44:32<1:06:39, 1.51it/s]
38%|███▊ | 3742/9770 [44:33<1:06:46, 1.50it/s]
38%|███▊ | 3743/9770 [44:34<1:07:29, 1.49it/s]
38%|███▊ | 3744/9770 [44:34<1:08:31, 1.47it/s]
38%|███▊ | 3745/9770 [44:35<1:07:44, 1.48it/s]
38%|███▊ | 3746/9770 [44:36<1:06:36, 1.51it/s]
38%|███▊ | 3747/9770 [44:36<1:06:37, 1.51it/s]
38%|███▊ | 3748/9770 [44:37<1:07:20, 1.49it/s]
38%|███▊ | 3749/9770 [44:38<1:08:22, 1.47it/s]
38%|███▊ | 3750/9770 [44:38<1:07:12, 1.49it/s]
38%|███▊ | 3750/9770 [44:
+0: {'loss': 0.6892, 'grad_norm': 0.6763012523075451, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.38}
+0: 38<1:07:12, 1.49it/s]
38%|███▊ | 3751/9770 [44:39<1:06:02, 1.52it/s]
38%|███▊ | 3752/9770 [44:40<1:06:07, 1.52it/s]
38%|███▊ | 3753/9770 [44:40<1:05:29, 1.53it/s]
38%|███▊ | 3754/9770 [44:41<1:05:20, 1.53it/s]
38%|███▊ | 3755/9770 [44:42<1:05:17, 1.54it/s]
38%|███▊ | 3756/9770 [44:42<1:05:39, 1.53it/s]
38%|███▊ | 3757/9770 [44:43<1:05:39, 1.53it/s]
38%|███▊ | 3758/9770 [44:44<1:05:30, 1.53it/s]
38%|███▊ | 3759/9770 [44:44<1:05:08, 1.54it/s]
38%|███▊ | 3760/9770 [44:45<1:06:07, 1.51it/s]
38%|███▊ | 3760/9770 [44:45<1:06:07, 1.51it/s]
38%|███▊ | 3761/9770 [44:46<1:05:41, 1.52it/s]
39%|███▊ | 3762/9770 [44:46<1:05:17, 1.53it/s]
39%|███▊ | 3763/9770 [44:47<1:05:46, 1.52it/s]
39%|███▊ | 3764/9770 [44:47<1:06:14, 1.51it/s]
39%|███�
+0: {'loss': 0.686, 'grad_norm': 0.6211816930529909, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.39}
+0: � | 3765/9770 [44:48<1:05:59, 1.52it/s]
39%|███▊ | 3766/9770 [44:49<1:06:17, 1.51it/s]
39%|███▊ | 3767/9770 [44:49<1:05:33, 1.53it/s]
39%|███▊ | 3768/9770 [44:50<1:05:17, 1.53it/s]
39%|███▊ | 3769/9770 [44:51<1:05:28, 1.53it/s]
39%|███▊ | 3770/9770 [44:51<1:05:22, 1.53it/s]
39%|███▊ | 3770/9770 [44:51<1:05:22, 1.53it/s]
39%|███▊ | 3771/9770 [44:52<1:05:58, 1.52it/s]
39%|███▊ | 3772/9770 [44:53<1:06:29, 1.50it/s]
39%|███▊ | 3773/9770 [44:53<1:05:37, 1.52it/s]
39%|███▊ | 3774/9770 [44:54<1:04:47, 1.54it/s]
39%|███▊ | 3775/9770 [44:55<1:05:34, 1.52it/s]
39%|███▊ | 3776/9770 [44:55<1:05:17, 1.53it/s]
39%|███▊ | 3777/9770 [44:56<1:05:26, 1.53it/s]
39%|███▊ | 3778/9770 [44:57<1:05:14, 1.53it/s]
39%|███▊ | 3779/9770 [44:57<1:05:45, 1.5
+0: {'loss': 0.6731, 'grad_norm': 0.6651068063163778, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.39}
+0: {'loss': 0.7022, 'grad_norm': 0.6543865116056973, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.39}
+0: 2it/s]
39%|███▊ | 3780/9770 [44:58<1:05:51, 1.52it/s]
39%|███▊ | 3780/9770 [44:58<1:05:51, 1.52it/s]
39%|███▊ | 3781/9770 [44:59<1:05:51, 1.52it/s]
39%|███▊ | 3782/9770 [44:59<1:06:12, 1.51it/s]
39%|███▊ | 3783/9770 [45:00<1:06:12, 1.51it/s]
39%|███▊ | 3784/9770 [45:01<1:06:47, 1.49it/s]
39%|███▊ | 3785/9770 [45:01<1:06:16, 1.50it/s]
39%|███▉ | 3786/9770 [45:02<1:06:56, 1.49it/s]
39%|███▉ | 3787/9770 [45:03<1:06:45, 1.49it/s]
39%|███▉ | 3788/9770 [45:03<1:06:20, 1.50it/s]
39%|███▉ | 3789/9770 [45:04<1:05:39, 1.52it/s]
39%|███▉ | 3790/9770 [45:05<1:05:28, 1.52it/s]
39%|███▉ | 3790/9770 [45:05<1:05:28, 1.52it/s]
39%|███▉ | 3791/9770 [45:05<1:05:34, 1.52it/s]
39%|███▉ | 3792/9770 [45:
+0: {'loss': 0.6895, 'grad_norm': 0.6670962613474667, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.39}
+0: 06<1:05:34, 1.52it/s]
39%|███▉ | 3793/9770 [45:07<1:04:56, 1.53it/s]
39%|███▉ | 3794/9770 [45:07<1:04:30, 1.54it/s]
39%|███▉ | 3795/9770 [45:08<1:05:06, 1.53it/s]
39%|███▉ | 3796/9770 [45:09<1:05:22, 1.52it/s]
39%|███▉ | 3797/9770 [45:09<1:05:15, 1.53it/s]
39%|███▉ | 3798/9770 [45:10<1:05:20, 1.52it/s]
39%|███▉ | 3799/9770 [45:11<1:05:14, 1.53it/s]
39%|███▉ | 3800/9770 [45:11<1:05:35, 1.52it/s]
39%|███▉ | 3800/9770 [45:11<1:05:35, 1.52it/s]
39%|███▉ | 3801/9770 [45:12<1:05:51, 1.51it/s]
39%|███▉ | 3802/9770 [45:12<1:04:58, 1.53it/s]
39%|███▉ | 3803/9770 [45:13<1:05:30, 1.52it/s]
39%|███▉ | 3804/9770 [45:14<1:05:33, 1.52it/s]
39%|███▉ | 3805/9770 [45:15<1:06:49, 1.49it/s]
39%|███▉ | 3806/9770 [45:15<1:07:03, 1.48it/s]
39%|███�
+0: {'loss': 0.6842, 'grad_norm': 0.6886142929521072, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.39}
+0: {'loss': 0.6742, 'grad_norm': 0.6128142071741325, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.39}
+0: � | 3807/9770 [45:16<1:06:54, 1.49it/s]
39%|███▉ | 3808/9770 [45:17<1:06:08, 1.50it/s]
39%|███▉ | 3809/9770 [45:17<1:05:26, 1.52it/s]
39%|███▉ | 3810/9770 [45:18<1:05:17, 1.52it/s]
39%|███▉ | 3810/9770 [45:18<1:05:17, 1.52it/s]
39%|███▉ | 3811/9770 [45:18<1:05:22, 1.52it/s]
39%|███▉ | 3812/9770 [45:19<1:04:55, 1.53it/s]
39%|███▉ | 3813/9770 [45:20<1:04:45, 1.53it/s]
39%|███▉ | 3814/9770 [45:20<1:04:01, 1.55it/s]
39%|███▉ | 3815/9770 [45:21<1:04:00, 1.55it/s]
39%|███▉ | 3816/9770 [45:22<1:03:39, 1.56it/s]
39%|███▉ | 3817/9770 [45:22<1:04:03, 1.55it/s]
39%|███▉ | 3818/9770 [45:23<1:04:25, 1.54it/s]
39%|███▉ | 3819/9770 [45:24<1:06:01, 1.50it/s]
39%|███▉ | 3820/9770 [45:24<1:05:39, 1.51it/s]
+0: {'loss': 0.6731, 'grad_norm': 0.608292936546312, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.39}
+0: 39%|███▉ | 3820/9770 [45:24<1:05:39, 1.51it/s]
39%|███▉ | 3821/9770 [45:25<1:06:06, 1.50it/s]
39%|███▉ | 3822/9770 [45:26<1:05:16, 1.52it/s]
39%|███▉ | 3823/9770 [45:26<1:05:40, 1.51it/s]
39%|███▉ | 3824/9770 [45:27<1:05:40, 1.51it/s]
39%|███▉ | 3825/9770 [45:28<1:06:06, 1.50it/s]
39%|███▉ | 3826/9770 [45:28<1:05:27, 1.51it/s]
39%|███▉ | 3827/9770 [45:29<1:06:17, 1.49it/s]
39%|███▉ | 3828/9770 [45:30<1:06:23, 1.49it/s]
39%|███▉ | 3829/9770 [45:30<1:05:38, 1.51it/s]
39%|███▉ | 3830/9770 [45:31<1:04:55, 1.52it/s]
39%|███▉ | 3830/9770 [45:31<1:04:55, 1.52it/s]
39%|███▉ | 3831/9770 [45:32<1:04:34, 1.53it/s]
39%|███▉ | 3832/9770 [45:32<1:05:09, 1.52it/s]
39%|███▉ | 3833/9770 [45:33<1:05:07, 1.52it/s]
39%|███▉ | 3834/9770 [45:
+0: {'loss': 0.6821, 'grad_norm': 0.6508755805607591, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.39}
+0: 34<1:05:39, 1.51it/s]
39%|███▉ | 3835/9770 [45:34<1:05:29, 1.51it/s]
39%|███▉ | 3836/9770 [45:35<1:04:37, 1.53it/s]
39%|███▉ | 3837/9770 [45:36<1:04:43, 1.53it/s]
39%|███▉ | 3838/9770 [45:36<1:04:13, 1.54it/s]
39%|███▉ | 3839/9770 [45:37<1:05:03, 1.52it/s]
39%|███▉ | 3840/9770 [45:38<1:04:20, 1.54it/s]
39%|███▉ | 3840/9770 [45:38<1:04:20, 1.54it/s]
39%|███▉ | 3841/9770 [45:38<1:04:48, 1.52it/s]
39%|███▉ | 3842/9770 [45:39<1:05:04, 1.52it/s]
39%|███▉ | 3843/9770 [45:40<1:06:04, 1.50it/s]
39%|███▉ | 3844/9770 [45:40<1:05:37, 1.51it/s]
39%|███▉ | 3845/9770 [45:41<1:05:30, 1.51it/s]
39%|███▉ | 3846/9770 [45:42<1:05:31, 1.51it/s]
39%|███▉ | 3847/9770 [45:42<1:06:43, 1.48it/s]
39%|███▉ | 3848/9770 [45:43<1:06:05, 1.49it/s]
39%|███�
+0: {'loss': 0.6905, 'grad_norm': 0.5984699872329943, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.39}
+0: {'loss': 0.6619, 'grad_norm': 0.6534331465162374, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.39}
+0: � | 3849/9770 [45:44<1:06:17, 1.49it/s]
39%|███▉ | 3850/9770 [45:44<1:06:36, 1.48it/s]
39%|███▉ | 3850/9770 [45:44<1:06:36, 1.48it/s]
39%|███▉ | 3851/9770 [45:45<1:06:11, 1.49it/s]
39%|███▉ | 3852/9770 [45:46<1:05:57, 1.50it/s]
39%|███▉ | 3853/9770 [45:46<1:05:42, 1.50it/s]
39%|███▉ | 3854/9770 [45:47<1:05:33, 1.50it/s]
39%|███▉ | 3855/9770 [45:48<1:05:36, 1.50it/s]
39%|███▉ | 3856/9770 [45:48<1:05:39, 1.50it/s]
39%|███▉ | 3857/9770 [45:49<1:05:22, 1.51it/s]
39%|███▉ | 3858/9770 [45:50<1:05:10, 1.51it/s]
39%|███▉ | 3859/9770 [45:50<1:04:53, 1.52it/s]
40%|███▉ | 3860/9770 [45:51<1:04:24, 1.53it/s]
40%|███▉ | 3860/9770 [45:51<1:04:24, 1.53it/s]
40%|███▉ | 3861/9770 [45:52<1:04:35, 1.52it/s]
+0: {'loss': 0.7051, 'grad_norm': 0.6617786171175576, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.4}
+0: 40%|███▉ | 3862/9770 [45:52<1:04:39, 1.52it/s]
40%|███▉ | 3863/9770 [45:53<1:05:27, 1.50it/s]
40%|███▉ | 3864/9770 [45:54<1:05:40, 1.50it/s]
40%|███▉ | 3865/9770 [45:54<1:05:21, 1.51it/s]
40%|███▉ | 3866/9770 [45:55<1:05:09, 1.51it/s]
40%|███▉ | 3867/9770 [45:55<1:05:16, 1.51it/s]
40%|███▉ | 3868/9770 [45:56<1:04:32, 1.52it/s]
40%|███▉ | 3869/9770 [45:57<1:05:45, 1.50it/s]
40%|███▉ | 3870/9770 [45:57<1:05:11, 1.51it/s]
40%|███▉ | 3870/9770 [45:57<1:05:11, 1.51it/s]
40%|███▉ | 3871/9770 [45:58<1:04:52, 1.52it/s]
40%|███▉ | 3872/9770 [45:59<1:06:08, 1.49it/s]
40%|███▉ | 3873/9770 [45:59<1:05:40, 1.50it/s]
40%|███▉ | 3874/9770 [46:00<1:05:25, 1.50it/s]
40%|███▉ | 3875/9770 [46:01<1:05:34, 1.50it/s]
40%|███▉ | 3876/9770 [46:
+0: {'loss': 0.6836, 'grad_norm': 0.6263478630584508, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.4}
+0: 01<1:05:17, 1.50it/s]
40%|███▉ | 3877/9770 [46:02<1:05:23, 1.50it/s]
40%|███▉ | 3878/9770 [46:03<1:05:02, 1.51it/s]
40%|███▉ | 3879/9770 [46:03<1:04:28, 1.52it/s]
40%|███▉ | 3880/9770 [46:04<1:05:28, 1.50it/s]
40%|███▉ | 3880/9770 [46:04<1:05:28, 1.50it/s]
40%|███▉ | 3881/9770 [46:05<1:10:01, 1.40it/s]
40%|███▉ | 3882/9770 [46:06<1:15:41, 1.30it/s]
40%|███▉ | 3883/9770 [46:07<1:17:00, 1.27it/s]
40%|███▉ | 3884/9770 [46:07<1:13:03, 1.34it/s]
40%|███▉ | 3885/9770 [46:08<1:10:37, 1.39it/s]
40%|███▉ | 3886/9770 [46:09<1:09:01, 1.42it/s]
40%|███▉ | 3887/9770 [46:09<1:07:34, 1.45it/s]
40%|███▉ | 3888/9770 [46:10<1:07:27, 1.45it/s]
40%|███▉ | 3889/9770 [46:11<1:06:44, 1.47it/s]
40%|███▉ | 3890/9770 [46:11<1:07:25, 1.45it/s]
+0: {'loss': 0.6979, 'grad_norm': 0.6182063540656019, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.4}
+0: {'loss': 0.6675, 'grad_norm': 0.6277899799839937, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.4}
+0:
40%|███▉ | 3890/9770 [46:11<1:07:25, 1.45it/s]
40%|███▉ | 3891/9770 [46:12<1:07:38, 1.45it/s]
40%|███▉ | 3892/9770 [46:13<1:06:35, 1.47it/s]
40%|███▉ | 3893/9770 [46:13<1:05:45, 1.49it/s]
40%|███▉ | 3894/9770 [46:14<1:05:35, 1.49it/s]
40%|███▉ | 3895/9770 [46:15<1:05:08, 1.50it/s]
40%|███▉ | 3896/9770 [46:15<1:04:37, 1.51it/s]
40%|███▉ | 3897/9770 [46:16<1:04:08, 1.53it/s]
40%|███▉ | 3898/9770 [46:17<1:04:27, 1.52it/s]
40%|███▉ | 3899/9770 [46:17<1:04:39, 1.51it/s]
40%|███▉ | 3900/9770 [46:18<1:04:06, 1.53it/s]
40%|███▉ | 3900/9770 [46:18<1:04:06, 1.53it/s]
40%|███▉ | 3901/9770 [46:19<1:04:31, 1.52it/s]
40%|███▉ | 3902/9770 [46:19<1:04:37, 1.51it/s]
40%|███▉ | 3903/9770 [46:20<1:05:57, 1.48it/s]
+0: [2025-09-02 20:42:25,266] [INFO] [axolotl.core.trainers.base._save:613] [PID:3622631] [RANK:0] Saving model checkpoint to /lustre/fswork/projects/rech/dgo/udv55np/math/Qwen3-235B-A22B/Qwen2.5-0.5B_reasoning/1/checkpoint-3908[39m
+0: [2025-09-02 20:42:26,198] [INFO] [axolotl.core.trainers.base._save:662] [PID:3622631] [RANK:0] Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`[39m
+0: {'loss': 0.6734, 'grad_norm': 0.6234948549415069, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.4}
+0: 40%|███▉ | 3904/9770 [46:21<1:06:04, 1.48it/s]
40%|███▉ | 3905/9770 [46:21<1:05:48, 1.49it/s]
40%|███▉ | 3906/9770 [46:22<1:05:57, 1.48it/s]
40%|███▉ | 3907/9770 [46:23<1:05:34, 1.49it/s]
40%|████ | 3908/9770 [46:23<1:05:11, 1.50it/s]
40%|████ | 3909/9770 [46:26<2:16:14, 1.39s/it]
40%|████ | 3910/9770 [46:27<1:54:11, 1.17s/it]
40%|████ | 3910/9770 [46:27<1:54:11, 1.17s/it]
40%|████ | 3911/9770 [46:28<1:38:59, 1.01s/it]
40%|████ | 3912/9770 [46:28<1:28:57, 1.10it/s]
40%|████ | 3913/9770 [46:29<1:21:33, 1.20it/s]
40%|████ | 3914/9770 [46:30<1:15:46, 1.29it/s]
40%|████ | 3915/9770 [46:30<1:12:17, 1.35it/s]
40%|████ | 3916/9770 [46:31<1:10:52, 1.38it/s]
40%|████ | 3917/9770 [46:32<1:08:17, 1.43it/s]
40%|████ | 3918/9770 [46:
+0: {'loss': 0.6716, 'grad_norm': 0.6635311663860688, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.4}
+0: {'loss': 0.6678, 'grad_norm': 0.668125380985513, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.4}
+0: 32<1:07:41, 1.44it/s]
40%|████ | 3919/9770 [46:33<1:13:07, 1.33it/s]
40%|████ | 3920/9770 [46:34<1:10:18, 1.39it/s]
40%|████ | 3920/9770 [46:34<1:10:18, 1.39it/s]
40%|████ | 3921/9770 [46:35<1:08:02, 1.43it/s]
40%|████ | 3922/9770 [46:35<1:06:21, 1.47it/s]
40%|████ | 3923/9770 [46:36<1:05:45, 1.48it/s]
40%|████ | 3924/9770 [46:37<1:05:06, 1.50it/s]
40%|████ | 3925/9770 [46:37<1:04:57, 1.50it/s]
40%|████ | 3926/9770 [46:38<1:04:50, 1.50it/s]
40%|████ | 3927/9770 [46:38<1:04:30, 1.51it/s]
40%|████ | 3928/9770 [46:39<1:03:59, 1.52it/s]
40%|████ | 3929/9770 [46:40<1:03:25, 1.53it/s]
40%|████ | 3930/9770 [46:40<1:04:06, 1.52it/s]
40%|████ | 3930/9770 [46:40<1:04:06, 1.52it/s]
40%|████
+0: {'loss': 0.6594, 'grad_norm': 0.5913230341114011, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.4}
+0: | 3931/9770 [46:41<1:03:51, 1.52it/s]
40%|████ | 3932/9770 [46:42<1:03:55, 1.52it/s]
40%|████ | 3933/9770 [46:42<1:03:59, 1.52it/s]
40%|████ | 3934/9770 [46:43<1:04:06, 1.52it/s]
40%|████ | 3935/9770 [46:44<1:05:04, 1.49it/s]
40%|████ | 3936/9770 [46:44<1:04:27, 1.51it/s]
40%|████ | 3937/9770 [46:45<1:07:00, 1.45it/s]
40%|████ | 3938/9770 [46:46<1:05:38, 1.48it/s]
40%|████ | 3939/9770 [46:46<1:05:17, 1.49it/s]
40%|████ | 3940/9770 [46:47<1:04:59, 1.50it/s]
40%|████ | 3940/9770 [46:47<1:04:59, 1.50it/s]
40%|████ | 3941/9770 [46:48<1:04:26, 1.51it/s]
40%|████ | 3942/9770 [46:48<1:03:22, 1.53it/s]
40%|████ | 3943/9770 [46:49<1:02:58, 1.54it/s]
40%|████ | 3944/9770 [46:50<1:03:15, 1.53it/s]
40%|████ | 3945/9770 [46:50<1:02:51, 1.54it/s]
+0: {'loss': 0.6752, 'grad_norm': 0.623502442130232, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.4}
+0: 40%|████ | 3946/9770 [46:51<1:02:28, 1.55it/s]
40%|████ | 3947/9770 [46:52<1:03:07, 1.54it/s]
40%|████ | 3948/9770 [46:52<1:02:59, 1.54it/s]
40%|████ | 3949/9770 [46:53<1:03:09, 1.54it/s]
40%|████ | 3950/9770 [46:54<1:03:15, 1.53it/s]
40%|████ | 3950/9770 [46:54<1:03:15, 1.53it/s]
40%|████ | 3951/9770 [46:54<1:03:01, 1.54it/s]
40%|████ | 3952/9770 [46:55<1:02:31, 1.55it/s]
40%|████ | 3953/9770 [46:56<1:03:08, 1.54it/s]
40%|████ | 3954/9770 [46:56<1:03:05, 1.54it/s]
40%|████ | 3955/9770 [46:57<1:03:27, 1.53it/s]
40%|████ | 3956/9770 [46:58<1:05:04, 1.49it/s]
41%|████ | 3957/9770 [46:58<1:04:35, 1.50it/s]
41%|████ | 3958/9770 [46:59<1:04:22, 1.50it/s]
41%|████ | 3959/9770 [47:00<1:04:09, 1.51it/s]
41%|████ | 3960/9770 [47:
+0: {'loss': 0.6619, 'grad_norm': 0.6097053603693122, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.41}
+0: {'loss': 0.6875, 'grad_norm': 0.6161818605147438, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.41}
+0: 00<1:03:56, 1.51it/s]
41%|████ | 3960/9770 [47:00<1:03:56, 1.51it/s]
41%|████ | 3961/9770 [47:01<1:03:32, 1.52it/s]
41%|████ | 3962/9770 [47:02<1:03:35, 1.52it/s]
41%|████ | 3963/9770 [47:02<1:03:33, 1.52it/s]
41%|████ | 3964/9770 [47:03<1:03:52, 1.52it/s]
41%|████ | 3965/9770 [47:04<1:04:10, 1.51it/s]
41%|████ | 3966/9770 [47:04<1:03:03, 1.53it/s]
41%|████ | 3967/9770 [47:05<1:03:41, 1.52it/s]
41%|████ | 3968/9770 [47:05<1:03:44, 1.52it/s]
41%|████ | 3969/9770 [47:06<1:03:39, 1.52it/s]
41%|████ | 3970/9770 [47:07<1:02:52, 1.54it/s]
41%|████ | 3970/9770 [47:07<1:02:52, 1.54it/s]
41%|████ | 3971/9770 [47:07<1:03:00, 1.53it/s]
41%|████ | 3972/9770 [47:08<1:03:51, 1.51it/s]
41%|████
+0: {'loss': 0.6978, 'grad_norm': 0.6657499661361843, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.41}
+0: | 3973/9770 [47:09<1:05:29, 1.48it/s]
41%|████ | 3974/9770 [47:09<1:04:57, 1.49it/s]
41%|████ | 3975/9770 [47:10<1:04:43, 1.49it/s]
41%|████ | 3976/9770 [47:11<1:04:09, 1.50it/s]
41%|████ | 3977/9770 [47:11<1:04:21, 1.50it/s]
41%|████ | 3978/9770 [47:12<1:04:00, 1.51it/s]
41%|████ | 3979/9770 [47:13<1:04:16, 1.50it/s]
41%|████ | 3980/9770 [47:13<1:04:07, 1.50it/s]
41%|████ | 3980/9770 [47:13<1:04:07, 1.50it/s]
41%|████ | 3981/9770 [47:14<1:03:28, 1.52it/s]
41%|████ | 3982/9770 [47:15<1:03:40, 1.51it/s]
41%|████ | 3983/9770 [47:15<1:04:11, 1.50it/s]
41%|████ | 3984/9770 [47:16<1:04:15, 1.50it/s]
41%|████ | 3985/9770 [47:17<1:04:10, 1.50it/s]
41%|████ | 3986/9770 [47:17<1:03:21, 1.52it/s]
41%|████ | 3987/9770 [47:18<1:03:13, 1.52it/s]
+0: {'loss': 0.6624, 'grad_norm': 0.646864287847045, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.41}
+0: {'loss': 0.6781, 'grad_norm': 0.6428436343225846, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.41}
+0: 41%|████ | 3988/9770 [47:19<1:03:20, 1.52it/s]
41%|████ | 3989/9770 [47:19<1:03:23, 1.52it/s]
41%|████ | 3990/9770 [47:20<1:03:20, 1.52it/s]
41%|████ | 3990/9770 [47:20<1:03:20, 1.52it/s]
41%|████ | 3991/9770 [47:21<1:05:08, 1.48it/s]
41%|████ | 3992/9770 [47:21<1:04:43, 1.49it/s]
41%|████ | 3993/9770 [47:22<1:04:17, 1.50it/s]
41%|████ | 3994/9770 [47:23<1:03:41, 1.51it/s]
41%|████ | 3995/9770 [47:23<1:04:32, 1.49it/s]
41%|████ | 3996/9770 [47:24<1:04:23, 1.49it/s]
41%|████ | 3997/9770 [47:25<1:04:11, 1.50it/s]
41%|████ | 3998/9770 [47:25<1:03:10, 1.52it/s]
41%|████ | 3999/9770 [47:26<1:03:44, 1.51it/s]
41%|████ | 4000/9770 [47:27<1:03:52, 1.51it/s]
41%|████ | 4000/9770 [47:27<1:03
+0: {'loss': 0.6933, 'grad_norm': 0.6890656609990352, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.41}
+0: :52, 1.51it/s]
41%|████ | 4001/9770 [47:27<1:04:57, 1.48it/s]
41%|████ | 4002/9770 [47:28<1:04:21, 1.49it/s]
41%|████ | 4003/9770 [47:29<1:04:48, 1.48it/s]
41%|████ | 4004/9770 [47:29<1:04:56, 1.48it/s]
41%|████ | 4005/9770 [47:30<1:04:23, 1.49it/s]
41%|████ | 4006/9770 [47:31<1:03:39, 1.51it/s]
41%|████ | 4007/9770 [47:31<1:04:11, 1.50it/s]
41%|████ | 4008/9770 [47:32<1:04:28, 1.49it/s]
41%|████ | 4009/9770 [47:33<1:05:34, 1.46it/s]
41%|████ | 4010/9770 [47:33<1:04:56, 1.48it/s]
41%|████ | 4010/9770 [47:33<1:04:56, 1.48it/s]
41%|████ | 4011/9770 [47:34<1:04:11, 1.50it/s]
41%|████ | 4012/9770 [47:35<1:04:28, 1.49it/s]
41%|████ | 4013/9770 [47:35<1:04:22, 1.49it/s]
41%|████ | 4014/9770 [47:36<1:04:17, 1.49it/s]
41%|████
+0: {'loss': 0.701, 'grad_norm': 0.6344444084921689, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.41}
+0: | 4015/9770 [47:37<1:04:13, 1.49it/s]
41%|████ | 4016/9770 [47:37<1:03:59, 1.50it/s]
41%|████ | 4017/9770 [47:38<1:03:44, 1.50it/s]
41%|████ | 4018/9770 [47:39<1:03:33, 1.51it/s]
41%|████ | 4019/9770 [47:39<1:02:53, 1.52it/s]
41%|████ | 4020/9770 [47:40<1:02:52, 1.52it/s]
41%|████ | 4020/9770 [47:40<1:02:52, 1.52it/s]
41%|████ | 4021/9770 [47:41<1:02:46, 1.53it/s]
41%|████ | 4022/9770 [47:41<1:02:12, 1.54it/s]
41%|████ | 4023/9770 [47:42<1:02:50, 1.52it/s]
41%|████ | 4024/9770 [47:43<1:03:34, 1.51it/s]
41%|████ | 4025/9770 [47:43<1:02:55, 1.52it/s]
41%|████ | 4026/9770 [47:44<1:03:20, 1.51it/s]
41%|████ | 4027/9770 [47:45<1:03:13, 1.51it/s]
41%|████ | 4028/9770 [47:45<1:03:05, 1.52it/s]
41%|████ | 4029/9770 [47:46<1:06:32, 1.44it/s]
+0: {'loss': 0.6512, 'grad_norm': 0.6000300993038492, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.41}
+0: {'loss': 0.6851, 'grad_norm': 0.7507050942994034, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.41}
+0: 41%|████ | 4030/9770 [47:47<1:06:35, 1.44it/s]
41%|████ | 4030/9770 [47:47<1:06:35, 1.44it/s]
41%|████▏ | 4031/9770 [47:47<1:05:44, 1.46it/s]
41%|████▏ | 4032/9770 [47:48<1:04:50, 1.47it/s]
41%|████▏ | 4033/9770 [47:49<1:05:26, 1.46it/s]
41%|████▏ | 4034/9770 [47:49<1:03:47, 1.50it/s]
41%|████▏ | 4035/9770 [47:50<1:03:49, 1.50it/s]
41%|████▏ | 4036/9770 [47:51<1:03:55, 1.50it/s]
41%|████▏ | 4037/9770 [47:51<1:03:19, 1.51it/s]
41%|████▏ | 4038/9770 [47:52<1:03:06, 1.51it/s]
41%|████▏ | 4039/9770 [47:53<1:02:35, 1.53it/s]
41%|████▏ | 4040/9770 [47:53<1:03:25, 1.51it/s]
41%|████▏ | 4040/9770 [47:53<1:03:25, 1.51it/s]
41%|████▏ | 4041/9770 [47:54<1:03:01, 1.52it/s]
41%|████▏
+0: {'loss': 0.7035, 'grad_norm': 0.6491833386356902, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.41}
+0: | 4042/9770 [47:55<1:02:07, 1.54it/s]
41%|████▏ | 4043/9770 [47:55<1:01:54, 1.54it/s]
41%|████▏ | 4044/9770 [47:56<1:02:26, 1.53it/s]
41%|████▏ | 4045/9770 [47:57<1:02:42, 1.52it/s]
41%|████▏ | 4046/9770 [47:57<1:03:30, 1.50it/s]
41%|████▏ | 4047/9770 [47:58<1:03:55, 1.49it/s]
41%|████▏ | 4048/9770 [47:59<1:03:35, 1.50it/s]
41%|████▏ | 4049/9770 [47:59<1:03:21, 1.51it/s]
41%|████▏ | 4050/9770 [48:00<1:02:49, 1.52it/s]
41%|████▏ | 4050/9770 [48:00<1:02:49, 1.52it/s]
41%|████▏ | 4051/9770 [48:01<1:03:06, 1.51it/s]
41%|████▏ | 4052/9770 [48:01<1:03:16, 1.51it/s]
41%|████▏ | 4053/9770 [48:02<1:02:31, 1.52it/s]
41%|████▏ | 4054/9770 [48:03<1:02:17, 1.53it/s]
42%|████▏ | 4055/9770 [48:03<1:02:27, 1.53it/s]
42%|████▏ | 4056
+0: {'loss': 0.6669, 'grad_norm': 0.6035692806902782, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.42}
+0: /9770 [48:04<1:01:47, 1.54it/s]
42%|████▏ | 4057/9770 [48:05<1:01:56, 1.54it/s]
42%|████▏ | 4058/9770 [48:05<1:02:13, 1.53it/s]
42%|████▏ | 4059/9770 [48:06<1:02:39, 1.52it/s]
42%|████▏ | 4060/9770 [48:07<1:02:34, 1.52it/s]
42%|████▏ | 4060/9770 [48:07<1:02:34, 1.52it/s]
42%|████▏ | 4061/9770 [48:07<1:02:40, 1.52it/s]
42%|████▏ | 4062/9770 [48:08<1:02:59, 1.51it/s]
42%|████▏ | 4063/9770 [48:09<1:02:47, 1.51it/s]
42%|████▏ | 4064/9770 [48:09<1:02:09, 1.53it/s]
42%|████▏ | 4065/9770 [48:10<1:01:43, 1.54it/s]
42%|████▏ | 4066/9770 [48:11<1:03:13, 1.50it/s]
42%|████▏ | 4067/9770 [48:11<1:02:36, 1.52it/s]
42%|████▏ | 4068/9770 [48:12<1:02:20, 1.52it/s]
42%|████▏ | 4069/9770 [48:13<1:02:20, 1.52it/s]
42%|████▏ | 4070/9770 [48
+0: {'loss': 0.6905, 'grad_norm': 0.6752122449540094, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.42}
+0: {'loss': 0.6832, 'grad_norm': 0.5972917322158103, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.42}
+0: :13<1:03:03, 1.51it/s]
42%|████▏ | 4070/9770 [48:13<1:03:03, 1.51it/s]
42%|████▏ | 4071/9770 [48:14<1:03:10, 1.50it/s]
42%|████▏ | 4072/9770 [48:15<1:03:26, 1.50it/s]
42%|████▏ | 4073/9770 [48:15<1:03:05, 1.50it/s]
42%|████▏ | 4074/9770 [48:16<1:02:59, 1.51it/s]
42%|████▏ | 4075/9770 [48:17<1:03:21, 1.50it/s]
42%|████▏ | 4076/9770 [48:17<1:02:55, 1.51it/s]
42%|████▏ | 4077/9770 [48:18<1:03:36, 1.49it/s]
42%|████▏ | 4078/9770 [48:19<1:02:49, 1.51it/s]
42%|████▏ | 4079/9770 [48:19<1:02:41, 1.51it/s]
42%|████▏ | 4080/9770 [48:20<1:03:24, 1.50it/s]
42%|████▏ | 4080/9770 [48:20<1:03:24, 1.50it/s]
42%|████▏ | 4081/9770 [48:21<1:03:31, 1.49it/s]
42%|████▏ | 4082/9770 [48:21<1:02:44, 1.51
+0: {'loss': 0.6759, 'grad_norm': 0.6177857483317616, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.42}
+0: it/s]
42%|████▏ | 4083/9770 [48:22<1:02:23, 1.52it/s]
42%|████▏ | 4084/9770 [48:22<1:02:20, 1.52it/s]
42%|████▏ | 4085/9770 [48:23<1:02:47, 1.51it/s]
42%|████▏ | 4086/9770 [48:24<1:01:56, 1.53it/s]
42%|████▏ | 4087/9770 [48:24<1:02:19, 1.52it/s]
42%|████▏ | 4088/9770 [48:25<1:02:25, 1.52it/s]
42%|████▏ | 4089/9770 [48:26<1:02:37, 1.51it/s]
42%|████▏ | 4090/9770 [48:26<1:01:55, 1.53it/s]
42%|████▏ | 4090/9770 [48:26<1:01:55, 1.53it/s]
42%|████▏ | 4091/9770 [48:27<1:03:06, 1.50it/s]
42%|████▏ | 4092/9770 [48:28<1:03:23, 1.49it/s]
42%|████▏ | 4093/9770 [48:28<1:02:27, 1.51it/s]
42%|████▏ | 4094/9770 [48:29<1:02:34, 1.51it/s]
42%|████▏ | 4095/9770 [48:30<1:01:49, 1.53it/s]
42%|████▏ | 4096/9770 [48:30<1:02:43, 1.51it/s]
42
+0: {'loss': 0.6757, 'grad_norm': 0.626741728603864, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.42}
+0: %|████▏ | 4097/9770 [48:31<1:02:35, 1.51it/s]
42%|████▏ | 4098/9770 [48:32<1:02:28, 1.51it/s]
42%|████▏ | 4099/9770 [48:32<1:01:23, 1.54it/s]
42%|████▏ | 4100/9770 [48:33<1:02:04, 1.52it/s]
42%|████▏ | 4100/9770 [48:33<1:02:04, 1.52it/s]
42%|████▏ | 4101/9770 [48:34<1:02:11, 1.52it/s]
42%|████▏ | 4102/9770 [48:34<1:02:03, 1.52it/s]
42%|████▏ | 4103/9770 [48:35<1:01:32, 1.53it/s]
42%|████▏ | 4104/9770 [48:36<1:01:37, 1.53it/s]
42%|████▏ | 4105/9770 [48:36<1:02:03, 1.52it/s]
42%|████▏ | 4106/9770 [48:37<1:01:09, 1.54it/s]
42%|████▏ | 4107/9770 [48:38<1:01:31, 1.53it/s]
42%|████▏ | 4108/9770 [48:38<1:02:53, 1.50it/s]
42%|████▏ | 4109/9770 [48:39<1:02:57, 1.50it/s]
42%|████▏ | 4110/9770 [48:40<1:03:10, 1.49it/s]
+0: {'loss': 0.6875, 'grad_norm': 0.629559077414286, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.42}
+0: {'loss': 0.6652, 'grad_norm': 0.5871601900952612, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.42}
+0:
42%|████▏ | 4110/9770 [48:40<1:03:10, 1.49it/s]
42%|████▏ | 4111/9770 [48:40<1:02:29, 1.51it/s]
42%|████▏ | 4112/9770 [48:41<1:02:34, 1.51it/s]
42%|████▏ | 4113/9770 [48:42<1:02:34, 1.51it/s]
42%|████▏ | 4114/9770 [48:42<1:02:16, 1.51it/s]
42%|████▏ | 4115/9770 [48:43<1:02:13, 1.51it/s]
42%|████▏ | 4116/9770 [48:44<1:01:50, 1.52it/s]
42%|████▏ | 4117/9770 [48:44<1:02:02, 1.52it/s]
42%|████▏ | 4118/9770 [48:45<1:01:42, 1.53it/s]
42%|████▏ | 4119/9770 [48:46<1:02:00, 1.52it/s]
42%|████▏ | 4120/9770 [48:46<1:02:13, 1.51it/s]
42%|████▏ | 4120/9770 [48:46<1:02:13, 1.51it/s]
42%|████▏ | 4121/9770 [48:47<1:01:48, 1.52it/s]
42%|████▏ | 4122/9770 [48:48<1:01:31, 1.53it/s]
42%|████▏ | 412
+0: {'loss': 0.6991, 'grad_norm': 0.6190659956535826, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.42}
+0: 3/9770 [48:48<1:02:02, 1.52it/s]
42%|████▏ | 4124/9770 [48:49<1:02:21, 1.51it/s]
42%|████▏ | 4125/9770 [48:50<1:01:54, 1.52it/s]
42%|████▏ | 4126/9770 [48:50<1:02:12, 1.51it/s]
42%|████▏ | 4127/9770 [48:51<1:02:43, 1.50it/s]
42%|████▏ | 4128/9770 [48:52<1:02:14, 1.51it/s]
42%|████▏ | 4129/9770 [48:52<1:02:24, 1.51it/s]
42%|████▏ | 4130/9770 [48:53<1:02:35, 1.50it/s]
42%|████▏ | 4130/9770 [48:53<1:02:35, 1.50it/s]
42%|████▏ | 4131/9770 [48:53<1:02:00, 1.52it/s]
42%|████▏ | 4132/9770 [48:54<1:02:50, 1.50it/s]
42%|████▏ | 4133/9770 [48:55<1:02:40, 1.50it/s]
42%|████▏ | 4134/9770 [48:55<1:02:10, 1.51it/s]
42%|████▏ | 4135/9770 [48:56<1:01:41, 1.52it/s]
42%|████▏ | 4136/9770 [48:57<1:02:32, 1.50it/s]
42%|████▏ | 4137/9770 [4
+0: {'loss': 0.6714, 'grad_norm': 0.5957275873796352, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.42}
+0: 8:57<1:02:10, 1.51it/s]
42%|████▏ | 4138/9770 [48:58<1:01:43, 1.52it/s]
42%|████▏ | 4139/9770 [48:59<1:01:46, 1.52it/s]
42%|████▏ | 4140/9770 [48:59<1:02:06, 1.51it/s]
42%|████▏ | 4140/9770 [48:59<1:02:06, 1.51it/s]
42%|████▏ | 4141/9770 [49:00<1:01:53, 1.52it/s]
42%|████▏ | 4142/9770 [49:01<1:01:07, 1.53it/s]
42%|████▏ | 4143/9770 [49:01<1:01:07, 1.53it/s]
42%|████▏ | 4144/9770 [49:02<1:01:10, 1.53it/s]
42%|████▏ | 4145/9770 [49:03<1:01:30, 1.52it/s]
42%|████▏ | 4146/9770 [49:03<1:01:28, 1.52it/s]
42%|████▏ | 4147/9770 [49:04<1:00:58, 1.54it/s]
42%|████▏ | 4148/9770 [49:05<1:01:03, 1.53it/s]
42%|████▏ | 4149/9770 [49:05<1:01:04, 1.53it/s]
42%|████▏ | 4150/9770 [49:06<1:00:37, 1.55it/s]
+0: {'loss': 0.6486, 'grad_norm': 0.6680324829603406, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.42}
+0: {'loss': 0.6961, 'grad_norm': 0.6027922397960519, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.43}
+0:
42%|████▏ | 4150/9770 [49:06<1:00:37, 1.55it/s]
42%|████▏ | 4151/9770 [49:07<1:00:43, 1.54it/s]
42%|████▏ | 4152/9770 [49:07<1:00:46, 1.54it/s]
43%|████▎ | 4153/9770 [49:08<1:01:17, 1.53it/s]
43%|████▎ | 4154/9770 [49:09<1:01:29, 1.52it/s]
43%|████▎ | 4155/9770 [49:09<1:07:29, 1.39it/s]
43%|████▎ | 4156/9770 [49:10<1:06:07, 1.41it/s]
43%|████▎ | 4157/9770 [49:11<1:05:04, 1.44it/s]
43%|████▎ | 4158/9770 [49:11<1:04:19, 1.45it/s]
43%|████▎ | 4159/9770 [49:12<1:03:20, 1.48it/s]
43%|████▎ | 4160/9770 [49:13<1:03:25, 1.47it/s]
43%|████▎ | 4160/9770 [49:13<1:03:25, 1.47it/s]
43%|████▎ | 4161/9770 [49:13<1:02:04, 1.51it/s]
43%|████▎ | 4162/9770 [49:14<1:02:07, 1.50it/s]
43%|████▎ | 4163/9770 [49:15<1:02:03, 1.51it/s]
4
+0: {'loss': 0.6844, 'grad_norm': 0.6293728208813729, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.43}
+0: 3%|████▎ | 4164/9770 [49:15<1:02:15, 1.50it/s]
43%|████▎ | 4165/9770 [49:16<1:01:51, 1.51it/s]
43%|████▎ | 4166/9770 [49:17<1:00:57, 1.53it/s]
43%|████▎ | 4167/9770 [49:17<1:01:12, 1.53it/s]
43%|████▎ | 4168/9770 [49:18<1:01:58, 1.51it/s]
43%|████▎ | 4169/9770 [49:19<1:01:02, 1.53it/s]
43%|████▎ | 4170/9770 [49:19<1:01:07, 1.53it/s]
43%|████▎ | 4170/9770 [49:19<1:01:07, 1.53it/s]
43%|████▎ | 4171/9770 [49:20<1:00:39, 1.54it/s]
43%|████▎ | 4172/9770 [49:21<1:01:07, 1.53it/s]
43%|████▎ | 4173/9770 [49:21<1:00:41, 1.54it/s]
43%|████▎ | 4174/9770 [49:22<1:00:39, 1.54it/s]
43%|████▎ | 4175/9770 [49:23<1:00:39, 1.54it/s]
43%|████▎ | 4176/9770 [49:23<1:00:32, 1.54it/s]
43%|████▎ | 4177/9770 [49:24<1:00:19, 1.55it/s]
43%|██
+0: {'loss': 0.6832, 'grad_norm': 0.6980571092517832, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.43}
+0: {'loss': 0.6894, 'grad_norm': 0.6906019584574479, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.43}
+0: ██▎ | 4178/9770 [49:25<1:00:47, 1.53it/s]
43%|████▎ | 4179/9770 [49:25<1:00:42, 1.54it/s]
43%|████▎ | 4180/9770 [49:26<1:01:16, 1.52it/s]
43%|████▎ | 4180/9770 [49:26<1:01:16, 1.52it/s]
43%|████▎ | 4181/9770 [49:27<1:01:48, 1.51it/s]
43%|████▎ | 4182/9770 [49:27<1:01:23, 1.52it/s]
43%|████▎ | 4183/9770 [49:28<1:01:30, 1.51it/s]
43%|████▎ | 4184/9770 [49:29<1:01:56, 1.50it/s]
43%|████▎ | 4185/9770 [49:29<1:02:54, 1.48it/s]
43%|████▎ | 4186/9770 [49:30<1:02:11, 1.50it/s]
43%|████▎ | 4187/9770 [49:31<1:01:12, 1.52it/s]
43%|████▎ | 4188/9770 [49:31<1:02:05, 1.50it/s]
43%|████▎ | 4189/9770 [49:32<1:00:42, 1.53it/s]
43%|████▎ | 4190/9770 [49:33<1:02:01, 1.50it/s]
43%|████▎ | 41
+0: {'loss': 0.6495, 'grad_norm': 0.6284243646925342, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.43}
+0: 90/9770 [49:33<1:02:01, 1.50it/s]
43%|██��█▎ | 4191/9770 [49:33<1:01:55, 1.50it/s]
43%|████▎ | 4192/9770 [49:34<1:02:04, 1.50it/s]
43%|████▎ | 4193/9770 [49:35<1:01:36, 1.51it/s]
43%|████▎ | 4194/9770 [49:35<1:01:00, 1.52it/s]
43%|████▎ | 4195/9770 [49:36<1:01:08, 1.52it/s]
43%|████▎ | 4196/9770 [49:36<1:01:36, 1.51it/s]
43%|████▎ | 4197/9770 [49:37<1:01:25, 1.51it/s]
43%|████▎ | 4198/9770 [49:38<1:01:38, 1.51it/s]
43%|████▎ | 4199/9770 [49:38<1:01:42, 1.50it/s]
43%|████▎ | 4200/9770 [49:39<1:00:51, 1.53it/s]
43%|████▎ | 4200/9770 [49:39<1:00:51, 1.53it/s]
43%|████▎ | 4201/9770 [49:40<1:01:31, 1.51it/s]
43%|████▎ | 4202/9770 [49:40<1:01:03, 1.52it/s]
43%|████▎ | 4203/9770 [49:41<1:01:18, 1.51it/s]
43%|████▎ | 4204/9770 [
+0: {'loss': 0.6508, 'grad_norm': 0.6186331595930812, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.43}
+0: 49:42<1:00:38, 1.53it/s]
43%|████▎ | 4205/9770 [49:42<1:01:01, 1.52it/s]
43%|████▎ | 4206/9770 [49:43<1:00:32, 1.53it/s]
43%|████▎ | 4207/9770 [49:44<1:00:09, 1.54it/s]
43%|████▎ | 4208/9770 [49:44<1:00:55, 1.52it/s]
43%|████▎ | 4209/9770 [49:45<1:01:27, 1.51it/s]
43%|████▎ | 4210/9770 [49:46<1:00:46, 1.52it/s]
43%|████▎ | 4210/9770 [49:46<1:00:46, 1.52it/s]
43%|████▎ | 4211/9770 [49:46<1:02:29, 1.48it/s]
43%|████▎ | 4212/9770 [49:47<1:02:07, 1.49it/s]
43%|████▎ | 4213/9770 [49:48<1:02:00, 1.49it/s]
43%|████▎ | 4214/9770 [49:48<1:01:22, 1.51it/s]
43%|████▎ | 4215/9770 [49:49<1:01:18, 1.51it/s]
43%|████▎ | 4216/9770 [49:50<1:00:57, 1.52it/s]
43%|████▎ | 4217/9770 [49:50<1:01:02, 1.52it/s]
43%|████▎ | 4218/9770 [49:51<1:0
+0: {'loss': 0.6691, 'grad_norm': 0.675031932798919, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.43}
+0: {'loss': 0.6946, 'grad_norm': 0.6179850664736041, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.43}
+0: 0:34, 1.53it/s]
43%|████▎ | 4219/9770 [49:52<1:00:29, 1.53it/s]
43%|████▎ | 4220/9770 [49:52<1:01:43, 1.50it/s]
43%|████▎ | 4220/9770 [49:52<1:01:43, 1.50it/s]
43%|████▎ | 4221/9770 [49:53<1:01:23, 1.51it/s]
43%|████▎ | 4222/9770 [49:54<1:01:59, 1.49it/s]
43%|████▎ | 4223/9770 [49:54<1:02:06, 1.49it/s]
43%|████▎ | 4224/9770 [49:55<1:01:36, 1.50it/s]
43%|████▎ | 4225/9770 [49:56<1:01:48, 1.50it/s]
43%|████▎ | 4226/9770 [49:56<1:02:37, 1.48it/s]
43%|████▎ | 4227/9770 [49:57<1:02:35, 1.48it/s]
43%|████▎ | 4228/9770 [49:58<1:03:23, 1.46it/s]
43%|████▎ | 4229/9770 [49:58<1:02:16, 1.48it/s]
43%|████▎ | 4230/9770 [49:59<1:01:49, 1.49it/s]
43%|████▎ | 4230/9770 [49:59<1:01:49, 1.49it/s]
+0: {'loss': 0.6883, 'grad_norm': 0.682126263648815, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.43}
+0: 43%|████▎ | 4231/9770 [50:00<1:02:09, 1.49it/s]
43%|████▎ | 4232/9770 [50:00<1:01:15, 1.51it/s]
43%|████▎ | 4233/9770 [50:01<1:02:34, 1.47it/s]
43%|████▎ | 4234/9770 [50:02<1:02:14, 1.48it/s]
43%|████▎ | 4235/9770 [50:02<1:01:47, 1.49it/s]
43%|████▎ | 4236/9770 [50:03<1:00:54, 1.51it/s]
43%|████▎ | 4237/9770 [50:04<1:00:55, 1.51it/s]
43%|████▎ | 4238/9770 [50:04<1:01:05, 1.51it/s]
43%|████▎ | 4239/9770 [50:05<1:01:04, 1.51it/s]
43%|████▎ | 4240/9770 [50:06<1:01:23, 1.50it/s]
43%|████▎ | 4240/9770 [50:06<1:01:23, 1.50it/s]
43%|████▎ | 4241/9770 [50:06<1:02:35, 1.47it/s]
43%|████▎ | 4242/9770 [50:07<1:03:06, 1.46it/s]
43%|████▎ | 4243/9770 [50:08<1:03:41, 1.45it/s]
43%|████▎ | 4244/9770 [50:09<1:02:38, 1.47it/s]
43%|█�
+0: {'loss': 0.6824, 'grad_norm': 0.6093098914641012, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.43}
+0: �██▎ | 4245/9770 [50:09<1:02:47, 1.47it/s]
43%|████▎ | 4246/9770 [50:10<1:04:38, 1.42it/s]
43%|████▎ | 4247/9770 [50:11<1:03:00, 1.46it/s]
43%|████▎ | 4248/9770 [50:11<1:01:39, 1.49it/s]
43%|████▎ | 4249/9770 [50:12<1:01:55, 1.49it/s]
44%|████▎ | 4250/9770 [50:13<1:02:42, 1.47it/s]
44%|████▎ | 4250/9770 [50:13<1:02:42, 1.47it/s]
44%|████▎ | 4251/9770 [50:14<1:11:11, 1.29it/s]
44%|████▎ | 4252/9770 [50:14<1:13:34, 1.25it/s]
44%|████▎ | 4253/9770 [50:16<1:24:33, 1.09it/s]
44%|████▎ | 4254/9770 [50:17<1:28:16, 1.04it/s]
44%|████▎ | 4255/9770 [50:18<1:25:39, 1.07it/s]
44%|████▎ | 4256/9770 [50:18<1:17:51, 1.18it/s]
44%|████▎ | 4257/9770 [50:20<1:33:23, 1.02s/it]
44%|████▎ | 4258/9770 [50:21<1:30:04, 1.02it/s]
44%|████�
+0: {'loss': 0.6668, 'grad_norm': 0.6636370947418444, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.44}
+0: {'loss': 0.6579, 'grad_norm': 0.6270054501030375, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.44}
+0: � | 4259/9770 [50:22<1:33:33, 1.02s/it]
44%|████▎ | 4260/9770 [50:23<1:42:56, 1.12s/it]
44%|████▎ | 4260/9770 [50:23<1:42:56, 1.12s/it]
44%|████▎ | 4261/9770 [50:24<1:31:14, 1.01it/s]
44%|████▎ | 4262/9770 [50:24<1:21:35, 1.13it/s]
44%|████▎ | 4263/9770 [50:25<1:15:38, 1.21it/s]
44%|████▎ | 4264/9770 [50:26<1:11:15, 1.29it/s]
44%|████▎ | 4265/9770 [50:26<1:07:11, 1.37it/s]
44%|████▎ | 4266/9770 [50:27<1:04:41, 1.42it/s]
44%|████▎ | 4267/9770 [50:28<1:04:14, 1.43it/s]
44%|████▎ | 4268/9770 [50:28<1:03:59, 1.43it/s]
44%|████▎ | 4269/9770 [50:29<1:02:19, 1.47it/s]
44%|████▎ | 4270/9770 [50:30<1:01:47, 1.48it/s]
44%|████▎ | 4270/9770 [50:30<1:01:47, 1.48it/s]
44%|████▎ | 4271/9770
+0: {'loss': 0.6578, 'grad_norm': 0.6622512603119264, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.16, 'epoch': 0.44}
+0: [50:30<1:01:05, 1.50it/s]
44%|████▎ | 4272/9770 [50:31<1:01:01, 1.50it/s]
44%|████▎ | 4273/9770 [50:32<59:49, 1.53it/s]
44%|████▎ | 4274/9770 [50:32<59:29, 1.54it/s]
44%|████▍ | 4275/9770 [50:33<59:45, 1.53it/s]
44%|████▍ | 4276/9770 [50:34<1:00:30, 1.51it/s]
44%|████▍ | 4277/9770 [50:35<1:08:33, 1.34it/s]
44%|████▍ | 4278/9770 [50:35<1:12:41, 1.26it/s]
44%|████▍ | 4279/9770 [50:36<1:11:49, 1.27it/s]
44%|████▍ | 4280/9770 [50:37<1:12:32, 1.26it/s]
44%|████▍ | 4280/9770 [50:37<1:12:32, 1.26it/s]
44%|████▍ | 4281/9770 [50:38<1:14:02, 1.24it/s]
44%|████▍ | 4282/9770 [50:39<1:15:37, 1.21it/s]
44%|████▍ | 4283/9770 [50:40<1:19:40, 1.15it/s]
44%|████▍ | 4284/9770 [50:41<1:22:35, 1.11it/s]
44%|████▍ | 4285/9770 [50:42<1:22:4
+0: {'loss': 0.6813, 'grad_norm': 0.6334268703476365, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.44}
+0: 5, 1.10it/s]
44%|████▍ | 4286/9770 [50:42<1:17:38, 1.18it/s]
44%|████▍ | 4287/9770 [50:43<1:20:53, 1.13it/s]
44%|████▍ | 4288/9770 [50:44<1:19:21, 1.15it/s]
44%|████▍ | 4289/9770 [50:47<2:04:44, 1.37s/it]
44%|████▍ | 4290/9770 [50:49<2:23:13, 1.57s/it]
44%|████▍ | 4290/9770 [50:49<2:23:13, 1.57s/it]
44%|████▍ | 4291/9770 [50:49<2:02:33, 1.34s/it]
44%|████▍ | 4292/9770 [50:50<1:50:55, 1.21s/it]
44%|████▍ | 4293/9770 [50:52<1:49:47, 1.20s/it]
44%|████▍ | 4294/9770 [50:53<1:42:54, 1.13s/it]
44%|████▍ | 4295/9770 [50:54<1:43:31, 1.13s/it]
44%|████▍ | 4296/9770 [50:54<1:35:27, 1.05s/it]
44%|████▍ | 4297/9770 [50:56<2:00:06, 1.32s/it]
44%|████▍ | 4298/9770 [50:58<1:58:32, 1.30s/it]
44%|████▍ | 4299/9770 [50:59<1:47:05, 1.17s
+0: {'loss': 0.6973, 'grad_norm': 0.6113929859192986, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.44}
+0: {'loss': 0.648, 'grad_norm': 0.6182785212815289, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.44}
+0: /it]
44%|████▍ | 4300/9770 [50:59<1:36:38, 1.06s/it]
44%|████▍ | 4300/9770 [50:59<1:36:38, 1.06s/it]
44%|████▍ | 4301/9770 [51:00<1:27:39, 1.04it/s]
44%|████▍ | 4302/9770 [51:01<1:26:00, 1.06it/s]
44%|████▍ | 4303/9770 [51:02<1:22:33, 1.10it/s]
44%|████▍ | 4304/9770 [51:03<1:17:47, 1.17it/s]
44%|████▍ | 4305/9770 [51:03<1:15:30, 1.21it/s]
44%|████▍ | 4306/9770 [51:04<1:20:09, 1.14it/s]
44%|████▍ | 4307/9770 [51:06<1:43:55, 1.14s/it]
44%|████▍ | 4308/9770 [51:07<1:39:20, 1.09s/it]
44%|████▍ | 4309/9770 [51:08<1:32:15, 1.01s/it]
44%|████▍ | 4310/9770 [51:09<1:27:33, 1.04it/s]
44%|████▍ | 4310/9770 [51:09<1:27:33, 1.04it/s]
44%|████▍ | 4311/9770 [51:10<1:24:25, 1.08it/s]
44%|██�
+0: {'loss': 0.6631, 'grad_norm': 0.5824695037322758, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.44}
+0: �█▍ | 4312/9770 [51:10<1:17:26, 1.17it/s]
44%|████▍ | 4313/9770 [51:11<1:20:43, 1.13it/s]
44%|████▍ | 4314/9770 [51:12<1:17:32, 1.17it/s]
44%|████▍ | 4315/9770 [51:13<1:15:38, 1.20it/s]
44%|████▍ | 4316/9770 [51:14<1:16:22, 1.19it/s]
44%|████▍ | 4317/9770 [51:15<1:28:32, 1.03it/s]
44%|████▍ | 4318/9770 [51:16<1:33:06, 1.02s/it]
44%|████▍ | 4319/9770 [51:17<1:29:53, 1.01it/s]
44%|████▍ | 4320/9770 [51:18<1:22:25, 1.10it/s]
44%|████▍ | 4320/9770 [51:18<1:22:25, 1.10it/s]
44%|████▍ | 4321/9770 [51:19<1:24:26, 1.08it/s]
44%|████▍ | 4322/9770 [51:20<1:21:47, 1.11it/s]
44%|████▍ | 4323/9770 [51:20<1:18:27, 1.16it/s]
44%|████▍ | 4324/9770 [51:21<1:15:26, 1.20it/s]
44%|████▍ | 4325/9770 [51:22<1:16:29, 1.19it/s]
44%|████▍
+0: {'loss': 0.6894, 'grad_norm': 0.6206395447552205, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.44}
+0: | 4326/9770 [51:23<1:16:19, 1.19it/s]
44%|████▍ | 4327/9770 [51:24<1:16:28, 1.19it/s]
44%|████▍ | 4328/9770 [51:24<1:16:55, 1.18it/s]
44%|████▍ | 4329/9770 [51:25<1:16:41, 1.18it/s]
44%|████▍ | 4330/9770 [51:26<1:14:21, 1.22it/s]
44%|████▍ | 4330/9770 [51:26<1:14:21, 1.22it/s]
44%|████▍ | 4331/9770 [51:27<1:10:22, 1.29it/s]
44%|████▍ | 4332/9770 [51:28<1:10:54, 1.28it/s]
44%|████▍ | 4333/9770 [51:28<1:09:21, 1.31it/s]
44%|████▍ | 4334/9770 [51:29<1:08:14, 1.33it/s]
44%|████▍ | 4335/9770 [51:30<1:05:40, 1.38it/s]
44%|████▍ | 4336/9770 [51:30<1:03:59, 1.42it/s]
44%|████▍ | 4337/9770 [51:31<1:02:42, 1.44it/s]
44%|████▍ | 4338/9770 [51:32<1:03:40, 1.42it/s]
44%|████▍ | 4339/9770 [51:32<1:04:00, 1.41it/s]
44%|████▍ | 4340
+0: {'loss': 0.6739, 'grad_norm': 0.6925600692794549, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.44}
+0: {'loss': 0.6981, 'grad_norm': 0.6618428618174739, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.45}
+0: /9770 [51:33<1:03:51, 1.42it/s]
44%|████▍ | 4340/9770 [51:33<1:03:51, 1.42it/s]
44%|████▍ | 4341/9770 [51:34<1:04:28, 1.40it/s]
44%|████▍ | 4342/9770 [51:35<1:03:38, 1.42it/s]
44%|████▍ | 4343/9770 [51:35<1:03:41, 1.42it/s]
44%|████▍ | 4344/9770 [51:36<1:02:45, 1.44it/s]
44%|████▍ | 4345/9770 [51:37<1:01:03, 1.48it/s]
44%|████▍ | 4346/9770 [51:37<1:01:26, 1.47it/s]
44%|████▍ | 4347/9770 [51:38<1:00:29, 1.49it/s]
45%|████▍ | 4348/9770 [51:39<1:02:42, 1.44it/s]
45%|████▍ | 4349/9770 [51:39<1:03:39, 1.42it/s]
45%|████▍ | 4350/9770 [51:40<1:01:55, 1.46it/s]
45%|████▍ | 4350/9770 [51:40<1:01:55, 1.46it/s]
45%|████▍ | 4351/9770 [51:41<1:01:13, 1.48it/s]
45%|████▍ | 4352/9770 [51:41<1:01:
+0: {'loss': 0.6851, 'grad_norm': 0.5816948319590163, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.45}
+0: 53, 1.46it/s]
45%|████▍ | 4353/9770 [51:42<1:01:21, 1.47it/s]
45%|████▍ | 4354/9770 [51:43<1:00:57, 1.48it/s]
45%|████▍ | 4355/9770 [51:43<1:00:09, 1.50it/s]
45%|████▍ | 4356/9770 [51:44<1:00:20, 1.50it/s]
45%|████▍ | 4357/9770 [51:45<1:02:10, 1.45it/s]
45%|████▍ | 4358/9770 [51:46<1:05:54, 1.37it/s]
45%|████▍ | 4359/9770 [51:46<1:09:35, 1.30it/s]
45%|████▍ | 4360/9770 [51:47<1:06:59, 1.35it/s]
45%|████▍ | 4360/9770 [51:47<1:06:59, 1.35it/s]
45%|████▍ | 4361/9770 [51:48<1:06:44, 1.35it/s]
45%|████▍ | 4362/9770 [51:49<1:06:10, 1.36it/s]
45%|████▍ | 4363/9770 [51:49<1:04:29, 1.40it/s]
45%|████▍ | 4364/9770 [51:50<1:06:34, 1.35it/s]
45%|████▍ | 4365/9770 [51:51<1:14:02, 1.22it/s]
45%|████▍ | 4366/9770 [51:52<1:14:34, 1.21
+0: {'loss': 0.6655, 'grad_norm': 0.6508653574816292, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.45}
+0: it/s]
45%|████▍ | 4367/9770 [51:53<1:12:07, 1.25it/s]
45%|████▍ | 4368/9770 [51:53<1:07:56, 1.33it/s]
45%|████▍ | 4369/9770 [51:54<1:04:46, 1.39it/s]
45%|████▍ | 4370/9770 [51:55<1:03:24, 1.42it/s]
45%|████▍ | 4370/9770 [51:55<1:03:24, 1.42it/s]
45%|████▍ | 4371/9770 [51:55<1:03:37, 1.41it/s]
45%|████▍ | 4372/9770 [51:56<1:02:47, 1.43it/s]
45%|████▍ | 4373/9770 [51:57<1:02:21, 1.44it/s]
45%|████▍ | 4374/9770 [51:57<1:01:08, 1.47it/s]
45%|████▍ | 4375/9770 [51:58<1:00:40, 1.48it/s]
45%|████▍ | 4376/9770 [51:59<1:00:22, 1.49it/s]
45%|████▍ | 4377/9770 [51:59<1:00:00, 1.50it/s]
45%|████▍ | 4378/9770 [52:00<59:33, 1.51it/s]
45%|████▍ | 4379/9770 [52:01<59:25, 1.51it/s]
45%|████▍ | 4380/9770 [52:01<58:47, 1.53it/s]
+0: {'loss': 0.6619, 'grad_norm': 0.5965585914970666, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.45}
+0: {'loss': 0.6798, 'grad_norm': 0.6020787042821834, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.45}
+0:
45%|████▍ | 4380/9770 [52:01<58:47, 1.53it/s]
45%|████▍ | 4381/9770 [52:02<58:37, 1.53it/s]
45%|████▍ | 4382/9770 [52:03<58:40, 1.53it/s]
45%|████▍ | 4383/9770 [52:03<1:01:25, 1.46it/s]
45%|████▍ | 4384/9770 [52:04<1:02:18, 1.44it/s]
45%|████▍ | 4385/9770 [52:05<1:01:57, 1.45it/s]
45%|████▍ | 4386/9770 [52:05<1:01:20, 1.46it/s]
45%|████▍ | 4387/9770 [52:06<1:00:40, 1.48it/s]
45%|████▍ | 4388/9770 [52:07<1:00:16, 1.49it/s]
45%|████▍ | 4389/9770 [52:07<59:26, 1.51it/s]
45%|████▍ | 4390/9770 [52:08<59:46, 1.50it/s]
45%|████▍ | 4390/9770 [52:08<59:46, 1.50it/s]
45%|████▍ | 4391/9770 [52:09<1:00:10, 1.49it/s]
45%|████▍ | 4392/9770 [52:09<59:27, 1.51it/s]
45%|████▍ | 4393/9770 [5
+0: {'loss': 0.6616, 'grad_norm': 0.6641873635664651, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.45}
+0: 2:10<59:18, 1.51it/s]
45%|████▍ | 4394/9770 [52:11<59:12, 1.51it/s]
45%|████▍ | 4395/9770 [52:11<1:00:09, 1.49it/s]
45%|████▍ | 4396/9770 [52:12<1:00:37, 1.48it/s]
45%|████▌ | 4397/9770 [52:13<1:00:18, 1.48it/s]
45%|████▌ | 4398/9770 [52:13<1:00:11, 1.49it/s]
45%|████▌ | 4399/9770 [52:14<1:00:00, 1.49it/s]
45%|████▌ | 4400/9770 [52:15<58:44, 1.52it/s]
45%|████▌ | 4400/9770 [52:15<58:44, 1.52it/s]
45%|████▌ | 4401/9770 [52:15<59:03, 1.52it/s]
45%|████▌ | 4402/9770 [52:16<1:00:15, 1.48it/s]
45%|████▌ | 4403/9770 [52:17<1:00:33, 1.48it/s]
45%|████▌ | 4404/9770 [52:17<59:59, 1.49it/s]
45%|████��� | 4405/9770 [52:18<59:41, 1.50it/s]
45%|████▌ | 4406/9770 [52:19<1:00:18, 1.48it/s]
45%|████▌ | 4407/9770 [52:19<1:00:19, 1.48it
+0: {'loss': 0.6962, 'grad_norm': 0.6255868311306785, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.45}
+0: {'loss': 0.6611, 'grad_norm': 0.6474186015364907, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.45}
+0: /s]
45%|████▌ | 4408/9770 [52:20<59:28, 1.50it/s]
45%|████▌ | 4409/9770 [52:21<59:28, 1.50it/s]
45%|████▌ | 4410/9770 [52:21<59:01, 1.51it/s]
45%|████▌ | 4410/9770 [52:21<59:01, 1.51it/s]
45%|████▌ | 4411/9770 [52:22<58:59, 1.51it/s]
45%|████▌ | 4412/9770 [52:23<59:16, 1.51it/s]
45%|████▌ | 4413/9770 [52:23<59:14, 1.51it/s]
45%|████▌ | 4414/9770 [52:24<59:35, 1.50it/s]
45%|████▌ | 4415/9770 [52:25<59:04, 1.51it/s]
45%|████▌ | 4416/9770 [52:25<58:23, 1.53it/s]
45%|████▌ | 4417/9770 [52:26<58:58, 1.51it/s]
45%|████▌ | 4418/9770 [52:27<58:28, 1.53it/s]
45%|████▌ | 4419/9770 [52:27<58:36, 1.52it/s]
45%|████▌ | 4420/9770 [52:28<58:31, 1.52it/s]
45%|████▌ | 4420/9770 [52:28<
+0: {'loss': 0.6715, 'grad_norm': 0.6228236916669737, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.45}
+0: 58:31, 1.52it/s]
45%|████▌ | 4421/9770 [52:29<58:17, 1.53it/s]
45%|████▌ | 4422/9770 [52:29<58:32, 1.52it/s]
45%|████▌ | 4423/9770 [52:30<59:04, 1.51it/s]
45%|████▌ | 4424/9770 [52:31<58:31, 1.52it/s]
45%|████▌ | 4425/9770 [52:31<57:58, 1.54it/s]
45%|████▌ | 4426/9770 [52:32<57:32, 1.55it/s]
45%|████▌ | 4427/9770 [52:33<58:57, 1.51it/s]
45%|████▌ | 4428/9770 [52:33<58:50, 1.51it/s]
45%|████▌ | 4429/9770 [52:34<58:31, 1.52it/s]
45%|████▌ | 4430/9770 [52:34<58:26, 1.52it/s]
45%|████▌ | 4430/9770 [52:35<58:26, 1.52it/s]
45%|████▌ | 4431/9770 [52:35<59:01, 1.51it/s]
45%|████▌ | 4432/9770 [52:36<58:48, 1.51it/s]
45%|████▌ | 4433/9770 [52:37<59:01, 1.51it/s]
45%|████▌ | 4434/9770 [52:37<59:30, 1.49it/s]
45%|████▌
+0: {'loss': 0.6592, 'grad_norm': 0.6441811924703615, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.45}
+0: | 4435/9770 [52:38<59:36, 1.49it/s]
45%|████▌ | 4436/9770 [52:39<1:00:22, 1.47it/s]
45%|████▌ | 4437/9770 [52:39<1:00:39, 1.47it/s]
45%|████▌ | 4438/9770 [52:40<1:00:09, 1.48it/s]
45%|████▌ | 4439/9770 [52:41<59:53, 1.48it/s]
45%|████▌ | 4440/9770 [52:41<59:14, 1.50it/s]
45%|████▌ | 4440/9770 [52:41<59:14, 1.50it/s]
45%|████▌ | 4441/9770 [52:42<59:05, 1.50it/s]
45%|████▌ | 4442/9770 [52:43<58:14, 1.52it/s]
45%|████▌ | 4443/9770 [52:43<58:57, 1.51it/s]
45%|████▌ | 4444/9770 [52:44<58:31, 1.52it/s]
45%|████▌ | 4445/9770 [52:45<59:46, 1.48it/s]
46%|████▌ | 4446/9770 [52:45<59:58, 1.48it/s]
46%|████▌ | 4447/9770 [52:46<59:48, 1.48it/s]
46%|████▌ | 4448/9770 [52:47<58:30, 1.52it/s]
46%|████▌ | 4449/9770 [52:47<58:38, 1.51
+0: {'loss': 0.6899, 'grad_norm': 0.6290440248726958, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.46}
+0: {'loss': 0.6848, 'grad_norm': 0.6456153704043983, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.46}
+0: it/s]
46%|████▌ | 4450/9770 [52:48<58:40, 1.51it/s]
46%|████▌ | 4450/9770 [52:48<58:40, 1.51it/s]
46%|████▌ | 4451/9770 [52:49<58:38, 1.51it/s]
46%|████▌ | 4452/9770 [52:49<58:50, 1.51it/s]
46%|████▌ | 4453/9770 [52:50<58:17, 1.52it/s]
46%|████▌ | 4454/9770 [52:50<57:58, 1.53it/s]
46%|████▌ | 4455/9770 [52:51<58:03, 1.53it/s]
46%|████▌ | 4456/9770 [52:52<57:58, 1.53it/s]
46%|████▌ | 4457/9770 [52:52<58:11, 1.52it/s]
46%|████▌ | 4458/9770 [52:53<57:39, 1.54it/s]
46%|████▌ | 4459/9770 [52:54<57:38, 1.54it/s]
46%|████▌ | 4460/9770 [52:54<57:55, 1.53it/s]
46%|████▌ | 4460/9770 [52:54<57:55, 1.53it/s]
46%|████▌ | 4461/9770 [52:55<58:21, 1.52it/s]
46%|████▌ | 4462/9770 [52:56<
+0: {'loss': 0.6663, 'grad_norm': 0.6714969665725109, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.46}
+0: 58:34, 1.51it/s]
46%|████▌ | 4463/9770 [52:56<58:56, 1.50it/s]
46%|████▌ | 4464/9770 [52:57<58:20, 1.52it/s]
46%|████▌ | 4465/9770 [52:58<57:51, 1.53it/s]
46%|████▌ | 4466/9770 [52:58<57:24, 1.54it/s]
46%|████▌ | 4467/9770 [52:59<57:43, 1.53it/s]
46%|████▌ | 4468/9770 [53:00<57:45, 1.53it/s]
46%|████▌ | 4469/9770 [53:00<57:02, 1.55it/s]
46%|████▌ | 4470/9770 [53:01<57:09, 1.55it/s]
46%|████▌ | 4470/9770 [53:01<57:09, 1.55it/s]
46%|████▌ | 4471/9770 [53:02<57:23, 1.54it/s]
46%|████▌ | 4472/9770 [53:02<57:36, 1.53it/s]
46%|████▌ | 4473/9770 [53:03<57:34, 1.53it/s]
46%|████▌ | 4474/9770 [53:04<57:26, 1.54it/s]
46%|████▌ | 4475/9770 [53:04<57:24, 1.54it/s]
46%|████▌ | 4476/9770 [53:05<58:20, 1.51it/s]
46%|████▌
+0: {'loss': 0.6888, 'grad_norm': 0.648316454573057, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.46}
+0: {'loss': 0.6643, 'grad_norm': 0.7099196805058802, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.46}
+0: | 4477/9770 [53:06<58:30, 1.51it/s]
46%|████▌ | 4478/9770 [53:06<58:56, 1.50it/s]
46%|████▌ | 4479/9770 [53:07<58:34, 1.51it/s]
46%|████▌ | 4480/9770 [53:08<58:18, 1.51it/s]
46%|████▌ | 4480/9770 [53:08<58:18, 1.51it/s]
46%|████▌ | 4481/9770 [53:08<58:19, 1.51it/s]
46%|████▌ | 4482/9770 [53:09<58:21, 1.51it/s]
46%|████▌ | 4483/9770 [53:10<58:35, 1.50it/s]
46%|████▌ | 4484/9770 [53:10<59:29, 1.48it/s]
46%|████▌ | 4485/9770 [53:11<58:45, 1.50it/s]
46%|████▌ | 4486/9770 [53:12<59:03, 1.49it/s]
46%|████▌ | 4487/9770 [53:12<58:28, 1.51it/s]
46%|████▌ | 4488/9770 [53:13<58:27, 1.51it/s]
46%|████▌ | 4489/9770 [53:14<58:29, 1.50it/s]
46%|████▌ | 4490/9770 [53:14<58:22, 1.51it/s]
46%|██
+0: {'loss': 0.6844, 'grad_norm': 0.6431119880934836, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.46}
+0: ██▌ | 4490/9770 [53:14<58:22, 1.51it/s]
46%|████▌ | 4491/9770 [53:15<57:59, 1.52it/s]
46%|████▌ | 4492/9770 [53:16<58:21, 1.51it/s]
46%|████▌ | 4493/9770 [53:16<58:21, 1.51it/s]
46%|████▌ | 4494/9770 [53:17<58:32, 1.50it/s]
46%|████▌ | 4495/9770 [53:18<58:25, 1.50it/s]
46%|████▌ | 4496/9770 [53:18<58:08, 1.51it/s]
46%|████▌ | 4497/9770 [53:19<58:13, 1.51it/s]
46%|████▌ | 4498/9770 [53:20<58:34, 1.50it/s]
46%|████▌ | 4499/9770 [53:20<58:30, 1.50it/s]
46%|████▌ | 4500/9770 [53:21<57:59, 1.51it/s]
46%|████▌ | 4500/9770 [53:21<57:59, 1.51it/s]
46%|████▌ | 4501/9770 [53:22<1:01:13, 1.43it/s]
46%|████▌ | 4502/9770 [53:22<1:00:12, 1.46it/s]
46%|████▌ | 4503/9770 [53:23<59:36, 1.47it/s]
46%|████▌ | 4504/9770 [53:24<59
+0: {'loss': 0.6672, 'grad_norm': 0.6170072839522381, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.46}
+0: :31, 1.47it/s]
46%|████▌ | 4505/9770 [53:24<58:24, 1.50it/s]
46%|████▌ | 4506/9770 [53:25<58:13, 1.51it/s]
46%|████▌ | 4507/9770 [53:26<58:12, 1.51it/s]
46%|████▌ | 4508/9770 [53:26<58:05, 1.51it/s]
46%|████▌ | 4509/9770 [53:27<58:18, 1.50it/s]
46%|████▌ | 4510/9770 [53:28<57:41, 1.52it/s]
46%|████▌ | 4510/9770 [53:28<57:41, 1.52it/s]
46%|████▌ | 4511/9770 [53:28<58:00, 1.51it/s]
46%|████▌ | 4512/9770 [53:29<57:40, 1.52it/s]
46%|████▌ | 4513/9770 [53:30<58:11, 1.51it/s]
46%|████▌ | 4514/9770 [53:30<57:59, 1.51it/s]
46%|████▌ | 4515/9770 [53:31<57:20, 1.53it/s]
46%|████▌ | 4516/9770 [53:32<58:47, 1.49it/s]
46%|████▌ | 4517/9770 [53:32<58:06, 1.51it/s]
46%|████▌ | 4518/9770 [53:33<58:20, 1.50it/s]
46%|████▋
+0: {'loss': 0.6677, 'grad_norm': 0.6134101429861566, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.46}
+0: {'loss': 0.6689, 'grad_norm': 0.627968441743086, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.46}
+0: | 4519/9770 [53:34<58:21, 1.50it/s]
46%|████▋ | 4520/9770 [53:34<58:22, 1.50it/s]
46%|████▋ | 4520/9770 [53:34<58:22, 1.50it/s]
46%|████▋ | 4521/9770 [53:35<57:36, 1.52it/s]
46%|████▋ | 4522/9770 [53:36<57:36, 1.52it/s]
46%|████▋ | 4523/9770 [53:36<57:18, 1.53it/s]
46%|████▋ | 4524/9770 [53:37<57:39, 1.52it/s]
46%|████▋ | 4525/9770 [53:37<57:50, 1.51it/s]
46%|████▋ | 4526/9770 [53:38<57:35, 1.52it/s]
46%|████▋ | 4527/9770 [53:39<57:34, 1.52it/s]
46%|████▋ | 4528/9770 [53:39<57:28, 1.52it/s]
46%|████▋ | 4529/9770 [53:40<56:43, 1.54it/s]
46%|████▋ | 4530/9770 [53:41<57:05, 1.53it/s]
46%|████▋ | 4530/9770 [53:41<57:05, 1.53it/s]
46%|████▋ | 4531/9770 [53:41<57:22, 1.52it/s]
46%|██�
+0: {'loss': 0.6532, 'grad_norm': 0.6286045234738764, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.46}
+0: �█▋ | 4532/9770 [53:42<57:19, 1.52it/s]
46%|████▋ | 4533/9770 [53:43<56:47, 1.54it/s]
46%|████▋ | 4534/9770 [53:43<57:04, 1.53it/s]
46%|████▋ | 4535/9770 [53:44<57:28, 1.52it/s]
46%|████▋ | 4536/9770 [53:45<57:37, 1.51it/s]
46%|████▋ | 4537/9770 [53:45<57:00, 1.53it/s]
46%|████▋ | 4538/9770 [53:46<57:16, 1.52it/s]
46%|████▋ | 4539/9770 [53:47<57:13, 1.52it/s]
46%|████▋ | 4540/9770 [53:47<57:22, 1.52it/s]
46%|████▋ | 4540/9770 [53:47<57:22, 1.52it/s]
46%|████▋ | 4541/9770 [53:48<57:34, 1.51it/s]
46%|████▋ | 4542/9770 [53:49<57:53, 1.51it/s]
46%|████▋ | 4543/9770 [53:49<57:42, 1.51it/s]
47%|████▋ | 4544/9770 [53:50<57:12, 1.52it/s]
47%|████▋ | 4545/9770 [53:51<57:07, 1.52it/s]
47%|████▋ | 4546/9770 [53:51<57:32, 1.
+0: {'loss': 0.6694, 'grad_norm': 0.6455909292966766, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.47}
+0: 51it/s]
47%|████▋ | 4547/9770 [53:52<57:46, 1.51it/s]
47%|████▋ | 4548/9770 [53:53<58:02, 1.50it/s]
47%|████▋ | 4549/9770 [53:53<57:38, 1.51it/s]
47%|████▋ | 4550/9770 [53:54<57:47, 1.51it/s]
47%|████▋ | 4550/9770 [53:54<57:47, 1.51it/s]
47%|████▋ | 4551/9770 [53:55<57:41, 1.51it/s]
47%|████▋ | 4552/9770 [53:55<57:35, 1.51it/s]
47%|████▋ | 4553/9770 [53:56<57:10, 1.52it/s]
47%|████▋ | 4554/9770 [53:57<57:25, 1.51it/s]
47%|████▋ | 4555/9770 [53:57<56:42, 1.53it/s]
47%|████▋ | 4556/9770 [53:58<57:06, 1.52it/s]
47%|████▋ | 4557/9770 [53:59<57:51, 1.50it/s]
47%|████▋ | 4558/9770 [53:59<57:45, 1.50it/s]
47%|████▋ | 4559/9770 [54:00<57:21, 1.51it/s]
47%|████▋ | 4560/9770 [54:01<57:44, 1.50it/s]
+0: {'loss': 0.6683, 'grad_norm': 0.6539358209451853, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.47}
+0: {'loss': 0.6696, 'grad_norm': 0.6053304847937633, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.47}
+0:
47%|████▋ | 4560/9770 [54:01<57:44, 1.50it/s]
47%|████▋ | 4561/9770 [54:01<57:41, 1.50it/s]
47%|████▋ | 4562/9770 [54:02<57:42, 1.50it/s]
47%|████▋ | 4563/9770 [54:03<58:20, 1.49it/s]
47%|████▋ | 4564/9770 [54:03<58:17, 1.49it/s]
47%|████▋ | 4565/9770 [54:04<58:04, 1.49it/s]
47%|████▋ | 4566/9770 [54:05<58:20, 1.49it/s]
47%|████▋ | 4567/9770 [54:05<58:26, 1.48it/s]
47%|████▋ | 4568/9770 [54:06<58:50, 1.47it/s]
47%|████▋ | 4569/9770 [54:07<58:09, 1.49it/s]
47%|████▋ | 4570/9770 [54:07<57:26, 1.51it/s]
47%|████▋ | 4570/9770 [54:07<57:26, 1.51it/s]
47%|████▋ | 4571/9770 [54:08<56:33, 1.53it/s]
47%|████▋ | 4572/9770 [54:09<56:24, 1.54it/s]
47%|████▋ | 4573/9770 [54:09<56:56, 1.52it/s]
47%|████▋
+0: {'loss': 0.6602, 'grad_norm': 0.6293399451600545, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.47}
+0: | 4574/9770 [54:10<57:41, 1.50it/s]
47%|████▋ | 4575/9770 [54:11<57:30, 1.51it/s]
47%|████▋ | 4576/9770 [54:11<57:20, 1.51it/s]
47%|████▋ | 4577/9770 [54:12<58:24, 1.48it/s]
47%|████▋ | 4578/9770 [54:13<58:01, 1.49it/s]
47%|████▋ | 4579/9770 [54:13<57:41, 1.50it/s]
47%|████▋ | 4580/9770 [54:14<57:02, 1.52it/s]
47%|████▋ | 4580/9770 [54:14<57:02, 1.52it/s]
47%|████▋ | 4581/9770 [54:15<57:08, 1.51it/s]
47%|████▋ | 4582/9770 [54:15<57:27, 1.50it/s]
47%|████▋ | 4583/9770 [54:16<57:15, 1.51it/s]
47%|████▋ | 4584/9770 [54:17<56:39, 1.53it/s]
47%|████▋ | 4585/9770 [54:17<56:16, 1.54it/s]
47%|████▋ | 4586/9770 [54:18<55:57, 1.54it/s]
47%|████▋ | 4587/9770 [54:18<56:28, 1.53it/s]
47%|████▋ | 4588/9770 [54:19<57:42, 1.50it/s]
+0: {'loss': 0.665, 'grad_norm': 0.6855893782011705, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.47}
+0: {'loss': 0.6849, 'grad_norm': 0.6086835789368573, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.47}
+0: 47%|████▋ | 4589/9770 [54:20<57:28, 1.50it/s]
47%|████▋ | 4590/9770 [54:21<57:46, 1.49it/s]
47%|████▋ | 4590/9770 [54:21<57:46, 1.49it/s]
47%|████▋ | 4591/9770 [54:21<57:25, 1.50it/s]
47%|████▋ | 4592/9770 [54:22<57:26, 1.50it/s]
47%|████▋ | 4593/9770 [54:22<57:20, 1.50it/s]
47%|████▋ | 4594/9770 [54:23<57:05, 1.51it/s]
47%|████▋ | 4595/9770 [54:24<57:00, 1.51it/s]
47%|████▋ | 4596/9770 [54:24<57:17, 1.51it/s]
47%|████▋ | 4597/9770 [54:25<56:55, 1.51it/s]
47%|████▋ | 4598/9770 [54:26<56:43, 1.52it/s]
47%|████▋ | 4599/9770 [54:26<57:05, 1.51it/s]
47%|████▋ | 4600/9770 [54:27<56:44, 1.52it/s]
47%|████▋ | 4600/9770 [54:27<56:44, 1.52it/s]
47%|████▋ | 4601/9770 [54:28<57:10,
+0: {'loss': 0.6674, 'grad_norm': 0.6158777590905761, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.47}
+0: 1.51it/s]
47%|████▋ | 4602/9770 [54:28<57:27, 1.50it/s]
47%|████▋ | 4603/9770 [54:29<56:28, 1.52it/s]
47%|████▋ | 4604/9770 [54:30<56:09, 1.53it/s]
47%|████▋ | 4605/9770 [54:30<56:39, 1.52it/s]
47%|████▋ | 4606/9770 [54:31<56:49, 1.51it/s]
47%|████▋ | 4607/9770 [54:32<56:53, 1.51it/s]
47%|████▋ | 4608/9770 [54:32<57:06, 1.51it/s]
47%|████▋ | 4609/9770 [54:33<56:18, 1.53it/s]
47%|████▋ | 4610/9770 [54:34<56:49, 1.51it/s]
47%|████▋ | 4610/9770 [54:34<56:49, 1.51it/s]
47%|████▋ | 4611/9770 [54:34<57:00, 1.51it/s]
47%|████▋ | 4612/9770 [54:35<58:24, 1.47it/s]
47%|████▋ | 4613/9770 [54:36<58:25, 1.47it/s]
47%|████▋ | 4614/9770 [54:36<57:50, 1.49it/s]
47%|████▋ | 4615/9770 [54:37<57:53, 1.48it/s]
47%|████▋ | 46
+0: {'loss': 0.6723, 'grad_norm': 0.6327481164496808, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.47}
+0: 16/9770 [54:38<57:24, 1.50it/s]
47%|████▋ | 4617/9770 [54:38<57:10, 1.50it/s]
47%|████▋ | 4618/9770 [54:39<56:55, 1.51it/s]
47%|████▋ | 4619/9770 [54:40<56:39, 1.52it/s]
47%|████▋ | 4620/9770 [54:40<57:33, 1.49it/s]
47%|████▋ | 4620/9770 [54:40<57:33, 1.49it/s]
47%|████▋ | 4621/9770 [54:41<57:02, 1.50it/s]
47%|████▋ | 4622/9770 [54:42<57:38, 1.49it/s]
47%|████▋ | 4623/9770 [54:42<57:33, 1.49it/s]
47%|████▋ | 4624/9770 [54:43<57:13, 1.50it/s]
47%|████▋ | 4625/9770 [54:44<56:52, 1.51it/s]
47%|████▋ | 4626/9770 [54:44<57:14, 1.50it/s]
47%|████▋ | 4627/9770 [54:45<57:23, 1.49it/s]
47%|████▋ | 4628/9770 [54:46<57:12, 1.50it/s]
47%|████▋ | 4629/9770 [54:46<56:19, 1.52it/s]
47%|████▋ | 4630/9770 [54:47<56:40, 1.51it/s]
+0: {'loss': 0.676, 'grad_norm': 0.6293183743531868, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.47}
+0: {'loss': 0.6857, 'grad_norm': 0.5956922279829752, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.47}
+0:
47%|████▋ | 4630/9770 [54:47<56:40, 1.51it/s]
47%|████▋ | 4631/9770 [54:48<57:16, 1.50it/s]
47%|████▋ | 4632/9770 [54:48<56:45, 1.51it/s]
47%|████▋ | 4633/9770 [54:49<56:14, 1.52it/s]
47%|████▋ | 4634/9770 [54:50<55:56, 1.53it/s]
47%|████▋ | 4635/9770 [54:50<56:02, 1.53it/s]
47%|████▋ | 4636/9770 [54:51<56:07, 1.52it/s]
47%|████▋ | 4637/9770 [54:52<56:22, 1.52it/s]
47%|████▋ | 4638/9770 [54:52<56:58, 1.50it/s]
47%|████▋ | 4639/9770 [54:53<56:36, 1.51it/s]
47%|████▋ | 4640/9770 [54:54<55:52, 1.53it/s]
47%|████▋ | 4640/9770 [54:54<55:52, 1.53it/s]
48%|████▊ | 4641/9770 [54:54<57:23, 1.49it/s]
48%|████▊ | 4642/9770 [54:55<56:11, 1.52it/s]
48%|████▊ | 4643/9770 [54:56<56:44, 1.51it
+0: {'loss': 0.6889, 'grad_norm': 0.5980974658015646, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.48}
+0: /s]
48%|████▊ | 4644/9770 [54:56<56:15, 1.52it/s]
48%|████▊ | 4645/9770 [54:57<56:26, 1.51it/s]
48%|████▊ | 4646/9770 [54:58<56:13, 1.52it/s]
48%|████▊ | 4647/9770 [54:58<56:36, 1.51it/s]
48%|████▊ | 4648/9770 [54:59<56:38, 1.51it/s]
48%|████▊ | 4649/9770 [55:00<55:49, 1.53it/s]
48%|████▊ | 4650/9770 [55:00<56:39, 1.51it/s]
48%|████▊ | 4650/9770 [55:00<56:39, 1.51it/s]
48%|████▊ | 4651/9770 [55:01<56:27, 1.51it/s]
48%|████▊ | 4652/9770 [55:02<56:29, 1.51it/s]
48%|████▊ | 4653/9770 [55:02<55:45, 1.53it/s]
48%|████▊ | 4654/9770 [55:03<56:18, 1.51it/s]
48%|████▊ | 4655/9770 [55:04<56:18, 1.51it/s]
48%|████▊ | 4656/9770 [55:04<55:45, 1.53it/s]
48%|████▊ | 4657/9770 [55:05<55:32, 1.53it/s]
48%|████▊ | 4658/9770
+0: {'loss': 0.6861, 'grad_norm': 0.6463123215303295, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.48}
+0: {'loss': 0.6542, 'grad_norm': 0.6520847112408649, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.48}
+0: [55:06<55:54, 1.52it/s]
48%|████▊ | 4659/9770 [55:06<56:24, 1.51it/s]
48%|████▊ | 4660/9770 [55:07<56:33, 1.51it/s]
48%|████▊ | 4660/9770 [55:07<56:33, 1.51it/s]
48%|████▊ | 4661/9770 [55:08<56:47, 1.50it/s]
48%|████▊ | 4662/9770 [55:08<56:23, 1.51it/s]
48%|████▊ | 4663/9770 [55:09<55:46, 1.53it/s]
48%|████▊ | 4664/9770 [55:09<55:28, 1.53it/s]
48%|████▊ | 4665/9770 [55:10<55:20, 1.54it/s]
48%|████▊ | 4666/9770 [55:11<55:05, 1.54it/s]
48%|████▊ | 4667/9770 [55:11<55:14, 1.54it/s]
48%|████▊ | 4668/9770 [55:12<55:35, 1.53it/s]
48%|████▊ | 4669/9770 [55:13<55:57, 1.52it/s]
48%|████▊ | 4670/9770 [55:13<55:45, 1.52it/s]
48%|████▊ | 4670/9770 [55:13<55:45, 1.52it/s]
48%|████▊
+0: {'loss': 0.6608, 'grad_norm': 0.5965875493466005, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.48}
+0: | 4671/9770 [55:14<56:11, 1.51it/s]
48%|████▊ | 4672/9770 [55:15<56:16, 1.51it/s]
48%|████▊ | 4673/9770 [55:15<56:16, 1.51it/s]
48%|████▊ | 4674/9770 [55:16<56:44, 1.50it/s]
48%|████▊ | 4675/9770 [55:17<56:41, 1.50it/s]
48%|████▊ | 4676/9770 [55:17<56:13, 1.51it/s]
48%|████▊ | 4677/9770 [55:18<55:59, 1.52it/s]
48%|████▊ | 4678/9770 [55:19<55:55, 1.52it/s]
48%|████▊ | 4679/9770 [55:19<55:56, 1.52it/s]
48%|████▊ | 4680/9770 [55:20<56:16, 1.51it/s]
48%|████▊ | 4680/9770 [55:20<56:16, 1.51it/s]
48%|████▊ | 4681/9770 [55:21<56:16, 1.51it/s]
48%|████▊ | 4682/9770 [55:21<55:47, 1.52it/s]
48%|████▊ | 4683/9770 [55:22<55:54, 1.52it/s]
48%|████▊ | 4684/9770 [55:23<56:01, 1.51it/s]
48%|████▊ | 4685/9770 [55:23<56:13, 1.51it/s]
48%
+0: {'loss': 0.6523, 'grad_norm': 0.6215592034303498, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.48}
+0: |████▊ | 4686/9770 [55:24<56:18, 1.50it/s]
48%|████▊ | 4687/9770 [55:25<56:12, 1.51it/s]
48%|████▊ | 4688/9770 [55:25<55:50, 1.52it/s]
48%|████▊ | 4689/9770 [55:26<55:52, 1.52it/s]
48%|████▊ | 4690/9770 [55:27<55:36, 1.52it/s]
48%|████▊ | 4690/9770 [55:27<55:36, 1.52it/s]
48%|████▊ | 4691/9770 [55:27<55:58, 1.51it/s]
48%|████▊ | 4692/9770 [55:28<55:39, 1.52it/s]
48%|████▊ | 4693/9770 [55:29<55:26, 1.53it/s]
48%|████▊ | 4694/9770 [55:29<55:48, 1.52it/s]
48%|████▊ | 4695/9770 [55:30<55:58, 1.51it/s]
48%|████▊ | 4696/9770 [55:31<56:34, 1.49it/s]
48%|████▊ | 4697/9770 [55:31<56:06, 1.51it/s]
48%|████▊ | 4698/9770 [55:32<57:08, 1.48it/s]
48%|████▊ | 4699/9770 [55:33<57:09, 1.48it/s]
48%|████▊ | 4700/9770 [55:33<5
+0: {'loss': 0.6868, 'grad_norm': 0.6496241447214529, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.48}
+0: {'loss': 0.6662, 'grad_norm': 0.6689949941729074, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.48}
+0: 7:09, 1.48it/s]
48%|████▊ | 4700/9770 [55:33<57:09, 1.48it/s]
48%|████▊ | 4701/9770 [55:34<57:53, 1.46it/s]
48%|████▊ | 4702/9770 [55:35<58:33, 1.44it/s]
48%|████▊ | 4703/9770 [55:35<57:20, 1.47it/s]
48%|████▊ | 4704/9770 [55:36<57:15, 1.47it/s]
48%|████▊ | 4705/9770 [55:37<56:38, 1.49it/s]
48%|████▊ | 4706/9770 [55:37<56:23, 1.50it/s]
48%|████▊ | 4707/9770 [55:38<56:17, 1.50it/s]
48%|████▊ | 4708/9770 [55:39<56:13, 1.50it/s]
48%|████▊ | 4709/9770 [55:39<56:00, 1.51it/s]
48%|████▊ | 4710/9770 [55:40<56:17, 1.50it/s]
48%|████▊ | 4710/9770 [55:40<56:17, 1.50it/s]
48%|████▊ | 4711/9770 [55:41<57:09, 1.48it/s]
48%|████▊ | 4712/9770 [55:41<56:15, 1.50it/s]
48%|████▊ | 4713/9
+0: {'loss': 0.6561, 'grad_norm': 0.6230667745183371, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.48}
+0: 770 [55:42<56:13, 1.50it/s]
48%|████▊ | 4714/9770 [55:43<55:23, 1.52it/s]
48%|████▊ | 4715/9770 [55:43<55:32, 1.52it/s]
48%|████▊ | 4716/9770 [55:44<54:55, 1.53it/s]
48%|████▊ | 4717/9770 [55:45<56:14, 1.50it/s]
48%|████▊ | 4718/9770 [55:45<56:09, 1.50it/s]
48%|████▊ | 4719/9770 [55:46<56:02, 1.50it/s]
48%|████▊ | 4720/9770 [55:47<55:10, 1.53it/s]
48%|████▊ | 4720/9770 [55:47<55:10, 1.53it/s]
48%|████▊ | 4721/9770 [55:47<55:15, 1.52it/s]
48%|████▊ | 4722/9770 [55:48<54:59, 1.53it/s]
48%|████▊ | 4723/9770 [55:49<55:24, 1.52it/s]
48%|████▊ | 4724/9770 [55:49<55:30, 1.52it/s]
48%|████▊ | 4725/9770 [55:50<55:53, 1.50it/s]
48%|████▊ | 4726/9770 [55:51<56:54, 1.48it/s]
48%|████▊ | 4727/9770 [55:51<56:05, 1.50it/s]
48%|██�
+0: {'loss': 0.6839, 'grad_norm': 0.6560895732592161, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.48}
+0: {'loss': 0.6696, 'grad_norm': 0.6276750270818932, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.48}
+0: ��█▊ | 4728/9770 [55:52<55:51, 1.50it/s]
48%|████▊ | 4729/9770 [55:53<55:11, 1.52it/s]
48%|████▊ | 4730/9770 [55:53<55:45, 1.51it/s]
48%|████▊ | 4730/9770 [55:53<55:45, 1.51it/s]
48%|████▊ | 4731/9770 [55:54<55:31, 1.51it/s]
48%|████▊ | 4732/9770 [55:55<56:28, 1.49it/s]
48%|████▊ | 4733/9770 [55:55<57:17, 1.47it/s]
48%|████▊ | 4734/9770 [55:56<57:55, 1.45it/s]
48%|████▊ | 4735/9770 [55:57<57:28, 1.46it/s]
48%|████▊ | 4736/9770 [55:57<57:58, 1.45it/s]
48%|████▊ | 4737/9770 [55:58<56:56, 1.47it/s]
48%|████▊ | 4738/9770 [55:59<56:43, 1.48it/s]
49%|████▊ | 4739/9770 [55:59<56:17, 1.49it/s]
49%|████▊ | 4740/9770 [56:00<55:55, 1.50it/s]
49%|████▊ | 4740/9770 [56:00<55:55, 1.50it/s]
+0: {'loss': 0.6439, 'grad_norm': 0.621410727524644, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.49}
+0: 49%|████▊ | 4741/9770 [56:01<55:43, 1.50it/s]
49%|████▊ | 4742/9770 [56:01<56:04, 1.49it/s]
49%|████▊ | 4743/9770 [56:02<55:34, 1.51it/s]
49%|████▊ | 4744/9770 [56:03<55:11, 1.52it/s]
49%|████▊ | 4745/9770 [56:03<54:44, 1.53it/s]
49%|████▊ | 4746/9770 [56:04<54:49, 1.53it/s]
49%|████▊ | 4747/9770 [56:05<55:03, 1.52it/s]
49%|████▊ | 4748/9770 [56:05<54:59, 1.52it/s]
49%|████▊ | 4749/9770 [56:06<55:01, 1.52it/s]
49%|████▊ | 4750/9770 [56:07<55:00, 1.52it/s]
49%|████▊ | 4750/9770 [56:07<55:00, 1.52it/s]
49%|████▊ | 4751/9770 [56:07<54:59, 1.52it/s]
49%|████▊ | 4752/9770 [56:08<54:51, 1.52it/s]
49%|████▊ | 4753/9770 [56:09<55:04, 1.52it/s]
49%|████▊ | 4754/9770 [56:09<55:34, 1.50it/s]
49%|████▊ | 4755/9770 [56:
+0: {'loss': 0.6599, 'grad_norm': 0.6095403300108553, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.49}
+0: 10<54:57, 1.52it/s]
49%|████▊ | 4756/9770 [56:11<54:52, 1.52it/s]
49%|████▊ | 4757/9770 [56:11<55:05, 1.52it/s]
49%|████▊ | 4758/9770 [56:12<55:30, 1.51it/s]
49%|████▊ | 4759/9770 [56:13<56:33, 1.48it/s]
49%|████▊ | 4760/9770 [56:13<56:11, 1.49it/s]
49%|████▊ | 4760/9770 [56:13<56:11, 1.49it/s]
49%|████▊ | 4761/9770 [56:14<56:13, 1.48it/s]
49%|████▊ | 4762/9770 [56:15<55:51, 1.49it/s]
49%|████▉ | 4763/9770 [56:15<55:41, 1.50it/s]
49%|████▉ | 4764/9770 [56:16<56:28, 1.48it/s]
49%|████▉ | 4765/9770 [56:17<55:53, 1.49it/s]
49%|████▉ | 4766/9770 [56:17<55:15, 1.51it/s]
49%|████▉ | 4767/9770 [56:18<55:05, 1.51it/s]
49%|████▉ | 4768/9770 [56:19<54:23, 1.53it/s]
49%|████▉ | 4769/9770 [56:19<54:01, 1.54it/s]
49%|████▉
+0: {'loss': 0.6858, 'grad_norm': 0.6462969335376962, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.49}
+0: {'loss': 0.7043, 'grad_norm': 0.6474250293118189, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.49}
+0: | 4770/9770 [56:20<54:23, 1.53it/s]
49%|████▉ | 4770/9770 [56:20<54:23, 1.53it/s]
49%|████▉ | 4771/9770 [56:21<54:31, 1.53it/s]
49%|████▉ | 4772/9770 [56:21<55:10, 1.51it/s]
49%|████▉ | 4773/9770 [56:22<54:58, 1.51it/s]
49%|████▉ | 4774/9770 [56:23<55:00, 1.51it/s]
49%|████▉ | 4775/9770 [56:23<54:52, 1.52it/s]
49%|████▉ | 4776/9770 [56:24<54:56, 1.52it/s]
49%|████▉ | 4777/9770 [56:25<54:23, 1.53it/s]
49%|████▉ | 4778/9770 [56:25<54:45, 1.52it/s]
49%|████▉ | 4779/9770 [56:26<54:35, 1.52it/s]
49%|████▉ | 4780/9770 [56:27<54:49, 1.52it/s]
49%|████▉ | 4780/9770 [56:27<54:49, 1.52it/s]
49%|████▉ | 4781/9770 [56:27<54:21, 1.53it/s]
49%|████▉ | 4782/9770 [56:28<54:26, 1.53it/s]
49%|█
+0: {'loss': 0.6722, 'grad_norm': 0.6352876260360985, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.49}
+0: ███▉ | 4783/9770 [56:28<54:06, 1.54it/s]
49%|████▉ | 4784/9770 [56:29<55:13, 1.50it/s]
49%|████▉ | 4785/9770 [56:30<55:13, 1.50it/s]
49%|████▉ | 4786/9770 [56:30<55:30, 1.50it/s]
49%|████▉ | 4787/9770 [56:31<55:14, 1.50it/s]
49%|████▉ | 4788/9770 [56:32<55:00, 1.51it/s]
49%|████▉ | 4789/9770 [56:32<54:19, 1.53it/s]
49%|████▉ | 4790/9770 [56:33<54:25, 1.53it/s]
49%|████▉ | 4790/9770 [56:33<54:25, 1.53it/s]
49%|████▉ | 4791/9770 [56:34<54:25, 1.52it/s]
49%|████▉ | 4792/9770 [56:34<54:05, 1.53it/s]
49%|████▉ | 4793/9770 [56:35<54:21, 1.53it/s]
49%|████▉ | 4794/9770 [56:36<54:04, 1.53it/s]
49%|████▉ | 4795/9770 [56:36<54:02, 1.53it/s]
49%|████▉ | 4796/9770 [56:37<54:21, 1.53it/s]
49%|████▉ | 4797/9770 [56:38<54:37
+0: {'loss': 0.66, 'grad_norm': 0.646113174427689, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.49}
+0: {'loss': 0.6703, 'grad_norm': 0.5939444943463341, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.49}
+0: , 1.52it/s]
49%|████▉ | 4798/9770 [56:38<54:30, 1.52it/s]
49%|████▉ | 4799/9770 [56:39<55:06, 1.50it/s]
49%|████▉ | 4800/9770 [56:40<55:05, 1.50it/s]
49%|████▉ | 4800/9770 [56:40<55:05, 1.50it/s]
49%|████▉ | 4801/9770 [56:40<54:39, 1.52it/s]
49%|████▉ | 4802/9770 [56:41<54:47, 1.51it/s]
49%|████▉ | 4803/9770 [56:42<54:44, 1.51it/s]
49%|████▉ | 4804/9770 [56:42<55:00, 1.50it/s]
49%|████▉ | 4805/9770 [56:43<54:59, 1.50it/s]
49%|████▉ | 4806/9770 [56:44<54:39, 1.51it/s]
49%|████▉ | 4807/9770 [56:44<54:03, 1.53it/s]
49%|████▉ | 4808/9770 [56:45<54:11, 1.53it/s]
49%|████▉ | 4809/9770 [56:46<53:53, 1.53it/s]
49%|████▉ | 4810/9770 [56:46<54:03, 1.53it/s]
49%|████▉ | 4810/9770
+0: {'loss': 0.6833, 'grad_norm': 0.6105768527945395, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.49}
+0: [56:46<54:03, 1.53it/s]
49%|████▉ | 4811/9770 [56:47<54:14, 1.52it/s]
49%|████▉ | 4812/9770 [56:48<53:50, 1.53it/s]
49%|████▉ | 4813/9770 [56:48<53:41, 1.54it/s]
49%|████▉ | 4814/9770 [56:49<53:46, 1.54it/s]
49%|████▉ | 4815/9770 [56:50<54:02, 1.53it/s]
49%|████▉ | 4816/9770 [56:50<54:17, 1.52it/s]
49%|████▉ | 4817/9770 [56:51<55:03, 1.50it/s]
49%|████▉ | 4818/9770 [56:52<55:33, 1.49it/s]
49%|████▉ | 4819/9770 [56:52<55:03, 1.50it/s]
49%|████▉ | 4820/9770 [56:53<55:42, 1.48it/s]
49%|████▉ | 4820/9770 [56:53<55:42, 1.48it/s]
49%|████▉ | 4821/9770 [56:54<55:10, 1.49it/s]
49%|████▉ | 4822/9770 [56:54<55:23, 1.49it/s]
49%|████▉ | 4823/9770 [56:55<54:47, 1.50it/s]
49%|████▉ | 4824/9770 [56:56<55:34, 1.48it/s]
49%|███�
+0: {'loss': 0.667, 'grad_norm': 0.6034095860421688, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.49}
+0: �▉ | 4825/9770 [56:56<55:03, 1.50it/s]
49%|████▉ | 4826/9770 [56:57<54:16, 1.52it/s]
49%|████▉ | 4827/9770 [56:58<53:37, 1.54it/s]
49%|████▉ | 4828/9770 [56:58<54:08, 1.52it/s]
49%|████▉ | 4829/9770 [56:59<54:18, 1.52it/s]
49%|████▉ | 4830/9770 [56:59<53:31, 1.54it/s]
49%|████▉ | 4830/9770 [56:59<53:31, 1.54it/s]
49%|████▉ | 4831/9770 [57:00<53:53, 1.53it/s]
49%|████▉ | 4832/9770 [57:01<54:16, 1.52it/s]
49%|████▉ | 4833/9770 [57:01<53:40, 1.53it/s]
49%|████▉ | 4834/9770 [57:02<53:37, 1.53it/s]
49%|████▉ | 4835/9770 [57:03<54:07, 1.52it/s]
49%|████▉ | 4836/9770 [57:03<55:15, 1.49it/s]
50%|████▉ | 4837/9770 [57:04<54:30, 1.51it/s]
50%|████▉ | 4838/9770 [57:05<54:48, 1.50it/s]
50%|████▉ | 4839/9770 [57:05<54:44, 1.50i
+0: {'loss': 0.6601, 'grad_norm': 0.6103518557766079, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.5}
+0: {'loss': 0.671, 'grad_norm': 0.603213918217476, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.5}
+0: t/s]
50%|████▉ | 4840/9770 [57:06<54:30, 1.51it/s]
50%|████▉ | 4840/9770 [57:06<54:30, 1.51it/s]
50%|████▉ | 4841/9770 [57:07<53:46, 1.53it/s]
50%|████▉ | 4842/9770 [57:07<53:18, 1.54it/s]
50%|████▉ | 4843/9770 [57:08<53:03, 1.55it/s]
50%|████▉ | 4844/9770 [57:09<54:05, 1.52it/s]
50%|████▉ | 4845/9770 [57:09<53:45, 1.53it/s]
50%|████▉ | 4846/9770 [57:10<53:39, 1.53it/s]
50%|████▉ | 4847/9770 [57:11<53:23, 1.54it/s]
50%|████▉ | 4848/9770 [57:11<53:22, 1.54it/s]
50%|████▉ | 4849/9770 [57:12<52:56, 1.55it/s]
50%|████▉ | 4850/9770 [57:13<53:33, 1.53it/s]
50%|████▉ | 4850/9770 [57:13<53:33, 1.53it/s]
50%|████▉ | 4851/9770 [57:13<54:06, 1.52it/s]
50%|████▉ | 4852/9770 [57:14<5
+0: {'loss': 0.6818, 'grad_norm': 0.6165028283423052, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.5}
+0: 4:06, 1.51it/s]
50%|████▉ | 4853/9770 [57:15<54:45, 1.50it/s]
50%|████▉ | 4854/9770 [57:15<54:22, 1.51it/s]
50%|████▉ | 4855/9770 [57:16<54:09, 1.51it/s]
50%|████▉ | 4856/9770 [57:17<53:44, 1.52it/s]
50%|████▉ | 4857/9770 [57:17<53:22, 1.53it/s]
50%|████▉ | 4858/9770 [57:18<53:47, 1.52it/s]
50%|████▉ | 4859/9770 [57:19<53:07, 1.54it/s]
50%|████▉ | 4860/9770 [57:19<53:23, 1.53it/s]
50%|████▉ | 4860/9770 [57:19<53:23, 1.53it/s]
50%|████▉ | 4861/9770 [57:20<53:47, 1.52it/s]
50%|████▉ | 4862/9770 [57:21<54:33, 1.50it/s]
50%|████▉ | 4863/9770 [57:21<53:41, 1.52it/s]
50%|████▉ | 4864/9770 [57:22<53:28, 1.53it/s]
50%|████▉ | 4865/9770 [57:22<53:26, 1.53it/s]
50%|████▉ | 4866/9770 [57:23<53:57, 1.51it/s]
50%|████▉
+0: {'loss': 0.6806, 'grad_norm': 0.6207253006825786, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.5}
+0: {'loss': 0.6792, 'grad_norm': 0.6250894090780208, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.5}
+0: | 4867/9770 [57:24<53:39, 1.52it/s]
50%|████▉ | 4868/9770 [57:24<53:47, 1.52it/s]
50%|████▉ | 4869/9770 [57:25<54:28, 1.50it/s]
50%|████▉ | 4870/9770 [57:26<53:35, 1.52it/s]
50%|████▉ | 4870/9770 [57:26<53:35, 1.52it/s]
50%|████▉ | 4871/9770 [57:26<54:02, 1.51it/s]
50%|████▉ | 4872/9770 [57:27<53:53, 1.51it/s]
50%|████▉ | 4873/9770 [57:28<53:18, 1.53it/s]
50%|████▉ | 4874/9770 [57:28<52:52, 1.54it/s]
50%|████▉ | 4875/9770 [57:29<52:55, 1.54it/s]
50%|████▉ | 4876/9770 [57:30<52:55, 1.54it/s]
50%|████▉ | 4877/9770 [57:30<54:04, 1.51it/s]
50%|████▉ | 4878/9770 [57:31<53:32, 1.52it/s]
50%|████▉ | 4879/9770 [57:32<53:34, 1.52it/s]
50%|████▉ | 4880/9770 [57:32<54:06, 1.51it/s]
50%|██�
+0: {'loss': 0.6811, 'grad_norm': 0.596448713420271, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.5}
+0: ��█▉ | 4880/9770 [57:32<54:06, 1.51it/s]
50%|████▉ | 4881/9770 [57:33<54:12, 1.50it/s]
50%|████▉ | 4882/9770 [57:34<53:48, 1.51it/s]
50%|████▉ | 4883/9770 [57:34<53:53, 1.51it/s]
50%|████▉ | 4884/9770 [57:35<53:35, 1.52it/s]
50%|█████ | 4885/9770 [57:36<54:34, 1.49it/s]
50%|█████ | 4886/9770 [57:36<54:23, 1.50it/s]
50%|█████ | 4887/9770 [57:37<54:21, 1.50it/s]
50%|█████ | 4888/9770 [57:38<54:08, 1.50it/s]
50%|█████ | 4889/9770 [57:38<54:55, 1.48it/s]
50%|█████ | 4890/9770 [57:39<54:21, 1.50it/s]
50%|█████ | 4890/9770 [57:39<54:21, 1.50it/s]
50%|█████ | 4891/9770 [57:40<54:13, 1.50it/s]
50%|█████ | 4892/9770 [57:40<53:35, 1.52it/s]
50%|█████ | 4893/9770 [57:41<53:23, 1.52it/s]
50%|█████ | 4894/9770 [57:42<53:17, 1
+0: {'loss': 0.6585, 'grad_norm': 0.652195539241238, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.5}
+0: .53it/s]
50%|█████ | 4895/9770 [57:42<53:15, 1.53it/s]
50%|█████ | 4896/9770 [57:43<53:40, 1.51it/s]
50%|█████ | 4897/9770 [57:44<53:50, 1.51it/s]
50%|█████ | 4898/9770 [57:44<53:53, 1.51it/s]
50%|█████ | 4899/9770 [57:45<53:54, 1.51it/s]
50%|█████ | 4900/9770 [57:46<54:02, 1.50it/s]
50%|█████ | 4900/9770 [57:46<54:02, 1.50it/s]
50%|█████ | 4901/9770 [57:46<54:10, 1.50it/s]
50%|█████ | 4902/9770 [57:47<54:08, 1.50it/s]
50%|█████ | 4903/9770 [57:48<53:48, 1.51it/s]
50%|█████ | 4904/9770 [57:48<53:43, 1.51it/s]
50%|█████ | 4905/9770 [57:49<53:07, 1.53it/s]
50%|█████ | 4906/9770 [57:50<53:14, 1.52it/s]
50%|█████ | 4907/9770 [57:50<53:23, 1.52it/s]
50%|█████ | 4908/9770 [57:51<54:29, 1.49it/s]
50%|█████ | 4909/
+0: {'loss': 0.6786, 'grad_norm': 0.6268866892127483, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.5}
+0: {'loss': 0.6671, 'grad_norm': 0.6134672595493911, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.5}
+0: 9770 [57:52<54:31, 1.49it/s]
50%|█████ | 4910/9770 [57:52<53:48, 1.51it/s]
50%|█████ | 4910/9770 [57:52<53:48, 1.51it/s]
50%|█████ | 4911/9770 [57:53<53:54, 1.50it/s]
50%|█████ | 4912/9770 [57:54<54:53, 1.48it/s]
50%|█████ | 4913/9770 [57:54<55:05, 1.47it/s]
50%|█████ | 4914/9770 [57:55<55:16, 1.46it/s]
50%|█████ | 4915/9770 [57:56<54:15, 1.49it/s]
50%|█████ | 4916/9770 [57:56<53:52, 1.50it/s]
50%|█████ | 4917/9770 [57:57<53:42, 1.51it/s]
50%|█████ | 4918/9770 [57:58<53:55, 1.50it/s]
50%|█████ | 4919/9770 [57:58<53:43, 1.50it/s]
50%|█████ | 4920/9770 [57:59<53:33, 1.51it/s]
50%|█████ | 4920/9770 [57:59<53:33, 1.51it/s]
50%|█████ | 4921/9770 [58:00<52:59, 1.53it/s]
50%|█████
+0: {'loss': 0.6594, 'grad_norm': 0.600250727870295, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.5}
+0: | 4922/9770 [58:00<53:10, 1.52it/s]
50%|█████ | 4923/9770 [58:01<53:11, 1.52it/s]
50%|█████ | 4924/9770 [58:02<52:45, 1.53it/s]
50%|█████ | 4925/9770 [58:02<53:14, 1.52it/s]
50%|█████ | 4926/9770 [58:03<52:34, 1.54it/s]
50%|█████ | 4927/9770 [58:04<53:01, 1.52it/s]
50%|█████ | 4928/9770 [58:04<52:50, 1.53it/s]
50%|█████ | 4929/9770 [58:05<52:29, 1.54it/s]
50%|█████ | 4930/9770 [58:06<53:15, 1.51it/s]
50%|█████ | 4930/9770 [58:06<53:15, 1.51it/s]
50%|█████ | 4931/9770 [58:06<53:13, 1.52it/s]
50%|█████ | 4932/9770 [58:07<54:16, 1.49it/s]
50%|█████ | 4933/9770 [58:08<53:47, 1.50it/s]
51%|█████ | 4934/9770 [58:08<53:18, 1.51it/s]
51%|█████ | 4935/9770 [58:09<53:09, 1.52it/s]
51%|█████ | 4936/9770 [58:10<53:06, 1.52it/s]
+0: {'loss': 0.6639, 'grad_norm': 0.6984372897619658, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.51}
+0:
51%|█████ | 4937/9770 [58:10<52:36, 1.53it/s]
51%|█████ | 4938/9770 [58:11<53:58, 1.49it/s]
51%|█████ | 4939/9770 [58:12<53:29, 1.51it/s]
51%|█████ | 4940/9770 [58:12<53:53, 1.49it/s]
51%|█████ | 4940/9770 [58:12<53:53, 1.49it/s]
51%|█████ | 4941/9770 [58:13<53:25, 1.51it/s]
51%|█████ | 4942/9770 [58:13<53:07, 1.51it/s]
51%|██��██ | 4943/9770 [58:14<53:15, 1.51it/s]
51%|█████ | 4944/9770 [58:15<53:01, 1.52it/s]
51%|█████ | 4945/9770 [58:15<52:31, 1.53it/s]
51%|█████ | 4946/9770 [58:16<53:10, 1.51it/s]
51%|█████ | 4947/9770 [58:17<53:12, 1.51it/s]
51%|█████ | 4948/9770 [58:17<52:56, 1.52it/s]
51%|█████ | 4949/9770 [58:18<52:58, 1.52it/s]
51%|█████ | 4950/9770 [58:19<52:40, 1.53it/s]
+0: {'loss': 0.6789, 'grad_norm': 0.6889980569853584, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.51}
+0: {'loss': 0.6559, 'grad_norm': 0.6653437753539656, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.51}
+0:
51%|█████ | 4950/9770 [58:19<52:40, 1.53it/s]
51%|█████ | 4951/9770 [58:19<52:39, 1.53it/s]
51%|█████ | 4952/9770 [58:20<51:51, 1.55it/s]
51%|█████ | 4953/9770 [58:21<52:32, 1.53it/s]
51%|█████ | 4954/9770 [58:21<52:20, 1.53it/s]
51%|█████ | 4955/9770 [58:22<52:20, 1.53it/s]
51%|█████ | 4956/9770 [58:23<52:16, 1.53it/s]
51%|█████ | 4957/9770 [58:23<52:16, 1.53it/s]
51%|█████ | 4958/9770 [58:24<52:20, 1.53it/s]
51%|█████ | 4959/9770 [58:25<52:27, 1.53it/s]
51%|█████ | 4960/9770 [58:25<53:44, 1.49it/s]
51%|█████ | 4960/9770 [58:25<53:44, 1.49it/s]
51%|█████ | 4961/9770 [58:26<52:56, 1.51it/s]
51%|█████ | 4962/9770 [58:27<52:36, 1.52it/s]
51%|█████ | 4963/9770 [58:27<52:02, 1.54it/s]
51%|█████ | 4
+0: {'loss': 0.666, 'grad_norm': 0.6380552114207497, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.51}
+0: 964/9770 [58:28<52:43, 1.52it/s]
51%|█████ | 4965/9770 [58:29<52:32, 1.52it/s]
51%|█████ | 4966/9770 [58:29<52:38, 1.52it/s]
51%|█████ | 4967/9770 [58:30<52:51, 1.51it/s]
51%|█████ | 4968/9770 [58:31<52:56, 1.51it/s]
51%|█████ | 4969/9770 [58:31<53:05, 1.51it/s]
51%|█████ | 4970/9770 [58:32<52:13, 1.53it/s]
51%|█████ | 4970/9770 [58:32<52:13, 1.53it/s]
51%|█████ | 4971/9770 [58:33<52:20, 1.53it/s]
51%|█████ | 4972/9770 [58:33<52:22, 1.53it/s]
51%|█████ | 4973/9770 [58:34<52:31, 1.52it/s]
51%|█████ | 4974/9770 [58:34<52:28, 1.52it/s]
51%|█████ | 4975/9770 [58:35<52:16, 1.53it/s]
51%|█████ | 4976/9770 [58:36<51:46, 1.54it/s]
51%|█████ | 4977/9770 [58:36<53:01, 1.51it/s]
51%|█████ | 4978/9770 [58:37<53:29, 1.49it/s]
51%|�
+0: {'loss': 0.6616, 'grad_norm': 0.6149642780972093, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.51}
+0: {'loss': 0.6607, 'grad_norm': 0.6259600119306887, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.51}
+0: �████ | 4979/9770 [58:38<52:33, 1.52it/s]
51%|█████ | 4980/9770 [58:38<52:43, 1.51it/s]
51%|█████ | 4980/9770 [58:38<52:43, 1.51it/s]
51%|█████ | 4981/9770 [58:39<52:16, 1.53it/s]
51%|█████ | 4982/9770 [58:40<52:27, 1.52it/s]
51%|█████ | 4983/9770 [58:40<52:31, 1.52it/s]
51%|█████ | 4984/9770 [58:41<52:53, 1.51it/s]
51%|█████ | 4985/9770 [58:42<53:00, 1.50it/s]
51%|█████ | 4986/9770 [58:42<53:05, 1.50it/s]
51%|█████ | 4987/9770 [58:43<52:38, 1.51it/s]
51%|█████ | 4988/9770 [58:44<52:30, 1.52it/s]
51%|█████ | 4989/9770 [58:44<52:17, 1.52it/s]
51%|█████ | 4990/9770 [58:45<52:10, 1.53it/s]
51%|█████ | 4990/9770 [58:45<52:10, 1.53it/s]
51%|█████ | 4991/9770 [58:46<52:03, 1.53i
+0: {'loss': 0.6552, 'grad_norm': 0.6039468761627335, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.51}
+0: t/s]
51%|█████ | 4992/9770 [58:46<52:01, 1.53it/s]
51%|█████ | 4993/9770 [58:47<51:55, 1.53it/s]
51%|█████ | 4994/9770 [58:48<52:30, 1.52it/s]
51%|█████ | 4995/9770 [58:48<51:57, 1.53it/s]
51%|█████ | 4996/9770 [58:49<52:11, 1.52it/s]
51%|█████ | 4997/9770 [58:50<51:34, 1.54it/s]
51%|█████ | 4998/9770 [58:50<51:58, 1.53it/s]
51%|█████ | 4999/9770 [58:51<52:51, 1.50it/s]
51%|█████ | 5000/9770 [58:52<52:23, 1.52it/s]
51%|█████ | 5000/9770 [58:52<52:23, 1.52it/s]
51%|█████ | 5001/9770 [58:52<51:48, 1.53it/s]
51%|█████ | 5002/9770 [58:53<51:46, 1.53it/s]
51%|█████ | 5003/9770 [58:54<51:31, 1.54it/s]
51%|█████ | 5004/9770 [58:54<51:33, 1.54it/s]
51%|█████ | 5005/9770 [58:55<51:36, 1.54it/s]
51%|█████ | 5006/9770
+0: {'loss': 0.6698, 'grad_norm': 0.6050012098233605, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.51}
+0: [58:55<51:21, 1.55it/s]
51%|█████ | 5007/9770 [58:56<52:14, 1.52it/s]
51%|█████▏ | 5008/9770 [58:57<52:14, 1.52it/s]
51%|█████▏ | 5009/9770 [58:57<51:51, 1.53it/s]
51%|█████▏ | 5010/9770 [58:58<52:13, 1.52it/s]
51%|█████▏ | 5010/9770 [58:58<52:13, 1.52it/s]
51%|█████▏ | 5011/9770 [58:59<52:27, 1.51it/s]
51%|█████▏ | 5012/9770 [58:59<52:31, 1.51it/s]
51%|█████▏ | 5013/9770 [59:00<52:27, 1.51it/s]
51%|█████▏ | 5014/9770 [59:01<53:07, 1.49it/s]
51%|█████▏ | 5015/9770 [59:01<53:35, 1.48it/s]
51%|█████▏ | 5016/9770 [59:02<52:58, 1.50it/s]
51%|█████▏ | 5017/9770 [59:03<52:55, 1.50it/s]
51%|█████▏ | 5018/9770 [59:03<52:37, 1.50it/s]
51%|█████▏ | 5019/9770 [59:04<52:34, 1.51it/s]
51%|█████▏ | 5020/9770 [59:05<52:15
+0: {'loss': 0.6814, 'grad_norm': 0.6387138190564051, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.51}
+0: {'loss': 0.6672, 'grad_norm': 0.6161556190000104, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.51}
+0: , 1.52it/s]
51%|█████▏ | 5020/9770 [59:05<52:15, 1.52it/s]
51%|█████▏ | 5021/9770 [59:05<52:33, 1.51it/s]
51%|█████▏ | 5022/9770 [59:06<51:46, 1.53it/s]
51%|█████▏ | 5023/9770 [59:07<51:33, 1.53it/s]
51%|█████▏ | 5024/9770 [59:07<51:24, 1.54it/s]
51%|█████▏ | 5025/9770 [59:08<51:00, 1.55it/s]
51%|█████▏ | 5026/9770 [59:09<51:05, 1.55it/s]
51%|█████▏ | 5027/9770 [59:09<51:15, 1.54it/s]
51%|█████▏ | 5028/9770 [59:10<50:59, 1.55it/s]
51%|█████▏ | 5029/9770 [59:11<51:33, 1.53it/s]
51%|█████▏ | 5030/9770 [59:11<51:37, 1.53it/s]
51%|█████▏ | 5030/9770 [59:11<51:37, 1.53it/s]
51%|█████▏ | 5031/9770 [59:12<51:46, 1.53it/s]
52%|█████▏ | 5032/9770 [59:13<52:01, 1.52it/s]
52%|█�
+0: {'loss': 0.6501, 'grad_norm': 0.6103714082201293, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.52}
+0: ��███▏ | 5033/9770 [59:13<51:14, 1.54it/s]
52%|█████▏ | 5034/9770 [59:14<51:15, 1.54it/s]
52%|█████▏ | 5035/9770 [59:15<51:25, 1.53it/s]
52%|█████▏ | 5036/9770 [59:15<56:20, 1.40it/s]
52%|█████▏ | 5037/9770 [59:16<54:56, 1.44it/s]
52%|█████▏ | 5038/9770 [59:17<54:02, 1.46it/s]
52%|█████▏ | 5039/9770 [59:17<53:17, 1.48it/s]
52%|█████▏ | 5040/9770 [59:18<52:28, 1.50it/s]
52%|█████▏ | 5040/9770 [59:18<52:28, 1.50it/s]
52%|█████▏ | 5041/9770 [59:19<52:02, 1.51it/s]
52%|█████▏ | 5042/9770 [59:19<52:20, 1.51it/s]
52%|█████▏ | 5043/9770 [59:20<51:47, 1.52it/s]
52%|█████▏ | 5044/9770 [59:21<52:25, 1.50it/s]
52%|█████▏ | 5045/9770 [59:21<52:00, 1.51it/s]
52%|█████▏ | 5046/9770 [59:22<51:38, 1.52it/s]
52%|█████
+0: {'loss': 0.6636, 'grad_norm': 0.6142671087698502, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.52}
+0: ▏ | 5047/9770 [59:23<51:59, 1.51it/s]
52%|█████▏ | 5048/9770 [59:23<51:43, 1.52it/s]
52%|█████▏ | 5049/9770 [59:24<52:21, 1.50it/s]
52%|���████▏ | 5050/9770 [59:25<51:59, 1.51it/s]
52%|█████▏ | 5050/9770 [59:25<51:59, 1.51it/s]
52%|█████▏ | 5051/9770 [59:25<51:48, 1.52it/s]
52%|█████▏ | 5052/9770 [59:26<52:04, 1.51it/s]
52%|█████▏ | 5053/9770 [59:27<51:48, 1.52it/s]
52%|█████▏ | 5054/9770 [59:27<51:30, 1.53it/s]
52%|█████▏ | 5055/9770 [59:28<51:38, 1.52it/s]
52%|█████▏ | 5056/9770 [59:29<50:42, 1.55it/s]
52%|█████▏ | 5057/9770 [59:29<50:38, 1.55it/s]
52%|█████▏ | 5058/9770 [59:30<51:16, 1.53it/s]
52%|█████▏ | 5059/9770 [59:30<51:14, 1.53it/s]
52%|█████▏ | 5060/9770 [59:31<51:45, 1.52it/s]
+0: {'loss': 0.662, 'grad_norm': 0.6304596907220404, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.52}
+0: {'loss': 0.6619, 'grad_norm': 0.6165093955269434, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.52}
+0:
52%|█████▏ | 5060/9770 [59:31<51:45, 1.52it/s]
52%|█████▏ | 5061/9770 [59:32<51:49, 1.51it/s]
52%|█████▏ | 5062/9770 [59:32<51:28, 1.52it/s]
52%|█████▏ | 5063/9770 [59:33<51:36, 1.52it/s]
52%|█████▏ | 5064/9770 [59:34<51:31, 1.52it/s]
52%|█████▏ | 5065/9770 [59:34<51:06, 1.53it/s]
52%|█████▏ | 5066/9770 [59:35<51:55, 1.51it/s]
52%|█████▏ | 5067/9770 [59:36<52:06, 1.50it/s]
52%|█████▏ | 5068/9770 [59:36<52:06, 1.50it/s]
52%|█████▏ | 5069/9770 [59:37<51:16, 1.53it/s]
52%|█████▏ | 5070/9770 [59:38<51:15, 1.53it/s]
52%|█████▏ | 5070/9770 [59:38<51:15, 1.53it/s]
52%|█████▏ | 5071/9770 [59:38<51:11, 1.53it/s]
52%|█████▏ | 5072/9770 [59:39<51:30, 1.52it/s]
52%|█████▏ | 5073/9770 [59:40<50:57,
+0: {'loss': 0.6767, 'grad_norm': 0.6046949541372585, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.52}
+0: 1.54it/s]
52%|█████▏ | 5074/9770 [59:40<51:41, 1.51it/s]
52%|█████▏ | 5075/9770 [59:41<51:50, 1.51it/s]
52%|█████▏ | 5076/9770 [59:42<51:39, 1.51it/s]
52%|█████▏ | 5077/9770 [59:42<51:23, 1.52it/s]
52%|█████▏ | 5078/9770 [59:43<51:20, 1.52it/s]
52%|█████▏ | 5079/9770 [59:44<51:46, 1.51it/s]
52%|█████▏ | 5080/9770 [59:44<51:55, 1.51it/s]
52%|█████▏ | 5080/9770 [59:44<51:55, 1.51it/s]
52%|█████▏ | 5081/9770 [59:45<51:58, 1.50it/s]
52%|█████▏ | 5082/9770 [59:46<51:46, 1.51it/s]
52%|█████▏ | 5083/9770 [59:46<52:26, 1.49it/s]
52%|█████▏ | 5084/9770 [59:47<52:46, 1.48it/s]
52%|█████▏ | 5085/9770 [59:48<53:17, 1.47it/s]
52%|█████▏ | 5086/9770 [59:48<53:52, 1.45it/s]
52%|█████▏ | 5087/9770 [59:49<52:59, 1.47it/s]
+0: {'loss': 0.6785, 'grad_norm': 0.6266865570543698, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.52}
+0: {'loss': 0.6767, 'grad_norm': 0.6342178104993592, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.52}
+0: 52%|█████▏ | 5088/9770 [59:50<52:29, 1.49it/s]
52%|█████▏ | 5089/9770 [59:50<51:32, 1.51it/s]
52%|█████▏ | 5090/9770 [59:51<52:03, 1.50it/s]
52%|█████▏ | 5090/9770 [59:51<52:03, 1.50it/s]
52%|█████▏ | 5091/9770 [59:52<52:06, 1.50it/s]
52%|█████▏ | 5092/9770 [59:52<52:16, 1.49it/s]
52%|█████▏ | 5093/9770 [59:53<51:14, 1.52it/s]
52%|█████▏ | 5094/9770 [59:54<51:24, 1.52it/s]
52%|█████▏ | 5095/9770 [59:54<51:29, 1.51it/s]
52%|█████▏ | 5096/9770 [59:55<51:21, 1.52it/s]
52%|█████▏ | 5097/9770 [59:56<50:42, 1.54it/s]
52%|█████▏ | 5098/9770 [59:56<50:50, 1.53it/s]
52%|█████▏ | 5099/9770 [59:57<51:20, 1.52it/s]
52%|█████▏ | 5100/9770 [59:58<50:57, 1.53it/s]
52%|█████�
+0: {'loss': 0.6796, 'grad_norm': 0.6464930117233029, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.52}
+0: � | 5100/9770 [59:58<50:57, 1.53it/s]
52%|█████▏ | 5101/9770 [59:58<51:26, 1.51it/s]
52%|█████▏ | 5102/9770 [59:59<50:59, 1.53it/s]
52%|█████▏ | 5103/9770 [1:00:00<50:36, 1.54it/s]
52%|█████▏ | 5104/9770 [1:00:00<50:55, 1.53it/s]
52%|█████▏ | 5105/9770 [1:00:01<50:50, 1.53it/s]
52%|█████▏ | 5106/9770 [1:00:02<51:45, 1.50it/s]
52%|█████▏ | 5107/9770 [1:00:02<51:30, 1.51it/s]
52%|█████▏ | 5108/9770 [1:00:03<51:58, 1.50it/s]
52%|█████▏ | 5109/9770 [1:00:04<51:32, 1.51it/s]
52%|█████▏ | 5110/9770 [1:00:04<51:23, 1.51it/s]
52%|█████▏ | 5110/9770 [1:00:04<51:23, 1.51it/s]
52%|█████▏ | 5111/9770 [1:00:05<51:12, 1.52it/s]
52%|█████▏ | 5112/9770 [1:00:06<50:35, 1.53it/s]
52%|█████▏ | 5113/9770 [1:00:06<50:21, 1.54it/s]
52%|�
+0: {'loss': 0.6708, 'grad_norm': 0.6656520912260043, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.52}
+0: �████▏ | 5114/9770 [1:00:07<50:54, 1.52it/s]
52%|█████▏ | 5115/9770 [1:00:07<50:35, 1.53it/s]
52%|█████▏ | 5116/9770 [1:00:08<50:26, 1.54it/s]
52%|█████▏ | 5117/9770 [1:00:09<50:50, 1.53it/s]
52%|█████▏ | 5118/9770 [1:00:09<50:43, 1.53it/s]
52%|█████▏ | 5119/9770 [1:00:10<50:12, 1.54it/s]
52%|█████▏ | 5120/9770 [1:00:11<50:33, 1.53it/s]
52%|█████▏ | 5120/9770 [1:00:11<50:33, 1.53it/s]
52%|█████▏ | 5121/9770 [1:00:11<50:45, 1.53it/s]
52%|█████▏ | 5122/9770 [1:00:12<50:51, 1.52it/s]
52%|█████▏ | 5123/9770 [1:00:13<50:28, 1.53it/s]
52%|█████▏ | 5124/9770 [1:00:13<50:54, 1.52it/s]
52%|█████▏ | 5125/9770 [1:00:14<51:10, 1.51it/s]
52%|█████▏ | 5126/9770 [1:00:15<50:49, 1.52it/s]
52%|█████▏ | 5127/9770 [1:00:15<50:5
+0: {'loss': 0.6657, 'grad_norm': 0.6129089321466914, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.52}
+0: 6, 1.52it/s]
52%|█████▏ | 5128/9770 [1:00:16<50:44, 1.52it/s]
52%|█████▏ | 5129/9770 [1:00:17<50:54, 1.52it/s]
53%|█████▎ | 5130/9770 [1:00:17<51:04, 1.51it/s]
53%|█████▎ | 5130/9770 [1:00:17<51:04, 1.51it/s]
53%|█████▎ | 5131/9770 [1:00:18<50:52, 1.52it/s]
53%|█████▎ | 5132/9770 [1:00:19<51:01, 1.51it/s]
53%|█████▎ | 5133/9770 [1:00:19<51:40, 1.50it/s]
53%|█████▎ | 5134/9770 [1:00:20<50:59, 1.52it/s]
53%|█████▎ | 5135/9770 [1:00:21<50:53, 1.52it/s]
53%|█████▎ | 5136/9770 [1:00:21<50:26, 1.53it/s]
53%|█████▎ | 5137/9770 [1:00:22<50:55, 1.52it/s]
53%|█████▎ | 5138/9770 [1:00:23<51:10, 1.51it/s]
53%|█████▎ | 5139/9770 [1:00:23<50:23, 1.53it/s]
53%|█████▎ | 5140/9770 [1:00:24<50:57, 1.51it/s]
+0: {'loss': 0.664, 'grad_norm': 0.6331759177336027, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.53}
+0: {'loss': 0.6643, 'grad_norm': 0.5925651718111511, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.53}
+0:
53%|█████▎ | 5140/9770 [1:00:24<50:57, 1.51it/s]
53%|█████▎ | 5141/9770 [1:00:25<51:21, 1.50it/s]
53%|█████▎ | 5142/9770 [1:00:25<51:13, 1.51it/s]
53%|█████▎ | 5143/9770 [1:00:26<50:59, 1.51it/s]
53%|█████▎ | 5144/9770 [1:00:27<51:46, 1.49it/s]
53%|█████▎ | 5145/9770 [1:00:27<51:16, 1.50it/s]
53%|█████▎ | 5146/9770 [1:00:28<50:43, 1.52it/s]
53%|█████▎ | 5147/9770 [1:00:29<50:29, 1.53it/s]
53%|█████▎ | 5148/9770 [1:00:29<49:55, 1.54it/s]
53%|█████▎ | 5149/9770 [1:00:30<50:11, 1.53it/s]
53%|█████▎ | 5150/9770 [1:00:30<50:00, 1.54it/s]
53%|█████▎ | 5150/9770 [1:00:31<50:00, 1.54it/s]
53%|█████▎ | 5151/9770 [1:00:31<50:28, 1.52it/s]
53%|█████▎ | 5152/9770 [1:00:32<50:36, 1.52it/s]
53%|█████�
+0: {'loss': 0.6604, 'grad_norm': 0.6180175090034707, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.53}
+0: �� | 5153/9770 [1:00:33<51:03, 1.51it/s]
53%|█████▎ | 5154/9770 [1:00:33<51:44, 1.49it/s]
53%|█████▎ | 5155/9770 [1:00:34<51:20, 1.50it/s]
53%|█████▎ | 5156/9770 [1:00:35<51:07, 1.50it/s]
53%|█████▎ | 5157/9770 [1:00:35<50:50, 1.51it/s]
53%|█████▎ | 5158/9770 [1:00:36<50:51, 1.51it/s]
53%|█████▎ | 5159/9770 [1:00:36<50:45, 1.51it/s]
53%|█████▎ | 5160/9770 [1:00:37<50:41, 1.52it/s]
53%|█████▎ | 5160/9770 [1:00:37<50:41, 1.52it/s]
53%|█████▎ | 5161/9770 [1:00:38<50:28, 1.52it/s]
53%|█████▎ | 5162/9770 [1:00:38<50:39, 1.52it/s]
53%|█████▎ | 5163/9770 [1:00:39<50:12, 1.53it/s]
53%|█████▎ | 5164/9770 [1:00:40<49:24, 1.55it/s]
53%|█████▎ | 5165/9770 [1:00:40<50:03, 1.53it/s]
53%|█████▎ | 5166/9770 [1:00:41<50:10, 1.53it/s]
+0: {'loss': 0.6589, 'grad_norm': 0.6013981899067725, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.53}
+0: 53%|█████▎ | 5167/9770 [1:00:42<50:22, 1.52it/s]
53%|█████▎ | 5168/9770 [1:00:42<51:31, 1.49it/s]
53%|█████▎ | 5169/9770 [1:00:43<52:01, 1.47it/s]
53%|█████▎ | 5170/9770 [1:00:44<51:40, 1.48it/s]
53%|█████▎ | 5170/9770 [1:00:44<51:40, 1.48it/s]
53%|█████▎ | 5171/9770 [1:00:44<51:31, 1.49it/s]
53%|█████▎ | 5172/9770 [1:00:45<51:25, 1.49it/s]
53%|█████▎ | 5173/9770 [1:00:46<51:53, 1.48it/s]
53%|█████▎ | 5174/9770 [1:00:46<51:39, 1.48it/s]
53%|█████▎ | 5175/9770 [1:00:47<52:33, 1.46it/s]
53%|█████▎ | 5176/9770 [1:00:48<51:37, 1.48it/s]
53%|█████▎ | 5177/9770 [1:00:48<51:14, 1.49it/s]
53%|█████▎ | 5178/9770 [1:00:49<51:19, 1.49it/s]
53%|█████▎ | 5179/9770 [1:00:50<50:44, 1.51it/s]
53%|█████▎ | 5180/9770 [1:00:
+0: {'loss': 0.6451, 'grad_norm': 0.6063826526139969, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.53}
+0: {'loss': 0.6684, 'grad_norm': 0.6126606239033013, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.53}
+0: 50<50:36, 1.51it/s]
53%|█████▎ | 5180/9770 [1:00:50<50:36, 1.51it/s]
53%|█████▎ | 5181/9770 [1:00:51<50:54, 1.50it/s]
53%|█████▎ | 5182/9770 [1:00:52<51:35, 1.48it/s]
53%|█████▎ | 5183/9770 [1:00:53<51:10, 1.49it/s]
53%|█████▎ | 5184/9770 [1:00:53<50:12, 1.52it/s]
53%|█████▎ | 5185/9770 [1:00:54<50:00, 1.53it/s]
53%|█████▎ | 5186/9770 [1:00:54<50:18, 1.52it/s]
53%|█████▎ | 5187/9770 [1:00:55<50:16, 1.52it/s]
53%|█████▎ | 5188/9770 [1:00:56<50:06, 1.52it/s]
53%|█████▎ | 5189/9770 [1:00:56<49:46, 1.53it/s]
53%|█████▎ | 5190/9770 [1:00:57<50:09, 1.52it/s]
53%|█████▎ | 5190/9770 [1:00:57<50:09, 1.52it/s]
53%|█████▎ | 5191/9770 [1:00:58<50:41, 1.51it/s]
53%|█████▎ | 5192/9
+0: {'loss': 0.6329, 'grad_norm': 0.6224836483508644, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.53}
+0: 770 [1:00:58<50:23, 1.51it/s]
53%|█████▎ | 5193/9770 [1:00:59<50:12, 1.52it/s]
53%|█████▎ | 5194/9770 [1:01:00<51:17, 1.49it/s]
53%|█████▎ | 5195/9770 [1:01:00<51:10, 1.49it/s]
53%|█████▎ | 5196/9770 [1:01:01<51:04, 1.49it/s]
53%|█████▎ | 5197/9770 [1:01:02<51:08, 1.49it/s]
53%|█████▎ | 5198/9770 [1:01:02<50:27, 1.51it/s]
53%|█████▎ | 5199/9770 [1:01:03<50:01, 1.52it/s]
53%|█████▎ | 5200/9770 [1:01:04<49:43, 1.53it/s]
53%|█████▎ | 5200/9770 [1:01:04<49:43, 1.53it/s]
53%|█████▎ | 5201/9770 [1:01:04<49:40, 1.53it/s]
53%|█████▎ | 5202/9770 [1:01:05<50:00, 1.52it/s]
53%|█████▎ | 5203/9770 [1:01:06<50:39, 1.50it/s]
53%|█████▎ | 5204/9770 [1:01:06<50:15, 1.51it/s]
53%|█████▎ | 5205/9770 [1:01:07<50:04, 1.52it/s]
53%|███
+0: {'loss': 0.6659, 'grad_norm': 0.6046861240025698, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.53}
+0: ██▎ | 5206/9770 [1:01:08<49:30, 1.54it/s]
53%|█████▎ | 5207/9770 [1:01:08<50:47, 1.50it/s]
53%|█████▎ | 5208/9770 [1:01:09<50:34, 1.50it/s]
53%|█████▎ | 5209/9770 [1:01:10<51:24, 1.48it/s]
53%|█████▎ | 5210/9770 [1:01:10<51:40, 1.47it/s]
53%|█████▎ | 5210/9770 [1:01:10<51:40, 1.47it/s]
53%|█████▎ | 5211/9770 [1:01:11<51:45, 1.47it/s]
53%|█████▎ | 5212/9770 [1:01:12<52:07, 1.46it/s]
53%|█████▎ | 5213/9770 [1:01:12<51:58, 1.46it/s]
53%|█████▎ | 5214/9770 [1:01:13<52:26, 1.45it/s]
53%|█████▎ | 5215/9770 [1:01:14<52:08, 1.46it/s]
53%|█████▎ | 5216/9770 [1:01:15<52:15, 1.45it/s]
53%|█████▎ | 5217/9770 [1:01:15<52:06, 1.46it/s]
53%|█████▎ | 5218/9770 [1:01:16<51:27, 1.47it/s]
53%|█████▎ | 5219/9770 [1:01:17<51:00, 1.4
+0: {'loss': 0.6744, 'grad_norm': 0.6061564935373156, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.53}
+0: {'loss': 0.6673, 'grad_norm': 0.612465793179078, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.54}
+0: 9it/s]
53%|█████▎ | 5220/9770 [1:01:17<51:58, 1.46it/s]
53%|█████▎ | 5220/9770 [1:01:17<51:58, 1.46it/s]
53%|█████▎ | 5221/9770 [1:01:18<51:35, 1.47it/s]
53%|█████▎ | 5222/9770 [1:01:19<50:54, 1.49it/s]
53%|█████▎ | 5223/9770 [1:01:19<50:23, 1.50it/s]
53%|█████▎ | 5224/9770 [1:01:20<50:44, 1.49it/s]
53%|█████▎ | 5225/9770 [1:01:21<51:25, 1.47it/s]
53%|█████▎ | 5226/9770 [1:01:21<52:01, 1.46it/s]
54%|█████▎ | 5227/9770 [1:01:22<51:47, 1.46it/s]
54%|█████▎ | 5228/9770 [1:01:23<51:44, 1.46it/s]
54%|█████▎ | 5229/9770 [1:01:23<51:48, 1.46it/s]
54%|█████▎ | 5230/9770 [1:01:24<51:56, 1.46it/s]
54%|█████▎ | 5230/9770 [1:01:24<51:56, 1.46it/s]
54%|█████▎ | 5231/9770 [1:01:25<5
+0: {'loss': 0.682, 'grad_norm': 0.6577209596441055, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.54}
+0: 1:47, 1.46it/s]
54%|█████▎ | 5232/9770 [1:01:25<51:58, 1.46it/s]
54%|█████▎ | 5233/9770 [1:01:26<52:50, 1.43it/s]
54%|█████▎ | 5234/9770 [1:01:27<51:36, 1.47it/s]
54%|█████▎ | 5235/9770 [1:01:27<50:44, 1.49it/s]
54%|█████▎ | 5236/9770 [1:01:28<50:35, 1.49it/s]
54%|█████▎ | 5237/9770 [1:01:29<50:42, 1.49it/s]
54%|█████▎ | 5238/9770 [1:01:29<50:18, 1.50it/s]
54%|█████▎ | 5239/9770 [1:01:30<50:05, 1.51it/s]
54%|█████▎ | 5240/9770 [1:01:31<49:56, 1.51it/s]
54%|█████▎ | 5240/9770 [1:01:31<49:56, 1.51it/s]
54%|█████▎ | 5241/9770 [1:01:31<50:04, 1.51it/s]
54%|█████▎ | 5242/9770 [1:01:32<50:24, 1.50it/s]
54%|█████▎ | 5243/9770 [1:01:33<49:57, 1.51it/s]
54%|█████▎ | 5244/9770 [1:01:33<49:59, 1.51it/s]
54%|█████▎ |
+0: {'loss': 0.6653, 'grad_norm': 0.5962466507500274, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.54}
+0: 5245/9770 [1:01:34<50:25, 1.50it/s]
54%|█████▎ | 5246/9770 [1:01:35<49:44, 1.52it/s]
54%|█████▎ | 5247/9770 [1:01:35<50:32, 1.49it/s]
54%|█████▎ | 5248/9770 [1:01:36<50:07, 1.50it/s]
54%|█████▎ | 5249/9770 [1:01:37<50:01, 1.51it/s]
54%|█████▎ | 5250/9770 [1:01:37<50:25, 1.49it/s]
54%|█████▎ | 5250/9770 [1:01:37<50:25, 1.49it/s]
54%|█████▎ | 5251/9770 [1:01:38<50:15, 1.50it/s]
54%|█████▍ | 5252/9770 [1:01:39<49:53, 1.51it/s]
54%|█████▍ | 5253/9770 [1:01:39<50:46, 1.48it/s]
54%|█████▍ | 5254/9770 [1:01:40<50:42, 1.48it/s]
54%|█████▍ | 5255/9770 [1:01:41<51:09, 1.47it/s]
54%|█████▍ | 5256/9770 [1:01:41<51:15, 1.47it/s]
54%|█████▍ | 5257/9770 [1:01:42<50:57, 1.48it/s]
54%|█████▍ | 5258/9770 [1:01:43<50:13, 1.50it/s]
54%|�
+0: {'loss': 0.6665, 'grad_norm': 0.6335504374501189, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.54}
+0: {'loss': 0.6731, 'grad_norm': 0.6099947883842839, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.54}
+0: �████▍ | 5259/9770 [1:01:43<49:50, 1.51it/s]
54%|█████▍ | 5260/9770 [1:01:44<50:02, 1.50it/s]
54%|█████▍ | 5260/9770 [1:01:44<50:02, 1.50it/s]
54%|█████▍ | 5261/9770 [1:01:45<50:16, 1.49it/s]
54%|█████▍ | 5262/9770 [1:01:45<49:54, 1.51it/s]
54%|█████▍ | 5263/9770 [1:01:46<49:59, 1.50it/s]
54%|█████▍ | 5264/9770 [1:01:47<50:10, 1.50it/s]
54%|█████▍ | 5265/9770 [1:01:47<50:06, 1.50it/s]
54%|█████▍ | 5266/9770 [1:01:48<50:56, 1.47it/s]
54%|█████▍ | 5267/9770 [1:01:49<50:45, 1.48it/s]
54%|█████▍ | 5268/9770 [1:01:49<50:14, 1.49it/s]
54%|█████▍ | 5269/9770 [1:01:50<49:58, 1.50it/s]
54%|█████▍ | 5270/9770 [1:01:51<49:29, 1.52it/s]
54%|█████▍ | 5270/9770 [1:01:51<49:29, 1.52it/
+0: {'loss': 0.6527, 'grad_norm': 0.6081425355020218, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.54}
+0: s]
54%|█████▍ | 5271/9770 [1:01:51<49:45, 1.51it/s]
54%|█████▍ | 5272/9770 [1:01:52<49:35, 1.51it/s]
54%|█████▍ | 5273/9770 [1:01:53<49:34, 1.51it/s]
54%|█████▍ | 5274/9770 [1:01:53<49:09, 1.52it/s]
54%|█████▍ | 5275/9770 [1:01:54<48:19, 1.55it/s]
54%|█████▍ | 5276/9770 [1:01:55<48:38, 1.54it/s]
54%|█████▍ | 5277/9770 [1:01:55<48:52, 1.53it/s]
54%|█████▍ | 5278/9770 [1:01:56<49:05, 1.53it/s]
54%|█████▍ | 5279/9770 [1:01:57<49:17, 1.52it/s]
54%|█████▍ | 5280/9770 [1:01:57<49:07, 1.52it/s]
54%|█████▍ | 5280/9770 [1:01:57<49:07, 1.52it/s]
54%|█████▍ | 5281/9770 [1:01:58<49:14, 1.52it/s]
54%|█████▍ | 5282/9770 [1:01:59<49:25, 1.51it/s]
54%|█████▍ | 5283/9770 [1:01:59<49:13, 1.52it/s]
54%|█████▍ | 5284/9770 [1:
+0: {'loss': 0.6567, 'grad_norm': 0.6164764821651015, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.54}
+0: 02:00<48:50, 1.53it/s]
54%|█████▍ | 5285/9770 [1:02:01<48:23, 1.54it/s]
54%|█████▍ | 5286/9770 [1:02:01<48:49, 1.53it/s]
54%|█████▍ | 5287/9770 [1:02:02<48:50, 1.53it/s]
54%|█████▍ | 5288/9770 [1:02:03<49:09, 1.52it/s]
54%|█████▍ | 5289/9770 [1:02:03<49:20, 1.51it/s]
54%|█████▍ | 5290/9770 [1:02:04<48:37, 1.54it/s]
54%|█████▍ | 5290/9770 [1:02:04<48:37, 1.54it/s]
54%|█████▍ | 5291/9770 [1:02:05<49:49, 1.50it/s]
54%|█████▍ | 5292/9770 [1:02:05<49:10, 1.52it/s]
54%|█████▍ | 5293/9770 [1:02:06<49:22, 1.51it/s]
54%|█████▍ | 5294/9770 [1:02:07<49:14, 1.51it/s]
54%|█████▍ | 5295/9770 [1:02:07<49:18, 1.51it/s]
54%|█████▍ | 5296/9770 [1:02:08<49:10, 1.52it/s]
54%|█████▍ | 5297/9770 [1:02:09<49:03, 1.52it/s]
54%|█████�
+0: {'loss': 0.6656, 'grad_norm': 0.6557561791126008, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.54}
+0: {'loss': 0.6654, 'grad_norm': 0.6200148141126642, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.54}
+0: �� | 5298/9770 [1:02:09<49:34, 1.50it/s]
54%|█████▍ | 5299/9770 [1:02:10<50:02, 1.49it/s]
54%|█████▍ | 5300/9770 [1:02:11<49:51, 1.49it/s]
54%|█████▍ | 5300/9770 [1:02:11<49:51, 1.49it/s]
54%|█████▍ | 5301/9770 [1:02:11<49:18, 1.51it/s]
54%|█████▍ | 5302/9770 [1:02:12<49:34, 1.50it/s]
54%|█████▍ | 5303/9770 [1:02:13<49:33, 1.50it/s]
54%|█████▍ | 5304/9770 [1:02:13<48:58, 1.52it/s]
54%|█████▍ | 5305/9770 [1:02:14<48:43, 1.53it/s]
54%|█████▍ | 5306/9770 [1:02:15<49:32, 1.50it/s]
54%|█████▍ | 5307/9770 [1:02:15<50:08, 1.48it/s]
54%|█████▍ | 5308/9770 [1:02:16<50:10, 1.48it/s]
54%|█████▍ | 5309/9770 [1:02:17<49:45, 1.49it/s]
54%|█████▍ | 5310/9770 [1:02:17<49:23, 1.51it/s]
54%|██
+0: {'loss': 0.6832, 'grad_norm': 0.6455936589980646, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.54}
+0: ███▍ | 5310/9770 [1:02:17<49:23, 1.51it/s]
54%|█████▍ | 5311/9770 [1:02:18<48:37, 1.53it/s]
54%|█████▍ | 5312/9770 [1:02:18<48:07, 1.54it/s]
54%|█████▍ | 5313/9770 [1:02:19<48:19, 1.54it/s]
54%|█████▍ | 5314/9770 [1:02:20<48:32, 1.53it/s]
54%|█████▍ | 5315/9770 [1:02:20<48:32, 1.53it/s]
54%|█████▍ | 5316/9770 [1:02:21<48:31, 1.53it/s]
54%|█████▍ | 5317/9770 [1:02:22<49:28, 1.50it/s]
54%|█████▍ | 5318/9770 [1:02:22<49:07, 1.51it/s]
54%|█████▍ | 5319/9770 [1:02:23<49:13, 1.51it/s]
54%|█████▍ | 5320/9770 [1:02:24<48:46, 1.52it/s]
54%|█████▍ | 5320/9770 [1:02:24<48:46, 1.52it/s]
54%|█████▍ | 5321/9770 [1:02:24<48:51, 1.52it/s]
54%|█████▍ | 5322/9770 [1:02:25<49:03, 1.51it/s]
54%|█████▍ | 5323/9770 [1:02:26<48:29,
+0: {'loss': 0.6815, 'grad_norm': 0.634737508337699, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.55}
+0: 1.53it/s]
54%|█████▍ | 5324/9770 [1:02:26<48:59, 1.51it/s]
55%|█████▍ | 5325/9770 [1:02:27<48:59, 1.51it/s]
55%|█████▍ | 5326/9770 [1:02:28<48:47, 1.52it/s]
55%|█████▍ | 5327/9770 [1:02:28<48:13, 1.54it/s]
55%|█████▍ | 5328/9770 [1:02:29<47:59, 1.54it/s]
55%|█████▍ | 5329/9770 [1:02:30<48:03, 1.54it/s]
55%|█████▍ | 5330/9770 [1:02:30<48:27, 1.53it/s]
55%|█████▍ | 5330/9770 [1:02:30<48:27, 1.53it/s]
55%|█████▍ | 5331/9770 [1:02:31<48:38, 1.52it/s]
55%|█████▍ | 5332/9770 [1:02:32<48:26, 1.53it/s]
55%|█████▍ | 5333/9770 [1:02:32<47:49, 1.55it/s]
55%|█████▍ | 5334/9770 [1:02:33<47:39, 1.55it/s]
55%|█████▍ | 5335/9770 [1:02:34<48:23, 1.53it/s]
55%|█████▍ | 5336/9770 [1:02:34<48:42, 1.52it/s]
55%|█████▍ | 5337/9
+0: {'loss': 0.6518, 'grad_norm': 0.617815014780144, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.55}
+0: 770 [1:02:35<49:51, 1.48it/s]
55%|█████▍ | 5338/9770 [1:02:36<48:35, 1.52it/s]
55%|█████▍ | 5339/9770 [1:02:36<48:19, 1.53it/s]
55%|█████▍ | 5340/9770 [1:02:37<47:59, 1.54it/s]
55%|█████▍ | 5340/9770 [1:02:37<47:59, 1.54it/s]
55%|█████▍ | 5341/9770 [1:02:38<48:45, 1.51it/s]
55%|█████▍ | 5342/9770 [1:02:38<48:22, 1.53it/s]
55%|█████▍ | 5343/9770 [1:02:39<48:07, 1.53it/s]
55%|█████▍ | 5344/9770 [1:02:39<47:40, 1.55it/s]
55%|█████▍ | 5345/9770 [1:02:40<47:53, 1.54it/s]
55%|█████▍ | 5346/9770 [1:02:41<48:04, 1.53it/s]
55%|█████▍ | 5347/9770 [1:02:41<48:49, 1.51it/s]
55%|█████▍ | 5348/9770 [1:02:42<47:44, 1.54it/s]
55%|█████▍ | 5349/9770 [1:02:43<47:52, 1.54it/s]
55%|█████▍ | 5350/9770 [1:02:43<48:03, 1.53it/s]
+0: {'loss': 0.6466, 'grad_norm': 0.6161852209158847, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.55}
+0: {'loss': 0.6639, 'grad_norm': 0.6175638914997587, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.55}
+0:
55%|█████▍ | 5350/9770 [1:02:43<48:03, 1.53it/s]
55%|█████▍ | 5351/9770 [1:02:44<49:19, 1.49it/s]
55%|█████▍ | 5352/9770 [1:02:45<49:02, 1.50it/s]
55%|█████▍ | 5353/9770 [1:02:45<48:54, 1.51it/s]
55%|█████▍ | 5354/9770 [1:02:46<48:43, 1.51it/s]
55%|█████▍ | 5355/9770 [1:02:47<48:32, 1.52it/s]
55%|█████▍ | 5356/9770 [1:02:47<48:09, 1.53it/s]
55%|█████▍ | 5357/9770 [1:02:48<48:05, 1.53it/s]
55%|█████▍ | 5358/9770 [1:02:49<48:15, 1.52it/s]
55%|█████▍ | 5359/9770 [1:02:49<48:23, 1.52it/s]
55%|█████▍ | 5360/9770 [1:02:50<48:17, 1.52it/s]
55%|█████▍ | 5360/9770 [1:02:50<48:17, 1.52it/s]
55%|█████▍ | 5361/9770 [1:02:51<48:44, 1.51it/s]
55%|█████▍ | 5362/9770 [1:02:51<47:59, 1.53it/s]
55%
+0: {'loss': 0.6785, 'grad_norm': 0.6340827133531325, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.55}
+0: |█████▍ | 5363/9770 [1:02:52<48:04, 1.53it/s]
55%|█████▍ | 5364/9770 [1:02:53<49:06, 1.50it/s]
55%|█████▍ | 5365/9770 [1:02:53<48:34, 1.51it/s]
55%|█████▍ | 5366/9770 [1:02:54<48:20, 1.52it/s]
55%|█████▍ | 5367/9770 [1:02:55<48:03, 1.53it/s]
55%|█████▍ | 5368/9770 [1:02:55<48:14, 1.52it/s]
55%|█████▍ | 5369/9770 [1:02:56<47:47, 1.53it/s]
55%|█████▍ | 5370/9770 [1:02:57<47:47, 1.53it/s]
55%|█████▍ | 5370/9770 [1:02:57<47:47, 1.53it/s]
55%|█████▍ | 5371/9770 [1:02:57<47:43, 1.54it/s]
55%|█████▍ | 5372/9770 [1:02:58<48:02, 1.53it/s]
55%|█████▍ | 5373/9770 [1:02:59<48:42, 1.50it/s]
55%|█████▌ | 5374/9770 [1:02:59<48:37, 1.51it/s]
55%|█████▌ | 5375/9770 [1:03:00<48:35, 1.51it/s]
55%|█████▌ | 5376/9770 [1:03:01<4
+0: {'loss': 0.6621, 'grad_norm': 0.604460760658246, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.55}
+0: 8:18, 1.52it/s]
55%|█████▌ | 5377/9770 [1:03:01<48:12, 1.52it/s]
55%|█████▌ | 5378/9770 [1:03:02<48:27, 1.51it/s]
55%|█████▌ | 5379/9770 [1:03:03<48:24, 1.51it/s]
55%|█████▌ | 5380/9770 [1:03:03<47:48, 1.53it/s]
55%|█████▌ | 5380/9770 [1:03:03<47:48, 1.53it/s]
55%|█████▌ | 5381/9770 [1:03:04<48:01, 1.52it/s]
55%|█████▌ | 5382/9770 [1:03:05<48:56, 1.49it/s]
55%|█████▌ | 5383/9770 [1:03:05<49:00, 1.49it/s]
55%|█████▌ | 5384/9770 [1:03:06<48:56, 1.49it/s]
55%|█████▌ | 5385/9770 [1:03:07<49:01, 1.49it/s]
55%|█████▌ | 5386/9770 [1:03:07<48:35, 1.50it/s]
55%|█████▌ | 5387/9770 [1:03:08<49:15, 1.48it/s]
55%|█████▌ | 5388/9770 [1:03:09<48:52, 1.49it/s]
55%|█████▌ | 5389/9770 [1:03:09<48:54, 1.49it/s]
55%|█████▌ |
+0: {'loss': 0.6735, 'grad_norm': 0.6677450244913148, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.55}
+0: {'loss': 0.661, 'grad_norm': 0.6251935515347155, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.55}
+0: 5390/9770 [1:03:10<48:25, 1.51it/s]
55%|█████▌ | 5390/9770 [1:03:10<48:25, 1.51it/s]
55%|█████▌ | 5391/9770 [1:03:11<48:24, 1.51it/s]
55%|█████▌ | 5392/9770 [1:03:11<47:51, 1.52it/s]
55%|█████▌ | 5393/9770 [1:03:12<47:45, 1.53it/s]
55%|█████▌ | 5394/9770 [1:03:12<48:12, 1.51it/s]
55%|█████▌ | 5395/9770 [1:03:13<48:14, 1.51it/s]
55%|█████▌ | 5396/9770 [1:03:14<48:21, 1.51it/s]
55%|█████▌ | 5397/9770 [1:03:14<48:18, 1.51it/s]
55%|█████▌ | 5398/9770 [1:03:15<48:07, 1.51it/s]
55%|█████▌ | 5399/9770 [1:03:16<48:12, 1.51it/s]
55%|█████▌ | 5400/9770 [1:03:16<47:42, 1.53it/s]
55%|█████▌ | 5400/9770 [1:03:16<47:42, 1.53it/s]
55%|█████▌ | 5401/9770 [1:03:17<47:28, 1.53it/s]
55%|████�
+0: {'loss': 0.6622, 'grad_norm': 0.5964295386334724, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.55}
+0: ��▌ | 5402/9770 [1:03:18<47:49, 1.52it/s]
55%|█████▌ | 5403/9770 [1:03:18<47:50, 1.52it/s]
55%|█████▌ | 5404/9770 [1:03:19<47:53, 1.52it/s]
55%|█████▌ | 5405/9770 [1:03:20<47:36, 1.53it/s]
55%|█████▌ | 5406/9770 [1:03:20<47:41, 1.53it/s]
55%|█████▌ | 5407/9770 [1:03:21<47:31, 1.53it/s]
55%|█████▌ | 5408/9770 [1:03:22<47:30, 1.53it/s]
55%|█████▌ | 5409/9770 [1:03:22<47:02, 1.54it/s]
55%|█████▌ | 5410/9770 [1:03:23<47:19, 1.54it/s]
55%|█████▌ | 5410/9770 [1:03:23<47:19, 1.54it/s]
55%|█████▌ | 5411/9770 [1:03:24<47:17, 1.54it/s]
55%|█████▌ | 5412/9770 [1:03:24<47:10, 1.54it/s]
55%|█████▌ | 5413/9770 [1:03:25<46:51, 1.55it/s]
55%|█████▌ | 5414/9770 [1:03:26<47:05, 1.54it/s]
55%|█████▌ | 5415/9770 [1:03:26<47:14, 1.54it/
+0: {'loss': 0.6531, 'grad_norm': 0.6073224849247759, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.55}
+0: s]
55%|█████▌ | 5416/9770 [1:03:27<47:06, 1.54it/s]
55%|█████▌ | 5417/9770 [1:03:28<47:17, 1.53it/s]
55%|█████▌ | 5418/9770 [1:03:28<47:56, 1.51it/s]
55%|█████▌ | 5419/9770 [1:03:29<47:24, 1.53it/s]
55%|█████▌ | 5420/9770 [1:03:29<47:46, 1.52it/s]
55%|█████▌ | 5420/9770 [1:03:30<47:46, 1.52it/s]
55%|█████▌ | 5421/9770 [1:03:30<47:58, 1.51it/s]
55%|█████▌ | 5422/9770 [1:03:31<48:06, 1.51it/s]
56%|█████▌ | 5423/9770 [1:03:32<48:30, 1.49it/s]
56%|█████▌ | 5424/9770 [1:03:32<48:21, 1.50it/s]
56%|█████▌ | 5425/9770 [1:03:33<48:48, 1.48it/s]
56%|█████▌ | 5426/9770 [1:03:34<48:50, 1.48it/s]
56%|█████▌ | 5427/9770 [1:03:34<48:26, 1.49it/s]
56%|█████▌ | 5428/9770 [1:03:35<48:19, 1.50it/s]
56%|█████▌ | 5429/9770 [1:
+0: {'loss': 0.6837, 'grad_norm': 0.6364591932416096, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.56}
+0: {'loss': 0.6881, 'grad_norm': 0.5988762548913954, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.56}
+0: 03:36<47:43, 1.52it/s]
56%|█████▌ | 5430/9770 [1:03:36<47:41, 1.52it/s]
56%|█████▌ | 5430/9770 [1:03:36<47:41, 1.52it/s]
56%|█████▌ | 5431/9770 [1:03:37<48:19, 1.50it/s]
56%|█████▌ | 5432/9770 [1:03:38<48:05, 1.50it/s]
56%|█████▌ | 5433/9770 [1:03:38<47:41, 1.52it/s]
56%|█████▌ | 5434/9770 [1:03:39<47:24, 1.52it/s]
56%|█████▌ | 5435/9770 [1:03:39<47:39, 1.52it/s]
56%|█████▌ | 5436/9770 [1:03:40<47:36, 1.52it/s]
56%|█████▌ | 5437/9770 [1:03:41<47:48, 1.51it/s]
56%|█████▌ | 5438/9770 [1:03:41<47:36, 1.52it/s]
56%|█████▌ | 5439/9770 [1:03:42<48:29, 1.49it/s]
56%|█████▌ | 5440/9770 [1:03:43<48:02, 1.50it/s]
56%|█████▌ | 5440/9770 [1:03:43<48:02, 1.50it/s]
56%|█████▌ | 544
+0: {'loss': 0.658, 'grad_norm': 0.6173900873759363, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.56}
+0: 1/9770 [1:03:43<47:59, 1.50it/s]
56%|█████▌ | 5442/9770 [1:03:44<48:10, 1.50it/s]
56%|█████▌ | 5443/9770 [1:03:45<47:29, 1.52it/s]
56%|█████▌ | 5444/9770 [1:03:45<47:28, 1.52it/s]
56%|█████▌ | 5445/9770 [1:03:46<46:50, 1.54it/s]
56%|█████▌ | 5446/9770 [1:03:47<47:27, 1.52it/s]
56%|█████▌ | 5447/9770 [1:03:47<47:35, 1.51it/s]
56%|█████▌ | 5448/9770 [1:03:48<47:30, 1.52it/s]
56%|█████▌ | 5449/9770 [1:03:49<47:02, 1.53it/s]
56%|█████▌ | 5450/9770 [1:03:49<46:29, 1.55it/s]
56%|█████▌ | 5450/9770 [1:03:49<46:29, 1.55it/s]
56%|█████▌ | 5451/9770 [1:03:50<46:53, 1.54it/s]
56%|█████▌ | 5452/9770 [1:03:51<47:02, 1.53it/s]
56%|█████▌ | 5453/9770 [1:03:51<47:05, 1.53it/s]
56%|█████▌ | 5454/9770 [1:03:52<47:50, 1.50it/s]
56%|██
+0: {'loss': 0.6651, 'grad_norm': 0.606547513460977, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.56}
+0: ███▌ | 5455/9770 [1:03:53<47:21, 1.52it/s]
56%|█████▌ | 5456/9770 [1:03:53<46:57, 1.53it/s]
56%|█████▌ | 5457/9770 [1:03:54<46:44, 1.54it/s]
56%|█████▌ | 5458/9770 [1:03:55<47:13, 1.52it/s]
56%|█████▌ | 5459/9770 [1:03:55<47:34, 1.51it/s]
56%|█████▌ | 5460/9770 [1:03:56<47:48, 1.50it/s]
56%|█████▌ | 5460/9770 [1:03:56<47:48, 1.50it/s]
56%|█████▌ | 5461/9770 [1:03:57<47:24, 1.51it/s]
56%|█████▌ | 5462/9770 [1:03:57<47:25, 1.51it/s]
56%|█████▌ | 5463/9770 [1:03:58<47:11, 1.52it/s]
56%|█████▌ | 5464/9770 [1:03:59<46:30, 1.54it/s]
56%|█████▌ | 5465/9770 [1:03:59<46:13, 1.55it/s]
56%|█████▌ | 5466/9770 [1:04:00<46:29, 1.54it/s]
56%|█████▌ | 5467/9770 [1:04:01<46:59, 1.53it/s]
56%|█████▌ | 5468/9770 [1:04:01<46:56,
+0: {'loss': 0.6603, 'grad_norm': 0.7025307382685239, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.56}
+0: {'loss': 0.6713, 'grad_norm': 0.6496061793260294, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.56}
+0: 1.53it/s]
56%|█████▌ | 5469/9770 [1:04:02<47:04, 1.52it/s]
56%|█████▌ | 5470/9770 [1:04:02<46:56, 1.53it/s]
56%|█████▌ | 5470/9770 [1:04:02<46:56, 1.53it/s]
56%|█████▌ | 5471/9770 [1:04:03<47:00, 1.52it/s]
56%|█████▌ | 5472/9770 [1:04:04<47:08, 1.52it/s]
56%|█████▌ | 5473/9770 [1:04:04<47:12, 1.52it/s]
56%|█████▌ | 5474/9770 [1:04:05<47:25, 1.51it/s]
56%|█████▌ | 5475/9770 [1:04:06<47:30, 1.51it/s]
56%|█████▌ | 5476/9770 [1:04:06<47:38, 1.50it/s]
56%|█████▌ | 5477/9770 [1:04:07<47:23, 1.51it/s]
56%|█████▌ | 5478/9770 [1:04:08<47:38, 1.50it/s]
56%|█████▌ | 5479/9770 [1:04:08<47:37, 1.50it/s]
56%|█████▌ | 5480/9770 [1:04:09<47:25, 1.51it/s]
56%|█████▌ | 5480/9770 [1:04:0
+0: {'loss': 0.654, 'grad_norm': 0.5842679073675652, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.56}
+0: 9<47:25, 1.51it/s]
56%|█████▌ | 5481/9770 [1:04:10<47:04, 1.52it/s]
56%|█████▌ | 5482/9770 [1:04:10<47:03, 1.52it/s]
56%|█████▌ | 5483/9770 [1:04:11<46:45, 1.53it/s]
56%|█████▌ | 5484/9770 [1:04:12<46:57, 1.52it/s]
56%|█████▌ | 5485/9770 [1:04:12<47:05, 1.52it/s]
56%|█████▌ | 5486/9770 [1:04:13<46:32, 1.53it/s]
56%|█████▌ | 5487/9770 [1:04:14<46:25, 1.54it/s]
56%|█████▌ | 5488/9770 [1:04:14<47:44, 1.49it/s]
56%|█████▌ | 5489/9770 [1:04:15<46:37, 1.53it/s]
56%|█████▌ | 5490/9770 [1:04:16<46:42, 1.53it/s]
56%|█████▌ | 5490/9770 [1:04:16<46:42, 1.53it/s]
56%|█████▌ | 5491/9770 [1:04:16<47:05, 1.51it/s]
56%|█████▌ | 5492/9770 [1:04:17<47:47, 1.49it/s]
56%|█████▌ | 5493/9770 [1:04:18<47:18, 1.51it/s]
56%|█████▌
+0: {'loss': 0.6859, 'grad_norm': 0.6692935819569421, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.56}
+0: | 5494/9770 [1:04:18<46:58, 1.52it/s]
56%|█████▌ | 5495/9770 [1:04:19<47:23, 1.50it/s]
56%|█████▋ | 5496/9770 [1:04:20<47:13, 1.51it/s]
56%|█████▋ | 5497/9770 [1:04:20<47:18, 1.51it/s]
56%|█████▋ | 5498/9770 [1:04:21<47:30, 1.50it/s]
56%|█████▋ | 5499/9770 [1:04:22<47:31, 1.50it/s]
56%|█████▋ | 5500/9770 [1:04:22<47:01, 1.51it/s]
56%|█████▋ | 5500/9770 [1:04:22<47:01, 1.51it/s]
56%|█████▋ | 5501/9770 [1:04:23<47:14, 1.51it/s]
56%|█████▋ | 5502/9770 [1:04:24<46:29, 1.53it/s]
56%|█████▋ | 5503/9770 [1:04:24<45:56, 1.55it/s]
56%|█████▋ | 5504/9770 [1:04:25<45:51, 1.55it/s]
56%|█████▋ | 5505/9770 [1:04:26<46:09, 1.54it/s]
56%|█████▋ | 5506/9770 [1:04:26<47:04, 1.51it/s]
56%|█████▋ | 5507/9770 [1:04:27<47:05, 1.51it/s]
56%
+0: {'loss': 0.6593, 'grad_norm': 0.6110359421557113, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.56}
+0: |█████▋ | 5508/9770 [1:04:28<47:04, 1.51it/s]
56%|█████▋ | 5509/9770 [1:04:28<46:45, 1.52it/s]
56%|█████▋ | 5510/9770 [1:04:29<46:32, 1.53it/s]
56%|█████▋ | 5510/9770 [1:04:29<46:32, 1.53it/s]
56%|█████▋ | 5511/9770 [1:04:30<46:55, 1.51it/s]
56%|█████▋ | 5512/9770 [1:04:30<46:52, 1.51it/s]
56%|█████▋ | 5513/9770 [1:04:31<47:08, 1.51it/s]
56%|█████▋ | 5514/9770 [1:04:32<47:02, 1.51it/s]
56%|█████▋ | 5515/9770 [1:04:32<47:23, 1.50it/s]
56%|█████▋ | 5516/9770 [1:04:33<46:56, 1.51it/s]
56%|█████▋ | 5517/9770 [1:04:34<46:39, 1.52it/s]
56%|█████▋ | 5518/9770 [1:04:34<46:38, 1.52it/s]
56%|█████▋ | 5519/9770 [1:04:35<46:57, 1.51it/s]
56%|█████▋ | 5520/9770 [1:04:35<46:46, 1.51it/s]
+0: {'loss': 0.674, 'grad_norm': 0.594650251780948, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.56}
+0: {'loss': 0.6588, 'grad_norm': 0.5886772556219194, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.57}
+0:
56%|█████▋ | 5520/9770 [1:04:36<46:46, 1.51it/s]
57%|█████▋ | 5521/9770 [1:04:36<47:07, 1.50it/s]
57%|█████▋ | 5522/9770 [1:04:37<46:20, 1.53it/s]
57%|█████▋ | 5523/9770 [1:04:37<46:30, 1.52it/s]
57%|█████▋ | 5524/9770 [1:04:38<46:47, 1.51it/s]
57%|█████▋ | 5525/9770 [1:04:39<46:36, 1.52it/s]
57%|█████▋ | 5526/9770 [1:04:39<46:18, 1.53it/s]
57%|█████▋ | 5527/9770 [1:04:40<46:53, 1.51it/s]
57%|█████▋ | 5528/9770 [1:04:41<47:09, 1.50it/s]
57%|█████▋ | 5529/9770 [1:04:41<46:53, 1.51it/s]
57%|█████▋ | 5530/9770 [1:04:42<46:51, 1.51it/s]
57%|█████▋ | 5530/9770 [1:04:42<46:51, 1.51it/s]
57%|█████▋ | 5531/9770 [1:04:43<47:33, 1.49it/s]
57%|█████▋ | 5532/9770 [1:04:44<48:03, 1.47it/s]
57%|█████▋ | 5533/9770
+0: {'loss': 0.6786, 'grad_norm': 0.6441412445192954, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.57}
+0: [1:04:44<47:26, 1.49it/s]
57%|█████▋ | 5534/9770 [1:04:45<47:09, 1.50it/s]
57%|█████▋ | 5535/9770 [1:04:45<46:54, 1.50it/s]
57%|█████▋ | 5536/9770 [1:04:46<46:25, 1.52it/s]
57%|█████▋ | 5537/9770 [1:04:47<45:42, 1.54it/s]
57%|█████▋ | 5538/9770 [1:04:47<46:07, 1.53it/s]
57%|█████▋ | 5539/9770 [1:04:48<46:38, 1.51it/s]
57%|█████▋ | 5540/9770 [1:04:49<46:43, 1.51it/s]
57%|█████▋ | 5540/9770 [1:04:49<46:43, 1.51it/s]
57%|█████▋ | 5541/9770 [1:04:49<46:45, 1.51it/s]
57%|█████▋ | 5542/9770 [1:04:50<46:48, 1.51it/s]
57%|█████▋ | 5543/9770 [1:04:51<46:25, 1.52it/s]
57%|█████▋ | 5544/9770 [1:04:51<46:44, 1.51it/s]
57%|█████▋ | 5545/9770 [1:04:52<47:37, 1.48it/s]
57%|█████▋ | 5546/9770 [1:04:53<47:19, 1.49it/s]
57%|████�
+0: {'loss': 0.6766, 'grad_norm': 0.5979832170617129, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.57}
+0: ��▋ | 5547/9770 [1:04:53<46:25, 1.52it/s]
57%|█████▋ | 5548/9770 [1:04:54<47:12, 1.49it/s]
57%|█████▋ | 5549/9770 [1:04:55<46:51, 1.50it/s]
57%|█████▋ | 5550/9770 [1:04:55<46:29, 1.51it/s]
57%|█████▋ | 5550/9770 [1:04:55<46:29, 1.51it/s]
57%|█████▋ | 5551/9770 [1:04:56<46:04, 1.53it/s]
57%|█████▋ | 5552/9770 [1:04:57<46:42, 1.51it/s]
57%|█████▋ | 5553/9770 [1:04:57<46:12, 1.52it/s]
57%|█████▋ | 5554/9770 [1:04:58<46:02, 1.53it/s]
57%|█████▋ | 5555/9770 [1:04:59<46:17, 1.52it/s]
57%|█████▋ | 5556/9770 [1:04:59<45:47, 1.53it/s]
57%|█████▋ | 5557/9770 [1:05:00<45:13, 1.55it/s]
57%|█████▋ | 5558/9770 [1:05:01<45:37, 1.54it/s]
57%|█████▋ | 5559/9770 [1:05:01<45:56, 1.53it/s]
57%|█████▋ | 5560/9770 [1:05:02<46:12, 1.52it/
+0: {'loss': 0.668, 'grad_norm': 0.6479565407748453, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.57}
+0: {'loss': 0.6977, 'grad_norm': 0.6371957492564437, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.57}
+0: s]
57%|█████▋ | 5560/9770 [1:05:02<46:12, 1.52it/s]
57%|█████▋ | 5561/9770 [1:05:03<46:08, 1.52it/s]
57%|█████▋ | 5562/9770 [1:05:03<46:12, 1.52it/s]
57%|█████▋ | 5563/9770 [1:05:04<46:34, 1.51it/s]
57%|█████▋ | 5564/9770 [1:05:05<46:00, 1.52it/s]
57%|█████▋ | 5565/9770 [1:05:05<46:04, 1.52it/s]
57%|█████▋ | 5566/9770 [1:05:06<45:54, 1.53it/s]
57%|█████▋ | 5567/9770 [1:05:07<46:08, 1.52it/s]
57%|█████▋ | 5568/9770 [1:05:07<46:25, 1.51it/s]
57%|█████▋ | 5569/9770 [1:05:08<45:57, 1.52it/s]
57%|█████▋ | 5570/9770 [1:05:09<45:48, 1.53it/s]
57%|█████▋ | 5570/9770 [1:05:09<45:48, 1.53it/s]
57%|█████▋ | 5571/9770 [1:05:09<46:17, 1.51it/s]
57%|█████▋ | 5572/9770 [1:05:10<46:28
+0: {'loss': 0.6499, 'grad_norm': 0.6183997452845711, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.57}
+0: , 1.51it/s]
57%|█████▋ | 5573/9770 [1:05:11<46:25, 1.51it/s]
57%|█████▋ | 5574/9770 [1:05:11<46:26, 1.51it/s]
57%|█████▋ | 5575/9770 [1:05:12<49:27, 1.41it/s]
57%|█████▋ | 5576/9770 [1:05:13<52:29, 1.33it/s]
57%|█████▋ | 5577/9770 [1:05:14<54:59, 1.27it/s]
57%|█████▋ | 5578/9770 [1:05:15<55:19, 1.26it/s]
57%|█████▋ | 5579/9770 [1:05:15<53:10, 1.31it/s]
57%|█████▋ | 5580/9770 [1:05:16<54:28, 1.28it/s]
57%|█████▋ | 5580/9770 [1:05:16<54:28, 1.28it/s]
57%|█████▋ | 5581/9770 [1:05:17<54:50, 1.27it/s]
57%|█████▋ | 5582/9770 [1:05:18<54:23, 1.28it/s]
57%|█████▋ | 5583/9770 [1:05:18<53:44, 1.30it/s]
57%|█████▋ | 5584/9770 [1:05:19<53:07, 1.31it/s]
57%|█████▋ | 5585/9770 [1:05:20<52:28, 1.33it/s]
57%|█████▋ | 558
+0: {'loss': 0.6525, 'grad_norm': 0.6537594655218257, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.57}
+0: 6/9770 [1:05:20<50:40, 1.38it/s]
57%|█████▋ | 5587/9770 [1:05:21<49:33, 1.41it/s]
57%|█████▋ | 5588/9770 [1:05:22<48:12, 1.45it/s]
57%|█████▋ | 5589/9770 [1:05:22<47:18, 1.47it/s]
57%|█████▋ | 5590/9770 [1:05:23<46:59, 1.48it/s]
57%|█████▋ | 5590/9770 [1:05:23<46:59, 1.48it/s]
57%|█████▋ | 5591/9770 [1:05:24<47:42, 1.46it/s]
57%|█████▋ | 5592/9770 [1:05:25<47:26, 1.47it/s]
57%|█████▋ | 5593/9770 [1:05:25<46:44, 1.49it/s]
57%|█████▋ | 5594/9770 [1:05:26<46:43, 1.49it/s]
57%|█████▋ | 5595/9770 [1:05:26<45:56, 1.51it/s]
57%|█████▋ | 5596/9770 [1:05:27<45:55, 1.51it/s]
57%|█████▋ | 5597/9770 [1:05:28<45:46, 1.52it/s]
57%|█████▋ | 5598/9770 [1:05:28<45:50, 1.52it/s]
57%|█████▋ | 5599/9770 [1:05:29<46:12, 1.50it/s]
57%|██
+0: {'loss': 0.6876, 'grad_norm': 0.6235942372326306, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.57}
+0: {'loss': 0.6784, 'grad_norm': 0.6644109009918524, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.57}
+0: ███▋ | 5600/9770 [1:05:30<45:42, 1.52it/s]
57%|█████▋ | 5600/9770 [1:05:30<45:42, 1.52it/s]
57%|█████▋ | 5601/9770 [1:05:30<46:05, 1.51it/s]
57%|█████▋ | 5602/9770 [1:05:31<46:35, 1.49it/s]
57%|█████▋ | 5603/9770 [1:05:32<46:44, 1.49it/s]
57%|█████▋ | 5604/9770 [1:05:32<46:08, 1.50it/s]
57%|█████▋ | 5605/9770 [1:05:33<46:01, 1.51it/s]
57%|█████▋ | 5606/9770 [1:05:34<46:00, 1.51it/s]
57%|█████▋ | 5607/9770 [1:05:34<46:16, 1.50it/s]
57%|█████▋ | 5608/9770 [1:05:35<46:35, 1.49it/s]
57%|█████▋ | 5609/9770 [1:05:36<45:56, 1.51it/s]
57%|█████▋ | 5610/9770 [1:05:36<46:57, 1.48it/s]
57%|█████▋ | 5610/9770 [1:05:36<46:57, 1.48it/s]
57%|█████▋ | 5611/9770 [1:05:37<46:35, 1.49it/s]
+0: {'loss': 0.6629, 'grad_norm': 0.6603418650841691, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.57}
+0: 57%|█████▋ | 5612/9770 [1:05:38<46:11, 1.50it/s]
57%|█████▋ | 5613/9770 [1:05:38<45:40, 1.52it/s]
57%|█████▋ | 5614/9770 [1:05:39<45:53, 1.51it/s]
57%|█████▋ | 5615/9770 [1:05:40<46:05, 1.50it/s]
57%|█████▋ | 5616/9770 [1:05:40<46:44, 1.48it/s]
57%|█████▋ | 5617/9770 [1:05:41<46:41, 1.48it/s]
58%|█████▊ | 5618/9770 [1:05:42<46:55, 1.47it/s]
58%|█████▊ | 5619/9770 [1:05:42<46:26, 1.49it/s]
58%|█████▊ | 5620/9770 [1:05:43<45:47, 1.51it/s]
58%|█████▊ | 5620/9770 [1:05:43<45:47, 1.51it/s]
58%|█████▊ | 5621/9770 [1:05:44<46:03, 1.50it/s]
58%|█████▊ | 5622/9770 [1:05:44<46:17, 1.49it/s]
58%|█████▊ | 5623/9770 [1:05:45<46:09, 1.50it/s]
58%|█████▊ | 5624/9770 [1:05:46<46:04, 1.50it/s]
58%|█████▊ | 5625/9770 [1:05:4
+0: {'loss': 0.6547, 'grad_norm': 0.6541120070414278, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.58}
+0: 6<45:53, 1.51it/s]
58%|█████▊ | 5626/9770 [1:05:47<46:09, 1.50it/s]
58%|█████▊ | 5627/9770 [1:05:48<45:57, 1.50it/s]
58%|█████▊ | 5628/9770 [1:05:48<46:14, 1.49it/s]
58%|█████▊ | 5629/9770 [1:05:49<46:58, 1.47it/s]
58%|█████▊ | 5630/9770 [1:05:50<46:44, 1.48it/s]
58%|█████▊ | 5630/9770 [1:05:50<46:44, 1.48it/s]
58%|█████▊ | 5631/9770 [1:05:51<46:43, 1.48it/s]
58%|█████▊ | 5632/9770 [1:05:51<46:52, 1.47it/s]
58%|█████▊ | 5633/9770 [1:05:52<46:28, 1.48it/s]
58%|█████▊ | 5634/9770 [1:05:53<46:08, 1.49it/s]
58%|█████▊ | 5635/9770 [1:05:53<45:45, 1.51it/s]
58%|█████▊ | 5636/9770 [1:05:54<45:44, 1.51it/s]
58%|█████▊ | 5637/9770 [1:05:55<45:30, 1.51it/s]
58%|█████▊ | 5638/9770 [1:05:55<45:04, 1.53it/s]
58%|█████▊
+0: {'loss': 0.6796, 'grad_norm': 0.6563746176347128, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.58}
+0: {'loss': 0.6493, 'grad_norm': 0.6319310754522596, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.58}
+0: | 5639/9770 [1:05:56<45:13, 1.52it/s]
58%|█████▊ | 5640/9770 [1:05:56<45:29, 1.51it/s]
58%|█████▊ | 5640/9770 [1:05:56<45:29, 1.51it/s]
58%|█████▊ | 5641/9770 [1:05:57<45:51, 1.50it/s]
58%|█████▊ | 5642/9770 [1:05:58<45:18, 1.52it/s]
58%|█████▊ | 5643/9770 [1:05:58<44:51, 1.53it/s]
58%|█████▊ | 5644/9770 [1:05:59<45:09, 1.52it/s]
58%|█████▊ | 5645/9770 [1:06:00<44:46, 1.54it/s]
58%|█████▊ | 5646/9770 [1:06:00<45:21, 1.52it/s]
58%|█████▊ | 5647/9770 [1:06:01<45:38, 1.51it/s]
58%|█████▊ | 5648/9770 [1:06:02<45:19, 1.52it/s]
58%|█████▊ | 5649/9770 [1:06:02<45:13, 1.52it/s]
58%|█████▊ | 5650/9770 [1:06:03<45:29, 1.51it/s]
58%|█████▊ | 5650/9770 [1:06:03<45:29, 1.51it/s]
58%|███�
+0: {'loss': 0.6733, 'grad_norm': 0.63236566476251, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.58}
+0: ��█▊ | 5651/9770 [1:06:04<45:18, 1.52it/s]
58%|█████▊ | 5652/9770 [1:06:04<46:22, 1.48it/s]
58%|█████▊ | 5653/9770 [1:06:05<45:29, 1.51it/s]
58%|█████▊ | 5654/9770 [1:06:06<45:14, 1.52it/s]
58%|█████▊ | 5655/9770 [1:06:06<45:02, 1.52it/s]
58%|█████▊ | 5656/9770 [1:06:07<45:27, 1.51it/s]
58%|█████▊ | 5657/9770 [1:06:08<45:40, 1.50it/s]
58%|█████▊ | 5658/9770 [1:06:08<45:31, 1.51it/s]
58%|█████▊ | 5659/9770 [1:06:09<45:30, 1.51it/s]
58%|█████▊ | 5660/9770 [1:06:10<45:17, 1.51it/s]
58%|█████▊ | 5660/9770 [1:06:10<45:17, 1.51it/s]
58%|█████▊ | 5661/9770 [1:06:10<45:38, 1.50it/s]
58%|█████▊ | 5662/9770 [1:06:11<45:25, 1.51it/s]
58%|█████▊ | 5663/9770 [1:06:12<45:51, 1.49it/s]
58%|█████▊ | 5664/9770 [1:06:12<45:57, 1.49
+0: {'loss': 0.6432, 'grad_norm': 0.5930895035110533, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.58}
+0: it/s]
58%|█████▊ | 5665/9770 [1:06:13<45:29, 1.50it/s]
58%|█████▊ | 5666/9770 [1:06:14<45:33, 1.50it/s]
58%|█████▊ | 5667/9770 [1:06:14<45:00, 1.52it/s]
58%|█████▊ | 5668/9770 [1:06:15<45:10, 1.51it/s]
58%|█████▊ | 5669/9770 [1:06:16<45:32, 1.50it/s]
58%|█████▊ | 5670/9770 [1:06:16<45:26, 1.50it/s]
58%|█████▊ | 5670/9770 [1:06:16<45:26, 1.50it/s]
58%|█████▊ | 5671/9770 [1:06:17<45:30, 1.50it/s]
58%|█████▊ | 5672/9770 [1:06:18<45:55, 1.49it/s]
58%|█████▊ | 5673/9770 [1:06:18<44:57, 1.52it/s]
58%|█████▊ | 5674/9770 [1:06:19<44:51, 1.52it/s]
58%|█████▊ | 5675/9770 [1:06:20<45:01, 1.52it/s]
58%|█████▊ | 5676/9770 [1:06:20<44:57, 1.52it/s]
58%|█████▊ | 5677/9770 [1:06:21<45:21, 1.50it/s]
58%|█████▊ | 5678/9770
+0: {'loss': 0.6543, 'grad_norm': 0.582187518779254, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.58}
+0: {'loss': 0.6744, 'grad_norm': 0.6236457720880778, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.58}
+0: [1:06:22<44:40, 1.53it/s]
58%|█████▊ | 5679/9770 [1:06:22<44:59, 1.52it/s]
58%|█████▊ | 5680/9770 [1:06:23<45:04, 1.51it/s]
58%|█████▊ | 5680/9770 [1:06:23<45:04, 1.51it/s]
58%|█████▊ | 5681/9770 [1:06:24<44:53, 1.52it/s]
58%|█████▊ | 5682/9770 [1:06:24<44:34, 1.53it/s]
58%|█████▊ | 5683/9770 [1:06:25<44:49, 1.52it/s]
58%|█████▊ | 5684/9770 [1:06:26<44:47, 1.52it/s]
58%|█████▊ | 5685/9770 [1:06:26<44:53, 1.52it/s]
58%|█████▊ | 5686/9770 [1:06:27<44:55, 1.52it/s]
58%|█████▊ | 5687/9770 [1:06:28<45:03, 1.51it/s]
58%|█████▊ | 5688/9770 [1:06:28<44:35, 1.53it/s]
58%|█████▊ | 5689/9770 [1:06:29<44:31, 1.53it/s]
58%|█████▊ | 5690/9770 [1:06:30<44:08, 1.54it/s]
58%|█████▊ |
+0: {'loss': 0.6832, 'grad_norm': 0.5858399871882379, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.58}
+0: 5690/9770 [1:06:30<44:08, 1.54it/s]
58%|█████▊ | 5691/9770 [1:06:30<44:09, 1.54it/s]
58%|█████▊ | 5692/9770 [1:06:31<44:15, 1.54it/s]
58%|█████▊ | 5693/9770 [1:06:31<44:38, 1.52it/s]
58%|█████▊ | 5694/9770 [1:06:32<44:45, 1.52it/s]
58%|█████▊ | 5695/9770 [1:06:33<44:13, 1.54it/s]
58%|█████▊ | 5696/9770 [1:06:33<44:30, 1.53it/s]
58%|█████▊ | 5697/9770 [1:06:34<43:53, 1.55it/s]
58%|█████▊ | 5698/9770 [1:06:35<44:21, 1.53it/s]
58%|█████▊ | 5699/9770 [1:06:35<44:25, 1.53it/s]
58%|█████▊ | 5700/9770 [1:06:36<44:22, 1.53it/s]
58%|█████▊ | 5700/9770 [1:06:36<44:22, 1.53it/s]
58%|█████▊ | 5701/9770 [1:06:37<44:52, 1.51it/s]
58%|█████▊ | 5702/9770 [1:06:37<45:04, 1.50it/s]
58%|█████▊ | 5703/9770 [1:06:38<45:06, 1.50it/s]
58%|█
+0: {'loss': 0.6815, 'grad_norm': 0.6216407289584691, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.58}
+0: ████▊ | 5704/9770 [1:06:39<44:48, 1.51it/s]
58%|█████▊ | 5705/9770 [1:06:39<44:52, 1.51it/s]
58%|█████▊ | 5706/9770 [1:06:40<44:20, 1.53it/s]
58%|█████▊ | 5707/9770 [1:06:41<43:53, 1.54it/s]
58%|█████▊ | 5708/9770 [1:06:41<44:04, 1.54it/s]
58%|█████▊ | 5709/9770 [1:06:42<44:06, 1.53it/s]
58%|█████▊ | 5710/9770 [1:06:43<44:19, 1.53it/s]
58%|█████▊ | 5710/9770 [1:06:43<44:19, 1.53it/s]
58%|█████▊ | 5711/9770 [1:06:43<44:34, 1.52it/s]
58%|█████▊ | 5712/9770 [1:06:44<44:32, 1.52it/s]
58%|█████▊ | 5713/9770 [1:06:45<44:19, 1.53it/s]
58%|█████▊ | 5714/9770 [1:06:45<43:55, 1.54it/s]
58%|█████▊ | 5715/9770 [1:06:46<43:32, 1.55it/s]
59%|█████▊ | 5716/9770 [1:06:47<43:43, 1.55it/s]
59%|█████▊ | 5717/9770 [1:06:47<43:43
+0: {'loss': 0.663, 'grad_norm': 0.6248991203726749, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.59}
+0: , 1.54it/s]
59%|█████▊ | 5718/9770 [1:06:48<43:33, 1.55it/s]
59%|█████▊ | 5719/9770 [1:06:48<43:15, 1.56it/s]
59%|█████▊ | 5720/9770 [1:06:49<44:24, 1.52it/s]
59%|█████▊ | 5720/9770 [1:06:49<44:24, 1.52it/s]
59%|█████▊ | 5721/9770 [1:06:50<44:44, 1.51it/s]
59%|█████▊ | 5722/9770 [1:06:50<44:17, 1.52it/s]
59%|█████▊ | 5723/9770 [1:06:51<44:34, 1.51it/s]
59%|█████▊ | 5724/9770 [1:06:52<44:20, 1.52it/s]
59%|█████▊ | 5725/9770 [1:06:52<44:16, 1.52it/s]
59%|█████▊ | 5726/9770 [1:06:53<44:49, 1.50it/s]
59%|█████▊ | 5727/9770 [1:06:54<45:06, 1.49it/s]
59%|█████▊ | 5728/9770 [1:06:54<44:46, 1.50it/s]
59%|█████▊ | 5729/9770 [1:06:55<44:36, 1.51it/s]
59%|█████▊ | 5730/9770 [1:06:56<44:31, 1.51it/s]
+0: {'loss': 0.6482, 'grad_norm': 0.6149293927060516, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.59}
+0: {'loss': 0.6356, 'grad_norm': 0.6041723375141027, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.59}
+0:
59%|█████▊ | 5730/9770 [1:06:56<44:31, 1.51it/s]
59%|█████▊ | 5731/9770 [1:06:56<44:00, 1.53it/s]
59%|█████▊ | 5732/9770 [1:06:57<43:37, 1.54it/s]
59%|█████▊ | 5733/9770 [1:06:58<43:47, 1.54it/s]
59%|█████▊ | 5734/9770 [1:06:58<44:03, 1.53it/s]
59%|█████▊ | 5735/9770 [1:06:59<44:10, 1.52it/s]
59%|█████▊ | 5736/9770 [1:07:00<44:09, 1.52it/s]
59%|█████▊ | 5737/9770 [1:07:00<43:48, 1.53it/s]
59%|█████▊ | 5738/9770 [1:07:01<43:55, 1.53it/s]
59%|█████▊ | 5739/9770 [1:07:02<43:50, 1.53it/s]
59%|█████▉ | 5740/9770 [1:07:02<43:54, 1.53it/s]
59%|█████▉ | 5740/9770 [1:07:02<43:54, 1.53it/s]
59%|█████▉ | 5741/9770 [1:07:03<44:22, 1.51it/s]
59%|█████▉ | 5742/9770 [1:07:04<44:30, 1.51it/s]
59%|█████�
+0: {'loss': 0.6721, 'grad_norm': 0.6561568108484856, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.59}
+0: � | 5743/9770 [1:07:04<44:35, 1.51it/s]
59%|█████▉ | 5744/9770 [1:07:05<44:22, 1.51it/s]
59%|█████▉ | 5745/9770 [1:07:06<44:55, 1.49it/s]
59%|█████▉ | 5746/9770 [1:07:06<44:33, 1.50it/s]
59%|█████▉ | 5747/9770 [1:07:07<44:15, 1.52it/s]
59%|█████▉ | 5748/9770 [1:07:08<44:14, 1.52it/s]
59%|█████▉ | 5749/9770 [1:07:08<44:21, 1.51it/s]
59%|█████▉ | 5750/9770 [1:07:09<44:29, 1.51it/s]
59%|█████▉ | 5750/9770 [1:07:09<44:29, 1.51it/s]
59%|█████▉ | 5751/9770 [1:07:10<45:23, 1.48it/s]
59%|█████▉ | 5752/9770 [1:07:10<45:24, 1.47it/s]
59%|█████▉ | 5753/9770 [1:07:11<44:55, 1.49it/s]
59%|█████▉ | 5754/9770 [1:07:12<44:14, 1.51it/s]
59%|█████▉ | 5755/9770 [1:07:12<44:07, 1.52it/s]
59%|█████▉ | 5756/9770 [1:07:13<44:15, 1.51it/s]
+0: {'loss': 0.697, 'grad_norm': 0.5842000879747412, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.59}
+0: 59%|█████▉ | 5757/9770 [1:07:14<44:53, 1.49it/s]
59%|█████▉ | 5758/9770 [1:07:14<44:45, 1.49it/s]
59%|█████▉ | 5759/9770 [1:07:15<44:32, 1.50it/s]
59%|█████▉ | 5760/9770 [1:07:16<44:22, 1.51it/s]
59%|█████▉ | 5760/9770 [1:07:16<44:22, 1.51it/s]
59%|█████▉ | 5761/9770 [1:07:16<44:26, 1.50it/s]
59%|█████▉ | 5762/9770 [1:07:17<44:32, 1.50it/s]
59%|█████▉ | 5763/9770 [1:07:18<43:52, 1.52it/s]
59%|█████▉ | 5764/9770 [1:07:18<44:53, 1.49it/s]
59%|█████▉ | 5765/9770 [1:07:19<44:31, 1.50it/s]
59%|█████▉ | 5766/9770 [1:07:20<44:13, 1.51it/s]
59%|█████▉ | 5767/9770 [1:07:20<44:03, 1.51it/s]
59%|█████▉ | 5768/9770 [1:07:21<43:45, 1.52it/s]
59%|█████▉ | 5769/9770 [1:07:22<43:36, 1.53it/s]
59%|█████▉ | 5770/9770 [1:07:2
+0: {'loss': 0.6646, 'grad_norm': 0.6418740986255856, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.59}
+0: {'loss': 0.6435, 'grad_norm': 0.6081004975742742, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.59}
+0: 2<43:44, 1.52it/s]
59%|█████▉ | 5770/9770 [1:07:22<43:44, 1.52it/s]
59%|█████▉ | 5771/9770 [1:07:23<43:50, 1.52it/s]
59%|█████▉ | 5772/9770 [1:07:24<43:34, 1.53it/s]
59%|█████▉ | 5773/9770 [1:07:24<43:27, 1.53it/s]
59%|█████▉ | 5774/9770 [1:07:25<44:34, 1.49it/s]
59%|█████▉ | 5775/9770 [1:07:26<44:28, 1.50it/s]
59%|█████▉ | 5776/9770 [1:07:26<44:08, 1.51it/s]
59%|█████▉ | 5777/9770 [1:07:27<43:30, 1.53it/s]
59%|█████▉ | 5778/9770 [1:07:27<43:17, 1.54it/s]
59%|█████▉ | 5779/9770 [1:07:28<43:08, 1.54it/s]
59%|█████▉ | 5780/9770 [1:07:29<43:03, 1.54it/s]
59%|█████▉ | 5780/9770 [1:07:29<43:03, 1.54it/s]
59%|█████▉ | 5781/9770 [1:07:29<43:34, 1.53it/s]
59%|█████▉ | 5782/97
+0: {'loss': 0.6575, 'grad_norm': 0.6151352129951854, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.59}
+0: 70 [1:07:30<43:19, 1.53it/s]
59%|█████▉ | 5783/9770 [1:07:31<43:03, 1.54it/s]
59%|█████▉ | 5784/9770 [1:07:31<42:38, 1.56it/s]
59%|█████▉ | 5785/9770 [1:07:32<42:51, 1.55it/s]
59%|█████▉ | 5786/9770 [1:07:33<42:57, 1.55it/s]
59%|█████▉ | 5787/9770 [1:07:33<43:03, 1.54it/s]
59%|█████▉ | 5788/9770 [1:07:34<43:01, 1.54it/s]
59%|█████▉ | 5789/9770 [1:07:35<44:04, 1.51it/s]
59%|█████▉ | 5790/9770 [1:07:35<43:15, 1.53it/s]
59%|█████▉ | 5790/9770 [1:07:35<43:15, 1.53it/s]
59%|█████▉ | 5791/9770 [1:07:36<43:06, 1.54it/s]
59%|█████▉ | 5792/9770 [1:07:37<43:10, 1.54it/s]
59%|█████▉ | 5793/9770 [1:07:37<43:59, 1.51it/s]
59%|█████▉ | 5794/9770 [1:07:38<44:41, 1.48it/s]
59%|█████▉ | 5795/9770 [1:07:39<44:05, 1.50it/s]
59%|███�
+0: {'loss': 0.6624, 'grad_norm': 0.6189362010778219, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.59}
+0: ��█▉ | 5796/9770 [1:07:39<43:49, 1.51it/s]
59%|█████▉ | 5797/9770 [1:07:40<43:47, 1.51it/s]
59%|█████▉ | 5798/9770 [1:07:41<43:16, 1.53it/s]
59%|█████▉ | 5799/9770 [1:07:41<43:20, 1.53it/s]
59%|█████▉ | 5800/9770 [1:07:42<43:45, 1.51it/s]
59%|█████▉ | 5800/9770 [1:07:42<43:45, 1.51it/s]
59%|█████▉ | 5801/9770 [1:07:43<43:50, 1.51it/s]
59%|█████▉ | 5802/9770 [1:07:43<43:38, 1.52it/s]
59%|█████▉ | 5803/9770 [1:07:44<43:16, 1.53it/s]
59%|█████▉ | 5804/9770 [1:07:45<43:13, 1.53it/s]
59%|█████▉ | 5805/9770 [1:07:45<43:19, 1.53it/s]
59%|█████▉ | 5806/9770 [1:07:46<43:04, 1.53it/s]
59%|█████▉ | 5807/9770 [1:07:46<42:55, 1.54it/s]
59%|█████▉ | 5808/9770 [1:07:47<42:46, 1.54it/s]
59%|█████▉ | 5809/9770 [1:07:48<42:53, 1.54
+0: {'loss': 0.6775, 'grad_norm': 0.6080866405818461, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.59}
+0: {'loss': 0.6701, 'grad_norm': 0.6308195833394955, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.6}
+0: it/s]
59%|█████▉ | 5810/9770 [1:07:48<43:18, 1.52it/s]
59%|█████▉ | 5810/9770 [1:07:48<43:18, 1.52it/s]
59%|█████▉ | 5811/9770 [1:07:49<44:23, 1.49it/s]
59%|█████▉ | 5812/9770 [1:07:50<44:06, 1.50it/s]
59%|█████▉ | 5813/9770 [1:07:50<43:27, 1.52it/s]
60%|█████▉ | 5814/9770 [1:07:51<43:08, 1.53it/s]
60%|█████▉ | 5815/9770 [1:07:52<42:56, 1.54it/s]
60%|█████▉ | 5816/9770 [1:07:52<44:13, 1.49it/s]
60%|█████▉ | 5817/9770 [1:07:53<43:19, 1.52it/s]
60%|█████▉ | 5818/9770 [1:07:54<43:23, 1.52it/s]
60%|█████▉ | 5819/9770 [1:07:54<43:46, 1.50it/s]
60%|█████▉ | 5820/9770 [1:07:55<43:19, 1.52it/s]
60%|█████▉ | 5820/9770 [1:07:55<43:19, 1.52it/s]
60%|████���▉ | 5821/9770 [1:07:56<43
+0: {'loss': 0.678, 'grad_norm': 0.6136074934812409, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.6}
+0: :17, 1.52it/s]
60%|█████▉ | 5822/9770 [1:07:56<44:17, 1.49it/s]
60%|█████▉ | 5823/9770 [1:07:57<45:59, 1.43it/s]
60%|█████▉ | 5824/9770 [1:07:58<49:07, 1.34it/s]
60%|█████▉ | 5825/9770 [1:07:59<49:53, 1.32it/s]
60%|█████▉ | 5826/9770 [1:08:00<52:46, 1.25it/s]
60%|█████▉ | 5827/9770 [1:08:00<51:04, 1.29it/s]
60%|█████▉ | 5828/9770 [1:08:01<49:13, 1.33it/s]
60%|█████▉ | 5829/9770 [1:08:02<50:33, 1.30it/s]
60%|█████▉ | 5830/9770 [1:08:03<52:00, 1.26it/s]
60%|█████▉ | 5830/9770 [1:08:03<52:00, 1.26it/s]
60%|█████▉ | 5831/9770 [1:08:04<1:00:53, 1.08it/s]
60%|█████▉ | 5832/9770 [1:08:05<1:02:05, 1.06it/s]
60%|█████▉ | 5833/9770 [1:08:06<1:08:49, 1.05s/it]
60%|█████▉ | 5834/9770 [1:08:07<1:10:00, 1.07s/it]
60%|█████�
+0: {'loss': 0.6791, 'grad_norm': 0.6501062341965902, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.6}
+0: �� | 5835/9770 [1:08:08<1:07:14, 1.03s/it]
60%|█████▉ | 5836/9770 [1:08:10<1:10:38, 1.08s/it]
60%|█████▉ | 5837/9770 [1:08:11<1:15:59, 1.16s/it]
60%|█████▉ | 5838/9770 [1:08:12<1:10:48, 1.08s/it]
60%|█████▉ | 5839/9770 [1:08:12<1:02:47, 1.04it/s]
60%|█████▉ | 5840/9770 [1:08:13<56:41, 1.16it/s]
60%|█████▉ | 5840/9770 [1:08:13<56:41, 1.16it/s]
60%|█████▉ | 5841/9770 [1:08:14<52:42, 1.24it/s]
60%|█████▉ | 5842/9770 [1:08:14<49:21, 1.33it/s]
60%|█████▉ | 5843/9770 [1:08:15<48:33, 1.35it/s]
60%|█████▉ | 5844/9770 [1:08:16<46:54, 1.39it/s]
60%|█████▉ | 5845/9770 [1:08:16<45:39, 1.43it/s]
60%|█████▉ | 5846/9770 [1:08:17<44:54, 1.46it/s]
60%|█████▉ | 5847/9770 [1:08:18<44:02, 1.48it/s]
60%|█████▉ | 5848/9770 [1:08:18<43:48,
+0: {'loss': 0.6844, 'grad_norm': 0.6291475157307491, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.6}
+0: {'loss': 0.6621, 'grad_norm': 0.6156725856265245, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.6}
+0: 1.49it/s]
60%|█████▉ | 5849/9770 [1:08:19<43:14, 1.51it/s]
60%|█████▉ | 5850/9770 [1:08:20<43:02, 1.52it/s]
60%|█████▉ | 5850/9770 [1:08:20<43:02, 1.52it/s]
60%|█████▉ | 5851/9770 [1:08:20<43:11, 1.51it/s]
60%|█████▉ | 5852/9770 [1:08:21<42:28, 1.54it/s]
60%|█████▉ | 5853/9770 [1:08:22<42:42, 1.53it/s]
60%|█████▉ | 5854/9770 [1:08:22<42:16, 1.54it/s]
60%|█████▉ | 5855/9770 [1:08:23<43:23, 1.50it/s]
60%|█████▉ | 5856/9770 [1:08:24<43:28, 1.50it/s]
60%|█████▉ | 5857/9770 [1:08:24<43:25, 1.50it/s]
60%|█████▉ | 5858/9770 [1:08:25<43:13, 1.51it/s]
60%|█████▉ | 5859/9770 [1:08:26<43:06, 1.51it/s]
60%|█████▉ | 5860/9770 [1:08:26<43:21, 1.50it/s]
60%|█████▉ | 5860/9770 [1:08
+0: [2025-09-02 21:04:29,557] [INFO] [axolotl.core.trainers.base._save:613] [PID:3622631] [RANK:0] Saving model checkpoint to /lustre/fswork/projects/rech/dgo/udv55np/math/Qwen3-235B-A22B/Qwen2.5-0.5B_reasoning/1/checkpoint-5862[39m
+0: [2025-09-02 21:04:30,494] [INFO] [axolotl.core.trainers.base._save:662] [PID:3622631] [RANK:0] Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`[39m
+0: {'loss': 0.6401, 'grad_norm': 0.592768214859416, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.6}
+0: :26<43:21, 1.50it/s]
60%|█████▉ | 5861/9770 [1:08:27<43:25, 1.50it/s]
60%|██████ | 5862/9770 [1:08:28<43:04, 1.51it/s]
60%|██████ | 5863/9770 [1:08:31<1:27:38, 1.35s/it]
60%|██████ | 5864/9770 [1:08:31<1:13:39, 1.13s/it]
60%|██████ | 5865/9770 [1:08:32<1:03:57, 1.02it/s]
60%|██████ | 5866/9770 [1:08:33<57:47, 1.13it/s]
60%|██████ | 5867/9770 [1:08:33<52:57, 1.23it/s]
60%|██████ | 5868/9770 [1:08:34<50:43, 1.28it/s]
60%|██████ | 5869/9770 [1:08:35<48:56, 1.33it/s]
60%|██████ | 5870/9770 [1:08:35<46:34, 1.40it/s]
60%|██████ | 5870/9770 [1:08:35<46:34, 1.40it/s]
60%|██████ | 5871/9770 [1:08:36<45:28, 1.43it/s]
60%|██████ | 5872/9770 [1:08:36<44:30, 1.46it/s]
60%|██████ | 5873/9770 [1:08:37<43:36, 1.49it/s]
60%|███�
+0: {'loss': 0.6718, 'grad_norm': 0.6318186032419443, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.6}
+0: ��██ | 5874/9770 [1:08:38<42:43, 1.52it/s]
60%|██████ | 5875/9770 [1:08:38<42:47, 1.52it/s]
60%|██████ | 5876/9770 [1:08:39<43:36, 1.49it/s]
60%|██████ | 5877/9770 [1:08:40<42:55, 1.51it/s]
60%|██████ | 5878/9770 [1:08:40<42:26, 1.53it/s]
60%|██████ | 5879/9770 [1:08:41<42:00, 1.54it/s]
60%|██████ | 5880/9770 [1:08:42<42:20, 1.53it/s]
60%|██████ | 5880/9770 [1:08:42<42:20, 1.53it/s]
60%|██████ | 5881/9770 [1:08:42<42:17, 1.53it/s]
60%|██████ | 5882/9770 [1:08:43<42:36, 1.52it/s]
60%|██████ | 5883/9770 [1:08:44<42:29, 1.52it/s]
60%|██████ | 5884/9770 [1:08:44<41:44, 1.55it/s]
60%|██████ | 5885/9770 [1:08:45<42:13, 1.53it/s]
60%|██████ | 5886/9770 [1:08:46<43:13, 1.50it/s]
60%|██████ | 5887/9770 [1:08:46<43:10, 1.50
+0: {'loss': 0.6802, 'grad_norm': 0.6351110163673325, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.6}
+0: it/s]
60%|██████ | 5888/9770 [1:08:47<42:46, 1.51it/s]
60%|██████ | 5889/9770 [1:08:48<42:15, 1.53it/s]
60%|██████ | 5890/9770 [1:08:48<41:46, 1.55it/s]
60%|██████ | 5890/9770 [1:08:48<41:46, 1.55it/s]
60%|██████ | 5891/9770 [1:08:49<41:54, 1.54it/s]
60%|██████ | 5892/9770 [1:08:50<42:13, 1.53it/s]
60%|██████ | 5893/9770 [1:08:50<41:47, 1.55it/s]
60%|██████ | 5894/9770 [1:08:51<42:34, 1.52it/s]
60%|██████ | 5895/9770 [1:08:52<42:31, 1.52it/s]
60%|██████ | 5896/9770 [1:08:52<42:14, 1.53it/s]
60%|██████ | 5897/9770 [1:08:53<42:18, 1.53it/s]
60%|██████ | 5898/9770 [1:08:53<42:20, 1.52it/s]
60%|██████ | 5899/9770 [1:08:54<41:56, 1.54it/s]
60%|██████ | 5900/9770 [1:08:55<41:56, 1.54it/s]
+0: {'loss': 0.652, 'grad_norm': 0.6338496819624907, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.6}
+0: {'loss': 0.6647, 'grad_norm': 0.626124748800393, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.6}
+0:
60%|██████ | 5900/9770 [1:08:55<41:56, 1.54it/s]
60%|██████ | 5901/9770 [1:08:55<41:56, 1.54it/s]
60%|██████ | 5902/9770 [1:08:56<41:44, 1.54it/s]
60%|██████ | 5903/9770 [1:08:57<41:59, 1.53it/s]
60%|██████ | 5904/9770 [1:08:57<41:37, 1.55it/s]
60%|██████ | 5905/9770 [1:08:58<41:28, 1.55it/s]
60%|██████ | 5906/9770 [1:08:59<41:00, 1.57it/s]
60%|██████ | 5907/9770 [1:08:59<41:22, 1.56it/s]
60%|██████ | 5908/9770 [1:09:00<41:21, 1.56it/s]
60%|██████ | 5909/9770 [1:09:01<41:19, 1.56it/s]
60%|██████ | 5910/9770 [1:09:01<41:10, 1.56it/s]
60%|██████ | 5910/9770 [1:09:01<41:10, 1.56it/s]
61%|██████ | 5911/9770 [1:09:02<40:52, 1.57it/s]
61%|██████ | 5912/9770 [1:09:03<41:40, 1.54it/s]
61%|██████ |
+0: {'loss': 0.6969, 'grad_norm': 0.6676084137739385, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.61}
+0: 5913/9770 [1:09:03<42:52, 1.50it/s]
61%|██████ | 5914/9770 [1:09:04<42:26, 1.51it/s]
61%|██████ | 5915/9770 [1:09:04<41:58, 1.53it/s]
61%|██████ | 5916/9770 [1:09:05<41:39, 1.54it/s]
61%|██████ | 5917/9770 [1:09:06<41:55, 1.53it/s]
61%|██████ | 5918/9770 [1:09:06<42:38, 1.51it/s]
61%|██████ | 5919/9770 [1:09:07<42:23, 1.51it/s]
61%|██████ | 5920/9770 [1:09:08<43:03, 1.49it/s]
61%|██████ | 5920/9770 [1:09:08<43:03, 1.49it/s]
61%|██████ | 5921/9770 [1:09:08<42:52, 1.50it/s]
61%|██████ | 5922/9770 [1:09:09<42:19, 1.52it/s]
61%|██████ | 5923/9770 [1:09:10<42:18, 1.52it/s]
61%|██████ | 5924/9770 [1:09:10<42:34, 1.51it/s]
61%|██████ | 5925/9770 [1:09:11<42:16, 1.52it/s]
61%|██████ | 5926/9770 [1:09:12<42:00, 1.53it/s]
61%|█
+0: {'loss': 0.6614, 'grad_norm': 0.6359984224492452, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.61}
+0: █████ | 5927/9770 [1:09:12<42:20, 1.51it/s]
61%|██████ | 5928/9770 [1:09:13<42:13, 1.52it/s]
61%|██████ | 5929/9770 [1:09:14<42:03, 1.52it/s]
61%|██████ | 5930/9770 [1:09:14<41:45, 1.53it/s]
61%|██████ | 5930/9770 [1:09:14<41:45, 1.53it/s]
61%|██████ | 5931/9770 [1:09:15<41:40, 1.54it/s]
61%|██████ | 5932/9770 [1:09:16<41:47, 1.53it/s]
61%|██████ | 5933/9770 [1:09:16<42:08, 1.52it/s]
61%|██████ | 5934/9770 [1:09:17<42:16, 1.51it/s]
61%|██████ | 5935/9770 [1:09:18<41:27, 1.54it/s]
61%|██████ | 5936/9770 [1:09:18<41:10, 1.55it/s]
61%|██████ | 5937/9770 [1:09:19<41:07, 1.55it/s]
61%|██████ | 5938/9770 [1:09:20<40:44, 1.57it/s]
61%|██████ | 5939/9770 [1:09:20<41:18, 1.55it/s]
61%|██████ | 5940/9770 [1:09:21<41:15
+0: {'loss': 0.6483, 'grad_norm': 0.6270648944919128, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.61}
+0: {'loss': 0.6561, 'grad_norm': 0.6461598273167389, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.61}
+0: , 1.55it/s]
61%|██████ | 5940/9770 [1:09:21<41:15, 1.55it/s]
61%|██████ | 5941/9770 [1:09:22<41:40, 1.53it/s]
61%|██████ | 5942/9770 [1:09:22<41:25, 1.54it/s]
61%|██████ | 5943/9770 [1:09:23<41:30, 1.54it/s]
61%|██████ | 5944/9770 [1:09:23<41:32, 1.53it/s]
61%|██████ | 5945/9770 [1:09:24<41:39, 1.53it/s]
61%|██████ | 5946/9770 [1:09:25<41:13, 1.55it/s]
61%|██████ | 5947/9770 [1:09:25<41:45, 1.53it/s]
61%|██████ | 5948/9770 [1:09:26<41:34, 1.53it/s]
61%|██████ | 5949/9770 [1:09:27<41:42, 1.53it/s]
61%|██████ | 5950/9770 [1:09:27<41:48, 1.52it/s]
61%|██████ | 5950/9770 [1:09:27<41:48, 1.52it/s]
61%|██████ | 5951/9770 [1:09:28<41:56, 1.52it/s]
61%|██████ | 5952/9770 [1:0
+0: {'loss': 0.6657, 'grad_norm': 0.6213399572232323, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.61}
+0: 9:29<41:12, 1.54it/s]
61%|██████ | 5953/9770 [1:09:29<41:03, 1.55it/s]
61%|██████ | 5954/9770 [1:09:30<40:56, 1.55it/s]
61%|██████ | 5955/9770 [1:09:31<41:28, 1.53it/s]
61%|██████ | 5956/9770 [1:09:31<42:10, 1.51it/s]
61%|██████ | 5957/9770 [1:09:32<42:11, 1.51it/s]
61%|██████ | 5958/9770 [1:09:33<41:22, 1.54it/s]
61%|██████ | 5959/9770 [1:09:33<41:40, 1.52it/s]
61%|██████ | 5960/9770 [1:09:34<41:27, 1.53it/s]
61%|██████ | 5960/9770 [1:09:34<41:27, 1.53it/s]
61%|██████ | 5961/9770 [1:09:35<42:19, 1.50it/s]
61%|██████ | 5962/9770 [1:09:35<42:10, 1.51it/s]
61%|██████ | 5963/9770 [1:09:36<42:21, 1.50it/s]
61%|██████ | 5964/9770 [1:09:37<41:59, 1.51it/s]
61%|██████ | 5965/9770 [1:09:37<41:30, 1.53it/s]
61%|█████�
+0: {'loss': 0.6767, 'grad_norm': 0.6061871593456278, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.61}
+0: � | 5966/9770 [1:09:38<40:50, 1.55it/s]
61%|██████ | 5967/9770 [1:09:39<41:27, 1.53it/s]
61%|██████ | 5968/9770 [1:09:39<41:14, 1.54it/s]
61%|██████ | 5969/9770 [1:09:40<41:09, 1.54it/s]
61%|██████ | 5970/9770 [1:09:40<40:50, 1.55it/s]
61%|██████ | 5970/9770 [1:09:40<40:50, 1.55it/s]
61%|██████ | 5971/9770 [1:09:41<41:15, 1.53it/s]
61%|██████ | 5972/9770 [1:09:42<41:11, 1.54it/s]
61%|██████ | 5973/9770 [1:09:42<41:10, 1.54it/s]
61%|██████ | 5974/9770 [1:09:43<41:05, 1.54it/s]
61%|██████ | 5975/9770 [1:09:44<41:41, 1.52it/s]
61%|██████ | 5976/9770 [1:09:44<42:27, 1.49it/s]
61%|██████ | 5977/9770 [1:09:45<42:04, 1.50it/s]
61%|██████ | 5978/9770 [1:09:46<42:34, 1.48it/s]
61%|██████ | 5979/9770 [1:09:46<42:16, 1.49it/s]
+0: {'loss': 0.6565, 'grad_norm': 0.6108349498367607, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.61}
+0: {'loss': 0.6877, 'grad_norm': 0.6891945262851742, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.61}
+0: 61%|██████ | 5980/9770 [1:09:47<41:41, 1.52it/s]
61%|██████ | 5980/9770 [1:09:47<41:41, 1.52it/s]
61%|██████ | 5981/9770 [1:09:48<42:43, 1.48it/s]
61%|██████ | 5982/9770 [1:09:48<42:13, 1.50it/s]
61%|██████ | 5983/9770 [1:09:49<42:16, 1.49it/s]
61%|██████ | 5984/9770 [1:09:50<41:35, 1.52it/s]
61%|██████▏ | 5985/9770 [1:09:50<41:31, 1.52it/s]
61%|██████▏ | 5986/9770 [1:09:51<41:39, 1.51it/s]
61%|██████▏ | 5987/9770 [1:09:52<41:20, 1.53it/s]
61%|██████▏ | 5988/9770 [1:09:52<40:55, 1.54it/s]
61%|██████▏ | 5989/9770 [1:09:53<41:43, 1.51it/s]
61%|██████▏ | 5990/9770 [1:09:54<41:30, 1.52it/s]
61%|██████▏ | 5990/9770 [1:09:54<41:30, 1.52it/s]
61%|██████▏ | 5991/9770 [1
+0: {'loss': 0.6837, 'grad_norm': 0.7839686916696933, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.61}
+0: :09:54<41:09, 1.53it/s]
61%|██████▏ | 5992/9770 [1:09:55<41:11, 1.53it/s]
61%|██████▏ | 5993/9770 [1:09:56<40:59, 1.54it/s]
61%|██████▏ | 5994/9770 [1:09:56<41:09, 1.53it/s]
61%|██████▏ | 5995/9770 [1:09:57<41:13, 1.53it/s]
61%|██████▏ | 5996/9770 [1:09:58<41:04, 1.53it/s]
61%|██████▏ | 5997/9770 [1:09:58<40:42, 1.54it/s]
61%|██████▏ | 5998/9770 [1:09:59<40:37, 1.55it/s]
61%|██████▏ | 5999/9770 [1:10:00<40:45, 1.54it/s]
61%|██████▏ | 6000/9770 [1:10:00<41:18, 1.52it/s]
61%|██████▏ | 6000/9770 [1:10:00<41:18, 1.52it/s]
61%|██████▏ | 6001/9770 [1:10:01<41:25, 1.52it/s]
61%|██████▏ | 6002/9770 [1:10:02<41:11, 1.52it/s]
61%|██████▏ | 6003/9770 [1:10:02<40:38, 1.55it/s]
61%|██████▏ | 6004/9770 [1:10:03<41:01, 1.
+0: {'loss': 0.656, 'grad_norm': 0.6488306883856158, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.61}
+0: 53it/s]
61%|██████▏ | 6005/9770 [1:10:03<40:35, 1.55it/s]
61%|██████▏ | 6006/9770 [1:10:04<40:21, 1.55it/s]
61%|██████▏ | 6007/9770 [1:10:05<40:39, 1.54it/s]
61%|██████▏ | 6008/9770 [1:10:05<40:51, 1.53it/s]
62%|██████▏ | 6009/9770 [1:10:06<40:40, 1.54it/s]
62%|██████▏ | 6010/9770 [1:10:07<41:25, 1.51it/s]
62%|██████▏ | 6010/9770 [1:10:07<41:25, 1.51it/s]
62%|██████▏ | 6011/9770 [1:10:07<41:12, 1.52it/s]
62%|██████▏ | 6012/9770 [1:10:08<41:13, 1.52it/s]
62%|██████▏ | 6013/9770 [1:10:09<40:53, 1.53it/s]
62%|██████▏ | 6014/9770 [1:10:09<41:00, 1.53it/s]
62%|██████▏ | 6015/9770 [1:10:10<41:42, 1.50it/s]
62%|██████▏ | 6016/9770 [1:10:11<41:30, 1.51it/s]
62%|██████▏ | 6017/9770 [1:10:11<41:13, 1.52it/s]
62%|█�
+0: {'loss': 0.6637, 'grad_norm': 0.6296898968920162, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.62}
+0: ��████▏ | 6018/9770 [1:10:12<41:16, 1.52it/s]
62%|██████▏ | 6019/9770 [1:10:13<41:39, 1.50it/s]
62%|██████▏ | 6020/9770 [1:10:13<41:34, 1.50it/s]
62%|██████▏ | 6020/9770 [1:10:13<41:34, 1.50it/s]
62%|██████▏ | 6021/9770 [1:10:14<41:04, 1.52it/s]
62%|██████▏ | 6022/9770 [1:10:15<40:54, 1.53it/s]
62%|██████▏ | 6023/9770 [1:10:15<40:34, 1.54it/s]
62%|██████▏ | 6024/9770 [1:10:16<40:30, 1.54it/s]
62%|██████▏ | 6025/9770 [1:10:17<40:24, 1.54it/s]
62%|██████▏ | 6026/9770 [1:10:17<40:41, 1.53it/s]
62%|██████▏ | 6027/9770 [1:10:18<40:29, 1.54it/s]
62%|██████▏ | 6028/9770 [1:10:19<40:15, 1.55it/s]
62%|██████▏ | 6029/9770 [1:10:19<40:11, 1.55it/s]
62%|██████▏ | 6030/9770 [1:10:20<40:36, 1.54it/s]
+0: {'loss': 0.6787, 'grad_norm': 0.6218360050616779, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.62}
+0: {'loss': 0.6545, 'grad_norm': 0.6268304939226396, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.62}
+0:
62%|██████▏ | 6030/9770 [1:10:20<40:36, 1.54it/s]
62%|██████▏ | 6031/9770 [1:10:21<40:44, 1.53it/s]
62%|██████▏ | 6032/9770 [1:10:21<40:41, 1.53it/s]
62%|██████▏ | 6033/9770 [1:10:22<41:27, 1.50it/s]
62%|██████▏ | 6034/9770 [1:10:22<40:39, 1.53it/s]
62%|██████▏ | 6035/9770 [1:10:23<40:33, 1.53it/s]
62%|██████▏ | 6036/9770 [1:10:24<40:05, 1.55it/s]
62%|██████▏ | 6037/9770 [1:10:24<41:06, 1.51it/s]
62%|██████▏ | 6038/9770 [1:10:25<41:12, 1.51it/s]
62%|██████▏ | 6039/9770 [1:10:26<40:54, 1.52it/s]
62%|██████▏ | 6040/9770 [1:10:26<40:31, 1.53it/s]
62%|██████▏ | 6040/9770 [1:10:26<40:31, 1.53it/s]
62%|██████▏ | 6041/9770 [1:10:27<40:22, 1.54it/s]
62%|██████▏ | 6042/9770 [1:10:28<40:34,
+0: {'loss': 0.6721, 'grad_norm': 0.6913627312689128, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.62}
+0: 1.53it/s]
62%|██████▏ | 6043/9770 [1:10:28<40:25, 1.54it/s]
62%|██████▏ | 6044/9770 [1:10:29<40:09, 1.55it/s]
62%|██████▏ | 6045/9770 [1:10:30<40:29, 1.53it/s]
62%|██████▏ | 6046/9770 [1:10:30<40:23, 1.54it/s]
62%|██████▏ | 6047/9770 [1:10:31<40:07, 1.55it/s]
62%|██████▏ | 6048/9770 [1:10:32<40:34, 1.53it/s]
62%|██████▏ | 6049/9770 [1:10:32<40:21, 1.54it/s]
62%|██████▏ | 6050/9770 [1:10:33<39:50, 1.56it/s]
62%|██████▏ | 6050/9770 [1:10:33<39:50, 1.56it/s]
62%|██████▏ | 6051/9770 [1:10:34<39:52, 1.55it/s]
62%|██████▏ | 6052/9770 [1:10:34<39:41, 1.56it/s]
62%|██████▏ | 6053/9770 [1:10:35<39:30, 1.57it/s]
62%|██████▏ | 6054/9770 [1:10:35<39:59, 1.55it/s]
62%|██████▏ | 6055/9770 [1:10:36<39:49, 1.56it/s]
62%|
+0: {'loss': 0.6791, 'grad_norm': 0.6885231239561105, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.62}
+0: ██████▏ | 6056/9770 [1:10:37<39:11, 1.58it/s]
62%|██████▏ | 6057/9770 [1:10:37<40:02, 1.55it/s]
62%|██████▏ | 6058/9770 [1:10:38<40:16, 1.54it/s]
62%|██████▏ | 6059/9770 [1:10:39<41:25, 1.49it/s]
62%|██████▏ | 6060/9770 [1:10:39<40:33, 1.52it/s]
62%|██████▏ | 6060/9770 [1:10:39<40:33, 1.52it/s]
62%|██████▏ | 6061/9770 [1:10:40<40:21, 1.53it/s]
62%|██████▏ | 6062/9770 [1:10:41<40:21, 1.53it/s]
62%|██████▏ | 6063/9770 [1:10:41<40:07, 1.54it/s]
62%|██████▏ | 6064/9770 [1:10:42<40:22, 1.53it/s]
62%|██████▏ | 6065/9770 [1:10:43<40:08, 1.54it/s]
62%|██████▏ | 6066/9770 [1:10:43<40:17, 1.53it/s]
62%|██████▏ | 6067/9770 [1:10:44<40:34, 1.52it/s]
62%|██████▏ | 6068/9770 [1:10:45<40:15, 1.53it/s]
62%|█████�
+0: {'loss': 0.6673, 'grad_norm': 0.611433905548775, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.62}
+0: {'loss': 0.6689, 'grad_norm': 0.6517878854669044, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.62}
+0: �▏ | 6069/9770 [1:10:45<40:14, 1.53it/s]
62%|██████▏ | 6070/9770 [1:10:46<40:26, 1.53it/s]
62%|██████▏ | 6070/9770 [1:10:46<40:26, 1.53it/s]
62%|██████▏ | 6071/9770 [1:10:47<39:55, 1.54it/s]
62%|██████▏ | 6072/9770 [1:10:47<39:54, 1.54it/s]
62%|██████▏ | 6073/9770 [1:10:48<39:46, 1.55it/s]
62%|██████▏ | 6074/9770 [1:10:48<39:16, 1.57it/s]
62%|██████▏ | 6075/9770 [1:10:49<39:24, 1.56it/s]
62%|██████▏ | 6076/9770 [1:10:50<39:47, 1.55it/s]
62%|██████▏ | 6077/9770 [1:10:50<39:46, 1.55it/s]
62%|██████▏ | 6078/9770 [1:10:51<39:30, 1.56it/s]
62%|██████▏ | 6079/9770 [1:10:52<39:09, 1.57it/s]
62%|██████▏ | 6080/9770 [1:10:52<39:16, 1.57it/s]
62%|██████▏ | 6080/9770 [1:10:52<39
+0: {'loss': 0.6643, 'grad_norm': 0.6257153782863818, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.62}
+0: :16, 1.57it/s]
62%|██████▏ | 6081/9770 [1:10:53<39:28, 1.56it/s]
62%|██████▏ | 6082/9770 [1:10:54<39:46, 1.55it/s]
62%|██████▏ | 6083/9770 [1:10:54<39:50, 1.54it/s]
62%|██████▏ | 6084/9770 [1:10:55<39:49, 1.54it/s]
62%|██████▏ | 6085/9770 [1:10:56<39:53, 1.54it/s]
62%|██████▏ | 6086/9770 [1:10:56<40:57, 1.50it/s]
62%|██████▏ | 6087/9770 [1:10:57<40:50, 1.50it/s]
62%|██████▏ | 6088/9770 [1:10:58<40:48, 1.50it/s]
62%|██████▏ | 6089/9770 [1:10:58<39:59, 1.53it/s]
62%|██████▏ | 6090/9770 [1:10:59<40:00, 1.53it/s]
62%|██████▏ | 6090/9770 [1:10:59<40:00, 1.53it/s]
62%|██████▏ | 6091/9770 [1:11:00<40:06, 1.53it/s]
62%|██████▏ | 6092/9770 [1:11:00<40:13, 1.52it/s]
62%|██████▏ | 6093/9770 [1:11:01<40:13, 1.52it/s]
+0: {'loss': 0.642, 'grad_norm': 0.6082842976633233, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.62}
+0: 62%|██████▏ | 6094/9770 [1:11:01<39:51, 1.54it/s]
62%|██████▏ | 6095/9770 [1:11:02<40:00, 1.53it/s]
62%|██████▏ | 6096/9770 [1:11:03<39:53, 1.53it/s]
62%|██████▏ | 6097/9770 [1:11:03<39:28, 1.55it/s]
62%|██████▏ | 6098/9770 [1:11:04<39:41, 1.54it/s]
62%|██████▏ | 6099/9770 [1:11:05<39:42, 1.54it/s]
62%|██████▏ | 6100/9770 [1:11:05<39:50, 1.54it/s]
62%|██████▏ | 6100/9770 [1:11:05<39:50, 1.54it/s]
62%|██████▏ | 6101/9770 [1:11:06<39:44, 1.54it/s]
62%|██████▏ | 6102/9770 [1:11:07<39:54, 1.53it/s]
62%|██████▏ | 6103/9770 [1:11:07<40:13, 1.52it/s]
62%|██████▏ | 6104/9770 [1:11:08<40:17, 1.52it/s]
62%|██████▏ | 6105/9770 [1:11:09<39:54, 1.53it/s]
62%|██████▏ | 6106/9770 [1:11:09<39:23, 1.55it/s]
63%|████�
+0: {'loss': 0.674, 'grad_norm': 0.652381596197015, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.63}
+0: ��█▎ | 6107/9770 [1:11:10<39:19, 1.55it/s]
63%|██████▎ | 6108/9770 [1:11:11<39:07, 1.56it/s]
63%|██████▎ | 6109/9770 [1:11:11<39:10, 1.56it/s]
63%|██████▎ | 6110/9770 [1:11:12<39:17, 1.55it/s]
63%|██████▎ | 6110/9770 [1:11:12<39:17, 1.55it/s]
63%|██████▎ | 6111/9770 [1:11:13<39:18, 1.55it/s]
63%|██████▎ | 6112/9770 [1:11:13<38:55, 1.57it/s]
63%|██████▎ | 6113/9770 [1:11:14<38:59, 1.56it/s]
63%|██████▎ | 6114/9770 [1:11:14<39:21, 1.55it/s]
63%|██████▎ | 6115/9770 [1:11:15<39:14, 1.55it/s]
63%|██████▎ | 6116/9770 [1:11:16<40:24, 1.51it/s]
63%|██████▎ | 6117/9770 [1:11:16<40:16, 1.51it/s]
63%|██████▎ | 6118/9770 [1:11:17<39:58, 1.52it/s]
63%|██████▎ | 6119/9770 [1:11:18<40:00, 1.52it/s]
63%|██████▎ | 6120
+0: {'loss': 0.6395, 'grad_norm': 0.5992901947274607, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.63}
+0: {'loss': 0.6662, 'grad_norm': 0.6075924975531513, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.63}
+0: /9770 [1:11:18<39:38, 1.53it/s]
63%|██████▎ | 6120/9770 [1:11:18<39:38, 1.53it/s]
63%|██████▎ | 6121/9770 [1:11:19<39:33, 1.54it/s]
63%|██████▎ | 6122/9770 [1:11:20<39:31, 1.54it/s]
63%|██████▎ | 6123/9770 [1:11:20<39:25, 1.54it/s]
63%|██████▎ | 6124/9770 [1:11:21<39:19, 1.55it/s]
63%|██████▎ | 6125/9770 [1:11:22<39:13, 1.55it/s]
63%|██████▎ | 6126/9770 [1:11:22<39:17, 1.55it/s]
63%|██████▎ | 6127/9770 [1:11:23<39:20, 1.54it/s]
63%|██████▎ | 6128/9770 [1:11:24<39:12, 1.55it/s]
63%|██████▎ | 6129/9770 [1:11:24<39:44, 1.53it/s]
63%|██████▎ | 6130/9770 [1:11:25<39:48, 1.52it/s]
63%|██████▎ | 6130/9770 [1:11:25<39:48, 1.52it/s]
63%|██████▎ | 6131/9770 [1:11:26<39:26, 1.54it/
+0: {'loss': 0.6465, 'grad_norm': 0.6583391324854634, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.63}
+0: s]
63%|██████▎ | 6132/9770 [1:11:26<39:33, 1.53it/s]
63%|██████▎ | 6133/9770 [1:11:27<39:24, 1.54it/s]
63%|██████▎ | 6134/9770 [1:11:27<38:59, 1.55it/s]
63%|██████▎ | 6135/9770 [1:11:28<38:47, 1.56it/s]
63%|██████▎ | 6136/9770 [1:11:29<38:41, 1.57it/s]
63%|██████▎ | 6137/9770 [1:11:29<39:53, 1.52it/s]
63%|██████▎ | 6138/9770 [1:11:30<39:38, 1.53it/s]
63%|██████▎ | 6139/9770 [1:11:31<39:12, 1.54it/s]
63%|██████▎ | 6140/9770 [1:11:31<39:30, 1.53it/s]
63%|██████▎ | 6140/9770 [1:11:31<39:30, 1.53it/s]
63%|██████▎ | 6141/9770 [1:11:32<39:16, 1.54it/s]
63%|██████▎ | 6142/9770 [1:11:33<39:08, 1.54it/s]
63%|██████▎ | 6143/9770 [1:11:33<39:09, 1.54it/s]
63%|██████▎ | 6144/9770 [1:11:34<39:20, 1.54it/s]
63%|███
+0: {'loss': 0.684, 'grad_norm': 0.6282871558381556, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.63}
+0: ███▎ | 6145/9770 [1:11:35<39:35, 1.53it/s]
63%|██████▎ | 6146/9770 [1:11:35<39:16, 1.54it/s]
63%|██████▎ | 6147/9770 [1:11:36<39:18, 1.54it/s]
63%|██████▎ | 6148/9770 [1:11:37<39:47, 1.52it/s]
63%|██████▎ | 6149/9770 [1:11:37<39:30, 1.53it/s]
63%|██████▎ | 6150/9770 [1:11:38<39:33, 1.53it/s]
63%|██████▎ | 6150/9770 [1:11:38<39:33, 1.53it/s]
63%|██████▎ | 6151/9770 [1:11:39<39:58, 1.51it/s]
63%|██████▎ | 6152/9770 [1:11:39<39:38, 1.52it/s]
63%|██████▎ | 6153/9770 [1:11:40<39:41, 1.52it/s]
63%|██████▎ | 6154/9770 [1:11:41<39:35, 1.52it/s]
63%|██████▎ | 6155/9770 [1:11:41<39:30, 1.53it/s]
63%|██████▎ | 6156/9770 [1:11:42<39:33, 1.52it/s]
63%|██████▎ | 6157/9770 [1:11:43<39:37, 1.52it/s]
63%|██████▎ |
+0: {'loss': 0.6486, 'grad_norm': 0.5795499591269432, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.63}
+0: 6158/9770 [1:11:43<39:25, 1.53it/s]
63%|██████▎ | 6159/9770 [1:11:44<39:40, 1.52it/s]
63%|██████▎ | 6160/9770 [1:11:44<39:26, 1.53it/s]
63%|██████▎ | 6160/9770 [1:11:44<39:26, 1.53it/s]
63%|██████▎ | 6161/9770 [1:11:45<39:33, 1.52it/s]
63%|██████▎ | 6162/9770 [1:11:46<39:25, 1.53it/s]
63%|██████▎ | 6163/9770 [1:11:46<39:09, 1.54it/s]
63%|██████▎ | 6164/9770 [1:11:47<39:20, 1.53it/s]
63%|██████▎ | 6165/9770 [1:11:48<39:08, 1.54it/s]
63%|██████▎ | 6166/9770 [1:11:48<39:15, 1.53it/s]
63%|██████▎ | 6167/9770 [1:11:49<39:17, 1.53it/s]
63%|██████▎ | 6168/9770 [1:11:50<38:49, 1.55it/s]
63%|██████▎ | 6169/9770 [1:11:50<38:50, 1.55it/s]
63%|██████▎ | 6170/9770 [1:11:51<38:22, 1.56it/s]
+0: {'loss': 0.681, 'grad_norm': 0.6470430136592646, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.63}
+0: {'loss': 0.659, 'grad_norm': 0.614568533659564, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.63}
+0:
63%|██████▎ | 6170/9770 [1:11:51<38:22, 1.56it/s]
63%|██████▎ | 6171/9770 [1:11:52<38:19, 1.56it/s]
63%|██████▎ | 6172/9770 [1:11:52<38:40, 1.55it/s]
63%|██████▎ | 6173/9770 [1:11:53<38:20, 1.56it/s]
63%|██████▎ | 6174/9770 [1:11:54<38:20, 1.56it/s]
63%|██████▎ | 6175/9770 [1:11:54<38:32, 1.55it/s]
63%|██████▎ | 6176/9770 [1:11:55<39:14, 1.53it/s]
63%|██████▎ | 6177/9770 [1:11:56<39:33, 1.51it/s]
63%|██████▎ | 6178/9770 [1:11:56<39:30, 1.52it/s]
63%|██████▎ | 6179/9770 [1:11:57<40:00, 1.50it/s]
63%|██████▎ | 6180/9770 [1:11:58<39:24, 1.52it/s]
63%|██████▎ | 6180/9770 [1:11:58<39:24, 1.52it/s]
63%|██████▎ | 6181/9770 [1:11:58<39:13, 1.53it/s]
63%|██████▎ | 6182/9770 [1:11:59<38:38, 1.55it/s]
63%|█�
+0: {'loss': 0.6696, 'grad_norm': 0.6186075441983189, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.63}
+0: �████▎ | 6183/9770 [1:11:59<38:24, 1.56it/s]
63%|██████▎ | 6184/9770 [1:12:00<38:02, 1.57it/s]
63%|██████▎ | 6185/9770 [1:12:01<37:48, 1.58it/s]
63%|██████▎ | 6186/9770 [1:12:01<37:45, 1.58it/s]
63%|██████▎ | 6187/9770 [1:12:02<37:45, 1.58it/s]
63%|██████▎ | 6188/9770 [1:12:03<37:42, 1.58it/s]
63%|██████▎ | 6189/9770 [1:12:03<38:10, 1.56it/s]
63%|██████▎ | 6190/9770 [1:12:04<38:31, 1.55it/s]
63%|██████▎ | 6190/9770 [1:12:04<38:31, 1.55it/s]
63%|██████▎ | 6191/9770 [1:12:05<38:21, 1.56it/s]
63%|██████▎ | 6192/9770 [1:12:05<38:06, 1.56it/s]
63%|██████▎ | 6193/9770 [1:12:06<38:34, 1.55it/s]
63%|██████▎ | 6194/9770 [1:12:06<38:39, 1.54it/s]
63%|██████▎ | 6195/9770 [1:12:07<38:43, 1.54it/s]
63%|██████▎
+0: {'loss': 0.6619, 'grad_norm': 0.6370901660722852, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.63}
+0: | 6196/9770 [1:12:08<38:46, 1.54it/s]
63%|██████▎ | 6197/9770 [1:12:08<39:03, 1.52it/s]
63%|██████▎ | 6198/9770 [1:12:09<38:56, 1.53it/s]
63%|██████▎ | 6199/9770 [1:12:10<39:40, 1.50it/s]
63%|██████▎ | 6200/9770 [1:12:10<39:47, 1.50it/s]
63%|██████▎ | 6200/9770 [1:12:10<39:47, 1.50it/s]
63%|██████▎ | 6201/9770 [1:12:11<39:46, 1.50it/s]
63%|██████▎ | 6202/9770 [1:12:12<39:44, 1.50it/s]
63%|██████▎ | 6203/9770 [1:12:12<39:40, 1.50it/s]
64%|██████▎ | 6204/9770 [1:12:13<39:29, 1.50it/s]
64%|██████▎ | 6205/9770 [1:12:14<39:15, 1.51it/s]
64%|██████▎ | 6206/9770 [1:12:14<39:56, 1.49it/s]
64%|██████▎ | 6207/9770 [1:12:15<39:06, 1.52it/s]
64%|██████▎ | 6208/9770 [1:12:16<38:47, 1.53it/s]
64%|██████▎ | 6209/9770 [1:
+0: {'loss': 0.6551, 'grad_norm': 0.6037277538780117, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.64}
+0: {'loss': 0.6641, 'grad_norm': 0.6217541178222828, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.64}
+0: 12:16<38:47, 1.53it/s]
64%|██████▎ | 6210/9770 [1:12:17<38:43, 1.53it/s]
64%|██████▎ | 6210/9770 [1:12:17<38:43, 1.53it/s]
64%|██████▎ | 6211/9770 [1:12:18<38:55, 1.52it/s]
64%|██████▎ | 6212/9770 [1:12:18<39:01, 1.52it/s]
64%|██████▎ | 6213/9770 [1:12:19<38:23, 1.54it/s]
64%|██████▎ | 6214/9770 [1:12:20<38:29, 1.54it/s]
64%|██████▎ | 6215/9770 [1:12:20<38:12, 1.55it/s]
64%|██████▎ | 6216/9770 [1:12:21<38:36, 1.53it/s]
64%|██████▎ | 6217/9770 [1:12:22<38:29, 1.54it/s]
64%|██████▎ | 6218/9770 [1:12:22<38:27, 1.54it/s]
64%|██████▎ | 6219/9770 [1:12:23<38:47, 1.53it/s]
64%|██████▎ | 6220/9770 [1:12:24<38:38, 1.53it/s]
64%|██████▎ | 6220/9770 [1:12:24<38:38, 1.53it/s]
64%|�
+0: {'loss': 0.6681, 'grad_norm': 0.6183644365808582, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.64}
+0: ��█████▎ | 6221/9770 [1:12:24<39:01, 1.52it/s]
64%|██████▎ | 6222/9770 [1:12:25<39:01, 1.52it/s]
64%|██████▎ | 6223/9770 [1:12:26<39:00, 1.52it/s]
64%|██████▎ | 6224/9770 [1:12:26<38:22, 1.54it/s]
64%|██████▎ | 6225/9770 [1:12:27<38:17, 1.54it/s]
64%|██████▎ | 6226/9770 [1:12:27<38:43, 1.53it/s]
64%|██████▎ | 6227/9770 [1:12:28<38:33, 1.53it/s]
64%|██████▎ | 6228/9770 [1:12:29<38:40, 1.53it/s]
64%|██████▍ | 6229/9770 [1:12:29<38:42, 1.52it/s]
64%|██████▍ | 6230/9770 [1:12:30<38:42, 1.52it/s]
64%|██████▍ | 6230/9770 [1:12:30<38:42, 1.52it/s]
64%|██████▍ | 6231/9770 [1:12:31<38:58, 1.51it/s]
64%|██████▍ | 6232/9770 [1:12:31<38:46, 1.52it/s]
64%|██████▍ | 6233/9770 [1:12:32<38:58, 1.51it/s]
64%|██████
+0: {'loss': 0.6765, 'grad_norm': 0.6131289071043077, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.64}
+0: ▍ | 6234/9770 [1:12:33<38:43, 1.52it/s]
64%|██████▍ | 6235/9770 [1:12:33<38:11, 1.54it/s]
64%|██████▍ | 6236/9770 [1:12:34<37:56, 1.55it/s]
64%|██████▍ | 6237/9770 [1:12:35<37:48, 1.56it/s]
64%|██████▍ | 6238/9770 [1:12:35<37:48, 1.56it/s]
64%|██████▍ | 6239/9770 [1:12:36<38:06, 1.54it/s]
64%|██████▍ | 6240/9770 [1:12:37<38:00, 1.55it/s]
64%|██████▍ | 6240/9770 [1:12:37<38:00, 1.55it/s]
64%|██████▍ | 6241/9770 [1:12:37<39:15, 1.50it/s]
64%|██████▍ | 6242/9770 [1:12:38<38:53, 1.51it/s]
64%|██████▍ | 6243/9770 [1:12:39<38:55, 1.51it/s]
64%|██████▍ | 6244/9770 [1:12:39<38:55, 1.51it/s]
64%|██████▍ | 6245/9770 [1:12:40<38:25, 1.53it/s]
64%|██████▍ | 6246/9770 [1:12:41<38:23, 1.53it/s]
64%|██████▍ | 6247/9770
+0: {'loss': 0.6729, 'grad_norm': 0.5970843397588241, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.64}
+0: [1:12:41<37:51, 1.55it/s]
64%|██████▍ | 6248/9770 [1:12:42<37:44, 1.56it/s]
64%|██████▍ | 6249/9770 [1:12:43<38:32, 1.52it/s]
64%|██████▍ | 6250/9770 [1:12:43<38:09, 1.54it/s]
64%|██████▍ | 6250/9770 [1:12:43<38:09, 1.54it/s]
64%|██████▍ | 6251/9770 [1:12:44<38:24, 1.53it/s]
64%|██████▍ | 6252/9770 [1:12:45<38:48, 1.51it/s]
64%|██████▍ | 6253/9770 [1:12:45<37:56, 1.54it/s]
64%|██████▍ | 6254/9770 [1:12:46<38:00, 1.54it/s]
64%|██████▍ | 6255/9770 [1:12:46<38:05, 1.54it/s]
64%|██████▍ | 6256/9770 [1:12:47<38:14, 1.53it/s]
64%|██████▍ | 6257/9770 [1:12:48<38:45, 1.51it/s]
64%|██████▍ | 6258/9770 [1:12:48<39:00, 1.50it/s]
64%|██████▍ | 6259/9770 [1:12:49<38:17, 1.53it/s]
64%|██████▍ | 6260/9770 [1:12:50<38:30,
+0: {'loss': 0.6574, 'grad_norm': 0.6583016744522384, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.64}
+0: {'loss': 0.6682, 'grad_norm': 0.6457912419212775, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.64}
+0: 1.52it/s]
64%|██████▍ | 6260/9770 [1:12:50<38:30, 1.52it/s]
64%|██████▍ | 6261/9770 [1:12:50<38:12, 1.53it/s]
64%|██████▍ | 6262/9770 [1:12:51<38:07, 1.53it/s]
64%|██████▍ | 6263/9770 [1:12:52<37:48, 1.55it/s]
64%|██████▍ | 6264/9770 [1:12:52<37:56, 1.54it/s]
64%|██████▍ | 6265/9770 [1:12:53<38:05, 1.53it/s]
64%|██████▍ | 6266/9770 [1:12:54<38:19, 1.52it/s]
64%|██████▍ | 6267/9770 [1:12:54<38:07, 1.53it/s]
64%|██████▍ | 6268/9770 [1:12:55<38:13, 1.53it/s]
64%|██████▍ | 6269/9770 [1:12:56<37:53, 1.54it/s]
64%|██████▍ | 6270/9770 [1:12:56<37:29, 1.56it/s]
64%|██████▍ | 6270/9770 [1:12:56<37:29, 1.56it/s]
64%|██████▍ | 6271/9770 [1:12:57<37:40, 1.55it/s]
64%|████�
+0: {'loss': 0.6707, 'grad_norm': 0.6145655516421002, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.64}
+0: �█▍ | 6272/9770 [1:12:58<37:58, 1.54it/s]
64%|██████▍ | 6273/9770 [1:12:58<38:12, 1.53it/s]
64%|██████▍ | 6274/9770 [1:12:59<38:01, 1.53it/s]
64%|██████▍ | 6275/9770 [1:12:59<37:44, 1.54it/s]
64%|██████▍ | 6276/9770 [1:13:00<37:27, 1.55it/s]
64%|██████▍ | 6277/9770 [1:13:01<37:39, 1.55it/s]
64%|██████▍ | 6278/9770 [1:13:01<37:26, 1.55it/s]
64%|██████▍ | 6279/9770 [1:13:02<37:43, 1.54it/s]
64%|██████▍ | 6280/9770 [1:13:03<37:49, 1.54it/s]
64%|██████▍ | 6280/9770 [1:13:03<37:49, 1.54it/s]
64%|██████▍ | 6281/9770 [1:13:03<39:00, 1.49it/s]
64%|██████▍ | 6282/9770 [1:13:04<38:14, 1.52it/s]
64%|██████▍ | 6283/9770 [1:13:05<38:02, 1.53it/s]
64%|██████▍ | 6284/9770 [1:13:05<38:13, 1.52it/s]
64%|██████▍ | 6285/
+0: {'loss': 0.6617, 'grad_norm': 0.6320067587190664, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.64}
+0: 9770 [1:13:06<38:12, 1.52it/s]
64%|██████▍ | 6286/9770 [1:13:07<38:06, 1.52it/s]
64%|██████▍ | 6287/9770 [1:13:07<37:34, 1.54it/s]
64%|██████▍ | 6288/9770 [1:13:08<37:51, 1.53it/s]
64%|██████▍ | 6289/9770 [1:13:09<37:56, 1.53it/s]
64%|██████▍ | 6290/9770 [1:13:09<37:42, 1.54it/s]
64%|██████▍ | 6290/9770 [1:13:09<37:42, 1.54it/s]
64%|██████▍ | 6291/9770 [1:13:10<37:33, 1.54it/s]
64%|██████▍ | 6292/9770 [1:13:11<37:31, 1.54it/s]
64%|██████▍ | 6293/9770 [1:13:11<37:42, 1.54it/s]
64%|██████▍ | 6294/9770 [1:13:12<37:44, 1.53it/s]
64%|██████▍ | 6295/9770 [1:13:13<37:53, 1.53it/s]
64%|██████▍ | 6296/9770 [1:13:13<37:51, 1.53it/s]
64%|██████▍ | 6297/9770 [1:13:14<37:52, 1.53it/s]
64%|██████▍ | 6298/9770 [1:13:15<38:
+0: {'loss': 0.6693, 'grad_norm': 0.6549017186900042, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.64}
+0: {'loss': 0.6739, 'grad_norm': 0.6417299597488738, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.65}
+0: 05, 1.52it/s]
64%|██████▍ | 6299/9770 [1:13:15<37:35, 1.54it/s]
64%|██████▍ | 6300/9770 [1:13:16<37:17, 1.55it/s]
64%|██████▍ | 6300/9770 [1:13:16<37:17, 1.55it/s]
64%|██████▍ | 6301/9770 [1:13:16<37:29, 1.54it/s]
65%|██████▍ | 6302/9770 [1:13:17<37:29, 1.54it/s]
65%|██████▍ | 6303/9770 [1:13:18<37:15, 1.55it/s]
65%|██████▍ | 6304/9770 [1:13:18<36:56, 1.56it/s]
65%|██████▍ | 6305/9770 [1:13:19<36:49, 1.57it/s]
65%|██████▍ | 6306/9770 [1:13:20<37:01, 1.56it/s]
65%|██████▍ | 6307/9770 [1:13:20<36:58, 1.56it/s]
65%|██████▍ | 6308/9770 [1:13:21<36:36, 1.58it/s]
65%|██████▍ | 6309/9770 [1:13:22<36:55, 1.56it/s]
65%|██████▍ | 6310/9770 [1:13:22<37:03, 1.56it/s]
65%|███�
+0: {'loss': 0.6727, 'grad_norm': 0.6375896737195781, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.65}
+0: ��██▍ | 6310/9770 [1:13:22<37:03, 1.56it/s]
65%|██████▍ | 6311/9770 [1:13:23<37:16, 1.55it/s]
65%|██████▍ | 6312/9770 [1:13:24<37:41, 1.53it/s]
65%|██████▍ | 6313/9770 [1:13:24<37:24, 1.54it/s]
65%|██████▍ | 6314/9770 [1:13:25<37:45, 1.53it/s]
65%|██████▍ | 6315/9770 [1:13:25<37:29, 1.54it/s]
65%|██████▍ | 6316/9770 [1:13:26<37:10, 1.55it/s]
65%|██████▍ | 6317/9770 [1:13:27<37:21, 1.54it/s]
65%|██████▍ | 6318/9770 [1:13:27<37:20, 1.54it/s]
65%|██████▍ | 6319/9770 [1:13:28<36:55, 1.56it/s]
65%|██████▍ | 6320/9770 [1:13:29<36:58, 1.56it/s]
65%|██████▍ | 6320/9770 [1:13:29<36:58, 1.56it/s]
65%|██████▍ | 6321/9770 [1:13:29<37:06, 1.55it/s]
65%|██████▍ | 6322/9770 [1:13:30<38:01, 1.51it/s]
65%|██████▍ | 6
+0: {'loss': 0.6639, 'grad_norm': 0.6329122036803733, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.65}
+0: 323/9770 [1:13:31<37:36, 1.53it/s]
65%|██████▍ | 6324/9770 [1:13:31<37:40, 1.52it/s]
65%|██████▍ | 6325/9770 [1:13:32<37:35, 1.53it/s]
65%|██████▍ | 6326/9770 [1:13:33<37:25, 1.53it/s]
65%|██████▍ | 6327/9770 [1:13:33<37:25, 1.53it/s]
65%|██████▍ | 6328/9770 [1:13:34<37:04, 1.55it/s]
65%|██████▍ | 6329/9770 [1:13:35<37:18, 1.54it/s]
65%|██████▍ | 6330/9770 [1:13:35<37:05, 1.55it/s]
65%|██████▍ | 6330/9770 [1:13:35<37:05, 1.55it/s]
65%|██████▍ | 6331/9770 [1:13:36<37:41, 1.52it/s]
65%|██████▍ | 6332/9770 [1:13:37<37:52, 1.51it/s]
65%|██████▍ | 6333/9770 [1:13:37<37:27, 1.53it/s]
65%|██████▍ | 6334/9770 [1:13:38<37:36, 1.52it/s]
65%|██████▍ | 6335/9770 [1:13:38<36:56, 1.55it/s]
65%|██████▍ | 6336/9770 [1:13:39
+0: {'loss': 0.6571, 'grad_norm': 0.6173711414772245, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.65}
+0: <36:55, 1.55it/s]
65%|██████▍ | 6337/9770 [1:13:40<36:54, 1.55it/s]
65%|██████▍ | 6338/9770 [1:13:40<36:55, 1.55it/s]
65%|██████▍ | 6339/9770 [1:13:41<37:06, 1.54it/s]
65%|██████▍ | 6340/9770 [1:13:42<37:35, 1.52it/s]
65%|██████▍ | 6340/9770 [1:13:42<37:35, 1.52it/s]
65%|██████▍ | 6341/9770 [1:13:42<37:35, 1.52it/s]
65%|██████▍ | 6342/9770 [1:13:43<37:41, 1.52it/s]
65%|██████▍ | 6343/9770 [1:13:44<37:26, 1.53it/s]
65%|██████▍ | 6344/9770 [1:13:44<38:06, 1.50it/s]
65%|██████▍ | 6345/9770 [1:13:45<37:46, 1.51it/s]
65%|██████▍ | 6346/9770 [1:13:46<37:38, 1.52it/s]
65%|██████▍ | 6347/9770 [1:13:46<37:49, 1.51it/s]
65%|██████▍ | 6348/9770 [1:13:47<37:56, 1.50it/s]
65%|██████▍ | 6349/9770 [1:13:48<37:41, 1.51it/s
+0: {'loss': 0.6586, 'grad_norm': 0.5961268309458277, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.65}
+0: {'loss': 0.6664, 'grad_norm': 0.6169471478383356, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.65}
+0: ]
65%|██████▍ | 6350/9770 [1:13:48<37:03, 1.54it/s]
65%|██████▍ | 6350/9770 [1:13:48<37:03, 1.54it/s]
65%|██████▌ | 6351/9770 [1:13:49<37:01, 1.54it/s]
65%|██████▌ | 6352/9770 [1:13:50<38:01, 1.50it/s]
65%|██████▌ | 6353/9770 [1:13:50<38:44, 1.47it/s]
65%|██████▌ | 6354/9770 [1:13:51<38:08, 1.49it/s]
65%|██████▌ | 6355/9770 [1:13:52<37:18, 1.53it/s]
65%|██████▌ | 6356/9770 [1:13:52<37:26, 1.52it/s]
65%|██████▌ | 6357/9770 [1:13:53<37:38, 1.51it/s]
65%|██████▌ | 6358/9770 [1:13:54<37:08, 1.53it/s]
65%|██████▌ | 6359/9770 [1:13:54<36:49, 1.54it/s]
65%|██████▌ | 6360/9770 [1:13:55<37:02, 1.53it/s]
65%|██████▌ | 6360/9770 [1:13:55<37:02, 1.53it/s]
65%|██████▌
+0: {'loss': 0.6521, 'grad_norm': 0.581540052822275, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.65}
+0: | 6361/9770 [1:13:56<37:54, 1.50it/s]
65%|██████▌ | 6362/9770 [1:13:56<37:31, 1.51it/s]
65%|██████▌ | 6363/9770 [1:13:57<37:21, 1.52it/s]
65%|██████▌ | 6364/9770 [1:13:58<37:48, 1.50it/s]
65%|██████▌ | 6365/9770 [1:13:58<37:15, 1.52it/s]
65%|██████▌ | 6366/9770 [1:13:59<37:42, 1.50it/s]
65%|██████▌ | 6367/9770 [1:14:00<37:28, 1.51it/s]
65%|██████▌ | 6368/9770 [1:14:00<37:18, 1.52it/s]
65%|██████▌ | 6369/9770 [1:14:01<37:22, 1.52it/s]
65%|██████▌ | 6370/9770 [1:14:02<37:08, 1.53it/s]
65%|██████▌ | 6370/9770 [1:14:02<37:08, 1.53it/s]
65%|██████▌ | 6371/9770 [1:14:02<37:00, 1.53it/s]
65%|██████▌ | 6372/9770 [1:14:03<36:28, 1.55it/s]
65%|██████▌ | 6373/9770 [1:14:03<36:35, 1.55it/s]
65%|██████▌ | 6374/9770 [1:1
+0: {'loss': 0.6648, 'grad_norm': 0.6752141016610559, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.65}
+0: 4:04<36:55, 1.53it/s]
65%|██████▌ | 6375/9770 [1:14:05<36:57, 1.53it/s]
65%|██████▌ | 6376/9770 [1:14:05<37:50, 1.49it/s]
65%|██████▌ | 6377/9770 [1:14:06<37:04, 1.53it/s]
65%|██████▌ | 6378/9770 [1:14:07<36:53, 1.53it/s]
65%|██████▌ | 6379/9770 [1:14:07<36:57, 1.53it/s]
65%|██████▌ | 6380/9770 [1:14:08<37:41, 1.50it/s]
65%|██████▌ | 6380/9770 [1:14:08<37:41, 1.50it/s]
65%|██████▌ | 6381/9770 [1:14:09<37:33, 1.50it/s]
65%|██████▌ | 6382/9770 [1:14:09<37:24, 1.51it/s]
65%|██████▌ | 6383/9770 [1:14:10<37:08, 1.52it/s]
65%|██████▌ | 6384/9770 [1:14:11<36:44, 1.54it/s]
65%|██████▌ | 6385/9770 [1:14:11<37:08, 1.52it/s]
65%|██████▌ | 6386/9770 [1:14:12<36:42, 1.54it/s]
65%|██████▌ | 6387/9770 [1:14:13<36:46, 1.53
+0: {'loss': 0.6502, 'grad_norm': 0.6082760860616365, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.65}
+0: it/s]
65%|██████▌ | 6388/9770 [1:14:13<36:30, 1.54it/s]
65%|██████▌ | 6389/9770 [1:14:14<37:28, 1.50it/s]
65%|██████▌ | 6390/9770 [1:14:15<36:59, 1.52it/s]
65%|██████▌ | 6390/9770 [1:14:15<36:59, 1.52it/s]
65%|██████▌ | 6391/9770 [1:14:15<37:02, 1.52it/s]
65%|██████▌ | 6392/9770 [1:14:16<36:51, 1.53it/s]
65%|██████▌ | 6393/9770 [1:14:17<37:16, 1.51it/s]
65%|██████▌ | 6394/9770 [1:14:17<36:38, 1.54it/s]
65%|██████▌ | 6395/9770 [1:14:18<36:10, 1.55it/s]
65%|██████▌ | 6396/9770 [1:14:19<36:14, 1.55it/s]
65%|██████▌ | 6397/9770 [1:14:19<36:27, 1.54it/s]
65%|██████▌ | 6398/9770 [1:14:20<36:26, 1.54it/s]
65%|██████▌ | 6399/9770 [1:14:20<36:23, 1.54it/s]
66%|██████▌ | 6400/9770 [1:14:21<36:15, 1.55it/s]
+0: {'loss': 0.6523, 'grad_norm': 0.6530998637352518, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.65}
+0: {'loss': 0.6551, 'grad_norm': 0.667131242856233, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.66}
+0:
66%|██████▌ | 6400/9770 [1:14:21<36:15, 1.55it/s]
66%|██████▌ | 6401/9770 [1:14:22<36:01, 1.56it/s]
66%|██████▌ | 6402/9770 [1:14:22<37:10, 1.51it/s]
66%|██████▌ | 6403/9770 [1:14:23<37:10, 1.51it/s]
66%|██████▌ | 6404/9770 [1:14:24<37:01, 1.52it/s]
66%|██████▌ | 6405/9770 [1:14:24<37:25, 1.50it/s]
66%|██████▌ | 6406/9770 [1:14:25<37:51, 1.48it/s]
66%|██████▌ | 6407/9770 [1:14:26<37:45, 1.48it/s]
66%|██████▌ | 6408/9770 [1:14:27<38:03, 1.47it/s]
66%|██████▌ | 6409/9770 [1:14:27<37:48, 1.48it/s]
66%|██████▌ | 6410/9770 [1:14:28<37:34, 1.49it/s]
66%|██████▌ | 6410/9770 [1:14:28<37:34, 1.49it/s]
66%|██████▌ | 6411/9770 [1:14:29<38:21, 1.46it/s]
66%|██████▌ | 6412/9770
+0: {'loss': 0.6569, 'grad_norm': 0.594956367277564, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.66}
+0: [1:14:29<37:43, 1.48it/s]
66%|██████▌ | 6413/9770 [1:14:30<37:10, 1.50it/s]
66%|██████▌ | 6414/9770 [1:14:31<36:38, 1.53it/s]
66%|██████▌ | 6415/9770 [1:14:31<36:32, 1.53it/s]
66%|██████▌ | 6416/9770 [1:14:32<36:24, 1.54it/s]
66%|██████▌ | 6417/9770 [1:14:32<35:48, 1.56it/s]
66%|██████▌ | 6418/9770 [1:14:33<35:19, 1.58it/s]
66%|██████▌ | 6419/9770 [1:14:34<35:24, 1.58it/s]
66%|██████▌ | 6420/9770 [1:14:34<35:48, 1.56it/s]
66%|██████▌ | 6420/9770 [1:14:34<35:48, 1.56it/s]
66%|██████▌ | 6421/9770 [1:14:35<35:57, 1.55it/s]
66%|██████▌ | 6422/9770 [1:14:36<36:29, 1.53it/s]
66%|██████▌ | 6423/9770 [1:14:36<36:33, 1.53it/s]
66%|██████▌ | 6424/9770 [1:14:37<42:42, 1.31it/s]
66%|██████▌ | 6425/9770 [1:14:38<41:12,
+0: {'loss': 0.6997, 'grad_norm': 0.6134620436602249, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.66}
+0: 1.35it/s]
66%|██████▌ | 6426/9770 [1:14:39<39:53, 1.40it/s]
66%|██████▌ | 6427/9770 [1:14:39<38:51, 1.43it/s]
66%|██████▌ | 6428/9770 [1:14:40<38:10, 1.46it/s]
66%|██████▌ | 6429/9770 [1:14:41<37:56, 1.47it/s]
66%|██████▌ | 6430/9770 [1:14:41<37:18, 1.49it/s]
66%|██████▌ | 6430/9770 [1:14:41<37:18, 1.49it/s]
66%|██████▌ | 6431/9770 [1:14:42<37:03, 1.50it/s]
66%|██████▌ | 6432/9770 [1:14:43<37:29, 1.48it/s]
66%|██████▌ | 6433/9770 [1:14:43<37:04, 1.50it/s]
66%|██████▌ | 6434/9770 [1:14:44<36:35, 1.52it/s]
66%|██████▌ | 6435/9770 [1:14:45<36:39, 1.52it/s]
66%|██████▌ | 6436/9770 [1:14:45<36:20, 1.53it/s]
66%|██████▌ | 6437/9770 [1:14:46<36:17, 1.53it/s]
66%|██████▌ | 6438/9770 [1:14:47<37:12, 1.49it/s]
66%|�
+0: {'loss': 0.6668, 'grad_norm': 0.6662470100499767, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.66}
+0: {'loss': 0.6599, 'grad_norm': 0.5885489684254362, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.66}
+0: �█████▌ | 6439/9770 [1:14:47<37:44, 1.47it/s]
66%|██████▌ | 6440/9770 [1:14:48<37:38, 1.47it/s]
66%|██████▌ | 6440/9770 [1:14:48<37:38, 1.47it/s]
66%|██████▌ | 6441/9770 [1:14:49<36:50, 1.51it/s]
66%|██████▌ | 6442/9770 [1:14:49<36:31, 1.52it/s]
66%|██████▌ | 6443/9770 [1:14:50<36:16, 1.53it/s]
66%|██████▌ | 6444/9770 [1:14:51<37:03, 1.50it/s]
66%|██████▌ | 6445/9770 [1:14:51<36:32, 1.52it/s]
66%|██████▌ | 6446/9770 [1:14:52<36:24, 1.52it/s]
66%|██████▌ | 6447/9770 [1:14:53<36:25, 1.52it/s]
66%|██████▌ | 6448/9770 [1:14:53<36:07, 1.53it/s]
66%|██████▌ | 6449/9770 [1:14:54<35:36, 1.55it/s]
66%|██████▌ | 6450/9770 [1:14:54<35:43, 1.55it/s]
66%|██████▌ | 6450/9
+0: {'loss': 0.6723, 'grad_norm': 0.619610510777976, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.66}
+0: 770 [1:14:54<35:43, 1.55it/s]
66%|██████▌ | 6451/9770 [1:14:55<36:27, 1.52it/s]
66%|██████▌ | 6452/9770 [1:14:56<36:48, 1.50it/s]
66%|██████▌ | 6453/9770 [1:14:56<36:13, 1.53it/s]
66%|██████▌ | 6454/9770 [1:14:57<35:39, 1.55it/s]
66%|██████▌ | 6455/9770 [1:14:58<36:34, 1.51it/s]
66%|██████▌ | 6456/9770 [1:14:58<36:37, 1.51it/s]
66%|██████▌ | 6457/9770 [1:14:59<37:00, 1.49it/s]
66%|██████▌ | 6458/9770 [1:15:00<36:55, 1.50it/s]
66%|██████▌ | 6459/9770 [1:15:00<36:27, 1.51it/s]
66%|██████▌ | 6460/9770 [1:15:01<36:12, 1.52it/s]
66%|██████▌ | 6460/9770 [1:15:01<36:12, 1.52it/s]
66%|██████▌ | 6461/9770 [1:15:02<35:52, 1.54it/s]
66%|██████▌ | 6462/9770 [1:15:02<35:44, 1.54it/s]
66%|██████▌ | 6463/9770 [1:15:03<35:5
+0: {'loss': 0.6848, 'grad_norm': 0.672171900724939, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.66}
+0: 2, 1.54it/s]
66%|██████▌ | 6464/9770 [1:15:04<35:38, 1.55it/s]
66%|██████▌ | 6465/9770 [1:15:04<35:39, 1.55it/s]
66%|██████▌ | 6466/9770 [1:15:05<35:32, 1.55it/s]
66%|██████▌ | 6467/9770 [1:15:06<35:51, 1.54it/s]
66%|██████▌ | 6468/9770 [1:15:06<36:46, 1.50it/s]
66%|██████▌ | 6469/9770 [1:15:07<41:55, 1.31it/s]
66%|██████▌ | 6470/9770 [1:15:08<40:33, 1.36it/s]
66%|██████▌ | 6470/9770 [1:15:08<40:33, 1.36it/s]
66%|██████▌ | 6471/9770 [1:15:09<39:02, 1.41it/s]
66%|██████▌ | 6472/9770 [1:15:09<37:51, 1.45it/s]
66%|██████▋ | 6473/9770 [1:15:10<37:27, 1.47it/s]
66%|██████▋ | 6474/9770 [1:15:11<36:30, 1.50it/s]
66%|█���████▋ | 6475/9770 [1:15:11<36:02, 1.52it/s]
66%|██████▋ | 6476/9770 [1:15:12<40:43, 1.35it/s]
66
+0: {'loss': 0.6737, 'grad_norm': 0.6175115663977871, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.66}
+0: %|██████▋ | 6477/9770 [1:15:13<39:16, 1.40it/s]
66%|██████▋ | 6478/9770 [1:15:13<38:17, 1.43it/s]
66%|██████▋ | 6479/9770 [1:15:14<37:16, 1.47it/s]
66%|██████▋ | 6480/9770 [1:15:15<37:00, 1.48it/s]
66%|██████▋ | 6480/9770 [1:15:15<37:00, 1.48it/s]
66%|██████▋ | 6481/9770 [1:15:15<36:41, 1.49it/s]
66%|██████▋ | 6482/9770 [1:15:16<36:13, 1.51it/s]
66%|██████▋ | 6483/9770 [1:15:17<35:56, 1.52it/s]
66%|██████▋ | 6484/9770 [1:15:17<35:42, 1.53it/s]
66%|██████▋ | 6485/9770 [1:15:18<35:32, 1.54it/s]
66%|██████▋ | 6486/9770 [1:15:19<35:34, 1.54it/s]
66%|██████▋ | 6487/9770 [1:15:19<35:20, 1.55it/s]
66%|██████▋ | 6488/9770 [1:15:20<35:29, 1.54it/s]
66%|██████▋ | 6489/9770 [1:15:21<36:29, 1.50it/s]
66%|█████
+0: {'loss': 0.6599, 'grad_norm': 0.6570466816903526, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.66}
+0: {'loss': 0.6549, 'grad_norm': 0.6427027214829611, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.66}
+0: █▋ | 6490/9770 [1:15:21<36:17, 1.51it/s]
66%|██████▋ | 6490/9770 [1:15:21<36:17, 1.51it/s]
66%|██████▋ | 6491/9770 [1:15:22<36:17, 1.51it/s]
66%|██████▋ | 6492/9770 [1:15:23<36:19, 1.50it/s]
66%|██████▋ | 6493/9770 [1:15:23<36:07, 1.51it/s]
66%|██████▋ | 6494/9770 [1:15:24<36:10, 1.51it/s]
66%|██████▋ | 6495/9770 [1:15:25<36:02, 1.51it/s]
66%|██████▋ | 6496/9770 [1:15:25<35:50, 1.52it/s]
66%|██████▋ | 6497/9770 [1:15:26<35:37, 1.53it/s]
67%|██████▋ | 6498/9770 [1:15:27<35:57, 1.52it/s]
67%|██████▋ | 6499/9770 [1:15:27<35:38, 1.53it/s]
67%|██████▋ | 6500/9770 [1:15:28<35:52, 1.52it/s]
67%|██████▋ | 6500/9770 [1:15:28<35:52, 1.52it/s]
67%|██████▋ | 6501/9770 [1:15:29<
+0: {'loss': 0.6613, 'grad_norm': 0.6206673552340211, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.67}
+0: 36:04, 1.51it/s]
67%|██████▋ | 6502/9770 [1:15:29<35:42, 1.53it/s]
67%|██████▋ | 6503/9770 [1:15:30<36:26, 1.49it/s]
67%|██████▋ | 6504/9770 [1:15:31<35:58, 1.51it/s]
67%|██████▋ | 6505/9770 [1:15:31<35:24, 1.54it/s]
67%|██████▋ | 6506/9770 [1:15:32<35:15, 1.54it/s]
67%|██████▋ | 6507/9770 [1:15:32<34:46, 1.56it/s]
67%|██████▋ | 6508/9770 [1:15:33<34:19, 1.58it/s]
67%|██████▋ | 6509/9770 [1:15:34<34:35, 1.57it/s]
67%|██████▋ | 6510/9770 [1:15:35<40:15, 1.35it/s]
67%|██████▋ | 6510/9770 [1:15:35<40:15, 1.35it/s]
67%|██████▋ | 6511/9770 [1:15:35<38:36, 1.41it/s]
67%|██████▋ | 6512/9770 [1:15:36<43:43, 1.24it/s]
67%|██████▋ | 6513/9770 [1:15:37<46:05, 1.18it/s]
67%|██████▋ | 6514/9770 [1:15:38<42:58, 1.26it/s]
+0: {'loss': 0.6661, 'grad_norm': 0.5671139138267554, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.67}
+0:
67%|██████▋ | 6515/9770 [1:15:39<40:57, 1.32it/s]
67%|██████▋ | 6516/9770 [1:15:39<38:51, 1.40it/s]
67%|██████▋ | 6517/9770 [1:15:40<38:02, 1.43it/s]
67%|██████▋ | 6518/9770 [1:15:41<37:24, 1.45it/s]
67%|██████▋ | 6519/9770 [1:15:41<36:28, 1.49it/s]
67%|██████▋ | 6520/9770 [1:15:42<35:45, 1.51it/s]
67%|██████▋ | 6520/9770 [1:15:42<35:45, 1.51it/s]
67%|██████▋ | 6521/9770 [1:15:42<35:44, 1.52it/s]
67%|██████▋ | 6522/9770 [1:15:43<40:36, 1.33it/s]
67%|██████▋ | 6523/9770 [1:15:44<38:37, 1.40it/s]
67%|██████▋ | 6524/9770 [1:15:45<37:45, 1.43it/s]
67%|██████▋ | 6525/9770 [1:15:45<37:18, 1.45it/s]
67%|██████▋ | 6526/9770 [1:15:46<36:45, 1.47it/s]
67%|██████▋ | 6527/9770 [1:15:47<36:27, 1.48it/s]
67%|███�
+0: {'loss': 0.6729, 'grad_norm': 0.6154789972421226, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.67}
+0: �██▋ | 6528/9770 [1:15:47<36:43, 1.47it/s]
67%|██████▋ | 6529/9770 [1:15:48<36:49, 1.47it/s]
67%|██████▋ | 6530/9770 [1:15:49<36:12, 1.49it/s]
67%|██████▋ | 6530/9770 [1:15:49<36:12, 1.49it/s]
67%|██████▋ | 6531/9770 [1:15:49<36:04, 1.50it/s]
67%|██████▋ | 6532/9770 [1:15:50<35:49, 1.51it/s]
67%|██████▋ | 6533/9770 [1:15:51<35:31, 1.52it/s]
67%|██████▋ | 6534/9770 [1:15:51<35:26, 1.52it/s]
67%|██████▋ | 6535/9770 [1:15:52<35:07, 1.54it/s]
67%|██████▋ | 6536/9770 [1:15:53<34:54, 1.54it/s]
67%|██████▋ | 6537/9770 [1:15:53<35:15, 1.53it/s]
67%|██████▋ | 6538/9770 [1:15:54<35:02, 1.54it/s]
67%|██████▋ | 6539/9770 [1:15:55<34:45, 1.55it/s]
67%|██████▋ | 6540/9770 [1:15:55<34:59, 1.54it/s]
+0: {'loss': 0.6504, 'grad_norm': 0.661393630158148, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.67}
+0: {'loss': 0.671, 'grad_norm': 0.6165761208441942, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.67}
+0:
67%|██████▋ | 6540/9770 [1:15:55<34:59, 1.54it/s]
67%|██████▋ | 6541/9770 [1:15:56<35:00, 1.54it/s]
67%|██████▋ | 6542/9770 [1:15:57<34:53, 1.54it/s]
67%|██████▋ | 6543/9770 [1:15:57<35:19, 1.52it/s]
67%|██████▋ | 6544/9770 [1:15:58<35:09, 1.53it/s]
67%|██████▋ | 6545/9770 [1:15:59<35:21, 1.52it/s]
67%|██████▋ | 6546/9770 [1:16:00<40:34, 1.32it/s]
67%|██████▋ | 6547/9770 [1:16:00<42:20, 1.27it/s]
67%|██████▋ | 6548/9770 [1:16:01<41:02, 1.31it/s]
67%|██████▋ | 6549/9770 [1:16:02<38:57, 1.38it/s]
67%|██████▋ | 6550/9770 [1:16:02<37:09, 1.44it/s]
67%|██████▋ | 6550/9770 [1:16:02<37:09, 1.44it/s]
67%|██████▋ | 6551/9770 [1:16:03<36:53, 1.45it/s]
67%|██████▋ | 6552/9770 [1:16:04<36:15, 1.48i
+0: {'loss': 0.6594, 'grad_norm': 0.6211973298546264, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.67}
+0: t/s]
67%|██████▋ | 6553/9770 [1:16:04<35:48, 1.50it/s]
67%|██████▋ | 6554/9770 [1:16:05<35:25, 1.51it/s]
67%|██████▋ | 6555/9770 [1:16:06<35:06, 1.53it/s]
67%|██████▋ | 6556/9770 [1:16:06<34:51, 1.54it/s]
67%|██████▋ | 6557/9770 [1:16:07<35:01, 1.53it/s]
67%|██████▋ | 6558/9770 [1:16:08<40:01, 1.34it/s]
67%|██████▋ | 6559/9770 [1:16:09<38:08, 1.40it/s]
67%|██████▋ | 6560/9770 [1:16:09<42:21, 1.26it/s]
67%|██████▋ | 6560/9770 [1:16:09<42:21, 1.26it/s]
67%|██████▋ | 6561/9770 [1:16:10<40:05, 1.33it/s]
67%|██████▋ | 6562/9770 [1:16:11<38:19, 1.40it/s]
67%|██████▋ | 6563/9770 [1:16:11<37:31, 1.42it/s]
67%|██████▋ | 6564/9770 [1:16:12<36:57, 1.45it/s]
67%|██████▋ | 6565/9770 [1:16:13<35:52, 1.49it/s]
67%|██�
+0: {'loss': 0.6793, 'grad_norm': 0.6803455642466582, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.67}
+0: ��███▋ | 6566/9770 [1:16:13<35:29, 1.50it/s]
67%|██████▋ | 6567/9770 [1:16:14<34:59, 1.53it/s]
67%|██████▋ | 6568/9770 [1:16:15<35:12, 1.52it/s]
67%|██████▋ | 6569/9770 [1:16:16<40:00, 1.33it/s]
67%|██████▋ | 6570/9770 [1:16:16<38:53, 1.37it/s]
67%|██████▋ | 6570/9770 [1:16:16<38:53, 1.37it/s]
67%|██████▋ | 6571/9770 [1:16:17<37:29, 1.42it/s]
67%|██████▋ | 6572/9770 [1:16:18<37:12, 1.43it/s]
67%|██████▋ | 6573/9770 [1:16:18<36:42, 1.45it/s]
67%|██████▋ | 6574/9770 [1:16:19<35:56, 1.48it/s]
67%|██████▋ | 6575/9770 [1:16:20<35:37, 1.49it/s]
67%|██████▋ | 6576/9770 [1:16:20<35:06, 1.52it/s]
67%|██████▋ | 6577/9770 [1:16:21<34:45, 1.53it/s]
67%|██████▋ | 6578/9770 [1:16:22<34:54, 1.52it/s]
67%|██████▋
+0: {'loss': 0.6892, 'grad_norm': 0.6249111655847285, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.67}
+0: {'loss': 0.688, 'grad_norm': 0.6369179943692846, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.67}
+0: | 6579/9770 [1:16:22<35:43, 1.49it/s]
67%|██████▋ | 6580/9770 [1:16:23<35:28, 1.50it/s]
67%|██████▋ | 6580/9770 [1:16:23<35:28, 1.50it/s]
67%|██████▋ | 6581/9770 [1:16:24<35:21, 1.50it/s]
67%|██████▋ | 6582/9770 [1:16:24<34:54, 1.52it/s]
67%|██████▋ | 6583/9770 [1:16:25<34:58, 1.52it/s]
67%|██████▋ | 6584/9770 [1:16:26<34:53, 1.52it/s]
67%|██████▋ | 6585/9770 [1:16:26<34:27, 1.54it/s]
67%|██████▋ | 6586/9770 [1:16:27<34:36, 1.53it/s]
67%|██████▋ | 6587/9770 [1:16:27<34:15, 1.55it/s]
67%|██████▋ | 6588/9770 [1:16:28<34:12, 1.55it/s]
67%|██████▋ | 6589/9770 [1:16:29<34:07, 1.55it/s]
67%|██████▋ | 6590/9770 [1:16:29<34:03, 1.56it/s]
67%|██████▋ | 6590/9770 [1:16:29<34:03, 1
+0: {'loss': 0.6648, 'grad_norm': 0.628182765662115, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.68}
+0: .56it/s]
67%|██████▋ | 6591/9770 [1:16:30<34:23, 1.54it/s]
67%|██████▋ | 6592/9770 [1:16:31<35:12, 1.50it/s]
67%|██████▋ | 6593/9770 [1:16:31<35:05, 1.51it/s]
67%|██████▋ | 6594/9770 [1:16:32<34:55, 1.52it/s]
68%|██████▊ | 6595/9770 [1:16:33<34:12, 1.55it/s]
68%|██████▊ | 6596/9770 [1:16:33<34:15, 1.54it/s]
68%|██████▊ | 6597/9770 [1:16:34<34:08, 1.55it/s]
68%|██████▊ | 6598/9770 [1:16:35<34:18, 1.54it/s]
68%|██████▊ | 6599/9770 [1:16:35<34:11, 1.55it/s]
68%|██████▊ | 6600/9770 [1:16:36<34:04, 1.55it/s]
68%|██████▊ | 6600/9770 [1:16:36<34:04, 1.55it/s]
68%|██████▊ | 6601/9770 [1:16:37<34:25, 1.53it/s]
68%|██████▊ | 6602/9770 [1:16:37<34:22, 1.54it/s]
68%|██████▊ | 6603/9770 [1:16:38<34:24, 1.53it/s]
68%|█
+0: {'loss': 0.6425, 'grad_norm': 0.658591188243479, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.68}
+0: █████▊ | 6604/9770 [1:16:38<33:56, 1.55it/s]
68%|██████▊ | 6605/9770 [1:16:39<33:54, 1.56it/s]
68%|██████▊ | 6606/9770 [1:16:40<34:12, 1.54it/s]
68%|██████▊ | 6607/9770 [1:16:40<34:30, 1.53it/s]
68%|██████▊ | 6608/9770 [1:16:41<34:11, 1.54it/s]
68%|██████▊ | 6609/9770 [1:16:42<34:01, 1.55it/s]
68%|██████▊ | 6610/9770 [1:16:42<34:51, 1.51it/s]
68%|██████▊ | 6610/9770 [1:16:42<34:51, 1.51it/s]
68%|██████▊ | 6611/9770 [1:16:43<34:37, 1.52it/s]
68%|██████▊ | 6612/9770 [1:16:44<34:04, 1.54it/s]
68%|██████▊ | 6613/9770 [1:16:44<34:09, 1.54it/s]
68%|██████▊ | 6614/9770 [1:16:45<33:49, 1.56it/s]
68%|██████▊ | 6615/9770 [1:16:46<34:05, 1.54it/s]
68%|██████▊ | 6616/9770 [1:16:46<33:59, 1.55it/s]
68%|██████�
+0: {'loss': 0.6629, 'grad_norm': 0.6536838426154358, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.68}
+0: � | 6617/9770 [1:16:47<35:03, 1.50it/s]
68%|██████▊ | 6618/9770 [1:16:48<34:58, 1.50it/s]
68%|██████▊ | 6619/9770 [1:16:48<34:44, 1.51it/s]
68%|██████▊ | 6620/9770 [1:16:49<34:28, 1.52it/s]
68%|██████▊ | 6620/9770 [1:16:49<34:28, 1.52it/s]
68%|██████▊ | 6621/9770 [1:16:50<34:11, 1.53it/s]
68%|██████▊ | 6622/9770 [1:16:50<33:43, 1.56it/s]
68%|██████▊ | 6623/9770 [1:16:51<33:42, 1.56it/s]
68%|██████▊ | 6624/9770 [1:16:52<33:49, 1.55it/s]
68%|██████▊ | 6625/9770 [1:16:52<34:09, 1.53it/s]
68%|█████���▊ | 6626/9770 [1:16:53<34:09, 1.53it/s]
68%|██████▊ | 6627/9770 [1:16:54<34:15, 1.53it/s]
68%|██████▊ | 6628/9770 [1:16:54<34:03, 1.54it/s]
68%|██████▊ | 6629/9770 [1:16:55<34:10, 1.53it/s]
68%|██████▊ | 6630/9770 [
+0: {'loss': 0.6884, 'grad_norm': 0.6120148503555509, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.68}
+0: {'loss': 0.683, 'grad_norm': 0.6054140378200428, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.68}
+0: 1:16:55<34:37, 1.51it/s]
68%|██████▊ | 6630/9770 [1:16:55<34:37, 1.51it/s]
68%|██████▊ | 6631/9770 [1:16:56<35:27, 1.48it/s]
68%|██████▊ | 6632/9770 [1:16:57<34:58, 1.50it/s]
68%|██████▊ | 6633/9770 [1:16:58<34:45, 1.50it/s]
68%|██████▊ | 6634/9770 [1:16:58<34:18, 1.52it/s]
68%|██████▊ | 6635/9770 [1:16:59<34:03, 1.53it/s]
68%|██████▊ | 6636/9770 [1:16:59<33:50, 1.54it/s]
68%|██████▊ | 6637/9770 [1:17:00<33:31, 1.56it/s]
68%|██████▊ | 6638/9770 [1:17:01<33:46, 1.55it/s]
68%|██████▊ | 6639/9770 [1:17:01<34:29, 1.51it/s]
68%|██████▊ | 6640/9770 [1:17:02<34:11, 1.53it/s]
68%|██████▊ | 6640/9770 [1:17:02<34:11, 1.53it/s]
68%|██████▊ | 6641/9770 [1:17:03<34:08, 1.53it/s]
68%
+0: {'loss': 0.6588, 'grad_norm': 0.6760128021442565, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.68}
+0: |██████▊ | 6642/9770 [1:17:03<34:52, 1.50it/s]
68%|██████▊ | 6643/9770 [1:17:04<34:32, 1.51it/s]
68%|██████▊ | 6644/9770 [1:17:05<34:13, 1.52it/s]
68%|██████▊ | 6645/9770 [1:17:05<34:16, 1.52it/s]
68%|██████▊ | 6646/9770 [1:17:06<34:20, 1.52it/s]
68%|██████▊ | 6647/9770 [1:17:07<34:51, 1.49it/s]
68%|██████▊ | 6648/9770 [1:17:07<34:43, 1.50it/s]
68%|██████▊ | 6649/9770 [1:17:08<34:25, 1.51it/s]
68%|██████▊ | 6650/9770 [1:17:09<34:05, 1.53it/s]
68%|██████▊ | 6650/9770 [1:17:09<34:05, 1.53it/s]
68%|██████▊ | 6651/9770 [1:17:09<34:20, 1.51it/s]
68%|██████▊ | 6652/9770 [1:17:10<34:52, 1.49it/s]
68%|██████▊ | 6653/9770 [1:17:11<34:20, 1.51it/s]
68%|██████▊ | 6654/9770 [1:17:11<34:57, 1.49it/s]
68%|█████�
+0: {'loss': 0.675, 'grad_norm': 0.6295464781057989, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.68}
+0: ��▊ | 6655/9770 [1:17:12<34:28, 1.51it/s]
68%|██████▊ | 6656/9770 [1:17:13<34:11, 1.52it/s]
68%|██████▊ | 6657/9770 [1:17:13<34:08, 1.52it/s]
68%|██████▊ | 6658/9770 [1:17:14<34:02, 1.52it/s]
68%|██████▊ | 6659/9770 [1:17:15<34:16, 1.51it/s]
68%|██████▊ | 6660/9770 [1:17:15<33:40, 1.54it/s]
68%|██████▊ | 6660/9770 [1:17:15<33:40, 1.54it/s]
68%|██████▊ | 6661/9770 [1:17:16<34:31, 1.50it/s]
68%|██████▊ | 6662/9770 [1:17:17<34:37, 1.50it/s]
68%|██████▊ | 6663/9770 [1:17:17<34:29, 1.50it/s]
68%|██████▊ | 6664/9770 [1:17:18<34:44, 1.49it/s]
68%|██████▊ | 6665/9770 [1:17:19<34:58, 1.48it/s]
68%|██████▊ | 6666/9770 [1:17:19<34:35, 1.50it/s]
68%|██████▊ | 6667/9770 [1:17:20<34:33, 1.50it/s]
68%|██████▊ | 6668/97
+0: {'loss': 0.6701, 'grad_norm': 0.6227957466848296, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.68}
+0: {'loss': 0.6755, 'grad_norm': 0.7060683051148555, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.68}
+0: 70 [1:17:21<35:01, 1.48it/s]
68%|██████▊ | 6669/9770 [1:17:21<34:38, 1.49it/s]
68%|██████▊ | 6670/9770 [1:17:22<34:35, 1.49it/s]
68%|██████▊ | 6670/9770 [1:17:22<34:35, 1.49it/s]
68%|██████▊ | 6671/9770 [1:17:23<34:41, 1.49it/s]
68%|██████▊ | 6672/9770 [1:17:23<34:12, 1.51it/s]
68%|██████▊ | 6673/9770 [1:17:24<34:11, 1.51it/s]
68%|██████▊ | 6674/9770 [1:17:25<34:47, 1.48it/s]
68%|██████▊ | 6675/9770 [1:17:25<34:25, 1.50it/s]
68%|██████▊ | 6676/9770 [1:17:26<34:05, 1.51it/s]
68%|██████▊ | 6677/9770 [1:17:27<34:15, 1.50it/s]
68%|██████▊ | 6678/9770 [1:17:27<34:57, 1.47it/s]
68%|██████▊ | 6679/9770 [1:17:28<35:04, 1.47it/s]
68%|██████▊ | 6680/9770 [1:17:29<34:42, 1.48it/s]
+0: {'loss': 0.6844, 'grad_norm': 0.6717450705216048, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.68}
+0: 68%|██████▊ | 6680/9770 [1:17:29<34:42, 1.48it/s]
68%|██████▊ | 6681/9770 [1:17:29<34:43, 1.48it/s]
68%|██████▊ | 6682/9770 [1:17:30<34:40, 1.48it/s]
68%|██████▊ | 6683/9770 [1:17:31<34:21, 1.50it/s]
68%|██████▊ | 6684/9770 [1:17:31<34:09, 1.51it/s]
68%|██████▊ | 6685/9770 [1:17:32<34:38, 1.48it/s]
68%|██████▊ | 6686/9770 [1:17:33<34:36, 1.48it/s]
68%|██████▊ | 6687/9770 [1:17:33<34:24, 1.49it/s]
68%|██████▊ | 6688/9770 [1:17:34<33:52, 1.52it/s]
68%|██████▊ | 6689/9770 [1:17:35<33:47, 1.52it/s]
68%|██████▊ | 6690/9770 [1:17:35<33:32, 1.53it/s]
68%|██████▊ | 6690/9770 [1:17:35<33:32, 1.53it/s]
68%|██████▊ | 6691/9770 [1:17:36<34:04, 1.51it/s]
68%|██████▊ | 6692/9770 [1:17:37<33:21, 1.54it/s]
69%|████
+0: {'loss': 0.6895, 'grad_norm': 0.6596544990822748, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.69}
+0: ██▊ | 6693/9770 [1:17:37<33:30, 1.53it/s]
69%|██████▊ | 6694/9770 [1:17:38<33:44, 1.52it/s]
69%|██████▊ | 6695/9770 [1:17:39<33:30, 1.53it/s]
69%|██████▊ | 6696/9770 [1:17:39<33:27, 1.53it/s]
69%|██████▊ | 6697/9770 [1:17:40<33:02, 1.55it/s]
69%|██████▊ | 6698/9770 [1:17:41<33:38, 1.52it/s]
69%|██████▊ | 6699/9770 [1:17:41<33:44, 1.52it/s]
69%|██████▊ | 6700/9770 [1:17:42<33:32, 1.53it/s]
69%|██████▊ | 6700/9770 [1:17:42<33:32, 1.53it/s]
69%|██████▊ | 6701/9770 [1:17:43<33:22, 1.53it/s]
69%|██████▊ | 6702/9770 [1:17:43<32:56, 1.55it/s]
69%|██████▊ | 6703/9770 [1:17:44<33:09, 1.54it/s]
69%|██████▊ | 6704/9770 [1:17:44<33:11, 1.54it/s]
69%|██████▊ | 6705/9770 [1:17:45<33:30, 1.52it/s]
69%|██████▊ | 670
+0: {'loss': 0.6675, 'grad_norm': 0.6793583770539431, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.69}
+0: 6/9770 [1:17:46<33:32, 1.52it/s]
69%|██████▊ | 6707/9770 [1:17:46<33:23, 1.53it/s]
69%|██████▊ | 6708/9770 [1:17:47<33:55, 1.50it/s]
69%|██████▊ | 6709/9770 [1:17:48<33:25, 1.53it/s]
69%|██████▊ | 6710/9770 [1:17:48<33:38, 1.52it/s]
69%|██████▊ | 6710/9770 [1:17:48<33:38, 1.52it/s]
69%|██████▊ | 6711/9770 [1:17:49<33:44, 1.51it/s]
69%|██████▊ | 6712/9770 [1:17:50<33:14, 1.53it/s]
69%|██████▊ | 6713/9770 [1:17:50<33:45, 1.51it/s]
69%|██████▊ | 6714/9770 [1:17:51<33:24, 1.52it/s]
69%|██████▊ | 6715/9770 [1:17:52<33:27, 1.52it/s]
69%|██████▊ | 6716/9770 [1:17:52<33:34, 1.52it/s]
69%|██████▉ | 6717/9770 [1:17:53<33:18, 1.53it/s]
69%|██████▉ | 6718/9770 [1:17:54<32:58, 1.54it/s]
69%|██████▉ | 6719/9770 [1:17:54<3
+0: {'loss': 0.6569, 'grad_norm': 0.6822316189326495, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.69}
+0: {'loss': 0.6515, 'grad_norm': 0.5938488492227458, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.69}
+0: 2:42, 1.55it/s]
69%|██████▉ | 6720/9770 [1:17:55<33:28, 1.52it/s]
69%|██████▉ | 6720/9770 [1:17:55<33:28, 1.52it/s]
69%|██████▉ | 6721/9770 [1:17:56<33:20, 1.52it/s]
69%|██████▉ | 6722/9770 [1:17:56<33:28, 1.52it/s]
69%|██████▉ | 6723/9770 [1:17:57<33:02, 1.54it/s]
69%|██████▉ | 6724/9770 [1:17:58<32:35, 1.56it/s]
69%|██████▉ | 6725/9770 [1:17:58<32:39, 1.55it/s]
69%|██████▉ | 6726/9770 [1:17:59<32:31, 1.56it/s]
69%|██████▉ | 6727/9770 [1:18:00<32:52, 1.54it/s]
69%|██████▉ | 6728/9770 [1:18:00<32:56, 1.54it/s]
69%|██████▉ | 6729/9770 [1:18:01<33:15, 1.52it/s]
69%|██████▉ | 6730/9770 [1:18:01<33:09, 1.53it/s]
69%|██████▉ | 6730/9770 [1:18:01<33:09, 1.53it/s]
69%|██�
+0: {'loss': 0.6472, 'grad_norm': 0.615696959525628, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.69}
+0: �███▉ | 6731/9770 [1:18:02<33:03, 1.53it/s]
69%|██████▉ | 6732/9770 [1:18:03<33:08, 1.53it/s]
69%|██████▉ | 6733/9770 [1:18:03<32:38, 1.55it/s]
69%|██████▉ | 6734/9770 [1:18:04<33:11, 1.52it/s]
69%|██████▉ | 6735/9770 [1:18:05<32:51, 1.54it/s]
69%|██████▉ | 6736/9770 [1:18:05<33:02, 1.53it/s]
69%|██████▉ | 6737/9770 [1:18:06<32:58, 1.53it/s]
69%|██████▉ | 6738/9770 [1:18:07<32:28, 1.56it/s]
69%|██████▉ | 6739/9770 [1:18:07<32:46, 1.54it/s]
69%|██████▉ | 6740/9770 [1:18:08<32:55, 1.53it/s]
69%|██████▉ | 6740/9770 [1:18:08<32:55, 1.53it/s]
69%|██████▉ | 6741/9770 [1:18:09<36:11, 1.39it/s]
69%|██████▉ | 6742/9770 [1:18:10<35:16, 1.43it/s]
69%|██████▉ | 6743/9770 [1:18:10<34:56, 1.44it/s]
69%|██████▉ |
+0: {'loss': 0.6534, 'grad_norm': 0.6151599925839777, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.69}
+0: 6744/9770 [1:18:11<34:26, 1.46it/s]
69%|██████▉ | 6745/9770 [1:18:11<33:40, 1.50it/s]
69%|██████▉ | 6746/9770 [1:18:12<32:56, 1.53it/s]
69%|██████▉ | 6747/9770 [1:18:13<32:27, 1.55it/s]
69%|██████▉ | 6748/9770 [1:18:13<32:39, 1.54it/s]
69%|██████▉ | 6749/9770 [1:18:14<32:52, 1.53it/s]
69%|██████▉ | 6750/9770 [1:18:15<32:46, 1.54it/s]
69%|██████▉ | 6750/9770 [1:18:15<32:46, 1.54it/s]
69%|██████▉ | 6751/9770 [1:18:15<33:29, 1.50it/s]
69%|██████▉ | 6752/9770 [1:18:16<33:06, 1.52it/s]
69%|██████▉ | 6753/9770 [1:18:17<32:49, 1.53it/s]
69%|██████▉ | 6754/9770 [1:18:17<32:53, 1.53it/s]
69%|██████▉ | 6755/9770 [1:18:18<32:48, 1.53it/s]
69%|██████▉ | 6756/9770 [1:18:19<32:27, 1.55it/s]
69%|██████▉ | 6757/9770 [1:18:
+0: {'loss': 0.6588, 'grad_norm': 0.6786478314661076, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.69}
+0: 19<32:28, 1.55it/s]
69%|██████▉ | 6758/9770 [1:18:20<32:25, 1.55it/s]
69%|██████▉ | 6759/9770 [1:18:21<32:32, 1.54it/s]
69%|██████▉ | 6760/9770 [1:18:21<32:51, 1.53it/s]
69%|██████▉ | 6760/9770 [1:18:21<32:51, 1.53it/s]
69%|██████▉ | 6761/9770 [1:18:22<33:11, 1.51it/s]
69%|██████▉ | 6762/9770 [1:18:23<33:01, 1.52it/s]
69%|██████▉ | 6763/9770 [1:18:23<32:58, 1.52it/s]
69%|██████▉ | 6764/9770 [1:18:24<32:37, 1.54it/s]
69%|██████▉ | 6765/9770 [1:18:24<32:28, 1.54it/s]
69%|██████▉ | 6766/9770 [1:18:25<32:58, 1.52it/s]
69%|██████▉ | 6767/9770 [1:18:26<33:20, 1.50it/s]
69%|██████▉ | 6768/9770 [1:18:27<33:13, 1.51it/s]
69%|██████▉ | 6769/9770 [1:18:27<32:40, 1.53it/s]
69%|██████▉ | 6770/9770 [1:18:28<32:24, 1.54it
+0: {'loss': 0.6578, 'grad_norm': 0.575305858631143, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.69}
+0: {'loss': 0.673, 'grad_norm': 0.5767315509497407, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.69}
+0: /s]
69%|██████▉ | 6770/9770 [1:18:28<32:24, 1.54it/s]
69%|██████▉ | 6771/9770 [1:18:28<32:21, 1.54it/s]
69%|██████▉ | 6772/9770 [1:18:29<32:10, 1.55it/s]
69%|██████▉ | 6773/9770 [1:18:30<32:03, 1.56it/s]
69%|██████�� | 6774/9770 [1:18:30<32:17, 1.55it/s]
69%|██████▉ | 6775/9770 [1:18:31<32:24, 1.54it/s]
69%|██████▉ | 6776/9770 [1:18:32<32:34, 1.53it/s]
69%|██████▉ | 6777/9770 [1:18:32<32:28, 1.54it/s]
69%|██████▉ | 6778/9770 [1:18:33<32:21, 1.54it/s]
69%|██████▉ | 6779/9770 [1:18:34<32:39, 1.53it/s]
69%|██████▉ | 6780/9770 [1:18:34<32:12, 1.55it/s]
69%|██████▉ | 6780/9770 [1:18:34<32:12, 1.55it/s]
69%|██████▉ | 6781/9770 [1:18:35<32:04, 1.55it/s]
69%|██████▉
+0: {'loss': 0.6468, 'grad_norm': 0.6239627762349214, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.69}
+0: | 6782/9770 [1:18:36<32:20, 1.54it/s]
69%|██████▉ | 6783/9770 [1:18:36<33:11, 1.50it/s]
69%|██████▉ | 6784/9770 [1:18:37<33:48, 1.47it/s]
69%|██████▉ | 6785/9770 [1:18:38<33:13, 1.50it/s]
69%|██████▉ | 6786/9770 [1:18:38<33:01, 1.51it/s]
69%|██████▉ | 6787/9770 [1:18:39<32:37, 1.52it/s]
69%|██████▉ | 6788/9770 [1:18:40<32:27, 1.53it/s]
69%|██████▉ | 6789/9770 [1:18:40<32:24, 1.53it/s]
69%|██████▉ | 6790/9770 [1:18:41<32:26, 1.53it/s]
69%|██████▉ | 6790/9770 [1:18:41<32:26, 1.53it/s]
70%|██████▉ | 6791/9770 [1:18:42<32:06, 1.55it/s]
70%|██████▉ | 6792/9770 [1:18:42<31:46, 1.56it/s]
70%|██████▉ | 6793/9770 [1:18:43<31:56, 1.55it/s]
70%|██████▉ | 6794/9770 [1:18:43<31:43, 1.56it/s]
70%|██████▉ | 6795/9770 [1
+0: {'loss': 0.6748, 'grad_norm': 0.6029363948297037, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.7}
+0: :18:44<32:01, 1.55it/s]
70%|██████▉ | 6796/9770 [1:18:45<33:01, 1.50it/s]
70%|██████▉ | 6797/9770 [1:18:45<32:44, 1.51it/s]
70%|██████▉ | 6798/9770 [1:18:46<32:42, 1.51it/s]
70%|██████▉ | 6799/9770 [1:18:47<32:46, 1.51it/s]
70%|██████▉ | 6800/9770 [1:18:47<32:36, 1.52it/s]
70%|██████▉ | 6800/9770 [1:18:47<32:36, 1.52it/s]
70%|██████▉ | 6801/9770 [1:18:48<32:14, 1.53it/s]
70%|██████▉ | 6802/9770 [1:18:49<32:26, 1.52it/s]
70%|██████▉ | 6803/9770 [1:18:49<32:41, 1.51it/s]
70%|██████▉ | 6804/9770 [1:18:50<32:31, 1.52it/s]
70%|██████▉ | 6805/9770 [1:18:51<32:28, 1.52it/s]
70%|██████▉ | 6806/9770 [1:18:51<32:20, 1.53it/s]
70%|██████▉ | 6807/9770 [1:18:52<32:18, 1.53it/s]
70%|██████▉ | 6808/9770 [1:18:53<32:09, 1.
+0: {'loss': 0.6581, 'grad_norm': 0.575547301774997, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.7}
+0: {'loss': 0.6576, 'grad_norm': 0.6123159396652619, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.7}
+0: 54it/s]
70%|██████▉ | 6809/9770 [1:18:53<31:37, 1.56it/s]
70%|██████▉ | 6810/9770 [1:18:54<31:52, 1.55it/s]
70%|██████▉ | 6810/9770 [1:18:54<31:52, 1.55it/s]
70%|██████▉ | 6811/9770 [1:18:55<32:07, 1.54it/s]
70%|██████▉ | 6812/9770 [1:18:55<31:45, 1.55it/s]
70%|██████▉ | 6813/9770 [1:18:56<31:57, 1.54it/s]
70%|██████▉ | 6814/9770 [1:18:57<31:57, 1.54it/s]
70%|██████▉ | 6815/9770 [1:18:57<31:58, 1.54it/s]
70%|██████▉ | 6816/9770 [1:18:58<32:05, 1.53it/s]
70%|██████▉ | 6817/9770 [1:18:58<31:52, 1.54it/s]
70%|██████▉ | 6818/9770 [1:18:59<32:03, 1.53it/s]
70%|██████▉ | 6819/9770 [1:19:00<32:48, 1.50it/s]
70%|██████▉ | 6820/9770 [1:19:00<32:37, 1.51it/s]
70%|█████�
+0: {'loss': 0.6586, 'grad_norm': 0.6278886841949828, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.7}
+0: �▉ | 6820/9770 [1:19:00<32:37, 1.51it/s]
70%|██████▉ | 6821/9770 [1:19:01<32:50, 1.50it/s]
70%|██████▉ | 6822/9770 [1:19:02<32:29, 1.51it/s]
70%|██████▉ | 6823/9770 [1:19:02<32:06, 1.53it/s]
70%|██████▉ | 6824/9770 [1:19:03<31:48, 1.54it/s]
70%|██████▉ | 6825/9770 [1:19:04<31:28, 1.56it/s]
70%|██████▉ | 6826/9770 [1:19:04<32:20, 1.52it/s]
70%|██████▉ | 6827/9770 [1:19:05<32:37, 1.50it/s]
70%|██████▉ | 6828/9770 [1:19:06<32:32, 1.51it/s]
70%|██████▉ | 6829/9770 [1:19:06<32:56, 1.49it/s]
70%|██████▉ | 6830/9770 [1:19:07<32:43, 1.50it/s]
70%|██████▉ | 6830/9770 [1:19:07<32:43, 1.50it/s]
70%|██████▉ | 6831/9770 [1:19:08<32:47, 1.49it/s]
70%|██████▉ | 6832/9770 [1:19:08<32:25, 1.51it/s]
70%|██████▉ | 6833/977
+0: {'loss': 0.6368, 'grad_norm': 0.5834538267684685, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.7}
+0: 0 [1:19:09<32:24, 1.51it/s]
70%|██████▉ | 6834/9770 [1:19:10<32:01, 1.53it/s]
70%|██████▉ | 6835/9770 [1:19:10<32:42, 1.50it/s]
70%|██████▉ | 6836/9770 [1:19:11<32:50, 1.49it/s]
70%|██████▉ | 6837/9770 [1:19:12<32:38, 1.50it/s]
70%|██████▉ | 6838/9770 [1:19:12<32:28, 1.51it/s]
70%|███████ | 6839/9770 [1:19:13<32:23, 1.51it/s]
70%|███████ | 6840/9770 [1:19:14<32:00, 1.53it/s]
70%|███████ | 6840/9770 [1:19:14<32:00, 1.53it/s]
70%|███████ | 6841/9770 [1:19:14<32:05, 1.52it/s]
70%|███████ | 6842/9770 [1:19:15<31:52, 1.53it/s]
70%|███████ | 6843/9770 [1:19:16<31:44, 1.54it/s]
70%|███████ | 6844/9770 [1:19:17<35:10, 1.39it/s]
70%|███████ | 6845/9770 [1:19:17<34:11, 1.43it/s]
70%|███████ | 6846/9770 [1:19:18<32:57,
+0: {'loss': 0.6574, 'grad_norm': 0.5921325280693264, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.7}
+0: 1.48it/s]
70%|███████ | 6847/9770 [1:19:18<32:08, 1.52it/s]
70%|███████ | 6848/9770 [1:19:19<31:56, 1.52it/s]
70%|███████ | 6849/9770 [1:19:20<31:47, 1.53it/s]
70%|███████ | 6850/9770 [1:19:20<32:27, 1.50it/s]
70%|███████ | 6850/9770 [1:19:20<32:27, 1.50it/s]
70%|███████ | 6851/9770 [1:19:21<32:04, 1.52it/s]
70%|███████ | 6852/9770 [1:19:22<32:05, 1.52it/s]
70%|███████ | 6853/9770 [1:19:22<32:48, 1.48it/s]
70%|███████ | 6854/9770 [1:19:23<32:14, 1.51it/s]
70%|███████ | 6855/9770 [1:19:24<32:11, 1.51it/s]
70%|███████ | 6856/9770 [1:19:24<32:04, 1.51it/s]
70%|███████ | 6857/9770 [1:19:25<32:25, 1.50it/s]
70%|███████ | 6858/9770 [1:19:26<32:05, 1.51it/s]
70%|███████ | 6859/9770 [1:19:26<31:59, 1.52it/s]
70%|
+0: {'loss': 0.6533, 'grad_norm': 0.5825754139324131, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.7}
+0: {'loss': 0.6486, 'grad_norm': 0.5925507355304772, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.7}
+0: ███████ | 6860/9770 [1:19:27<31:30, 1.54it/s]
70%|███████ | 6860/9770 [1:19:27<31:30, 1.54it/s]
70%|███████ | 6861/9770 [1:19:28<31:20, 1.55it/s]
70%|███████ | 6862/9770 [1:19:28<31:21, 1.55it/s]
70%|███████ | 6863/9770 [1:19:29<31:21, 1.54it/s]
70%|███████ | 6864/9770 [1:19:30<32:13, 1.50it/s]
70%|███████ | 6865/9770 [1:19:30<31:57, 1.52it/s]
70%|███████ | 6866/9770 [1:19:31<31:16, 1.55it/s]
70%|███████ | 6867/9770 [1:19:32<31:15, 1.55it/s]
70%|███████ | 6868/9770 [1:19:32<31:15, 1.55it/s]
70%|███████ | 6869/9770 [1:19:33<31:24, 1.54it/s]
70%|███████ | 6870/9770 [1:19:33<31:18, 1.54it/s]
70%|███████ | 6870/9770 [1:19:33<31:18, 1.54it/s]
70%|███████ | 6871
+0: {'loss': 0.6388, 'grad_norm': 0.5718033196205995, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.7}
+0: /9770 [1:19:34<31:29, 1.53it/s]
70%|███████ | 6872/9770 [1:19:35<31:55, 1.51it/s]
70%|███████ | 6873/9770 [1:19:35<31:44, 1.52it/s]
70%|███████ | 6874/9770 [1:19:36<31:43, 1.52it/s]
70%|███████ | 6875/9770 [1:19:37<31:54, 1.51it/s]
70%|███████ | 6876/9770 [1:19:37<31:58, 1.51it/s]
70%|███████ | 6877/9770 [1:19:38<31:55, 1.51it/s]
70%|███████ | 6878/9770 [1:19:39<31:21, 1.54it/s]
70%|███████ | 6879/9770 [1:19:39<31:39, 1.52it/s]
70%|███████ | 6880/9770 [1:19:40<31:25, 1.53it/s]
70%|███████ | 6880/9770 [1:19:40<31:25, 1.53it/s]
70%|███████ | 6881/9770 [1:19:41<31:19, 1.54it/s]
70%|███████ | 6882/9770 [1:19:41<31:02, 1.55it/s]
70%|███████ | 6883/9770 [1:19:42<31:13, 1.54it/s]
70%|███████ | 6884/9770 [1:19:43<31
+0: {'loss': 0.6693, 'grad_norm': 0.6239262658093406, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.7}
+0: :08, 1.54it/s]
70%|███████ | 6885/9770 [1:19:43<31:16, 1.54it/s]
70%|███████ | 6886/9770 [1:19:44<31:22, 1.53it/s]
70%|███████ | 6887/9770 [1:19:45<30:56, 1.55it/s]
71%|███████ | 6888/9770 [1:19:45<31:04, 1.55it/s]
71%|███████ | 6889/9770 [1:19:46<31:14, 1.54it/s]
71%|███████ | 6890/9770 [1:19:47<31:23, 1.53it/s]
71%|███████ | 6890/9770 [1:19:47<31:23, 1.53it/s]
71%|███████ | 6891/9770 [1:19:47<31:35, 1.52it/s]
71%|███████ | 6892/9770 [1:19:48<32:08, 1.49it/s]
71%|███████ | 6893/9770 [1:19:49<31:46, 1.51it/s]
71%|███████ | 6894/9770 [1:19:49<32:23, 1.48it/s]
71%|███████ | 6895/9770 [1:19:50<31:45, 1.51it/s]
71%|███████ | 6896/9770 [1:19:51<31:21, 1.53it/s]
71%|███████ | 6897/9770 [1:19:51<31:07, 1.54it/s]
+0: {'loss': 0.6558, 'grad_norm': 0.6352474947058601, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.71}
+0: 71%|███████ | 6898/9770 [1:19:52<31:11, 1.53it/s]
71%|███████ | 6899/9770 [1:19:53<31:11, 1.53it/s]
71%|███████ | 6900/9770 [1:19:53<31:50, 1.50it/s]
71%|███████ | 6900/9770 [1:19:53<31:50, 1.50it/s]
71%|███████ | 6901/9770 [1:19:54<31:34, 1.51it/s]
71%|███████ | 6902/9770 [1:19:54<31:08, 1.54it/s]
71%|███████ | 6903/9770 [1:19:55<30:46, 1.55it/s]
71%|███████ | 6904/9770 [1:19:56<30:46, 1.55it/s]
71%|███████ | 6905/9770 [1:19:56<30:58, 1.54it/s]
71%|███████ | 6906/9770 [1:19:57<31:15, 1.53it/s]
71%|███████ | 6907/9770 [1:19:58<30:59, 1.54it/s]
71%|███████ | 6908/9770 [1:19:58<30:55, 1.54it/s]
71%|███████ | 6909/9770 [1:19:59<31:27, 1.52it/s]
71%|███████ | 6910/9770 [1:20:00<31:30, 1.51it/s]
+0: {'loss': 0.6502, 'grad_norm': 0.6361891715027244, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.71}
+0: {'loss': 0.6261, 'grad_norm': 0.5857497709112279, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.71}
+0:
71%|███████ | 6910/9770 [1:20:00<31:30, 1.51it/s]
71%|███████ | 6911/9770 [1:20:00<31:33, 1.51it/s]
71%|███████ | 6912/9770 [1:20:01<31:20, 1.52it/s]
71%|███████ | 6913/9770 [1:20:02<31:09, 1.53it/s]
71%|███████ | 6914/9770 [1:20:02<31:50, 1.49it/s]
71%|███████ | 6915/9770 [1:20:03<31:38, 1.50it/s]
71%|███████ | 6916/9770 [1:20:04<31:18, 1.52it/s]
71%|███████ | 6917/9770 [1:20:04<31:16, 1.52it/s]
71%|███████ | 6918/9770 [1:20:05<31:16, 1.52it/s]
71%|███████ | 6919/9770 [1:20:06<31:21, 1.52it/s]
71%|███████ | 6920/9770 [1:20:06<31:25, 1.51it/s]
71%|███████ | 6920/9770 [1:20:06<31:25, 1.51it/s]
71%|███████ | 6921/9770 [1:20:07<31:36, 1.50it/s]
71%|███████ | 6922/9770 [1:20:0
+0: {'loss': 0.6744, 'grad_norm': 0.6224643305708777, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.71}
+0: 8<31:24, 1.51it/s]
71%|███████ | 6923/9770 [1:20:08<31:10, 1.52it/s]
71%|███████ | 6924/9770 [1:20:09<31:13, 1.52it/s]
71%|███████ | 6925/9770 [1:20:10<31:28, 1.51it/s]
71%|███████ | 6926/9770 [1:20:10<31:16, 1.52it/s]
71%|███████ | 6927/9770 [1:20:11<31:53, 1.49it/s]
71%|███████ | 6928/9770 [1:20:12<31:17, 1.51it/s]
71%|███████ | 6929/9770 [1:20:12<31:05, 1.52it/s]
71%|███████ | 6930/9770 [1:20:13<30:49, 1.54it/s]
71%|███████ | 6930/9770 [1:20:13<30:49, 1.54it/s]
71%|███████ | 6931/9770 [1:20:14<30:44, 1.54it/s]
71%|███████ | 6932/9770 [1:20:14<30:27, 1.55it/s]
71%|███████ | 6933/9770 [1:20:15<30:23, 1.56it/s]
71%|███████ | 6934/9770 [1:20:15<30:22, 1.56it/s]
71%|███████ | 6935/9770 [1:20:16<30:37, 1.54it/
+0: {'loss': 0.6668, 'grad_norm': 0.6262758144806874, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.71}
+0: s]
71%|███████ | 6936/9770 [1:20:17<30:41, 1.54it/s]
71%|███████ | 6937/9770 [1:20:17<30:20, 1.56it/s]
71%|███████ | 6938/9770 [1:20:18<30:24, 1.55it/s]
71%|███████ | 6939/9770 [1:20:19<30:33, 1.54it/s]
71%|███████ | 6940/9770 [1:20:19<30:21, 1.55it/s]
71%|███████ | 6940/9770 [1:20:19<30:21, 1.55it/s]
71%|███████ | 6941/9770 [1:20:20<30:34, 1.54it/s]
71%|███████ | 6942/9770 [1:20:21<30:50, 1.53it/s]
71%|███████ | 6943/9770 [1:20:21<30:49, 1.53it/s]
71%|███████ | 6944/9770 [1:20:22<30:55, 1.52it/s]
71%|███████ | 6945/9770 [1:20:23<30:43, 1.53it/s]
71%|███████ | 6946/9770 [1:20:23<31:24, 1.50it/s]
71%|███████ | 6947/9770 [1:20:24<31:02, 1.52it/s]
71%|███████ | 6948/9770 [1:20:25<31:04, 1.51it/s]
71%|███
+0: {'loss': 0.6506, 'grad_norm': 0.6208488686671941, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.71}
+0: {'loss': 0.6686, 'grad_norm': 0.5927485796495109, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.71}
+0: ████ | 6949/9770 [1:20:25<30:26, 1.54it/s]
71%|███████ | 6950/9770 [1:20:26<30:19, 1.55it/s]
71%|███████ | 6950/9770 [1:20:26<30:19, 1.55it/s]
71%|███████ | 6951/9770 [1:20:27<31:13, 1.50it/s]
71%|███████ | 6952/9770 [1:20:27<31:04, 1.51it/s]
71%|███████ | 6953/9770 [1:20:28<30:54, 1.52it/s]
71%|███████ | 6954/9770 [1:20:29<30:37, 1.53it/s]
71%|███████ | 6955/9770 [1:20:29<30:09, 1.56it/s]
71%|███████ | 6956/9770 [1:20:30<30:11, 1.55it/s]
71%|███████ | 6957/9770 [1:20:30<30:02, 1.56it/s]
71%|███████ | 6958/9770 [1:20:31<29:52, 1.57it/s]
71%|███████ | 6959/9770 [1:20:32<30:06, 1.56it/s]
71%|███████ | 6960/9770 [1:20:32<30:03, 1.56it/s]
71%|███████ | 6960/9770 [1:
+0: {'loss': 0.6243, 'grad_norm': 0.5629670801469756, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.71}
+0: 20:32<30:03, 1.56it/s]
71%|███████ | 6961/9770 [1:20:33<30:41, 1.53it/s]
71%|███████▏ | 6962/9770 [1:20:34<31:08, 1.50it/s]
71%|███████▏ | 6963/9770 [1:20:34<30:31, 1.53it/s]
71%|███████▏ | 6964/9770 [1:20:35<30:13, 1.55it/s]
71%|███████▏ | 6965/9770 [1:20:36<30:02, 1.56it/s]
71%|███████▏ | 6966/9770 [1:20:36<30:03, 1.56it/s]
71%|███████▏ | 6967/9770 [1:20:37<30:22, 1.54it/s]
71%|███████▏ | 6968/9770 [1:20:38<30:14, 1.54it/s]
71%|███████▏ | 6969/9770 [1:20:38<30:10, 1.55it/s]
71%|███████▏ | 6970/9770 [1:20:39<30:01, 1.55it/s]
71%|███████▏ | 6970/9770 [1:20:39<30:01, 1.55it/s]
71%|███████▏ | 6971/9770 [1:20:40<30:08, 1.55it/s]
71%|███████▏ | 6972/9770 [1:20:40<30:17, 1.54it/s]
71%|███████▏ | 6973
+0: {'loss': 0.6526, 'grad_norm': 0.6561745459175042, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.71}
+0: /9770 [1:20:41<30:35, 1.52it/s]
71%|███████▏ | 6974/9770 [1:20:41<30:17, 1.54it/s]
71%|███████▏ | 6975/9770 [1:20:42<30:16, 1.54it/s]
71%|███████▏ | 6976/9770 [1:20:43<30:04, 1.55it/s]
71%|███████▏ | 6977/9770 [1:20:43<30:00, 1.55it/s]
71%|███████▏ | 6978/9770 [1:20:44<30:01, 1.55it/s]
71%|███████▏ | 6979/9770 [1:20:45<30:52, 1.51it/s]
71%|███████▏ | 6980/9770 [1:20:45<31:21, 1.48it/s]
71%|███████▏ | 6980/9770 [1:20:45<31:21, 1.48it/s]
71%|███████▏ | 6981/9770 [1:20:46<30:56, 1.50it/s]
71%|███████▏ | 6982/9770 [1:20:47<31:26, 1.48it/s]
71%|███████▏ | 6983/9770 [1:20:47<31:09, 1.49it/s]
71%|███████▏ | 6984/9770 [1:20:48<30:39, 1.51it/s]
71%|███████▏ | 6985/9770 [1:20:49<30:22, 1.53it/s]
72%|███████
+0: {'loss': 0.6542, 'grad_norm': 0.6079073579913449, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.72}
+0: ▏ | 6986/9770 [1:20:49<30:12, 1.54it/s]
72%|███████▏ | 6987/9770 [1:20:50<29:45, 1.56it/s]
72%|███████▏ | 6988/9770 [1:20:51<30:24, 1.52it/s]
72%|███████▏ | 6989/9770 [1:20:51<30:25, 1.52it/s]
72%|███████▏ | 6990/9770 [1:20:52<30:30, 1.52it/s]
72%|███████▏ | 6990/9770 [1:20:52<30:30, 1.52it/s]
72%|███████▏ | 6991/9770 [1:20:53<30:20, 1.53it/s]
72%|███████▏ | 6992/9770 [1:20:53<30:25, 1.52it/s]
72%|███████▏ | 6993/9770 [1:20:54<30:13, 1.53it/s]
72%|███████▏ | 6994/9770 [1:20:55<30:23, 1.52it/s]
72%|███████▏ | 6995/9770 [1:20:55<29:55, 1.55it/s]
72%|███████▏ | 6996/9770 [1:20:56<30:26, 1.52it/s]
72%|███████▏ | 6997/9770 [1:20:57<30:03, 1.54it/s]
72%|███████▏ | 6998/9770 [1:20:57<30:04, 1.54it/s]
72%|███�
+0: {'loss': 0.6488, 'grad_norm': 0.5753313691472444, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.72}
+0: {'loss': 0.6336, 'grad_norm': 0.5944756617514969, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.72}
+0: ��███▏ | 6999/9770 [1:20:58<30:08, 1.53it/s]
72%|███████▏ | 7000/9770 [1:20:59<29:56, 1.54it/s]
72%|███████▏ | 7000/9770 [1:20:59<29:56, 1.54it/s]
72%|███████▏ | 7001/9770 [1:20:59<30:07, 1.53it/s]
72%|███████▏ | 7002/9770 [1:21:00<29:51, 1.55it/s]
72%|███████▏ | 7003/9770 [1:21:00<29:22, 1.57it/s]
72%|███████▏ | 7004/9770 [1:21:01<29:00, 1.59it/s]
72%|███████▏ | 7005/9770 [1:21:02<28:52, 1.60it/s]
72%|███████▏ | 7006/9770 [1:21:02<29:07, 1.58it/s]
72%|███████▏ | 7007/9770 [1:21:03<29:34, 1.56it/s]
72%|███████▏ | 7008/9770 [1:21:04<29:25, 1.56it/s]
72%|███████▏ | 7009/9770 [1:21:04<30:10, 1.52it/s]
72%|███████▏ | 7010/9770 [1:21:05<30:15, 1.52it/s]
72%|████�
+0: {'loss': 0.6839, 'grad_norm': 0.6204004173055316, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.72}
+0: �██▏ | 7010/9770 [1:21:05<30:15, 1.52it/s]
72%|███████▏ | 7011/9770 [1:21:06<30:05, 1.53it/s]
72%|███████▏ | 7012/9770 [1:21:06<29:57, 1.53it/s]
72%|███████▏ | 7013/9770 [1:21:07<30:05, 1.53it/s]
72%|███████▏ | 7014/9770 [1:21:08<29:58, 1.53it/s]
72%|███████▏ | 7015/9770 [1:21:08<30:43, 1.49it/s]
72%|███████▏ | 7016/9770 [1:21:09<30:19, 1.51it/s]
72%|███████▏ | 7017/9770 [1:21:10<29:59, 1.53it/s]
72%|███████▏ | 7018/9770 [1:21:10<30:02, 1.53it/s]
72%|███████▏ | 7019/9770 [1:21:11<30:06, 1.52it/s]
72%|███████▏ | 7020/9770 [1:21:12<30:10, 1.52it/s]
72%|███████▏ | 7020/9770 [1:21:12<30:10, 1.52it/s]
72%|███████▏ | 7021/9770 [1:21:12<30:10, 1.52it/s]
72%|███████▏ | 7022/9770 [1:21:13<30:13, 1.52it/s]
72%|█
+0: {'loss': 0.6519, 'grad_norm': 0.57989699012608, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.72}
+0: ██████▏ | 7023/9770 [1:21:14<30:00, 1.53it/s]
72%|███████��� | 7024/9770 [1:21:14<29:45, 1.54it/s]
72%|███████▏ | 7025/9770 [1:21:15<29:43, 1.54it/s]
72%|███████▏ | 7026/9770 [1:21:15<29:53, 1.53it/s]
72%|███████▏ | 7027/9770 [1:21:16<30:27, 1.50it/s]
72%|███████▏ | 7028/9770 [1:21:17<30:19, 1.51it/s]
72%|███████▏ | 7029/9770 [1:21:17<30:18, 1.51it/s]
72%|███████▏ | 7030/9770 [1:21:18<30:02, 1.52it/s]
72%|███████▏ | 7030/9770 [1:21:18<30:02, 1.52it/s]
72%|███████▏ | 7031/9770 [1:21:19<29:54, 1.53it/s]
72%|███████▏ | 7032/9770 [1:21:19<29:56, 1.52it/s]
72%|███████▏ | 7033/9770 [1:21:20<30:41, 1.49it/s]
72%|███████▏ | 7034/9770 [1:21:21<30:57, 1.47it/s]
72%|███████▏ | 7035/9770 [1:21:21<30:29, 1.50it/
+0: {'loss': 0.6633, 'grad_norm': 0.6232905363297797, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.72}
+0: s]
72%|███████▏ | 7036/9770 [1:21:22<30:26, 1.50it/s]
72%|███████▏ | 7037/9770 [1:21:23<30:06, 1.51it/s]
72%|███████▏ | 7038/9770 [1:21:23<29:57, 1.52it/s]
72%|███████▏ | 7039/9770 [1:21:24<30:16, 1.50it/s]
72%|███████▏ | 7040/9770 [1:21:25<30:44, 1.48it/s]
72%|███████▏ | 7040/9770 [1:21:25<30:44, 1.48it/s]
72%|███████▏ | 7041/9770 [1:21:25<30:32, 1.49it/s]
72%|███████▏ | 7042/9770 [1:21:26<30:29, 1.49it/s]
72%|███████▏ | 7043/9770 [1:21:27<30:09, 1.51it/s]
72%|███████▏ | 7044/9770 [1:21:27<29:55, 1.52it/s]
72%|███████▏ | 7045/9770 [1:21:28<29:34, 1.54it/s]
72%|███████▏ | 7046/9770 [1:21:29<30:01, 1.51it/s]
72%|███████▏ | 7047/9770 [1:21:29<29:47, 1.52it/s]
72%|███████▏ | 7048/9770 [1:21:30<29:2
+0: {'loss': 0.6448, 'grad_norm': 0.6003479512567722, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.72}
+0: 7, 1.54it/s]
72%|███████▏ | 7049/9770 [1:21:31<29:20, 1.55it/s]
72%|███████▏ | 7050/9770 [1:21:31<29:30, 1.54it/s]
72%|███████▏ | 7050/9770 [1:21:31<29:30, 1.54it/s]
72%|███████▏ | 7051/9770 [1:21:32<29:18, 1.55it/s]
72%|███████▏ | 7052/9770 [1:21:33<29:45, 1.52it/s]
72%|███████▏ | 7053/9770 [1:21:33<29:33, 1.53it/s]
72%|███████▏ | 7054/9770 [1:21:34<30:11, 1.50it/s]
72%|███████▏ | 7055/9770 [1:21:35<30:39, 1.48it/s]
72%|███████▏ | 7056/9770 [1:21:35<30:13, 1.50it/s]
72%|███████▏ | 7057/9770 [1:21:36<29:56, 1.51it/s]
72%|███████▏ | 7058/9770 [1:21:37<29:40, 1.52it/s]
72%|███████▏ | 7059/9770 [1:21:37<29:33, 1.53it/s]
72%|███████▏ | 7060/9770 [1:21:38<29:37, 1.52it/s]
+0: {'loss': 0.6594, 'grad_norm': 0.5866288400266059, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.72}
+0: {'loss': 0.6558, 'grad_norm': 0.6146188645465911, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.72}
+0:
72%|███████▏ | 7060/9770 [1:21:38<29:37, 1.52it/s]
72%|███████▏ | 7061/9770 [1:21:39<29:17, 1.54it/s]
72%|███████▏ | 7062/9770 [1:21:39<30:00, 1.50it/s]
72%|███████▏ | 7063/9770 [1:21:40<29:58, 1.51it/s]
72%|███████▏ | 7064/9770 [1:21:41<29:51, 1.51it/s]
72%|███████▏ | 7065/9770 [1:21:41<29:38, 1.52it/s]
72%|███████▏ | 7066/9770 [1:21:42<29:24, 1.53it/s]
72%|███████▏ | 7067/9770 [1:21:43<29:40, 1.52it/s]
72%|███████▏ | 7068/9770 [1:21:43<29:35, 1.52it/s]
72%|███████▏ | 7069/9770 [1:21:44<29:28, 1.53it/s]
72%|███████▏ | 7070/9770 [1:21:45<29:23, 1.53it/s]
72%|███████▏ | 7070/9770 [1:21:45<29:23, 1.53it/s]
72%|███████▏ | 7071/9770 [1:21:45<29:24, 1.53it/s]
72%|███████▏ | 7072/9770 [1:21:
+0: {'loss': 0.6406, 'grad_norm': 0.6223869616327192, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.72}
+0: 46<29:05, 1.55it/s]
72%|██���████▏ | 7073/9770 [1:21:46<29:35, 1.52it/s]
72%|███████▏ | 7074/9770 [1:21:47<29:40, 1.51it/s]
72%|███████▏ | 7075/9770 [1:21:48<29:28, 1.52it/s]
72%|███████▏ | 7076/9770 [1:21:48<29:27, 1.52it/s]
72%|███████▏ | 7077/9770 [1:21:49<29:23, 1.53it/s]
72%|███████▏ | 7078/9770 [1:21:50<29:26, 1.52it/s]
72%|███████▏ | 7079/9770 [1:21:50<28:59, 1.55it/s]
72%|███████▏ | 7080/9770 [1:21:51<29:07, 1.54it/s]
72%|███████▏ | 7080/9770 [1:21:51<29:07, 1.54it/s]
72%|███████▏ | 7081/9770 [1:21:52<28:56, 1.55it/s]
72%|███████▏ | 7082/9770 [1:21:52<29:04, 1.54it/s]
72%|███████▏ | 7083/9770 [1:21:53<29:13, 1.53it/s]
73%|███████▎ | 7084/9770 [1:21:54<29:14, 1.53it/s]
73%|███████▎ | 7085/
+0: {'loss': 0.6524, 'grad_norm': 0.6426328606997298, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.73}
+0: 9770 [1:21:54<29:19, 1.53it/s]
73%|███████▎ | 7086/9770 [1:21:55<29:09, 1.53it/s]
73%|███████▎ | 7087/9770 [1:21:56<29:08, 1.53it/s]
73%|███████▎ | 7088/9770 [1:21:56<29:03, 1.54it/s]
73%|███████▎ | 7089/9770 [1:21:57<29:10, 1.53it/s]
73%|███████▎ | 7090/9770 [1:21:58<28:59, 1.54it/s]
73%|███████▎ | 7090/9770 [1:21:58<28:59, 1.54it/s]
73%|███████▎ | 7091/9770 [1:21:58<29:02, 1.54it/s]
73%|███████▎ | 7092/9770 [1:21:59<28:40, 1.56it/s]
73%|███████▎ | 7093/9770 [1:21:59<28:53, 1.54it/s]
73%|███████▎ | 7094/9770 [1:22:00<29:28, 1.51it/s]
73%|███████▎ | 7095/9770 [1:22:01<29:03, 1.53it/s]
73%|███████▎ | 7096/9770 [1:22:01<29:04, 1.53it/s]
73%|███████▎ | 7097/9770 [1:22:02<29:02, 1.53it/s]
73%|███████�
+0: {'loss': 0.6771, 'grad_norm': 0.5918909773169833, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.73}
+0: �� | 7098/9770 [1:22:03<28:32, 1.56it/s]
73%|███████▎ | 7099/9770 [1:22:03<28:29, 1.56it/s]
73%|███████▎ | 7100/9770 [1:22:04<28:30, 1.56it/s]
73%|███████▎ | 7100/9770 [1:22:04<28:30, 1.56it/s]
73%|███████▎ | 7101/9770 [1:22:05<28:41, 1.55it/s]
73%|███████▎ | 7102/9770 [1:22:05<28:15, 1.57it/s]
73%|███████▎ | 7103/9770 [1:22:06<28:22, 1.57it/s]
73%|███████▎ | 7104/9770 [1:22:07<28:49, 1.54it/s]
73%|███████▎ | 7105/9770 [1:22:07<28:48, 1.54it/s]
73%|███████▎ | 7106/9770 [1:22:08<29:01, 1.53it/s]
73%|███████▎ | 7107/9770 [1:22:09<28:32, 1.56it/s]
73%|███████▎ | 7108/9770 [1:22:09<28:37, 1.55it/s]
73%|███████▎ | 7109/9770 [1:22:10<29:17, 1.51it/s]
73%|███████▎ | 7110/9770 [1:22:11<29:08, 1.52it/s]
+0: {'loss': 0.6498, 'grad_norm': 0.5815130158801066, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.73}
+0: {'loss': 0.6228, 'grad_norm': 0.5793914989442194, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.73}
+0:
73%|███████▎ | 7110/9770 [1:22:11<29:08, 1.52it/s]
73%|███████▎ | 7111/9770 [1:22:11<29:10, 1.52it/s]
73%|███████▎ | 7112/9770 [1:22:12<29:07, 1.52it/s]
73%|███████▎ | 7113/9770 [1:22:12<28:37, 1.55it/s]
73%|███████▎ | 7114/9770 [1:22:13<28:33, 1.55it/s]
73%|███████▎ | 7115/9770 [1:22:14<28:33, 1.55it/s]
73%|███████▎ | 7116/9770 [1:22:14<28:50, 1.53it/s]
73%|███████▎ | 7117/9770 [1:22:15<28:33, 1.55it/s]
73%|███████▎ | 7118/9770 [1:22:16<28:32, 1.55it/s]
73%|███████▎ | 7119/9770 [1:22:16<28:27, 1.55it/s]
73%|███████▎ | 7120/9770 [1:22:17<28:16, 1.56it/s]
73%|███████▎ | 7120/9770 [1:22:17<28:16, 1.56it/s]
73%|███████▎ | 7121/9770 [1:22:18<28:34, 1.54it/s]
73%|█████
+0: {'loss': 0.6562, 'grad_norm': 0.5924726055110466, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.73}
+0: ██▎ | 7122/9770 [1:22:18<28:28, 1.55it/s]
73%|███████▎ | 7123/9770 [1:22:19<28:10, 1.57it/s]
73%|███████▎ | 7124/9770 [1:22:20<28:48, 1.53it/s]
73%|███████▎ | 7125/9770 [1:22:20<28:33, 1.54it/s]
73%|███████▎ | 7126/9770 [1:22:21<28:31, 1.54it/s]
73%|███████▎ | 7127/9770 [1:22:22<28:40, 1.54it/s]
73%|███████▎ | 7128/9770 [1:22:22<28:33, 1.54it/s]
73%|███████▎ | 7129/9770 [1:22:23<29:19, 1.50it/s]
73%|███████▎ | 7130/9770 [1:22:24<28:51, 1.52it/s]
73%|███████▎ | 7130/9770 [1:22:24<28:51, 1.52it/s]
73%|███████▎ | 7131/9770 [1:22:24<28:46, 1.53it/s]
73%|███████▎ | 7132/9770 [1:22:25<28:39, 1.53it/s]
73%|███████▎ | 7133/9770 [1:22:25<28:27, 1.54it/s]
73%|███████▎ | 7134/9770 [1:22:26<28:35, 1.54it/s]
73%|█�
+0: {'loss': 0.6808, 'grad_norm': 0.5776311203110236, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.73}
+0: ��█████▎ | 7135/9770 [1:22:27<28:21, 1.55it/s]
73%|███████▎ | 7136/9770 [1:22:27<28:58, 1.51it/s]
73%|███████▎ | 7137/9770 [1:22:28<28:36, 1.53it/s]
73%|███████▎ | 7138/9770 [1:22:29<28:31, 1.54it/s]
73%|███████▎ | 7139/9770 [1:22:29<28:30, 1.54it/s]
73%|███████▎ | 7140/9770 [1:22:30<28:23, 1.54it/s]
73%|███████▎ | 7140/9770 [1:22:30<28:23, 1.54it/s]
73%|███████▎ | 7141/9770 [1:22:31<28:54, 1.52it/s]
73%|███████▎ | 7142/9770 [1:22:31<28:53, 1.52it/s]
73%|███████▎ | 7143/9770 [1:22:32<28:38, 1.53it/s]
73%|███████▎ | 7144/9770 [1:22:33<29:10, 1.50it/s]
73%|███████▎ | 7145/9770 [1:22:33<28:46, 1.52it/s]
73%|███████▎ | 7146/9770 [1:22:34<28:31, 1.53it/s]
73%|███████▎ | 7147/9770 [1:22:35<28:21, 1.54it/s
+0: {'loss': 0.6514, 'grad_norm': 0.6172256297917075, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.73}
+0: ]
73%|███████▎ | 7148/9770 [1:22:35<28:07, 1.55it/s]
73%|███████▎ | 7149/9770 [1:22:36<27:57, 1.56it/s]
73%|███████▎ | 7150/9770 [1:22:37<28:04, 1.56it/s]
73%|███████▎ | 7150/9770 [1:22:37<28:04, 1.56it/s]
73%|███████▎ | 7151/9770 [1:22:37<28:31, 1.53it/s]
73%|███████▎ | 7152/9770 [1:22:38<28:24, 1.54it/s]
73%|███████▎ | 7153/9770 [1:22:39<29:09, 1.50it/s]
73%|███████▎ | 7154/9770 [1:22:39<29:01, 1.50it/s]
73%|███████▎ | 7155/9770 [1:22:40<28:41, 1.52it/s]
73%|███████▎ | 7156/9770 [1:22:41<29:11, 1.49it/s]
73%|███████▎ | 7157/9770 [1:22:41<28:24, 1.53it/s]
73%|███████▎ | 7158/9770 [1:22:42<28:27, 1.53it/s]
73%|███████▎ | 7159/9770 [1:22:42<28:39, 1.52it/s]
73%|███████▎ | 7160/9770 [1:22:43<28:31
+0: {'loss': 0.6424, 'grad_norm': 0.5606820504968753, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.73}
+0: {'loss': 0.6537, 'grad_norm': 0.567872453373072, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.73}
+0: , 1.52it/s]
73%|███████▎ | 7160/9770 [1:22:43<28:31, 1.52it/s]
73%|███████▎ | 7161/9770 [1:22:44<28:28, 1.53it/s]
73%|███████▎ | 7162/9770 [1:22:44<28:40, 1.52it/s]
73%|███████▎ | 7163/9770 [1:22:45<28:04, 1.55it/s]
73%|███████▎ | 7164/9770 [1:22:46<28:08, 1.54it/s]
73%|███████▎ | 7165/9770 [1:22:46<28:03, 1.55it/s]
73%|███████▎ | 7166/9770 [1:22:47<28:18, 1.53it/s]
73%|███████▎ | 7167/9770 [1:22:48<28:16, 1.53it/s]
73%|███████▎ | 7168/9770 [1:22:48<28:36, 1.52it/s]
73%|███████▎ | 7169/9770 [1:22:49<28:24, 1.53it/s]
73%|███████▎ | 7170/9770 [1:22:50<28:23, 1.53it/s]
73%|███████▎ | 7170/9770 [1:22:50<28:23, 1.53it/s]
73%|███████▎ | 7171/9770 [1:22:50<28:35, 1
+0: {'loss': 0.6525, 'grad_norm': 0.6213660601265412, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.73}
+0: .51it/s]
73%|███████▎ | 7172/9770 [1:22:51<28:13, 1.53it/s]
73%|███████▎ | 7173/9770 [1:22:52<27:51, 1.55it/s]
73%|███████▎ | 7174/9770 [1:22:52<27:48, 1.56it/s]
73%|███████▎ | 7175/9770 [1:22:53<27:52, 1.55it/s]
73%|███████▎ | 7176/9770 [1:22:53<27:27, 1.57it/s]
73%|███████▎ | 7177/9770 [1:22:54<27:50, 1.55it/s]
73%|███████▎ | 7178/9770 [1:22:55<27:50, 1.55it/s]
73%|███████▎ | 7179/9770 [1:22:55<28:01, 1.54it/s]
73%|███████▎ | 7180/9770 [1:22:56<28:50, 1.50it/s]
73%|███████▎ | 7180/9770 [1:22:56<28:50, 1.50it/s]
74%|███████▎ | 7181/9770 [1:22:57<28:25, 1.52it/s]
74%|███████▎ | 7182/9770 [1:22:58<29:04, 1.48it/s]
74%|███████▎ | 7183/9770 [1:22:58<28:47, 1.50it/s]
74%|███████▎ | 7184/9770 [1:22:5
+0: {'loss': 0.6557, 'grad_norm': 0.5873302338932653, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.74}
+0: 9<28:32, 1.51it/s]
74%|███████▎ | 7185/9770 [1:22:59<28:18, 1.52it/s]
74%|███████▎ | 7186/9770 [1:23:00<28:08, 1.53it/s]
74%|███████▎ | 7187/9770 [1:23:01<28:38, 1.50it/s]
74%|███████▎ | 7188/9770 [1:23:01<28:46, 1.50it/s]
74%|███████▎ | 7189/9770 [1:23:02<28:23, 1.51it/s]
74%|███████▎ | 7190/9770 [1:23:03<28:04, 1.53it/s]
74%|███████▎ | 7190/9770 [1:23:03<28:04, 1.53it/s]
74%|███████▎ | 7191/9770 [1:23:03<28:03, 1.53it/s]
74%|███████▎ | 7192/9770 [1:23:04<28:02, 1.53it/s]
74%|███████▎ | 7193/9770 [1:23:05<27:51, 1.54it/s]
74%|███████▎ | 7194/9770 [1:23:05<27:37, 1.55it/s]
74%|███████▎ | 7195/9770 [1:23:06<27:41, 1.55it/s]
74%|███████▎ | 7196/9770 [1:23:07<28:02, 1.53it/s]
74%|███████▎ | 7197/9
+0: {'loss': 0.6389, 'grad_norm': 0.6106008594952007, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.74}
+0: 770 [1:23:07<28:02, 1.53it/s]
74%|███████▎ | 7198/9770 [1:23:08<28:05, 1.53it/s]
74%|███████▎ | 7199/9770 [1:23:09<28:01, 1.53it/s]
74%|███████▎ | 7200/9770 [1:23:09<28:01, 1.53it/s]
74%|███████▎ | 7200/9770 [1:23:09<28:01, 1.53it/s]
74%|███████▎ | 7201/9770 [1:23:10<27:59, 1.53it/s]
74%|███████▎ | 7202/9770 [1:23:11<27:28, 1.56it/s]
74%|███████▎ | 7203/9770 [1:23:11<27:42, 1.54it/s]
74%|███████▎ | 7204/9770 [1:23:12<27:42, 1.54it/s]
74%|███████▎ | 7205/9770 [1:23:12<27:27, 1.56it/s]
74%|███████▍ | 7206/9770 [1:23:13<27:19, 1.56it/s]
74%|███████▍ | 7207/9770 [1:23:14<27:32, 1.55it/s]
74%|███████▍ | 7208/9770 [1:23:14<27:37, 1.55it/s]
74%|███████▍ | 7209/9770 [1:23:15<27:29, 1.55it/s]
74%|███████�
+0: {'loss': 0.6463, 'grad_norm': 0.6182190792551714, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.74}
+0: {'loss': 0.6667, 'grad_norm': 0.5927060992796735, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.74}
+0: � | 7210/9770 [1:23:16<27:27, 1.55it/s]
74%|███████▍ | 7210/9770 [1:23:16<27:27, 1.55it/s]
74%|███████▍ | 7211/9770 [1:23:16<27:48, 1.53it/s]
74%|███████▍ | 7212/9770 [1:23:17<28:05, 1.52it/s]
74%|███████▍ | 7213/9770 [1:23:18<28:15, 1.51it/s]
74%|███████▍ | 7214/9770 [1:23:18<27:59, 1.52it/s]
74%|███████▍ | 7215/9770 [1:23:19<27:56, 1.52it/s]
74%|███████▍ | 7216/9770 [1:23:20<27:46, 1.53it/s]
74%|███████▍ | 7217/9770 [1:23:20<27:57, 1.52it/s]
74%|███████▍ | 7218/9770 [1:23:21<27:54, 1.52it/s]
74%|███████▍ | 7219/9770 [1:23:22<28:21, 1.50it/s]
74%|███████▍ | 7220/9770 [1:23:22<28:13, 1.51it/s]
74%|███████▍ | 7220/9770 [1:23:22<28:13, 1.51it/s]
74%|███████▍ |
+0: {'loss': 0.6618, 'grad_norm': 0.6413888712498511, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.74}
+0: 7221/9770 [1:23:23<28:02, 1.52it/s]
74%|███████▍ | 7222/9770 [1:23:24<28:08, 1.51it/s]
74%|███████▍ | 7223/9770 [1:23:24<27:55, 1.52it/s]
74%|███████▍ | 7224/9770 [1:23:25<27:33, 1.54it/s]
74%|███████▍ | 7225/9770 [1:23:26<27:37, 1.54it/s]
74%|███████▍ | 7226/9770 [1:23:26<27:16, 1.55it/s]
74%|███████▍ | 7227/9770 [1:23:27<27:25, 1.55it/s]
74%|███████▍ | 7228/9770 [1:23:28<28:10, 1.50it/s]
74%|███████▍ | 7229/9770 [1:23:28<28:03, 1.51it/s]
74%|███████▍ | 7230/9770 [1:23:29<28:33, 1.48it/s]
74%|███████▍ | 7230/9770 [1:23:29<28:33, 1.48it/s]
74%|███████▍ | 7231/9770 [1:23:30<28:16, 1.50it/s]
74%|███████▍ | 7232/9770 [1:23:30<28:07, 1.50it/s]
74%|███████▍ | 7233/9770 [1:23:31<27:42, 1.53it/s]
74%|█████�
+0: {'loss': 0.6494, 'grad_norm': 0.6053832444121335, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.74}
+0: ��█▍ | 7234/9770 [1:23:32<27:32, 1.53it/s]
74%|███████▍ | 7235/9770 [1:23:32<27:54, 1.51it/s]
74%|███████▍ | 7236/9770 [1:23:33<27:40, 1.53it/s]
74%|███████▍ | 7237/9770 [1:23:34<27:48, 1.52it/s]
74%|███████▍ | 7238/9770 [1:23:34<27:20, 1.54it/s]
74%|███████▍ | 7239/9770 [1:23:35<26:59, 1.56it/s]
74%|███████▍ | 7240/9770 [1:23:35<26:58, 1.56it/s]
74%|███████▍ | 7240/9770 [1:23:35<26:58, 1.56it/s]
74%|███████▍ | 7241/9770 [1:23:36<27:14, 1.55it/s]
74%|███████▍ | 7242/9770 [1:23:37<27:57, 1.51it/s]
74%|███████▍ | 7243/9770 [1:23:37<27:50, 1.51it/s]
74%|███████▍ | 7244/9770 [1:23:38<27:31, 1.53it/s]
74%|███████▍ | 7245/9770 [1:23:39<27:31, 1.53it/s]
74%|███████▍ | 7246/9770 [1:23:39<27:22, 1.54it/s]
74%|█�
+0: {'loss': 0.6451, 'grad_norm': 0.5643537416975682, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.74}
+0: �█████▍ | 7247/9770 [1:23:40<27:29, 1.53it/s]
74%|███████▍ | 7248/9770 [1:23:41<28:08, 1.49it/s]
74%|███████▍ | 7249/9770 [1:23:41<27:45, 1.51it/s]
74%|███████▍ | 7250/9770 [1:23:42<27:44, 1.51it/s]
74%|███████▍ | 7250/9770 [1:23:42<27:44, 1.51it/s]
74%|███████▍ | 7251/9770 [1:23:43<27:15, 1.54it/s]
74%|███████▍ | 7252/9770 [1:23:43<27:09, 1.54it/s]
74%|███████▍ | 7253/9770 [1:23:44<27:21, 1.53it/s]
74%|███████▍ | 7254/9770 [1:23:45<27:29, 1.52it/s]
74%|███████▍ | 7255/9770 [1:23:45<27:51, 1.50it/s]
74%|███████▍ | 7256/9770 [1:23:46<27:23, 1.53it/s]
74%|███████▍ | 7257/9770 [1:23:47<27:11, 1.54it/s]
74%|███████▍ | 7258/9770 [1:23:47<27:51, 1.50it/s]
74%|███████▍ | 7259/9770 [1:23:48<27:33, 1.52it/s]
+0: {'loss': 0.6534, 'grad_norm': 0.5841152597879908, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.74}
+0: {'loss': 0.6422, 'grad_norm': 0.5954131460693213, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.74}
+0:
74%|███████▍ | 7260/9770 [1:23:49<27:04, 1.55it/s]
74%|███████▍ | 7260/9770 [1:23:49<27:04, 1.55it/s]
74%|███████▍ | 7261/9770 [1:23:49<27:17, 1.53it/s]
74%|███████▍ | 7262/9770 [1:23:50<27:02, 1.55it/s]
74%|███████▍ | 7263/9770 [1:23:50<27:08, 1.54it/s]
74%|███████▍ | 7264/9770 [1:23:51<27:09, 1.54it/s]
74%|███████▍ | 7265/9770 [1:23:52<27:00, 1.55it/s]
74%|███████▍ | 7266/9770 [1:23:52<26:59, 1.55it/s]
74%|███████▍ | 7267/9770 [1:23:53<26:56, 1.55it/s]
74%|███████▍ | 7268/9770 [1:23:54<27:03, 1.54it/s]
74%|███████▍ | 7269/9770 [1:23:54<26:55, 1.55it/s]
74%|███████▍ | 7270/9770 [1:23:55<26:53, 1.55it/s]
74%|███████▍ | 7270/9770 [1:23:55<26:53, 1.55it/s]
74
+0: {'loss': 0.6448, 'grad_norm': 0.5946845944286797, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.74}
+0: %|███████▍ | 7271/9770 [1:23:56<26:48, 1.55it/s]
74%|███████▍ | 7272/9770 [1:23:56<26:33, 1.57it/s]
74%|███████▍ | 7273/9770 [1:23:57<26:46, 1.55it/s]
74%|███████▍ | 7274/9770 [1:23:58<26:35, 1.56it/s]
74%|███████▍ | 7275/9770 [1:23:58<26:53, 1.55it/s]
74%|███████▍ | 7276/9770 [1:23:59<26:46, 1.55it/s]
74%|███████▍ | 7277/9770 [1:24:00<26:50, 1.55it/s]
74%|███████▍ | 7278/9770 [1:24:00<26:29, 1.57it/s]
75%|███████▍ | 7279/9770 [1:24:01<26:32, 1.56it/s]
75%|███████▍ | 7280/9770 [1:24:01<26:39, 1.56it/s]
75%|███████▍ | 7280/9770 [1:24:01<26:39, 1.56it/s]
75%|███████▍ | 7281/9770 [1:24:02<26:52, 1.54it/s]
75%|███████▍ | 7282/9770 [1:24:03<26:51, 1.54it/s]
75%|███████▍ | 7283/9770 [1:24:03<26:51, 1.
+0: {'loss': 0.6593, 'grad_norm': 0.5843315131096042, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.75}
+0: 54it/s]
75%|███████▍ | 7284/9770 [1:24:04<26:54, 1.54it/s]
75%|███████▍ | 7285/9770 [1:24:05<27:00, 1.53it/s]
75%|███████▍ | 7286/9770 [1:24:05<26:50, 1.54it/s]
75%|███████▍ | 7287/9770 [1:24:06<26:54, 1.54it/s]
75%|███████▍ | 7288/9770 [1:24:07<27:05, 1.53it/s]
75%|███████▍ | 7289/9770 [1:24:07<26:46, 1.54it/s]
75%|███████▍ | 7290/9770 [1:24:08<26:44, 1.55it/s]
75%|███████▍ | 7290/9770 [1:24:08<26:44, 1.55it/s]
75%|███████▍ | 7291/9770 [1:24:09<26:58, 1.53it/s]
75%|███████▍ | 7292/9770 [1:24:09<27:04, 1.53it/s]
75%|███████▍ | 7293/9770 [1:24:10<27:05, 1.52it/s]
75%|███████▍ | 7294/9770 [1:24:11<26:45, 1.54it/s]
75%|███████▍ | 7295/9770 [1:24:11<27:09, 1.52it/s]
75%|███████▍ | 7296/9770 [1:24:12
+0: {'loss': 0.6495, 'grad_norm': 0.6262861925557471, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.75}
+0: <27:14, 1.51it/s]
75%|███████▍ | 7297/9770 [1:24:13<27:00, 1.53it/s]
75%|███████▍ | 7298/9770 [1:24:13<27:04, 1.52it/s]
75%|███████▍ | 7299/9770 [1:24:14<26:35, 1.55it/s]
75%|███████▍ | 7300/9770 [1:24:14<26:44, 1.54it/s]
75%|███████▍ | 7300/9770 [1:24:14<26:44, 1.54it/s]
75%|███████▍ | 7301/9770 [1:24:15<27:06, 1.52it/s]
75%|███████▍ | 7302/9770 [1:24:16<26:58, 1.53it/s]
75%|███████▍ | 7303/9770 [1:24:16<26:35, 1.55it/s]
75%|███████▍ | 7304/9770 [1:24:17<26:20, 1.56it/s]
75%|███████▍ | 7305/9770 [1:24:18<26:17, 1.56it/s]
75%|███████▍ | 7306/9770 [1:24:18<26:21, 1.56it/s]
75%|███████▍ | 7307/9770 [1:24:19<26:24, 1.55it/s]
75%|███████▍ | 7308/9770 [1:24:20<26:39, 1.54it/s]
75%|███████▍ | 7309/97
+0: {'loss': 0.6508, 'grad_norm': 0.6229127142839981, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.75}
+0: {'loss': 0.6659, 'grad_norm': 0.5912851874001394, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.75}
+0: 70 [1:24:20<26:48, 1.53it/s]
75%|███████▍ | 7310/9770 [1:24:21<26:42, 1.54it/s]
75%|███████▍ | 7310/9770 [1:24:21<26:42, 1.54it/s]
75%|███████▍ | 7311/9770 [1:24:22<26:41, 1.54it/s]
75%|███████▍ | 7312/9770 [1:24:22<26:44, 1.53it/s]
75%|███████▍ | 7313/9770 [1:24:23<26:34, 1.54it/s]
75%|███████▍ | 7314/9770 [1:24:24<26:45, 1.53it/s]
75%|███████▍ | 7315/9770 [1:24:24<26:48, 1.53it/s]
75%|███████▍ | 7316/9770 [1:24:25<26:37, 1.54it/s]
75%|███████▍ | 7317/9770 [1:24:26<26:31, 1.54it/s]
75%|███████▍ | 7318/9770 [1:24:26<26:43, 1.53it/s]
75%|███████▍ | 7319/9770 [1:24:27<26:44, 1.53it/s]
75%|███████▍ | 7320/9770 [1:24:28<27:08, 1.50it/s]
75%|███���███▍ | 7320/9770 [
+0: {'loss': 0.6685, 'grad_norm': 0.6169956620319743, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.75}
+0: 1:24:28<27:08, 1.50it/s]
75%|███████▍ | 7321/9770 [1:24:28<27:17, 1.50it/s]
75%|███████▍ | 7322/9770 [1:24:29<27:19, 1.49it/s]
75%|███████▍ | 7323/9770 [1:24:30<27:05, 1.51it/s]
75%|███████▍ | 7324/9770 [1:24:30<26:38, 1.53it/s]
75%|███████▍ | 7325/9770 [1:24:31<26:20, 1.55it/s]
75%|███████▍ | 7326/9770 [1:24:31<26:16, 1.55it/s]
75%|███████▍ | 7327/9770 [1:24:32<26:18, 1.55it/s]
75%|███████▌ | 7328/9770 [1:24:33<26:12, 1.55it/s]
75%|███████▌ | 7329/9770 [1:24:33<26:18, 1.55it/s]
75%|███████▌ | 7330/9770 [1:24:34<26:33, 1.53it/s]
75%|███████▌ | 7330/9770 [1:24:34<26:33, 1.53it/s]
75%|███████▌ | 7331/9770 [1:24:35<26:21, 1.54it/s]
75%|███████▌ | 7332/9770 [1:24:35<26:16, 1.55it/s]
75%|███████▌ |
+0: {'loss': 0.6591, 'grad_norm': 0.6208719769887372, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.75}
+0: 7333/9770 [1:24:36<26:19, 1.54it/s]
75%|███████▌ | 7334/9770 [1:24:37<26:27, 1.53it/s]
75%|███████▌ | 7335/9770 [1:24:37<26:46, 1.52it/s]
75%|███████▌ | 7336/9770 [1:24:38<27:10, 1.49it/s]
75%|███████▌ | 7337/9770 [1:24:39<27:02, 1.50it/s]
75%|███████▌ | 7338/9770 [1:24:39<26:48, 1.51it/s]
75%|███████▌ | 7339/9770 [1:24:40<26:36, 1.52it/s]
75%|███████▌ | 7340/9770 [1:24:41<26:52, 1.51it/s]
75%|███████▌ | 7340/9770 [1:24:41<26:52, 1.51it/s]
75%|███████▌ | 7341/9770 [1:24:41<26:36, 1.52it/s]
75%|███████▌ | 7342/9770 [1:24:42<26:43, 1.51it/s]
75%|███████▌ | 7343/9770 [1:24:43<26:44, 1.51it/s]
75%|███████▌ | 7344/9770 [1:24:43<26:40, 1.52it/s]
75%|███████▌ | 7345/9770 [1:24:44<26:22, 1.53it/s]
75%|█████�
+0: {'loss': 0.6643, 'grad_norm': 0.5981268168191592, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.75}
+0: �█▌ | 7346/9770 [1:24:45<26:51, 1.50it/s]
75%|███████▌ | 7347/9770 [1:24:45<26:39, 1.52it/s]
75%|███████▌ | 7348/9770 [1:24:46<26:26, 1.53it/s]
75%|███████▌ | 7349/9770 [1:24:47<27:01, 1.49it/s]
75%|███████▌ | 7350/9770 [1:24:47<26:54, 1.50it/s]
75%|███████▌ | 7350/9770 [1:24:47<26:54, 1.50it/s]
75%|███████▌ | 7351/9770 [1:24:48<26:41, 1.51it/s]
75%|███████▌ | 7352/9770 [1:24:49<26:32, 1.52it/s]
75%|███████▌ | 7353/9770 [1:24:49<26:28, 1.52it/s]
75%|███████▌ | 7354/9770 [1:24:50<26:47, 1.50it/s]
75%|███████▌ | 7355/9770 [1:24:51<26:26, 1.52it/s]
75%|███████▌ | 7356/9770 [1:24:51<26:19, 1.53it/s]
75%|███████▌ | 7357/9770 [1:24:52<26:00, 1.55it/s]
75%|███████▌ | 7358/9770 [1:24:52<26:24, 1.52it/s]
75%|██
+0: {'loss': 0.6468, 'grad_norm': 0.6032178405209255, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.75}
+0: {'loss': 0.6546, 'grad_norm': 0.623968033296279, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.75}
+0: █████▌ | 7359/9770 [1:24:53<26:11, 1.53it/s]
75%|███████▌ | 7360/9770 [1:24:54<26:42, 1.50it/s]
75%|███████▌ | 7360/9770 [1:24:54<26:42, 1.50it/s]
75%|███████▌ | 7361/9770 [1:24:54<26:30, 1.51it/s]
75%|███████▌ | 7362/9770 [1:24:55<26:13, 1.53it/s]
75%|███████▌ | 7363/9770 [1:24:56<26:30, 1.51it/s]
75%|███████▌ | 7364/9770 [1:24:56<26:56, 1.49it/s]
75%|███████▌ | 7365/9770 [1:24:57<26:39, 1.50it/s]
75%|███████▌ | 7366/9770 [1:24:58<26:31, 1.51it/s]
75%|███████▌ | 7367/9770 [1:24:58<26:22, 1.52it/s]
75%|███████▌ | 7368/9770 [1:24:59<26:53, 1.49it/s]
75%|███████▌ | 7369/9770 [1:25:00<26:40, 1.50it/s]
75%|███████▌ | 7370/9770 [1:25:00<26:29, 1.51it/s]
75%|███�
+0: {'loss': 0.6553, 'grad_norm': 0.5745720584742292, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.75}
+0: ��███▌ | 7370/9770 [1:25:00<26:29, 1.51it/s]
75%|███████▌ | 7371/9770 [1:25:01<26:33, 1.51it/s]
75%|███████▌ | 7372/9770 [1:25:02<26:26, 1.51it/s]
75%|███████▌ | 7373/9770 [1:25:02<26:33, 1.50it/s]
75%|███████▌ | 7374/9770 [1:25:03<26:40, 1.50it/s]
75%|███████▌ | 7375/9770 [1:25:04<26:24, 1.51it/s]
75%|███████▌ | 7376/9770 [1:25:04<26:21, 1.51it/s]
76%|███████▌ | 7377/9770 [1:25:05<26:51, 1.48it/s]
76%|███████▌ | 7378/9770 [1:25:06<26:22, 1.51it/s]
76%|███████▌ | 7379/9770 [1:25:06<26:18, 1.52it/s]
76%|███████▌ | 7380/9770 [1:25:07<26:10, 1.52it/s]
76%|███████▌ | 7380/9770 [1:25:07<26:10, 1.52it/s]
76%|███████▌ | 7381/9770 [1:25:08<26:04, 1.53it/s]
76%|███████▌ | 7382/9770 [1:25:08<26:19, 1.51it/s]
76%
+0: {'loss': 0.6574, 'grad_norm': 0.6353691229960085, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.76}
+0: |███████▌ | 7383/9770 [1:25:09<26:01, 1.53it/s]
76%|███████▌ | 7384/9770 [1:25:10<26:32, 1.50it/s]
76%|███████▌ | 7385/9770 [1:25:10<26:28, 1.50it/s]
76%|███████▌ | 7386/9770 [1:25:11<26:20, 1.51it/s]
76%|███████▌ | 7387/9770 [1:25:12<26:47, 1.48it/s]
76%|███████▌ | 7388/9770 [1:25:12<26:21, 1.51it/s]
76%|███████▌ | 7389/9770 [1:25:13<25:57, 1.53it/s]
76%|███████▌ | 7390/9770 [1:25:14<26:01, 1.52it/s]
76%|███████▌ | 7390/9770 [1:25:14<26:01, 1.52it/s]
76%|███████▌ | 7391/9770 [1:25:14<26:09, 1.52it/s]
76%|███████▌ | 7392/9770 [1:25:15<25:56, 1.53it/s]
76%|███████▌ | 7393/9770 [1:25:16<25:46, 1.54it/s]
76%|███████▌ | 7394/9770 [1:25:16<25:52, 1.53it/s]
76%|███████▌ | 7395/9770 [1:25:17<25:36, 1.5
+0: {'loss': 0.6433, 'grad_norm': 0.5987619216176655, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.76}
+0: 5it/s]
76%|███████▌ | 7396/9770 [1:25:18<25:48, 1.53it/s]
76%|███████▌ | 7397/9770 [1:25:18<25:37, 1.54it/s]
76%|███████▌ | 7398/9770 [1:25:19<25:41, 1.54it/s]
76%|███████▌ | 7399/9770 [1:25:20<26:04, 1.52it/s]
76%|███████▌ | 7400/9770 [1:25:20<25:41, 1.54it/s]
76%|███████▌ | 7400/9770 [1:25:20<25:41, 1.54it/s]
76%|███████▌ | 7401/9770 [1:25:21<25:44, 1.53it/s]
76%|███████▌ | 7402/9770 [1:25:21<25:31, 1.55it/s]
76%|███████▌ | 7403/9770 [1:25:22<25:31, 1.55it/s]
76%|███████▌ | 7404/9770 [1:25:23<25:38, 1.54it/s]
76%|███████▌ | 7405/9770 [1:25:23<25:31, 1.54it/s]
76%|███████▌ | 7406/9770 [1:25:24<25:32, 1.54it/s]
76%|███████▌ | 7407/9770 [1:25:25<25:27, 1.55it/s]
76%|███████▌ | 7408/9770 [1:25:25<
+0: {'loss': 0.657, 'grad_norm': 0.6000013480638627, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.76}
+0: 25:32, 1.54it/s]
76%|███████▌ | 7409/9770 [1:25:26<25:21, 1.55it/s]
76%|███████▌ | 7410/9770 [1:25:27<25:06, 1.57it/s]
76%|███████▌ | 7410/9770 [1:25:27<25:06, 1.57it/s]
76%|███████▌ | 7411/9770 [1:25:27<25:22, 1.55it/s]
76%|███████▌ | 7412/9770 [1:25:28<25:17, 1.55it/s]
76%|███████▌ | 7413/9770 [1:25:29<25:41, 1.53it/s]
76%|███████▌ | 7414/9770 [1:25:29<26:05, 1.51it/s]
76%|███████▌ | 7415/9770 [1:25:30<26:08, 1.50it/s]
76%|███████▌ | 7416/9770 [1:25:31<25:51, 1.52it/s]
76%|███████▌ | 7417/9770 [1:25:31<26:19, 1.49it/s]
76%|███████▌ | 7418/9770 [1:25:32<26:28, 1.48it/s]
76%|███████▌ | 7419/9770 [1:25:33<25:58, 1.51it/s]
76%|███████▌ | 7420/9770 [1:25:33<26:02, 1.50it/s]
+0: {'loss': 0.6577, 'grad_norm': 0.5889138649444225, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.76}
+0: {'loss': 0.66, 'grad_norm': 0.6042249110869242, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.76}
+0:
76%|███████▌ | 7420/9770 [1:25:33<26:02, 1.50it/s]
76%|███████▌ | 7421/9770 [1:25:34<25:57, 1.51it/s]
76%|███████▌ | 7422/9770 [1:25:35<25:49, 1.52it/s]
76%|███████▌ | 7423/9770 [1:25:35<26:02, 1.50it/s]
76%|███████▌ | 7424/9770 [1:25:36<25:45, 1.52it/s]
76%|███████▌ | 7425/9770 [1:25:37<25:31, 1.53it/s]
76%|███████▌ | 7426/9770 [1:25:37<25:37, 1.52it/s]
76%|███████▌ | 7427/9770 [1:25:38<25:37, 1.52it/s]
76%|███████▌ | 7428/9770 [1:25:39<25:29, 1.53it/s]
76%|███████▌ | 7429/9770 [1:25:39<25:11, 1.55it/s]
76%|███████▌ | 7430/9770 [1:25:40<25:05, 1.55it/s]
76%|███████▌ | 7430/9770 [1:25:40<25:05, 1.55it/s]
76%|███████▌ | 7431/9770 [1:25:40<25:42, 1.52it/s]
76%|███████▌ | 7432/9770 [1
+0: {'loss': 0.6301, 'grad_norm': 0.5930566989166397, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.76}
+0: :25:41<25:34, 1.52it/s]
76%|███████▌ | 7433/9770 [1:25:42<25:47, 1.51it/s]
76%|███████▌ | 7434/9770 [1:25:42<25:41, 1.52it/s]
76%|███████▌ | 7435/9770 [1:25:43<26:16, 1.48it/s]
76%|███████▌ | 7436/9770 [1:25:44<25:51, 1.50it/s]
76%|███████▌ | 7437/9770 [1:25:45<26:12, 1.48it/s]
76%|███████▌ | 7438/9770 [1:25:45<25:42, 1.51it/s]
76%|███████▌ | 7439/9770 [1:25:46<25:40, 1.51it/s]
76%|███████▌ | 7440/9770 [1:25:46<25:33, 1.52it/s]
76%|███████▌ | 7440/9770 [1:25:46<25:33, 1.52it/s]
76%|███████▌ | 7441/9770 [1:25:47<25:28, 1.52it/s]
76%|███████▌ | 7442/9770 [1:25:48<25:24, 1.53it/s]
76%|███████▌ | 7443/9770 [1:25:48<25:54, 1.50it/s]
76%|███████▌ | 7444/9770 [1:25:49<25:45, 1.50it/s]
76%|███████▌ | 7
+0: {'loss': 0.644, 'grad_norm': 0.5943052392252575, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.76}
+0: 445/9770 [1:25:50<25:50, 1.50it/s]
76%|███████▌ | 7446/9770 [1:25:50<26:00, 1.49it/s]
76%|███████▌ | 7447/9770 [1:25:51<25:44, 1.50it/s]
76%|███████▌ | 7448/9770 [1:25:52<25:56, 1.49it/s]
76%|███████▌ | 7449/9770 [1:25:52<26:05, 1.48it/s]
76%|███████▋ | 7450/9770 [1:25:53<25:54, 1.49it/s]
76%|███████▋ | 7450/9770 [1:25:53<25:54, 1.49it/s]
76%|███████▋ | 7451/9770 [1:25:54<25:30, 1.52it/s]
76%|███████▋ | 7452/9770 [1:25:54<25:23, 1.52it/s]
76%|███████▋ | 7453/9770 [1:25:55<25:24, 1.52it/s]
76%|███████▋ | 7454/9770 [1:25:56<25:30, 1.51it/s]
76%|███████▋ | 7455/9770 [1:25:56<25:32, 1.51it/s]
76%|███████▋ | 7456/9770 [1:25:57<25:53, 1.49it/s]
76%|███████▋ | 7457/9770 [1:25:58<25:35, 1.51it/s]
76%|██████
+0: {'loss': 0.6622, 'grad_norm': 0.589574033988617, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.76}
+0: █▋ | 7458/9770 [1:25:58<25:23, 1.52it/s]
76%|███████▋ | 7459/9770 [1:25:59<25:21, 1.52it/s]
76%|███████▋ | 7460/9770 [1:26:00<25:13, 1.53it/s]
76%|███████▋ | 7460/9770 [1:26:00<25:13, 1.53it/s]
76%|███████▋ | 7461/9770 [1:26:00<25:13, 1.53it/s]
76%|███████▋ | 7462/9770 [1:26:01<25:08, 1.53it/s]
76%|███████▋ | 7463/9770 [1:26:02<25:01, 1.54it/s]
76%|███████▋ | 7464/9770 [1:26:02<24:48, 1.55it/s]
76%|███████▋ | 7465/9770 [1:26:03<25:11, 1.52it/s]
76%|███████▋ | 7466/9770 [1:26:04<24:58, 1.54it/s]
76%|███████▋ | 7467/9770 [1:26:04<24:45, 1.55it/s]
76%|███████▋ | 7468/9770 [1:26:05<24:32, 1.56it/s]
76%|███████▋ | 7469/9770 [1:26:06<24:57, 1.54it/s]
76%|███████▋ | 7470/9770 [1:26:06<24:55, 1.54it/s]
+0: {'loss': 0.6661, 'grad_norm': 0.6078080519022193, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.76}
+0: {'loss': 0.6717, 'grad_norm': 0.6513111552467029, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.77}
+0:
76%|███████▋ | 7470/9770 [1:26:06<24:55, 1.54it/s]
76%|███████▋ | 7471/9770 [1:26:07<25:02, 1.53it/s]
76%|███████▋ | 7472/9770 [1:26:08<25:08, 1.52it/s]
76%|███████▋ | 7473/9770 [1:26:08<24:56, 1.54it/s]
76%|███████▋ | 7474/9770 [1:26:09<24:59, 1.53it/s]
77%|███████▋ | 7475/9770 [1:26:10<25:09, 1.52it/s]
77%|███████▋ | 7476/9770 [1:26:10<25:07, 1.52it/s]
77%|███████▋ | 7477/9770 [1:26:11<24:57, 1.53it/s]
77%|███████▋ | 7478/9770 [1:26:12<25:30, 1.50it/s]
77%|███████▋ | 7479/9770 [1:26:12<25:10, 1.52it/s]
77%|███████▋ | 7480/9770 [1:26:13<24:51, 1.54it/s]
77%|███████▋ | 7480/9770 [1:26:13<24:51, 1.54it/s]
77%|███████▋ | 7481/9770 [1:26:13<24:32, 1.55it/s]
77%|███�
+0: {'loss': 0.6414, 'grad_norm': 0.5754265651193947, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.77}
+0: �███▋ | 7482/9770 [1:26:14<24:13, 1.57it/s]
77%|███████▋ | 7483/9770 [1:26:15<24:27, 1.56it/s]
77%|███████▋ | 7484/9770 [1:26:15<24:38, 1.55it/s]
77%|███████▋ | 7485/9770 [1:26:16<24:54, 1.53it/s]
77%|███████▋ | 7486/9770 [1:26:17<24:39, 1.54it/s]
77%|███████▋ | 7487/9770 [1:26:17<24:45, 1.54it/s]
77%|███████▋ | 7488/9770 [1:26:18<24:31, 1.55it/s]
77%|███████▋ | 7489/9770 [1:26:19<24:57, 1.52it/s]
77%|███████▋ | 7490/9770 [1:26:19<24:37, 1.54it/s]
77%|███████▋ | 7490/9770 [1:26:19<24:37, 1.54it/s]
77%|███████▋ | 7491/9770 [1:26:20<24:42, 1.54it/s]
77%|███████▋ | 7492/9770 [1:26:21<24:59, 1.52it/s]
77%|███████▋ | 7493/9770 [1:26:21<24:54, 1.52it/s]
77%|███████▋ | 7494/9770 [1:26:22<24:43, 1.53it/s]
77%|
+0: {'loss': 0.6404, 'grad_norm': 0.6131656563321725, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.77}
+0: ███████▋ | 7495/9770 [1:26:23<24:43, 1.53it/s]
77%|███████▋ | 7496/9770 [1:26:23<24:31, 1.55it/s]
77%|███████▋ | 7497/9770 [1:26:24<24:39, 1.54it/s]
77%|███████▋ | 7498/9770 [1:26:24<24:44, 1.53it/s]
77%|███████▋ | 7499/9770 [1:26:25<24:26, 1.55it/s]
77%|███████▋ | 7500/9770 [1:26:26<24:27, 1.55it/s]
77%|███████▋ | 7500/9770 [1:26:26<24:27, 1.55it/s]
77%|███████▋ | 7501/9770 [1:26:26<24:24, 1.55it/s]
77%|███████▋ | 7502/9770 [1:26:27<24:14, 1.56it/s]
77%|███████▋ | 7503/9770 [1:26:28<24:15, 1.56it/s]
77%|███████▋ | 7504/9770 [1:26:28<24:30, 1.54it/s]
77%|███████▋ | 7505/9770 [1:26:29<24:41, 1.53it/s]
77%|███████▋ | 7506/9770 [1:26:30<24:38, 1.53it/s]
77%|███████▋ | 7507/9770 [1:26:30<24:29, 1.54
+0: {'loss': 0.6606, 'grad_norm': 0.5875310509975358, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.77}
+0: it/s]
77%|███████▋ | 7508/9770 [1:26:31<24:21, 1.55it/s]
77%|███████▋ | 7509/9770 [1:26:32<24:33, 1.53it/s]
77%|███████▋ | 7510/9770 [1:26:32<24:20, 1.55it/s]
77%|███████▋ | 7510/9770 [1:26:32<24:20, 1.55it/s]
77%|███████▋ | 7511/9770 [1:26:33<24:20, 1.55it/s]
77%|███████▋ | 7512/9770 [1:26:34<24:19, 1.55it/s]
77%|███████▋ | 7513/9770 [1:26:34<24:07, 1.56it/s]
77%|███████▋ | 7514/9770 [1:26:35<24:13, 1.55it/s]
77%|███████▋ | 7515/9770 [1:26:35<24:44, 1.52it/s]
77%|███████▋ | 7516/9770 [1:26:36<24:23, 1.54it/s]
77%|███████▋ | 7517/9770 [1:26:37<24:20, 1.54it/s]
77%|███████▋ | 7518/9770 [1:26:37<24:24, 1.54it/s]
77%|███████▋ | 7519/9770 [1:26:38<25:32, 1.47it/s]
77%|████���██▋ | 7520/9770 [1:26:39<2
+0: {'loss': 0.6476, 'grad_norm': 0.6307834141084832, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.77}
+0: {'loss': 0.6625, 'grad_norm': 0.6392285155212977, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.77}
+0: 4:57, 1.50it/s]
77%|███████▋ | 7520/9770 [1:26:39<24:57, 1.50it/s]
77%|███████▋ | 7521/9770 [1:26:39<24:40, 1.52it/s]
77%|███████▋ | 7522/9770 [1:26:40<24:44, 1.51it/s]
77%|███████▋ | 7523/9770 [1:26:41<24:38, 1.52it/s]
77%|███████▋ | 7524/9770 [1:26:41<24:39, 1.52it/s]
77%|███████▋ | 7525/9770 [1:26:42<24:39, 1.52it/s]
77%|███████▋ | 7526/9770 [1:26:43<25:12, 1.48it/s]
77%|███████▋ | 7527/9770 [1:26:43<25:16, 1.48it/s]
77%|███████▋ | 7528/9770 [1:26:44<25:01, 1.49it/s]
77%|███████▋ | 7529/9770 [1:26:45<24:52, 1.50it/s]
77%|███████▋ | 7530/9770 [1:26:45<24:43, 1.51it/s]
77%|███████▋ | 7530/9770 [1:26:45<24:43, 1.51it/s]
77%|███████▋ | 7531/9770 [1:26:46<24:50
+0: {'loss': 0.6403, 'grad_norm': 0.6336726995125174, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.77}
+0: , 1.50it/s]
77%|███████▋ | 7532/9770 [1:26:47<24:34, 1.52it/s]
77%|███████▋ | 7533/9770 [1:26:47<24:45, 1.51it/s]
77%|███████▋ | 7534/9770 [1:26:48<24:21, 1.53it/s]
77%|███████▋ | 7535/9770 [1:26:49<24:14, 1.54it/s]
77%|███████▋ | 7536/9770 [1:26:49<24:14, 1.54it/s]
77%|███████▋ | 7537/9770 [1:26:50<23:57, 1.55it/s]
77%|███████▋ | 7538/9770 [1:26:51<24:12, 1.54it/s]
77%|███████▋ | 7539/9770 [1:26:51<24:49, 1.50it/s]
77%|███████▋ | 7540/9770 [1:26:52<24:31, 1.52it/s]
77%|███████▋ | 7540/9770 [1:26:52<24:31, 1.52it/s]
77%|███████▋ | 7541/9770 [1:26:53<24:08, 1.54it/s]
77%|███████▋ | 7542/9770 [1:26:53<24:00, 1.55it/s]
77%|███████▋ | 7543/9770 [1:26:54<24:06, 1.54it/s]
77%|███████▋ | 7544/9770 [1:
+0: {'loss': 0.6346, 'grad_norm': 0.6252329712101955, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.77}
+0: 26:55<24:05, 1.54it/s]
77%|███████▋ | 7545/9770 [1:26:55<24:32, 1.51it/s]
77%|███████▋ | 7546/9770 [1:26:56<24:52, 1.49it/s]
77%|███████▋ | 7547/9770 [1:26:57<24:34, 1.51it/s]
77%|███████▋ | 7548/9770 [1:26:57<24:22, 1.52it/s]
77%|███████▋ | 7549/9770 [1:26:58<24:21, 1.52it/s]
77%|███████▋ | 7550/9770 [1:26:59<24:20, 1.52it/s]
77%|███████▋ | 7550/9770 [1:26:59<24:20, 1.52it/s]
77%|███████▋ | 7551/9770 [1:26:59<24:05, 1.54it/s]
77%|███████▋ | 7552/9770 [1:27:00<24:19, 1.52it/s]
77%|███████▋ | 7553/9770 [1:27:01<24:10, 1.53it/s]
77%|███████▋ | 7554/9770 [1:27:01<24:22, 1.52it/s]
77%|███████▋ | 7555/9770 [1:27:02<24:07, 1.53it/s]
77%|███████▋ | 7556/9770 [1:27:02<24:01, 1.54it/s]
77%|███████▋ | 75
+0: {'loss': 0.642, 'grad_norm': 0.5714879057449758, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.77}
+0: 57/9770 [1:27:03<24:06, 1.53it/s]
77%|███████▋ | 7558/9770 [1:27:04<23:56, 1.54it/s]
77%|███████▋ | 7559/9770 [1:27:04<23:44, 1.55it/s]
77%|███████▋ | 7560/9770 [1:27:05<23:52, 1.54it/s]
77%|███████▋ | 7560/9770 [1:27:05<23:52, 1.54it/s]
77%|███████▋ | 7561/9770 [1:27:06<24:06, 1.53it/s]
77%|███████▋ | 7562/9770 [1:27:06<24:00, 1.53it/s]
77%|███████▋ | 7563/9770 [1:27:07<24:05, 1.53it/s]
77%|███████▋ | 7564/9770 [1:27:08<23:40, 1.55it/s]
77%|███████▋ | 7565/9770 [1:27:08<23:40, 1.55it/s]
77%|███████▋ | 7566/9770 [1:27:09<24:02, 1.53it/s]
77%|███████▋ | 7567/9770 [1:27:10<23:47, 1.54it/s]
77%|███████▋ | 7568/9770 [1:27:10<24:04, 1.52it/s]
77%|███████▋ | 7569/9770 [1:27:11<23:57, 1.53it/s]
77%|██████�
+0: {'loss': 0.6607, 'grad_norm': 0.5544062795923599, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.77}
+0: {'loss': 0.6307, 'grad_norm': 0.6148215099868057, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.78}
+0: ��▋ | 7570/9770 [1:27:12<23:38, 1.55it/s]
77%|███████▋ | 7570/9770 [1:27:12<23:38, 1.55it/s]
77%|███████▋ | 7571/9770 [1:27:12<23:37, 1.55it/s]
78%|███████▊ | 7572/9770 [1:27:13<23:34, 1.55it/s]
78%|███████▊ | 7573/9770 [1:27:13<23:39, 1.55it/s]
78%|███████▊ | 7574/9770 [1:27:14<23:41, 1.55it/s]
78%|███████▊ | 7575/9770 [1:27:15<23:46, 1.54it/s]
78%|███████▊ | 7576/9770 [1:27:15<23:37, 1.55it/s]
78%|███████▊ | 7577/9770 [1:27:16<23:46, 1.54it/s]
78%|███████▊ | 7578/9770 [1:27:17<23:42, 1.54it/s]
78%|███████▊ | 7579/9770 [1:27:17<23:47, 1.53it/s]
78%|███████▊ | 7580/9770 [1:27:18<23:50, 1.53it/s]
78%|███████▊ | 7580/9770 [1:27:18<23:50, 1.53it/s]
78%|███████�
+0: {'loss': 0.6483, 'grad_norm': 0.6182403662394549, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.78}
+0: � | 7581/9770 [1:27:19<23:44, 1.54it/s]
78%|███████▊ | 7582/9770 [1:27:19<23:35, 1.55it/s]
78%|███████▊ | 7583/9770 [1:27:20<23:19, 1.56it/s]
78%|███████▊ | 7584/9770 [1:27:21<23:30, 1.55it/s]
78%|███████▊ | 7585/9770 [1:27:21<23:31, 1.55it/s]
78%|███████▊ | 7586/9770 [1:27:22<23:47, 1.53it/s]
78%|███████▊ | 7587/9770 [1:27:23<23:54, 1.52it/s]
78%|███████▊ | 7588/9770 [1:27:23<24:07, 1.51it/s]
78%|███████▊ | 7589/9770 [1:27:24<24:24, 1.49it/s]
78%|███████▊ | 7590/9770 [1:27:25<24:06, 1.51it/s]
78%|███████▊ | 7590/9770 [1:27:25<24:06, 1.51it/s]
78%|███████▊ | 7591/9770 [1:27:25<23:53, 1.52it/s]
78%|███████▊ | 7592/9770 [1:27:26<23:52, 1.52it/s]
78%|███████▊ | 7593/9770 [1:27:27<23:36, 1.54it/s]
78%|████
+0: {'loss': 0.6719, 'grad_norm': 0.5942953071673898, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.78}
+0: ███▊ | 7594/9770 [1:27:27<23:29, 1.54it/s]
78%|███████▊ | 7595/9770 [1:27:28<23:36, 1.54it/s]
78%|███████▊ | 7596/9770 [1:27:29<24:16, 1.49it/s]
78%|███████▊ | 7597/9770 [1:27:29<24:03, 1.51it/s]
78%|███████▊ | 7598/9770 [1:27:30<23:47, 1.52it/s]
78%|███████▊ | 7599/9770 [1:27:31<23:40, 1.53it/s]
78%|███████▊ | 7600/9770 [1:27:31<23:43, 1.52it/s]
78%|███████▊ | 7600/9770 [1:27:31<23:43, 1.52it/s]
78%|███████▊ | 7601/9770 [1:27:32<23:39, 1.53it/s]
78%|███████▊ | 7602/9770 [1:27:32<23:37, 1.53it/s]
78%|███████▊ | 7603/9770 [1:27:33<23:20, 1.55it/s]
78%|███████▊ | 7604/9770 [1:27:34<23:29, 1.54it/s]
78%|███████▊ | 7605/9770 [1:27:34<23:27, 1.54it/s]
78%|███████▊ | 7606/9770 [1:27:35<23:39, 1.52it/s]
78%|�
+0: {'loss': 0.6478, 'grad_norm': 0.5694043947539491, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.78}
+0: ��██████▊ | 7607/9770 [1:27:36<23:41, 1.52it/s]
78%|███████▊ | 7608/9770 [1:27:36<23:50, 1.51it/s]
78%|███████▊ | 7609/9770 [1:27:37<23:36, 1.53it/s]
78%|███████▊ | 7610/9770 [1:27:38<23:33, 1.53it/s]
78%|███████▊ | 7610/9770 [1:27:38<23:33, 1.53it/s]
78%|███████▊ | 7611/9770 [1:27:38<23:46, 1.51it/s]
78%|███████▊ | 7612/9770 [1:27:39<23:39, 1.52it/s]
78%|███████▊ | 7613/9770 [1:27:40<23:51, 1.51it/s]
78%|███████▊ | 7614/9770 [1:27:40<23:53, 1.50it/s]
78%|███████▊ | 7615/9770 [1:27:41<23:46, 1.51it/s]
78%|███████▊ | 7616/9770 [1:27:42<24:00, 1.50it/s]
78%|███████▊ | 7617/9770 [1:27:42<23:32, 1.52it/s]
78%|███████▊ | 7618/9770 [1:27:43<23:07, 1.55it/s]
78%|███████▊ | 7619/9770 [1:27:44<23:09, 1.55i
+0: {'loss': 0.6497, 'grad_norm': 0.5858731492458507, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.78}
+0: {'loss': 0.6725, 'grad_norm': 0.6262111008331863, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.78}
+0: t/s]
78%|███████▊ | 7620/9770 [1:27:44<23:17, 1.54it/s]
78%|███████▊ | 7620/9770 [1:27:44<23:17, 1.54it/s]
78%|███████▊ | 7621/9770 [1:27:45<23:26, 1.53it/s]
78%|███████▊ | 7622/9770 [1:27:46<23:05, 1.55it/s]
78%|███████▊ | 7623/9770 [1:27:46<23:13, 1.54it/s]
78%|███████▊ | 7624/9770 [1:27:47<23:16, 1.54it/s]
78%|███████▊ | 7625/9770 [1:27:48<23:16, 1.54it/s]
78%|███████▊ | 7626/9770 [1:27:48<23:03, 1.55it/s]
78%|███████▊ | 7627/9770 [1:27:49<23:31, 1.52it/s]
78%|███████▊ | 7628/9770 [1:27:49<23:15, 1.54it/s]
78%|███████▊ | 7629/9770 [1:27:50<23:23, 1.53it/s]
78%|███████▊ | 7630/9770 [1:27:51<23:04, 1.55it/s]
78%|███████▊ | 7630/9770 [1:27:51<23:04, 1.55it/s]
+0: {'loss': 0.649, 'grad_norm': 0.6119738083369423, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.78}
+0:
78%|███████▊ | 7631/9770 [1:27:51<23:07, 1.54it/s]
78%|███████▊ | 7632/9770 [1:27:52<23:04, 1.54it/s]
78%|███████▊ | 7633/9770 [1:27:53<22:52, 1.56it/s]
78%|███████▊ | 7634/9770 [1:27:53<23:00, 1.55it/s]
78%|███████▊ | 7635/9770 [1:27:54<22:57, 1.55it/s]
78%|███████▊ | 7636/9770 [1:27:55<22:37, 1.57it/s]
78%|███████▊ | 7637/9770 [1:27:55<22:50, 1.56it/s]
78%|███████▊ | 7638/9770 [1:27:56<22:53, 1.55it/s]
78%|███████▊ | 7639/9770 [1:27:57<22:49, 1.56it/s]
78%|███████▊ | 7640/9770 [1:27:57<22:45, 1.56it/s]
78%|███████▊ | 7640/9770 [1:27:57<22:45, 1.56it/s]
78%|███████▊ | 7641/9770 [1:27:58<22:59, 1.54it/s]
78%|███████▊ | 7642/9770 [1:27:58<22:55, 1.55it/s]
78%|███████▊ | 7643/9770 [1:27:59<22:37,
+0: {'loss': 0.6369, 'grad_norm': 0.6060823204611006, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.78}
+0: 1.57it/s]
78%|███████▊ | 7644/9770 [1:28:00<22:53, 1.55it/s]
78%|███████▊ | 7645/9770 [1:28:00<22:57, 1.54it/s]
78%|███████▊ | 7646/9770 [1:28:01<23:06, 1.53it/s]
78%|███████▊ | 7647/9770 [1:28:02<23:03, 1.53it/s]
78%|███████▊ | 7648/9770 [1:28:02<23:04, 1.53it/s]
78%|███████▊ | 7649/9770 [1:28:03<23:12, 1.52it/s]
78%|███████▊ | 7650/9770 [1:28:04<23:00, 1.54it/s]
78%|███████▊ | 7650/9770 [1:28:04<23:00, 1.54it/s]
78%|███████▊ | 7651/9770 [1:28:04<22:41, 1.56it/s]
78%|███████▊ | 7652/9770 [1:28:05<22:49, 1.55it/s]
78%|███████▊ | 7653/9770 [1:28:06<22:40, 1.56it/s]
78%|███████▊ | 7654/9770 [1:28:06<22:54, 1.54it/s]
78%|███████▊ | 7655/9770 [1:28:07<23:19, 1.51it/s]
78%|███████▊ | 7656/9770 [1:2
+0: {'loss': 0.6423, 'grad_norm': 0.5716280329295779, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.78}
+0: 8:08<23:32, 1.50it/s]
78%|███████▊ | 7657/9770 [1:28:08<23:27, 1.50it/s]
78%|███████▊ | 7658/9770 [1:28:09<23:20, 1.51it/s]
78%|███████▊ | 7659/9770 [1:28:10<23:13, 1.51it/s]
78%|███████▊ | 7660/9770 [1:28:10<23:29, 1.50it/s]
78%|███████▊ | 7660/9770 [1:28:10<23:29, 1.50it/s]
78%|███████▊ | 7661/9770 [1:28:11<23:04, 1.52it/s]
78%|███████▊ | 7662/9770 [1:28:12<22:50, 1.54it/s]
78%|███████▊ | 7663/9770 [1:28:12<23:04, 1.52it/s]
78%|███████▊ | 7664/9770 [1:28:13<22:59, 1.53it/s]
78%|███████▊ | 7665/9770 [1:28:14<22:52, 1.53it/s]
78%|███████▊ | 7666/9770 [1:28:14<22:33, 1.55it/s]
78%|███████▊ | 7667/9770 [1:28:15<23:20, 1.50it/s]
78%|███████▊ | 7668/9770 [1:28:16<23:21, 1.50it/s]
78%|███████▊ | 766
+0: {'loss': 0.6516, 'grad_norm': 0.5654812439202281, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.78}
+0: {'loss': 0.6562, 'grad_norm': 0.6199105076588045, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.79}
+0: 9/9770 [1:28:16<23:15, 1.51it/s]
79%|███████▊ | 7670/9770 [1:28:17<22:52, 1.53it/s]
79%|███████▊ | 7670/9770 [1:28:17<22:52, 1.53it/s]
79%|███████▊ | 7671/9770 [1:28:18<22:53, 1.53it/s]
79%|███████▊ | 7672/9770 [1:28:18<22:49, 1.53it/s]
79%|███████▊ | 7673/9770 [1:28:19<22:42, 1.54it/s]
79%|███████▊ | 7674/9770 [1:28:19<22:44, 1.54it/s]
79%|███████▊ | 7675/9770 [1:28:20<23:01, 1.52it/s]
79%|███████▊ | 7676/9770 [1:28:21<22:53, 1.52it/s]
79%|███████▊ | 7677/9770 [1:28:21<22:57, 1.52it/s]
79%|███████▊ | 7678/9770 [1:28:22<22:45, 1.53it/s]
79%|███████▊ | 7679/9770 [1:28:23<22:42, 1.54it/s]
79%|███████▊ | 7680/9770 [1:28:23<22:37, 1.54it/s]
79%|███████▊ | 7680/97
+0: {'loss': 0.6361, 'grad_norm': 0.5924798354147677, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.79}
+0: 70 [1:28:23<22:37, 1.54it/s]
79%|███████▊ | 7681/9770 [1:28:24<22:38, 1.54it/s]
79%|███████▊ | 7682/9770 [1:28:25<22:32, 1.54it/s]
79%|███████▊ | 7683/9770 [1:28:25<22:24, 1.55it/s]
79%|███████▊ | 7684/9770 [1:28:26<22:31, 1.54it/s]
79%|███████▊ | 7685/9770 [1:28:27<22:48, 1.52it/s]
79%|███████▊ | 7686/9770 [1:28:27<22:36, 1.54it/s]
79%|███████▊ | 7687/9770 [1:28:28<22:37, 1.53it/s]
79%|███████▊ | 7688/9770 [1:28:29<22:42, 1.53it/s]
79%|███████▊ | 7689/9770 [1:28:29<22:44, 1.53it/s]
79%|███████▊ | 7690/9770 [1:28:30<22:52, 1.52it/s]
79%|███████▊ | 7690/9770 [1:28:30<22:52, 1.52it/s]
79%|███████▊ | 7691/9770 [1:28:31<22:44, 1.52it/s]
79%|███████▊ | 7692/9770 [1:28:31<22:27, 1.54it/s]
79%|███████▊
+0: {'loss': 0.6667, 'grad_norm': 0.6107388749357734, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.79}
+0: | 7693/9770 [1:28:32<22:17, 1.55it/s]
79%|███████▉ | 7694/9770 [1:28:32<22:15, 1.55it/s]
79%|███████▉ | 7695/9770 [1:28:33<22:14, 1.55it/s]
79%|███████▉ | 7696/9770 [1:28:34<22:24, 1.54it/s]
79%|███████▉ | 7697/9770 [1:28:34<22:21, 1.55it/s]
79%|███████▉ | 7698/9770 [1:28:35<22:29, 1.54it/s]
79%|███████▉ | 7699/9770 [1:28:36<23:03, 1.50it/s]
79%|███████▉ | 7700/9770 [1:28:36<22:32, 1.53it/s]
79%|███████▉ | 7700/9770 [1:28:36<22:32, 1.53it/s]
79%|███████▉ | 7701/9770 [1:28:37<22:35, 1.53it/s]
79%|███████▉ | 7702/9770 [1:28:38<22:28, 1.53it/s]
79%|███████▉ | 7703/9770 [1:28:38<22:34, 1.53it/s]
79%|███████▉ | 7704/9770 [1:28:39<22:57, 1.50it/s]
79%|███████▉ | 7705/9770 [1:28:40<22:32, 1.53it/s]
79%|████�
+0: {'loss': 0.6495, 'grad_norm': 0.6100469495613748, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.79}
+0: ��██▉ | 7706/9770 [1:28:40<22:37, 1.52it/s]
79%|███████▉ | 7707/9770 [1:28:41<22:31, 1.53it/s]
79%|███████▉ | 7708/9770 [1:28:42<22:27, 1.53it/s]
79%|███████▉ | 7709/9770 [1:28:42<22:37, 1.52it/s]
79%|███████▉ | 7710/9770 [1:28:43<22:36, 1.52it/s]
79%|███████▉ | 7710/9770 [1:28:43<22:36, 1.52it/s]
79%|███████▉ | 7711/9770 [1:28:44<22:34, 1.52it/s]
79%|███████▉ | 7712/9770 [1:28:44<22:25, 1.53it/s]
79%|███████▉ | 7713/9770 [1:28:45<22:23, 1.53it/s]
79%|███████▉ | 7714/9770 [1:28:46<22:23, 1.53it/s]
79%|███████▉ | 7715/9770 [1:28:46<22:43, 1.51it/s]
79%|███████▉ | 7716/9770 [1:28:47<22:16, 1.54it/s]
79%|███████▉ | 7717/9770 [1:28:48<22:24, 1.53it/s]
79%|███████▉ | 7718/9770 [1:28:48<22:21, 1.53it/s]
79%|�
+0: {'loss': 0.641, 'grad_norm': 0.5786261006200034, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.79}
+0: {'loss': 0.6421, 'grad_norm': 0.5998515328407902, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.79}
+0: �██████▉ | 7719/9770 [1:28:49<22:30, 1.52it/s]
79%|███████▉ | 7720/9770 [1:28:50<22:18, 1.53it/s]
79%|███████▉ | 7720/9770 [1:28:50<22:18, 1.53it/s]
79%|███████▉ | 7721/9770 [1:28:50<22:18, 1.53it/s]
79%|███████▉ | 7722/9770 [1:28:51<22:35, 1.51it/s]
79%|███████▉ | 7723/9770 [1:28:52<22:24, 1.52it/s]
79%|███████▉ | 7724/9770 [1:28:52<22:15, 1.53it/s]
79%|███████▉ | 7725/9770 [1:28:53<22:18, 1.53it/s]
79%|███████▉ | 7726/9770 [1:28:53<22:24, 1.52it/s]
79%|███████▉ | 7727/9770 [1:28:54<22:24, 1.52it/s]
79%|███████▉ | 7728/9770 [1:28:55<22:41, 1.50it/s]
79%|███████▉ | 7729/9770 [1:28:55<22:17, 1.53it/s]
79%|███████▉ | 7730/9770 [1:28:56<22:15, 1.53it/s]
79%|██
+0: {'loss': 0.6508, 'grad_norm': 0.5757542905583947, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.79}
+0: █████▉ | 7730/9770 [1:28:56<22:15, 1.53it/s]
79%|███████▉ | 7731/9770 [1:28:57<22:09, 1.53it/s]
79%|███████▉ | 7732/9770 [1:28:57<22:20, 1.52it/s]
79%|███████▉ | 7733/9770 [1:28:58<22:59, 1.48it/s]
79%|███████▉ | 7734/9770 [1:28:59<22:47, 1.49it/s]
79%|███████▉ | 7735/9770 [1:28:59<22:37, 1.50it/s]
79%|███████▉ | 7736/9770 [1:29:00<22:18, 1.52it/s]
79%|███████▉ | 7737/9770 [1:29:01<22:13, 1.52it/s]
79%|███████▉ | 7738/9770 [1:29:01<21:55, 1.54it/s]
79%|███████▉ | 7739/9770 [1:29:02<22:24, 1.51it/s]
79%|███████▉ | 7740/9770 [1:29:03<22:17, 1.52it/s]
79%|███████▉ | 7740/9770 [1:29:03<22:17, 1.52it/s]
79%|███████▉ | 7741/9770 [1:29:03<22:07, 1.53it/s]
79%|███████▉ | 7742/9770 [1:29:04<22:02, 1.53it/s]
+0: {'loss': 0.6773, 'grad_norm': 0.6095169766738976, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.79}
+0: 79%|███████▉ | 7743/9770 [1:29:05<21:58, 1.54it/s]
79%|███████▉ | 7744/9770 [1:29:05<22:19, 1.51it/s]
79%|███████▉ | 7745/9770 [1:29:06<22:12, 1.52it/s]
79%|███████▉ | 7746/9770 [1:29:07<22:01, 1.53it/s]
79%|███████▉ | 7747/9770 [1:29:07<22:21, 1.51it/s]
79%|███████▉ | 7748/9770 [1:29:08<22:35, 1.49it/s]
79%|███████▉ | 7749/9770 [1:29:09<22:44, 1.48it/s]
79%|███████▉ | 7750/9770 [1:29:09<22:24, 1.50it/s]
79%|███████▉ | 7750/9770 [1:29:09<22:24, 1.50it/s]
79%|███████▉ | 7751/9770 [1:29:10<22:18, 1.51it/s]
79%|███████▉ | 7752/9770 [1:29:11<22:08, 1.52it/s]
79%|███████▉ | 7753/9770 [1:29:11<21:57, 1.53it/s]
79%|███████▉ | 7754/9770 [1:29:12<22:08, 1.52it/s]
79%|███████▉ | 7755/9770 [1:29:13<21:45,
+0: {'loss': 0.6616, 'grad_norm': 0.5825167591987942, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.79}
+0: 1.54it/s]
79%|███████▉ | 7756/9770 [1:29:13<21:51, 1.54it/s]
79%|███████▉ | 7757/9770 [1:29:14<21:52, 1.53it/s]
79%|███████▉ | 7758/9770 [1:29:15<21:37, 1.55it/s]
79%|███████▉ | 7759/9770 [1:29:15<21:50, 1.53it/s]
79%|███████▉ | 7760/9770 [1:29:16<21:56, 1.53it/s]
79%|███████▉ | 7760/9770 [1:29:16<21:56, 1.53it/s]
79%|███████▉ | 7761/9770 [1:29:16<21:50, 1.53it/s]
79%|███████▉ | 7762/9770 [1:29:17<21:56, 1.53it/s]
79%|███████▉ | 7763/9770 [1:29:18<21:57, 1.52it/s]
79%|███��███▉ | 7764/9770 [1:29:18<21:59, 1.52it/s]
79%|███████▉ | 7765/9770 [1:29:19<22:06, 1.51it/s]
79%|███████▉ | 7766/9770 [1:29:20<22:33, 1.48it/s]
79%|███████▉ | 7767/9770 [1:29:20<22:12, 1.50it/s]
80%|███████▉ | 7768/9770 [1:29
+0: {'loss': 0.6564, 'grad_norm': 0.582752384573689, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.79}
+0: :21<22:09, 1.51it/s]
80%|███████▉ | 7769/9770 [1:29:22<21:53, 1.52it/s]
80%|███████▉ | 7770/9770 [1:29:22<22:09, 1.50it/s]
80%|███████▉ | 7770/9770 [1:29:22<22:09, 1.50it/s]
80%|███████▉ | 7771/9770 [1:29:23<22:32, 1.48it/s]
80%|███████▉ | 7772/9770 [1:29:24<22:17, 1.49it/s]
80%|███████▉ | 7773/9770 [1:29:24<22:13, 1.50it/s]
80%|███████▉ | 7774/9770 [1:29:25<22:05, 1.51it/s]
80%|███████▉ | 7775/9770 [1:29:26<21:58, 1.51it/s]
80%|███████▉ | 7776/9770 [1:29:26<21:56, 1.51it/s]
80%|███████▉ | 7777/9770 [1:29:27<21:40, 1.53it/s]
80%|███████▉ | 7778/9770 [1:29:28<22:12, 1.50it/s]
80%|███████▉ | 7779/9770 [1:29:28<21:46, 1.52it/s]
80%|███████▉ | 7780/9770 [1:29:29<21:38, 1.53it/s]
+0: {'loss': 0.67, 'grad_norm': 0.640384264981169, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.8}
+0: {'loss': 0.6364, 'grad_norm': 0.5767090113468913, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.8}
+0:
80%|███████▉ | 7780/9770 [1:29:29<21:38, 1.53it/s]
80%|███████▉ | 7781/9770 [1:29:30<21:21, 1.55it/s]
80%|███████▉ | 7782/9770 [1:29:30<22:00, 1.51it/s]
80%|███████▉ | 7783/9770 [1:29:31<21:55, 1.51it/s]
80%|███████▉ | 7784/9770 [1:29:32<21:44, 1.52it/s]
80%|███████▉ | 7785/9770 [1:29:32<21:28, 1.54it/s]
80%|███████▉ | 7786/9770 [1:29:33<21:35, 1.53it/s]
80%|███████▉ | 7787/9770 [1:29:34<21:44, 1.52it/s]
80%|███████▉ | 7788/9770 [1:29:34<21:38, 1.53it/s]
80%|███████▉ | 7789/9770 [1:29:35<21:20, 1.55it/s]
80%|███████▉ | 7790/9770 [1:29:36<21:30, 1.53it/s]
80%|███████▉ | 7790/9770 [1:29:36<21:30, 1.53it/s]
80%|███████▉ | 7791/9770 [1:29:36<21:38, 1.52it/s]
80%|███████▉ | 7792/977
+0: {'loss': 0.6386, 'grad_norm': 0.5957009439289818, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.8}
+0: 0 [1:29:37<21:29, 1.53it/s]
80%|███████▉ | 7793/9770 [1:29:38<21:27, 1.54it/s]
80%|███████▉ | 7794/9770 [1:29:38<21:27, 1.54it/s]
80%|███████▉ | 7795/9770 [1:29:39<21:16, 1.55it/s]
80%|███████▉ | 7796/9770 [1:29:40<21:49, 1.51it/s]
80%|███████▉ | 7797/9770 [1:29:40<21:43, 1.51it/s]
80%|███████▉ | 7798/9770 [1:29:41<21:32, 1.53it/s]
80%|███████▉ | 7799/9770 [1:29:41<21:19, 1.54it/s]
80%|███████▉ | 7800/9770 [1:29:42<21:26, 1.53it/s]
80%|███████▉ | 7800/9770 [1:29:42<21:26, 1.53it/s]
80%|███████▉ | 7801/9770 [1:29:43<21:26, 1.53it/s]
80%|███████▉ | 7802/9770 [1:29:43<21:28, 1.53it/s]
80%|███████▉ | 7803/9770 [1:29:44<21:54, 1.50it/s]
80%|███████▉ | 7804/9770 [1:29:45<21:40, 1.51it/s]
80%|███████▉
+0: {'loss': 0.6525, 'grad_norm': 0.6051848176523708, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.8}
+0: [2025-09-02 21:25:54,345] [INFO] [axolotl.core.trainers.base._save:613] [PID:3622631] [RANK:0] Saving model checkpoint to /lustre/fswork/projects/rech/dgo/udv55np/math/Qwen3-235B-A22B/Qwen2.5-0.5B_reasoning/1/checkpoint-7816[39m
+0: [2025-09-02 21:25:55,242] [INFO] [axolotl.core.trainers.base._save:662] [PID:3622631] [RANK:0] Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`[39m
+0: | 7805/9770 [1:29:45<21:14, 1.54it/s]
80%|███████▉ | 7806/9770 [1:29:46<21:11, 1.54it/s]
80%|███████▉ | 7807/9770 [1:29:47<21:08, 1.55it/s]
80%|███████▉ | 7808/9770 [1:29:47<21:11, 1.54it/s]
80%|███████▉ | 7809/9770 [1:29:48<21:25, 1.53it/s]
80%|███████▉ | 7810/9770 [1:29:49<21:28, 1.52it/s]
80%|███████▉ | 7810/9770 [1:29:49<21:28, 1.52it/s]
80%|███████▉ | 7811/9770 [1:29:49<21:28, 1.52it/s]
80%|███████▉ | 7812/9770 [1:29:50<21:23, 1.53it/s]
80%|███████▉ | 7813/9770 [1:29:51<21:21, 1.53it/s]
80%|███████▉ | 7814/9770 [1:29:51<21:17, 1.53it/s]
80%|███████▉ | 7815/9770 [1:29:52<20:56, 1.56it/s]
80%|████████ | 7816/9770 [1:29:53<20:35, 1.58it/s]
80%|████████ | 7817/9770 [1:29:55<41:09, 1.26s/it]
80%|████�
+0: {'loss': 0.6366, 'grad_norm': 0.6151419471490285, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.8}
+0: �███ | 7818/9770 [1:29:56<35:06, 1.08s/it]
80%|████████ | 7819/9770 [1:29:57<30:37, 1.06it/s]
80%|████████ | 7820/9770 [1:29:57<27:57, 1.16it/s]
80%|████████ | 7820/9770 [1:29:57<27:57, 1.16it/s]
80%|████████ | 7821/9770 [1:29:58<25:57, 1.25it/s]
80%|████████ | 7822/9770 [1:29:59<25:22, 1.28it/s]
80%|████████ | 7823/9770 [1:29:59<24:19, 1.33it/s]
80%|████████ | 7824/9770 [1:30:00<23:22, 1.39it/s]
80%|████████ | 7825/9770 [1:30:01<23:06, 1.40it/s]
80%|████████ | 7826/9770 [1:30:01<22:58, 1.41it/s]
80%|████████ | 7827/9770 [1:30:02<22:06, 1.46it/s]
80%|████████ | 7828/9770 [1:30:03<21:43, 1.49it/s]
80%|████████ | 7829/9770 [1:30:03<21:23, 1.51it/s]
80%|████████ | 7830/9770 [1:30:04<21:14, 1.52it/s]
+0: {'loss': 0.6436, 'grad_norm': 0.5832642772142379, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.8}
+0: {'loss': 0.6556, 'grad_norm': 0.5978307860900283, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.8}
+0:
80%|████████ | 7830/9770 [1:30:04<21:14, 1.52it/s]
80%|████████ | 7831/9770 [1:30:05<21:09, 1.53it/s]
80%|████████ | 7832/9770 [1:30:05<21:08, 1.53it/s]
80%|████████ | 7833/9770 [1:30:06<21:06, 1.53it/s]
80%|████████ | 7834/9770 [1:30:07<21:27, 1.50it/s]
80%|████████ | 7835/9770 [1:30:07<21:19, 1.51it/s]
80%|████████ | 7836/9770 [1:30:08<21:14, 1.52it/s]
80%|████████ | 7837/9770 [1:30:08<20:55, 1.54it/s]
80%|████████ | 7838/9770 [1:30:09<20:58, 1.54it/s]
80%|████████ | 7839/9770 [1:30:10<21:02, 1.53it/s]
80%|████████ | 7840/9770 [1:30:10<21:02, 1.53it/s]
80%|████████ | 7840/9770 [1:30:10<21:02, 1.53it/s]
80%|████████ | 7841/9770 [1:30:11<20:59, 1.53it/s]
80%|██�
+0: {'loss': 0.6499, 'grad_norm': 0.5847377319393593, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.8}
+0: ��█████ | 7842/9770 [1:30:12<21:12, 1.52it/s]
80%|████████ | 7843/9770 [1:30:12<21:16, 1.51it/s]
80%|████████ | 7844/9770 [1:30:13<21:06, 1.52it/s]
80%|████████ | 7845/9770 [1:30:14<20:49, 1.54it/s]
80%|████████ | 7846/9770 [1:30:14<20:50, 1.54it/s]
80%|████████ | 7847/9770 [1:30:15<21:01, 1.52it/s]
80%|████████ | 7848/9770 [1:30:16<20:55, 1.53it/s]
80%|████████ | 7849/9770 [1:30:16<20:50, 1.54it/s]
80%|████████ | 7850/9770 [1:30:17<20:35, 1.55it/s]
80%|████████ | 7850/9770 [1:30:17<20:35, 1.55it/s]
80%|████████ | 7851/9770 [1:30:18<20:35, 1.55it/s]
80%|████████ | 7852/9770 [1:30:18<20:42, 1.54it/s]
80%|████████ | 7853/9770 [1:30:19<20:52, 1.53it/s]
80%|████████ | 7854/9770 [1:30:20<20:43, 1.54it/s]
+0: {'loss': 0.6604, 'grad_norm': 0.6134773396412422, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.8}
+0: 80%|████████ | 7855/9770 [1:30:20<20:47, 1.53it/s]
80%|████████ | 7856/9770 [1:30:21<21:11, 1.51it/s]
80%|████████ | 7857/9770 [1:30:22<21:01, 1.52it/s]
80%|████████ | 7858/9770 [1:30:22<20:51, 1.53it/s]
80%|████████ | 7859/9770 [1:30:23<21:11, 1.50it/s]
80%|████████ | 7860/9770 [1:30:24<20:55, 1.52it/s]
80%|████████ | 7860/9770 [1:30:24<20:55, 1.52it/s]
80%|████████ | 7861/9770 [1:30:24<20:53, 1.52it/s]
80%|████████ | 7862/9770 [1:30:25<21:08, 1.50it/s]
80%|████████ | 7863/9770 [1:30:26<20:52, 1.52it/s]
80%|████████ | 7864/9770 [1:30:26<20:46, 1.53it/s]
81%|████████ | 7865/9770 [1:30:27<20:42, 1.53it/s]
81%|████████ | 7866/9770 [1:30:27<20:46, 1.53it/s]
81%|████████ | 7867/9770 [1:30:28<20:41,
+0: {'loss': 0.6542, 'grad_norm': 0.5846256365738515, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.81}
+0: 1.53it/s]
81%|████████ | 7868/9770 [1:30:29<20:56, 1.51it/s]
81%|████████ | 7869/9770 [1:30:29<20:39, 1.53it/s]
81%|████████ | 7870/9770 [1:30:30<20:24, 1.55it/s]
81%|████████ | 7870/9770 [1:30:30<20:24, 1.55it/s]
81%|████████ | 7871/9770 [1:30:31<20:20, 1.56it/s]
81%|████████ | 7872/9770 [1:30:31<20:12, 1.56it/s]
81%|████████ | 7873/9770 [1:30:32<20:17, 1.56it/s]
81%|████████ | 7874/9770 [1:30:33<20:27, 1.54it/s]
81%|████████ | 7875/9770 [1:30:33<20:21, 1.55it/s]
81%|████████ | 7876/9770 [1:30:34<20:34, 1.53it/s]
81%|████████ | 7877/9770 [1:30:35<20:29, 1.54it/s]
81%|████████ | 7878/9770 [1:30:35<20:18, 1.55it/s]
81%|████████ | 7879/9770 [1:30:36<20:10, 1.56it/s]
81%|████████ | 7880/9770 [1:30:
+0: {'loss': 0.648, 'grad_norm': 0.5594934689814495, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.81}
+0: {'loss': 0.6466, 'grad_norm': 0.5833123225807513, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.81}
+0: 36<19:52, 1.59it/s]
81%|████████ | 7880/9770 [1:30:36<19:52, 1.59it/s]
81%|████████ | 7881/9770 [1:30:37<20:10, 1.56it/s]
81%|████████ | 7882/9770 [1:30:38<20:22, 1.54it/s]
81%|████████ | 7883/9770 [1:30:38<20:07, 1.56it/s]
81%|████████ | 7884/9770 [1:30:39<20:22, 1.54it/s]
81%|████████ | 7885/9770 [1:30:40<20:20, 1.54it/s]
81%|████████ | 7886/9770 [1:30:40<20:19, 1.55it/s]
81%|████████ | 7887/9770 [1:30:41<20:18, 1.55it/s]
81%|████████ | 7888/9770 [1:30:42<20:27, 1.53it/s]
81%|████████ | 7889/9770 [1:30:42<20:44, 1.51it/s]
81%|████████ | 7890/9770 [1:30:43<20:25, 1.53it/s]
81%|████████ | 7890/9770 [1:30:43<20:25, 1.53it/s]
81%|████████ | 7891/9770 [1:30:44<2
+0: {'loss': 0.6461, 'grad_norm': 0.5757969189097039, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.81}
+0: 0:31, 1.53it/s]
81%|████████ | 7892/9770 [1:30:44<20:24, 1.53it/s]
81%|████████ | 7893/9770 [1:30:45<20:18, 1.54it/s]
81%|████████ | 7894/9770 [1:30:46<20:17, 1.54it/s]
81%|████████ | 7895/9770 [1:30:46<20:23, 1.53it/s]
81%|████████ | 7896/9770 [1:30:47<20:36, 1.52it/s]
81%|████████ | 7897/9770 [1:30:48<20:21, 1.53it/s]
81%|████████ | 7898/9770 [1:30:48<20:21, 1.53it/s]
81%|████████ | 7899/9770 [1:30:49<20:20, 1.53it/s]
81%|████████ | 7900/9770 [1:30:50<20:27, 1.52it/s]
81%|████████ | 7900/9770 [1:30:50<20:27, 1.52it/s]
81%|████████ | 7901/9770 [1:30:50<20:34, 1.51it/s]
81%|████████ | 7902/9770 [1:30:51<20:31, 1.52it/s]
81%|████████ | 7903/9770 [1:30:52<20:46, 1.50it/s]
81%|████████ | 7904/9770
+0: {'loss': 0.6505, 'grad_norm': 0.6262193887659668, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.81}
+0: [1:30:52<20:25, 1.52it/s]
81%|████████ | 7905/9770 [1:30:53<20:25, 1.52it/s]
81%|████████ | 7906/9770 [1:30:53<20:26, 1.52it/s]
81%|████████ | 7907/9770 [1:30:54<20:24, 1.52it/s]
81%|████████ | 7908/9770 [1:30:55<20:20, 1.53it/s]
81%|████████ | 7909/9770 [1:30:55<20:34, 1.51it/s]
81%|████████ | 7910/9770 [1:30:56<20:25, 1.52it/s]
81%|████████ | 7910/9770 [1:30:56<20:25, 1.52it/s]
81%|████████ | 7911/9770 [1:30:57<20:21, 1.52it/s]
81%|████████ | 7912/9770 [1:30:57<20:38, 1.50it/s]
81%|████████ | 7913/9770 [1:30:58<20:24, 1.52it/s]
81%|████████ | 7914/9770 [1:30:59<20:20, 1.52it/s]
81%|████████ | 7915/9770 [1:30:59<20:21, 1.52it/s]
81%|████████ | 7916/9770 [1:31:00<20:04, 1.54it/s]
81%|████████
+0: {'loss': 0.6481, 'grad_norm': 0.5938254866180448, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.81}
+0: | 7917/9770 [1:31:01<20:04, 1.54it/s]
81%|████████ | 7918/9770 [1:31:01<20:21, 1.52it/s]
81%|████████ | 7919/9770 [1:31:02<20:16, 1.52it/s]
81%|████████ | 7920/9770 [1:31:03<20:03, 1.54it/s]
81%|████████ | 7920/9770 [1:31:03<20:03, 1.54it/s]
81%|████████ | 7921/9770 [1:31:03<19:51, 1.55it/s]
81%|████████ | 7922/9770 [1:31:04<20:03, 1.54it/s]
81%|████████ | 7923/9770 [1:31:05<20:03, 1.53it/s]
81%|████████ | 7924/9770 [1:31:05<20:29, 1.50it/s]
81%|████████ | 7925/9770 [1:31:06<20:21, 1.51it/s]
81%|████████ | 7926/9770 [1:31:07<20:32, 1.50it/s]
81%|████████ | 7927/9770 [1:31:07<20:47, 1.48it/s]
81%|████████ | 7928/9770 [1:31:08<20:26, 1.50it/s]
81%|████████ | 7929/9770 [1:31:09<20:27, 1.50it/s]
81%|█████
+0: {'loss': 0.6498, 'grad_norm': 0.6020907690551343, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.81}
+0: {'loss': 0.6434, 'grad_norm': 0.6060348582271227, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.81}
+0: ███ | 7930/9770 [1:31:09<20:21, 1.51it/s]
81%|████████ | 7930/9770 [1:31:09<20:21, 1.51it/s]
81%|████████ | 7931/9770 [1:31:10<20:23, 1.50it/s]
81%|████████ | 7932/9770 [1:31:11<20:16, 1.51it/s]
81%|████████ | 7933/9770 [1:31:11<19:56, 1.53it/s]
81%|████████ | 7934/9770 [1:31:12<19:57, 1.53it/s]
81%|████████ | 7935/9770 [1:31:13<19:55, 1.54it/s]
81%|████████ | 7936/9770 [1:31:13<19:59, 1.53it/s]
81%|████████ | 7937/9770 [1:31:14<19:46, 1.54it/s]
81%|████████ | 7938/9770 [1:31:15<20:07, 1.52it/s]
81%|████████▏ | 7939/9770 [1:31:15<20:05, 1.52it/s]
81%|████████▏ | 7940/9770 [1:31:16<20:08, 1.51it/s]
81%|████████▏ | 7940/9770 [1:31:16<20:08, 1.51it/s]
81%|████�
+0: {'loss': 0.6425, 'grad_norm': 0.6216693844810112, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.81}
+0: ��███▏ | 7941/9770 [1:31:16<19:43, 1.55it/s]
81%|████████▏ | 7942/9770 [1:31:17<19:40, 1.55it/s]
81%|████████▏ | 7943/9770 [1:31:18<19:47, 1.54it/s]
81%|████████▏ | 7944/9770 [1:31:18<19:46, 1.54it/s]
81%|████████▏ | 7945/9770 [1:31:19<19:58, 1.52it/s]
81%|████████▏ | 7946/9770 [1:31:20<19:36, 1.55it/s]
81%|████████▏ | 7947/9770 [1:31:20<19:49, 1.53it/s]
81%|████████▏ | 7948/9770 [1:31:21<19:46, 1.54it/s]
81%|████████▏ | 7949/9770 [1:31:22<19:26, 1.56it/s]
81%|████████▏ | 7950/9770 [1:31:22<19:40, 1.54it/s]
81%|████████▏ | 7950/9770 [1:31:22<19:40, 1.54it/s]
81%|████████▏ | 7951/9770 [1:31:23<19:47, 1.53it/s]
81%|████████▏ | 7952/9770 [1:31:24<19:45, 1.53it/s]
81%|████████▏ | 7953/9770 [1:31:
+0: {'loss': 0.6635, 'grad_norm': 0.582854611367015, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.81}
+0: 24<19:41, 1.54it/s]
81%|████████▏ | 7954/9770 [1:31:25<19:48, 1.53it/s]
81%|████████▏ | 7955/9770 [1:31:26<19:43, 1.53it/s]
81%|████████▏ | 7956/9770 [1:31:26<19:30, 1.55it/s]
81%|████████▏ | 7957/9770 [1:31:27<19:30, 1.55it/s]
81%|████████▏ | 7958/9770 [1:31:28<19:39, 1.54it/s]
81%|████████▏ | 7959/9770 [1:31:28<19:41, 1.53it/s]
81%|████████▏ | 7960/9770 [1:31:29<19:19, 1.56it/s]
81%|████████▏ | 7960/9770 [1:31:29<19:19, 1.56it/s]
81%|████████▏ | 7961/9770 [1:31:29<19:24, 1.55it/s]
81%|████████▏ | 7962/9770 [1:31:30<19:50, 1.52it/s]
82%|████████▏ | 7963/9770 [1:31:31<19:46, 1.52it/s]
82%|████████▏ | 7964/9770 [1:31:31<19:48, 1.52it/s]
82%|████████▏ | 7965/9770 [1:31:32<19:30, 1.54it/s]
82%|██�
+0: {'loss': 0.6339, 'grad_norm': 0.5840065675906253, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.82}
+0: ��█████▏ | 7966/9770 [1:31:33<19:24, 1.55it/s]
82%|████████▏ | 7967/9770 [1:31:33<19:21, 1.55it/s]
82%|████████▏ | 7968/9770 [1:31:34<19:17, 1.56it/s]
82%|████████▏ | 7969/9770 [1:31:35<19:22, 1.55it/s]
82%|████████▏ | 7970/9770 [1:31:35<19:20, 1.55it/s]
82%|████████▏ | 7970/9770 [1:31:35<19:20, 1.55it/s]
82%|████████▏ | 7971/9770 [1:31:36<19:26, 1.54it/s]
82%|████████▏ | 7972/9770 [1:31:37<19:41, 1.52it/s]
82%|████████▏ | 7973/9770 [1:31:37<20:01, 1.50it/s]
82%|████████▏ | 7974/9770 [1:31:38<19:42, 1.52it/s]
82%|████████▏ | 7975/9770 [1:31:39<19:42, 1.52it/s]
82%|████████▏ | 7976/9770 [1:31:39<19:46, 1.51it/s]
82%|████████▏ | 7977/9770 [1:31:40<19:34, 1.53it/s]
82%|████████▏ | 7978/9770
+0: {'loss': 0.6368, 'grad_norm': 0.6193311371538059, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.82}
+0: [1:31:41<19:35, 1.52it/s]
82%|████████▏ | 7979/9770 [1:31:41<19:25, 1.54it/s]
82%|████████▏ | 7980/9770 [1:31:42<19:24, 1.54it/s]
82%|████████▏ | 7980/9770 [1:31:42<19:24, 1.54it/s]
82%|████████▏ | 7981/9770 [1:31:43<19:32, 1.53it/s]
82%|████████▏ | 7982/9770 [1:31:43<19:34, 1.52it/s]
82%|████████▏ | 7983/9770 [1:31:44<20:01, 1.49it/s]
82%|████████▏ | 7984/9770 [1:31:45<19:29, 1.53it/s]
82%|████████▏ | 7985/9770 [1:31:45<19:22, 1.53it/s]
82%|████████▏ | 7986/9770 [1:31:46<19:29, 1.53it/s]
82%|████████▏ | 7987/9770 [1:31:46<19:20, 1.54it/s]
82%|████████▏ | 7988/9770 [1:31:47<19:19, 1.54it/s]
82%|████████▏ | 7989/9770 [1:31:48<19:33, 1.52it/s]
82%|████████▏ | 7990/9770 [1:31:48<19:28, 1.52it/s]
+0: {'loss': 0.6741, 'grad_norm': 0.6208613350071387, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.82}
+0: {'loss': 0.6432, 'grad_norm': 0.5729237702257056, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.82}
+0:
82%|████████▏ | 7990/9770 [1:31:48<19:28, 1.52it/s]
82%|████████▏ | 7991/9770 [1:31:49<19:11, 1.55it/s]
82%|████████▏ | 7992/9770 [1:31:50<19:19, 1.53it/s]
82%|████████▏ | 7993/9770 [1:31:50<19:19, 1.53it/s]
82%|████████▏ | 7994/9770 [1:31:51<19:15, 1.54it/s]
82%|████████▏ | 7995/9770 [1:31:52<19:20, 1.53it/s]
82%|████████▏ | 7996/9770 [1:31:52<19:25, 1.52it/s]
82%|████████▏ | 7997/9770 [1:31:53<19:14, 1.54it/s]
82%|████████▏ | 7998/9770 [1:31:54<19:06, 1.55it/s]
82%|████████▏ | 7999/9770 [1:31:54<19:09, 1.54it/s]
82%|████████▏ | 8000/9770 [1:31:55<19:14, 1.53it/s]
82%|████████▏ | 8000/9770 [1:31:55<19:14, 1.53it/s]
82%|████████▏ | 8001/9770 [1:31:56<19
+0: {'loss': 0.6555, 'grad_norm': 0.6171835596278724, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.82}
+0: :20, 1.52it/s]
82%|████████▏ | 8002/9770 [1:31:56<19:31, 1.51it/s]
82%|████████▏ | 8003/9770 [1:31:57<19:19, 1.52it/s]
82%|████████▏ | 8004/9770 [1:31:58<19:03, 1.54it/s]
82%|████████▏ | 8005/9770 [1:31:58<19:07, 1.54it/s]
82%|████████▏ | 8006/9770 [1:31:59<19:04, 1.54it/s]
82%|████████▏ | 8007/9770 [1:32:00<19:04, 1.54it/s]
82%|████████▏ | 8008/9770 [1:32:00<19:08, 1.53it/s]
82%|████████▏ | 8009/9770 [1:32:01<19:03, 1.54it/s]
82%|████████▏ | 8010/9770 [1:32:01<19:04, 1.54it/s]
82%|████████▏ | 8010/9770 [1:32:01<19:04, 1.54it/s]
82%|████████▏ | 8011/9770 [1:32:02<19:12, 1.53it/s]
82%|████████▏ | 8012/9770 [1:32:03<19:12, 1.53it/s]
82%|████████▏ | 8013/9770 [1:32:04<19:36, 1.49it/s]
82%|████
+0: {'loss': 0.6548, 'grad_norm': 0.6677592893497909, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.82}
+0: ████▏ | 8014/9770 [1:32:04<19:47, 1.48it/s]
82%|████████▏ | 8015/9770 [1:32:05<19:40, 1.49it/s]
82%|████████▏ | 8016/9770 [1:32:06<19:37, 1.49it/s]
82%|████████▏ | 8017/9770 [1:32:06<19:23, 1.51it/s]
82%|████████▏ | 8018/9770 [1:32:07<19:23, 1.51it/s]
82%|████████▏ | 8019/9770 [1:32:08<19:35, 1.49it/s]
82%|████████▏ | 8020/9770 [1:32:08<19:38, 1.48it/s]
82%|████████▏ | 8020/9770 [1:32:08<19:38, 1.48it/s]
82%|████████▏ | 8021/9770 [1:32:09<19:31, 1.49it/s]
82%|████████▏ | 8022/9770 [1:32:10<19:25, 1.50it/s]
82%|████████▏ | 8023/9770 [1:32:10<19:10, 1.52it/s]
82%|████████▏ | 8024/9770 [1:32:11<19:07, 1.52it/s]
82%|████████▏ | 8025/9770 [1:32:11<19:07, 1.52it/s]
82%|████████▏ | 8026/9770 [1:32
+0: {'loss': 0.6447, 'grad_norm': 0.5980020406673359, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.82}
+0: :12<19:23, 1.50it/s]
82%|████████▏ | 8027/9770 [1:32:13<19:10, 1.51it/s]
82%|████████▏ | 8028/9770 [1:32:13<19:09, 1.52it/s]
82%|████████▏ | 8029/9770 [1:32:14<19:29, 1.49it/s]
82%|████████▏ | 8030/9770 [1:32:15<19:33, 1.48it/s]
82%|████████▏ | 8030/9770 [1:32:15<19:33, 1.48it/s]
82%|████████▏ | 8031/9770 [1:32:16<19:17, 1.50it/s]
82%|████████▏ | 8032/9770 [1:32:16<19:23, 1.49it/s]
82%|████████▏ | 8033/9770 [1:32:17<19:03, 1.52it/s]
82%|████████▏ | 8034/9770 [1:32:17<18:53, 1.53it/s]
82%|████████▏ | 8035/9770 [1:32:18<18:37, 1.55it/s]
82%|████████▏ | 8036/9770 [1:32:19<18:43, 1.54it/s]
82%|████████▏ | 8037/9770 [1:32:19<18:47, 1.54it/s]
82%|████████▏ | 8038/9770 [1:32:20<18:45, 1.54it/s]
82%|██
+0: {'loss': 0.6621, 'grad_norm': 0.6144292215970258, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.82}
+0: ██████▏ | 8039/9770 [1:32:21<18:45, 1.54it/s]
82%|████████▏ | 8040/9770 [1:32:21<18:45, 1.54it/s]
82%|████████▏ | 8040/9770 [1:32:21<18:45, 1.54it/s]
82%|████████▏ | 8041/9770 [1:32:22<18:48, 1.53it/s]
82%|████████▏ | 8042/9770 [1:32:23<18:46, 1.53it/s]
82%|████████▏ | 8043/9770 [1:32:23<18:28, 1.56it/s]
82%|████████▏ | 8044/9770 [1:32:24<18:24, 1.56it/s]
82%|████████▏ | 8045/9770 [1:32:25<18:26, 1.56it/s]
82%|████████▏ | 8046/9770 [1:32:25<18:30, 1.55it/s]
82%|████████▏ | 8047/9770 [1:32:26<18:35, 1.54it/s]
82%|████████▏ | 8048/9770 [1:32:27<18:41, 1.54it/s]
82%|████████▏ | 8049/9770 [1:32:27<18:49, 1.52it/s]
82%|████████▏ | 8050/9770 [1:32:28<18:51, 1.52it/s]
+0: {'loss': 0.6477, 'grad_norm': 0.588304783120434, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.82}
+0: {'loss': 0.6428, 'grad_norm': 0.6089214420843809, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.82}
+0:
82%|████████▏ | 8050/9770 [1:32:28<18:51, 1.52it/s]
82%|████████▏ | 8051/9770 [1:32:28<18:33, 1.54it/s]
82%|████████▏ | 8052/9770 [1:32:29<18:16, 1.57it/s]
82%|████████▏ | 8053/9770 [1:32:30<18:38, 1.53it/s]
82%|████████▏ | 8054/9770 [1:32:30<18:43, 1.53it/s]
82%|████████▏ | 8055/9770 [1:32:31<18:43, 1.53it/s]
82%|████████▏ | 8056/9770 [1:32:32<18:52, 1.51it/s]
82%|████████▏ | 8057/9770 [1:32:32<18:50, 1.52it/s]
82%|████████▏ | 8058/9770 [1:32:33<18:47, 1.52it/s]
82%|████████▏ | 8059/9770 [1:32:34<18:46, 1.52it/s]
82%|████████▏ | 8060/9770 [1:32:34<19:13, 1.48it/s]
82%|████████▏ | 8060/9770 [1:32:34<19:13, 1.48it/s]
83%|████████▎ | 8061/9770 [1:32:35<18:47, 1.52it/s]
83%|█████�
+0: {'loss': 0.6646, 'grad_norm': 0.6159007486217206, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.83}
+0: �██▎ | 8062/9770 [1:32:36<18:45, 1.52it/s]
83%|████████▎ | 8063/9770 [1:32:36<18:39, 1.52it/s]
83%|████████▎ | 8064/9770 [1:32:37<18:42, 1.52it/s]
83%|████████▎ | 8065/9770 [1:32:38<18:44, 1.52it/s]
83%|████████▎ | 8066/9770 [1:32:38<18:32, 1.53it/s]
83%|████████▎ | 8067/9770 [1:32:39<18:29, 1.54it/s]
83%|████████▎ | 8068/9770 [1:32:40<18:52, 1.50it/s]
83%|████████▎ | 8069/9770 [1:32:40<18:42, 1.51it/s]
83%|████████▎ | 8070/9770 [1:32:41<18:37, 1.52it/s]
83%|████████▎ | 8070/9770 [1:32:41<18:37, 1.52it/s]
83%|████████▎ | 8071/9770 [1:32:42<18:32, 1.53it/s]
83%|████████▎ | 8072/9770 [1:32:42<18:28, 1.53it/s]
83%|████████▎ | 8073/9770 [1:32:43<18:45, 1.51it/s]
83%|████████▎ | 8074/9770 [1:32:44<1
+0: {'loss': 0.6649, 'grad_norm': 0.6164654085364054, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.83}
+0: 8:38, 1.52it/s]
83%|████████▎ | 8075/9770 [1:32:44<18:36, 1.52it/s]
83%|████████▎ | 8076/9770 [1:32:45<18:37, 1.52it/s]
83%|████████▎ | 8077/9770 [1:32:46<18:31, 1.52it/s]
83%|████████▎ | 8078/9770 [1:32:46<18:38, 1.51it/s]
83%|████████▎ | 8079/9770 [1:32:47<18:24, 1.53it/s]
83%|████████▎ | 8080/9770 [1:32:48<18:33, 1.52it/s]
83%|████████▎ | 8080/9770 [1:32:48<18:33, 1.52it/s]
83%|████████▎ | 8081/9770 [1:32:48<18:30, 1.52it/s]
83%|████████▎ | 8082/9770 [1:32:49<18:31, 1.52it/s]
83%|████████▎ | 8083/9770 [1:32:50<18:12, 1.54it/s]
83%|████████▎ | 8084/9770 [1:32:50<18:21, 1.53it/s]
83%|████████▎ | 8085/9770 [1:32:51<18:18, 1.53it/s]
83%|████████▎ | 8086/9770 [1:32:51<18:25, 1.52it/s]
83%|███�
+0: {'loss': 0.6418, 'grad_norm': 0.6094421314964482, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.83}
+0: �████▎ | 8087/9770 [1:32:52<18:08, 1.55it/s]
83%|████████▎ | 8088/9770 [1:32:53<18:06, 1.55it/s]
83%|████████▎ | 8089/9770 [1:32:53<17:56, 1.56it/s]
83%|████████▎ | 8090/9770 [1:32:54<18:02, 1.55it/s]
83%|████████▎ | 8090/9770 [1:32:54<18:02, 1.55it/s]
83%|████████▎ | 8091/9770 [1:32:55<18:04, 1.55it/s]
83%|████████▎ | 8092/9770 [1:32:55<18:05, 1.55it/s]
83%|████████▎ | 8093/9770 [1:32:56<18:04, 1.55it/s]
83%|████████▎ | 8094/9770 [1:32:57<18:09, 1.54it/s]
83%|████████▎ | 8095/9770 [1:32:57<18:17, 1.53it/s]
83%|████████▎ | 8096/9770 [1:32:58<18:15, 1.53it/s]
83%|████████▎ | 8097/9770 [1:32:59<18:00, 1.55it/s]
83%|████████▎ | 8098/9770 [1:32:59<17:58, 1.55it/s]
83%|████████▎ | 8099/9770 [1:3
+0: {'loss': 0.656, 'grad_norm': 0.60547380148355, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.83}
+0: {'loss': 0.6508, 'grad_norm': 0.5841923562627888, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.83}
+0: 3:00<17:50, 1.56it/s]
83%|████████▎ | 8100/9770 [1:33:01<17:58, 1.55it/s]
83%|████████▎ | 8100/9770 [1:33:01<17:58, 1.55it/s]
83%|████████▎ | 8101/9770 [1:33:01<18:24, 1.51it/s]
83%|████████▎ | 8102/9770 [1:33:02<18:46, 1.48it/s]
83%|████████▎ | 8103/9770 [1:33:03<18:57, 1.47it/s]
83%|████████▎ | 8104/9770 [1:33:03<18:33, 1.50it/s]
83%|████████▎ | 8105/9770 [1:33:04<18:11, 1.53it/s]
83%|████████▎ | 8106/9770 [1:33:05<18:01, 1.54it/s]
83%|████████▎ | 8107/9770 [1:33:05<17:57, 1.54it/s]
83%|████████▎ | 8108/9770 [1:33:06<17:39, 1.57it/s]
83%|████████▎ | 8109/9770 [1:33:06<17:48, 1.55it/s]
83%|████████▎ | 8110/9770 [1:33:07<17:52, 1.55it/s]
83%|███████�
+0: {'loss': 0.6696, 'grad_norm': 0.5854473509277579, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.83}
+0: ��▎ | 8110/9770 [1:33:07<17:52, 1.55it/s]
83%|████████▎ | 8111/9770 [1:33:08<18:01, 1.53it/s]
83%|████████▎ | 8112/9770 [1:33:08<17:59, 1.54it/s]
83%|████████▎ | 8113/9770 [1:33:09<18:03, 1.53it/s]
83%|████████▎ | 8114/9770 [1:33:10<17:54, 1.54it/s]
83%|████████▎ | 8115/9770 [1:33:10<18:00, 1.53it/s]
83%|████████▎ | 8116/9770 [1:33:11<17:47, 1.55it/s]
83%|████████▎ | 8117/9770 [1:33:12<17:38, 1.56it/s]
83%|████████▎ | 8118/9770 [1:33:12<17:40, 1.56it/s]
83%|████████▎ | 8119/9770 [1:33:13<17:30, 1.57it/s]
83%|████████▎ | 8120/9770 [1:33:14<17:33, 1.57it/s]
83%|████████▎ | 8120/9770 [1:33:14<17:33, 1.57it/s]
83%|████████▎ | 8121/9770 [1:33:14<17:44, 1.55it/s]
83%|████████▎ | 8122/9770 [1:33:15<17:53,
+0: {'loss': 0.6508, 'grad_norm': 0.6101386612387996, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.83}
+0: 1.53it/s]
83%|████████▎ | 8123/9770 [1:33:16<17:53, 1.53it/s]
83%|████████▎ | 8124/9770 [1:33:16<18:06, 1.52it/s]
83%|████████▎ | 8125/9770 [1:33:17<17:50, 1.54it/s]
83%|████████▎ | 8126/9770 [1:33:17<17:50, 1.54it/s]
83%|████████▎ | 8127/9770 [1:33:18<17:40, 1.55it/s]
83%|████████▎ | 8128/9770 [1:33:19<17:30, 1.56it/s]
83%|████████▎ | 8129/9770 [1:33:19<17:33, 1.56it/s]
83%|████████▎ | 8130/9770 [1:33:20<17:28, 1.56it/s]
83%|████████▎ | 8130/9770 [1:33:20<17:28, 1.56it/s]
83%|████████▎ | 8131/9770 [1:33:21<17:43, 1.54it/s]
83%|████████▎ | 8132/9770 [1:33:21<18:09, 1.50it/s]
83%|████████▎ | 8133/9770 [1:33:22<18:02, 1.51it/s]
83%|████████▎ | 8134/9770 [1:33:23<18:00, 1.51it/s]
83%|█████�
+0: {'loss': 0.6607, 'grad_norm': 0.5825941876726741, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.83}
+0: ��██▎ | 8135/9770 [1:33:23<17:52, 1.52it/s]
83%|████████▎ | 8136/9770 [1:33:24<17:49, 1.53it/s]
83%|████████▎ | 8137/9770 [1:33:25<17:39, 1.54it/s]
83%|████████▎ | 8138/9770 [1:33:25<17:33, 1.55it/s]
83%|████████▎ | 8139/9770 [1:33:26<17:47, 1.53it/s]
83%|████████▎ | 8140/9770 [1:33:27<17:37, 1.54it/s]
83%|████████▎ | 8140/9770 [1:33:27<17:37, 1.54it/s]
83%|████████▎ | 8141/9770 [1:33:27<17:39, 1.54it/s]
83%|████████▎ | 8142/9770 [1:33:28<17:48, 1.52it/s]
83%|████████▎ | 8143/9770 [1:33:29<17:44, 1.53it/s]
83%|████████▎ | 8144/9770 [1:33:29<17:36, 1.54it/s]
83%|████████▎ | 8145/9770 [1:33:30<17:34, 1.54it/s]
83%|████████▎ | 8146/9770 [1:33:30<17:29, 1.55it/s]
83%|████████▎ | 8147/9770 [1:33:31<
+0: {'loss': 0.643, 'grad_norm': 0.5545415505236951, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.83}
+0: 17:32, 1.54it/s]
83%|████████▎ | 8148/9770 [1:33:32<17:24, 1.55it/s]
83%|████████▎ | 8149/9770 [1:33:32<17:26, 1.55it/s]
83%|████████▎ | 8150/9770 [1:33:33<17:58, 1.50it/s]
83%|████████▎ | 8150/9770 [1:33:33<17:58, 1.50it/s]
83%|████████▎ | 8151/9770 [1:33:34<18:07, 1.49it/s]
83%|████████▎ | 8152/9770 [1:33:34<17:55, 1.50it/s]
83%|████████▎ | 8153/9770 [1:33:35<17:52, 1.51it/s]
83%|████████▎ | 8154/9770 [1:33:36<17:36, 1.53it/s]
83%|████████▎ | 8155/9770 [1:33:36<17:38, 1.53it/s]
83%|████████▎ | 8156/9770 [1:33:37<17:33, 1.53it/s]
83%|████████▎ | 8157/9770 [1:33:38<17:43, 1.52it/s]
84%|████████▎ | 8158/9770 [1:33:38<17:40, 1.52it/s]
84%|████████▎ | 8159/9770 [1:33:39<17:55, 1.50it/s]
84%|███�
+0: {'loss': 0.6298, 'grad_norm': 0.5720221603385147, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.83}
+0: {'loss': 0.6508, 'grad_norm': 0.5954875717516195, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.84}
+0: ��████▎ | 8160/9770 [1:33:40<18:02, 1.49it/s]
84%|████████▎ | 8160/9770 [1:33:40<18:02, 1.49it/s]
84%|████████▎ | 8161/9770 [1:33:40<18:04, 1.48it/s]
84%|████████▎ | 8162/9770 [1:33:41<17:50, 1.50it/s]
84%|████████▎ | 8163/9770 [1:33:42<17:43, 1.51it/s]
84%|████████▎ | 8164/9770 [1:33:42<17:41, 1.51it/s]
84%|████████▎ | 8165/9770 [1:33:43<17:53, 1.49it/s]
84%|████████▎ | 8166/9770 [1:33:44<18:02, 1.48it/s]
84%|████████▎ | 8167/9770 [1:33:44<18:00, 1.48it/s]
84%|████████▎ | 8168/9770 [1:33:45<17:53, 1.49it/s]
84%|████████▎ | 8169/9770 [1:33:46<17:42, 1.51it/s]
84%|████████▎ | 8170/9770 [1:33:46<17:48, 1.50it/s]
84%|████████▎ | 8170/9770 [1:33:46<17:48, 1.5
+0: {'loss': 0.6379, 'grad_norm': 0.5937285383603257, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.84}
+0: 0it/s]
84%|████████▎ | 8171/9770 [1:33:47<17:28, 1.52it/s]
84%|████████▎ | 8172/9770 [1:33:48<17:48, 1.50it/s]
84%|████████▎ | 8173/9770 [1:33:48<17:45, 1.50it/s]
84%|████████▎ | 8174/9770 [1:33:49<17:39, 1.51it/s]
84%|████████▎ | 8175/9770 [1:33:50<17:48, 1.49it/s]
84%|████████▎ | 8176/9770 [1:33:50<17:36, 1.51it/s]
84%|████████▎ | 8177/9770 [1:33:51<17:22, 1.53it/s]
84%|████████▎ | 8178/9770 [1:33:52<17:18, 1.53it/s]
84%|████████▎ | 8179/9770 [1:33:52<17:42, 1.50it/s]
84%|████████▎ | 8180/9770 [1:33:53<17:32, 1.51it/s]
84%|████████▎ | 8180/9770 [1:33:53<17:32, 1.51it/s]
84%|████████▎ | 8181/9770 [1:33:54<17:15, 1.53it/s]
84%|████████▎ | 8182/9770 [1:33:54<17:14, 1.53it/s]
84%|███████
+0: {'loss': 0.651, 'grad_norm': 0.6353387079587396, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.84}
+0: █▍ | 8183/9770 [1:33:55<17:02, 1.55it/s]
84%|████████▍ | 8184/9770 [1:33:56<17:02, 1.55it/s]
84%|████████▍ | 8185/9770 [1:33:56<17:16, 1.53it/s]
84%|████████▍ | 8186/9770 [1:33:57<17:16, 1.53it/s]
84%|████████▍ | 8187/9770 [1:33:58<17:05, 1.54it/s]
84%|████████▍ | 8188/9770 [1:33:58<17:24, 1.51it/s]
84%|████████▍ | 8189/9770 [1:33:59<17:30, 1.50it/s]
84%|████████▍ | 8190/9770 [1:34:00<17:41, 1.49it/s]
84%|████████▍ | 8190/9770 [1:34:00<17:41, 1.49it/s]
84%|████████▍ | 8191/9770 [1:34:00<18:00, 1.46it/s]
84%|████████▍ | 8192/9770 [1:34:01<17:28, 1.50it/s]
84%|████████▍ | 8193/9770 [1:34:02<17:16, 1.52it/s]
84%|████████▍ | 8194/9770 [1:34:02<17:09, 1.53it/s]
84%|████████▍ | 8195/9770 [1:34:03<17:30
+0: {'loss': 0.6311, 'grad_norm': 0.5549069972711465, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.84}
+0: , 1.50it/s]
84%|████████▍ | 8196/9770 [1:34:04<17:19, 1.51it/s]
84%|████████▍ | 8197/9770 [1:34:04<17:15, 1.52it/s]
84%|████████▍ | 8198/9770 [1:34:05<17:30, 1.50it/s]
84%|████████▍ | 8199/9770 [1:34:06<17:18, 1.51it/s]
84%|████████▍ | 8200/9770 [1:34:06<17:33, 1.49it/s]
84%|████████▍ | 8200/9770 [1:34:06<17:33, 1.49it/s]
84%|████████▍ | 8201/9770 [1:34:07<17:19, 1.51it/s]
84%|████████▍ | 8202/9770 [1:34:08<17:11, 1.52it/s]
84%|████████▍ | 8203/9770 [1:34:08<16:57, 1.54it/s]
84%|████████▍ | 8204/9770 [1:34:09<16:44, 1.56it/s]
84%|████████▍ | 8205/9770 [1:34:09<16:51, 1.55it/s]
84%|████████▍ | 8206/9770 [1:34:10<16:50, 1.55it/s]
84%|████████▍ | 8207/9770 [1:34:11<16:54, 1.54it/s]
84%|█████
+0: {'loss': 0.6434, 'grad_norm': 0.5836313442357334, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.84}
+0: ███▍ | 8208/9770 [1:34:11<16:55, 1.54it/s]
84%|████████▍ | 8209/9770 [1:34:12<17:01, 1.53it/s]
84%|████████▍ | 8210/9770 [1:34:13<16:44, 1.55it/s]
84%|████████▍ | 8210/9770 [1:34:13<16:44, 1.55it/s]
84%|████████▍ | 8211/9770 [1:34:13<16:53, 1.54it/s]
84%|████████▍ | 8212/9770 [1:34:14<16:42, 1.55it/s]
84%|████████▍ | 8213/9770 [1:34:15<16:44, 1.55it/s]
84%|████████▍ | 8214/9770 [1:34:15<16:41, 1.55it/s]
84%|████████▍ | 8215/9770 [1:34:16<17:12, 1.51it/s]
84%|████████▍ | 8216/9770 [1:34:17<16:52, 1.53it/s]
84%|████████▍ | 8217/9770 [1:34:17<16:50, 1.54it/s]
84%|████████▍ | 8218/9770 [1:34:18<16:43, 1.55it/s]
84%|████████▍ | 8219/9770 [1:34:19<16:46, 1.54it/s]
84%|████████▍ | 8220/9770 [1:34:19
+0: {'loss': 0.6591, 'grad_norm': 0.5948877497922387, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.84}
+0: {'loss': 0.6424, 'grad_norm': 0.5763685160472533, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.84}
+0: <16:47, 1.54it/s]
84%|████████▍ | 8220/9770 [1:34:19<16:47, 1.54it/s]
84%|████████▍ | 8221/9770 [1:34:20<16:51, 1.53it/s]
84%|████████▍ | 8222/9770 [1:34:21<16:51, 1.53it/s]
84%|████████▍ | 8223/9770 [1:34:21<16:50, 1.53it/s]
84%|████████▍ | 8224/9770 [1:34:22<16:53, 1.53it/s]
84%|████████▍ | 8225/9770 [1:34:22<16:46, 1.54it/s]
84%|████████▍ | 8226/9770 [1:34:23<17:09, 1.50it/s]
84%|████████▍ | 8227/9770 [1:34:24<16:48, 1.53it/s]
84%|████████▍ | 8228/9770 [1:34:24<16:55, 1.52it/s]
84%|████████▍ | 8229/9770 [1:34:25<16:49, 1.53it/s]
84%|████████▍ | 8230/9770 [1:34:26<16:38, 1.54it/s]
84%|████████▍ | 8230/9770 [1:34:26<16:38, 1.54it/s]
84%|████████�
+0: {'loss': 0.6373, 'grad_norm': 0.6056688816769327, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.84}
+0: � | 8231/9770 [1:34:26<16:46, 1.53it/s]
84%|████████▍ | 8232/9770 [1:34:27<16:43, 1.53it/s]
84%|████████▍ | 8233/9770 [1:34:28<16:45, 1.53it/s]
84%|████████▍ | 8234/9770 [1:34:28<16:44, 1.53it/s]
84%|████████▍ | 8235/9770 [1:34:29<16:34, 1.54it/s]
84%|████████▍ | 8236/9770 [1:34:30<16:49, 1.52it/s]
84%|████████▍ | 8237/9770 [1:34:30<16:39, 1.53it/s]
84%|████████▍ | 8238/9770 [1:34:31<17:04, 1.50it/s]
84%|████████▍ | 8239/9770 [1:34:32<16:49, 1.52it/s]
84%|████████▍ | 8240/9770 [1:34:32<16:49, 1.52it/s]
84%|████████▍ | 8240/9770 [1:34:32<16:49, 1.52it/s]
84%|████████▍ | 8241/9770 [1:34:33<16:43, 1.52it/s]
84%|████████▍ | 8242/9770 [1:34:34<16:49, 1.51it/s]
84%|████████▍ | 8243/9770 [1:34:34<16:45, 1.
+0: {'loss': 0.6457, 'grad_norm': 0.6003665729718766, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.84}
+0: 52it/s]
84%|████████▍ | 8244/9770 [1:34:35<16:41, 1.52it/s]
84%|████████▍ | 8245/9770 [1:34:36<16:39, 1.53it/s]
84%|████████▍ | 8246/9770 [1:34:36<16:37, 1.53it/s]
84%|████████▍ | 8247/9770 [1:34:37<16:35, 1.53it/s]
84%|████████▍ | 8248/9770 [1:34:38<16:30, 1.54it/s]
84%|████████▍ | 8249/9770 [1:34:38<16:57, 1.50it/s]
84%|████████▍ | 8250/9770 [1:34:39<16:40, 1.52it/s]
84%|████████▍ | 8250/9770 [1:34:39<16:40, 1.52it/s]
84%|████████▍ | 8251/9770 [1:34:40<16:31, 1.53it/s]
84%|████████▍ | 8252/9770 [1:34:40<16:20, 1.55it/s]
84%|████████▍ | 8253/9770 [1:34:41<16:27, 1.54it/s]
84%|████████▍ | 8254/9770 [1:34:41<16:32, 1.53it/s]
84%|████████▍ | 8255/9770 [1:34:42<16:36, 1.52it/s]
85%|██████�
+0: {'loss': 0.6338, 'grad_norm': 0.607244218041695, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.85}
+0: �█▍ | 8256/9770 [1:34:43<16:19, 1.55it/s]
85%|████████▍ | 8257/9770 [1:34:43<16:20, 1.54it/s]
85%|████████▍ | 8258/9770 [1:34:44<16:26, 1.53it/s]
85%|████████▍ | 8259/9770 [1:34:45<16:27, 1.53it/s]
85%|████████▍ | 8260/9770 [1:34:45<16:25, 1.53it/s]
85%|████████▍ | 8260/9770 [1:34:45<16:25, 1.53it/s]
85%|████████▍ | 8261/9770 [1:34:46<16:18, 1.54it/s]
85%|████████▍ | 8262/9770 [1:34:47<16:14, 1.55it/s]
85%|████████▍ | 8263/9770 [1:34:47<16:12, 1.55it/s]
85%|████████▍ | 8264/9770 [1:34:48<16:18, 1.54it/s]
85%|████████▍ | 8265/9770 [1:34:49<16:05, 1.56it/s]
85%|████████▍ | 8266/9770 [1:34:49<16:00, 1.57it/s]
85%|████████▍ | 8267/9770 [1:34:50<16:05, 1.56it/s]
85%|████████▍ | 8268/9770 [1:34:51<16:0
+0: {'loss': 0.644, 'grad_norm': 0.6186205128490853, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.85}
+0: 7, 1.55it/s]
85%|████████▍ | 8269/9770 [1:34:51<16:00, 1.56it/s]
85%|████████▍ | 8270/9770 [1:34:52<16:04, 1.56it/s]
85%|████████▍ | 8270/9770 [1:34:52<16:04, 1.56it/s]
85%|████████▍ | 8271/9770 [1:34:52<16:11, 1.54it/s]
85%|████████▍ | 8272/9770 [1:34:53<16:32, 1.51it/s]
85%|████████▍ | 8273/9770 [1:34:54<16:25, 1.52it/s]
85%|████████▍ | 8274/9770 [1:34:54<16:33, 1.51it/s]
85%|████████▍ | 8275/9770 [1:34:55<16:26, 1.52it/s]
85%|████████▍ | 8276/9770 [1:34:56<16:28, 1.51it/s]
85%|████████▍ | 8277/9770 [1:34:56<16:23, 1.52it/s]
85%|████████▍ | 8278/9770 [1:34:57<16:19, 1.52it/s]
85%|████████▍ | 8279/9770 [1:34:58<16:14, 1.53it/s]
85%|████████▍ | 8280/9770 [1:34:58<16:11, 1.53it/s]
+0: {'loss': 0.6577, 'grad_norm': 0.6554491228832788, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.85}
+0: {'loss': 0.6665, 'grad_norm': 0.596526928701001, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.85}
+0:
85%|████████▍ | 8280/9770 [1:34:58<16:11, 1.53it/s]
85%|████████▍ | 8281/9770 [1:34:59<16:05, 1.54it/s]
85%|████████▍ | 8282/9770 [1:35:00<16:07, 1.54it/s]
85%|████████▍ | 8283/9770 [1:35:00<15:58, 1.55it/s]
85%|████████▍ | 8284/9770 [1:35:01<16:13, 1.53it/s]
85%|████████▍ | 8285/9770 [1:35:02<16:19, 1.52it/s]
85%|████████▍ | 8286/9770 [1:35:02<16:15, 1.52it/s]
85%|████████▍ | 8287/9770 [1:35:03<16:07, 1.53it/s]
85%|████████▍ | 8288/9770 [1:35:04<16:06, 1.53it/s]
85%|████████▍ | 8289/9770 [1:35:04<16:01, 1.54it/s]
85%|████████▍ | 8290/9770 [1:35:05<16:02, 1.54it/s]
85%|████████▍ | 8290/9770 [1:35:05<16:02, 1.54it/s]
85%|████████▍ | 8291/9770 [1:35:06<15:56, 1.55it/
+0: {'loss': 0.6173, 'grad_norm': 0.5895966949035891, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.85}
+0: s]
85%|████████▍ | 8292/9770 [1:35:06<15:57, 1.54it/s]
85%|████████▍ | 8293/9770 [1:35:07<16:07, 1.53it/s]
85%|████████▍ | 8294/9770 [1:35:08<16:13, 1.52it/s]
85%|████████▍ | 8295/9770 [1:35:08<16:05, 1.53it/s]
85%|████████▍ | 8296/9770 [1:35:09<16:06, 1.52it/s]
85%|████████▍ | 8297/9770 [1:35:10<16:03, 1.53it/s]
85%|████████▍ | 8298/9770 [1:35:10<15:59, 1.53it/s]
85%|████████▍ | 8299/9770 [1:35:11<16:01, 1.53it/s]
85%|████████▍ | 8300/9770 [1:35:11<15:56, 1.54it/s]
85%|████████▍ | 8300/9770 [1:35:11<15:56, 1.54it/s]
85%|████████▍ | 8301/9770 [1:35:12<16:20, 1.50it/s]
85%|████████▍ | 8302/9770 [1:35:13<16:13, 1.51it/s]
85%|████████▍ | 8303/9770 [1:35:13<16:07, 1.52it/s]
85%|████████�
+0: {'loss': 0.639, 'grad_norm': 0.598837078887474, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.85}
+0: �� | 8304/9770 [1:35:14<15:56, 1.53it/s]
85%|████████▌ | 8305/9770 [1:35:15<15:53, 1.54it/s]
85%|████████▌ | 8306/9770 [1:35:15<15:50, 1.54it/s]
85%|████████▌ | 8307/9770 [1:35:16<15:41, 1.55it/s]
85%|████████▌ | 8308/9770 [1:35:17<15:41, 1.55it/s]
85%|████████▌ | 8309/9770 [1:35:17<15:36, 1.56it/s]
85%|████████▌ | 8310/9770 [1:35:18<15:44, 1.55it/s]
85%|████████▌ | 8310/9770 [1:35:18<15:44, 1.55it/s]
85%|████████▌ | 8311/9770 [1:35:19<15:47, 1.54it/s]
85%|████████▌ | 8312/9770 [1:35:19<15:41, 1.55it/s]
85%|████████▌ | 8313/9770 [1:35:20<15:40, 1.55it/s]
85%|████████▌ | 8314/9770 [1:35:21<15:44, 1.54it/s]
85%|████████▌ | 8315/9770 [1:35:21<15:37, 1.55it/s]
85%|████████▌ | 8316/9770 [1:35:22<15:46, 1
+0: {'loss': 0.6523, 'grad_norm': 0.6128223636084873, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.85}
+0: .54it/s]
85%|████████▌ | 8317/9770 [1:35:23<15:49, 1.53it/s]
85%|████████▌ | 8318/9770 [1:35:23<15:52, 1.52it/s]
85%|████████▌ | 8319/9770 [1:35:24<15:52, 1.52it/s]
85%|████████▌ | 8320/9770 [1:35:24<15:45, 1.53it/s]
85%|████████▌ | 8320/9770 [1:35:24<15:45, 1.53it/s]
85%|████████▌ | 8321/9770 [1:35:25<15:41, 1.54it/s]
85%|████████▌ | 8322/9770 [1:35:26<15:32, 1.55it/s]
85%|████████▌ | 8323/9770 [1:35:26<15:34, 1.55it/s]
85%|████████▌ | 8324/9770 [1:35:27<15:32, 1.55it/s]
85%|████████▌ | 8325/9770 [1:35:28<15:32, 1.55it/s]
85%|████████▌ | 8326/9770 [1:35:28<15:33, 1.55it/s]
85%|████████▌ | 8327/9770 [1:35:29<15:25, 1.56it/s]
85%|████████▌ | 8328/9770 [1:35:30<15:21, 1.57it/s]
85%|██████�
+0: {'loss': 0.6527, 'grad_norm': 0.598424214726235, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.85}
+0: {'loss': 0.6397, 'grad_norm': 0.5988903949903229, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.85}
+0: ��█▌ | 8329/9770 [1:35:30<15:27, 1.55it/s]
85%|████████▌ | 8330/9770 [1:35:31<15:21, 1.56it/s]
85%|████████▌ | 8330/9770 [1:35:31<15:21, 1.56it/s]
85%|████████▌ | 8331/9770 [1:35:32<15:29, 1.55it/s]
85%|████████▌ | 8332/9770 [1:35:32<15:22, 1.56it/s]
85%|████████▌ | 8333/9770 [1:35:33<15:32, 1.54it/s]
85%|████████▌ | 8334/9770 [1:35:34<15:37, 1.53it/s]
85%|████████▌ | 8335/9770 [1:35:34<15:33, 1.54it/s]
85%|████████▌ | 8336/9770 [1:35:35<15:31, 1.54it/s]
85%|████████▌ | 8337/9770 [1:35:35<15:31, 1.54it/s]
85%|████████▌ | 8338/9770 [1:35:36<15:53, 1.50it/s]
85%|████████▌ | 8339/9770 [1:35:37<15:46, 1.51it/s]
85%|████████▌ | 8340/9770 [1:35:37<15:42, 1.52it/s]
8
+0: {'loss': 0.6412, 'grad_norm': 0.5483067057822244, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.85}
+0: 5%|████████▌ | 8340/9770 [1:35:37<15:42, 1.52it/s]
85%|████████▌ | 8341/9770 [1:35:38<15:39, 1.52it/s]
85%|████████▌ | 8342/9770 [1:35:39<15:33, 1.53it/s]
85%|████████▌ | 8343/9770 [1:35:39<15:22, 1.55it/s]
85%|████████▌ | 8344/9770 [1:35:40<15:22, 1.55it/s]
85%|████████▌ | 8345/9770 [1:35:41<15:42, 1.51it/s]
85%|████████▌ | 8346/9770 [1:35:41<15:40, 1.51it/s]
85%|████████▌ | 8347/9770 [1:35:42<15:31, 1.53it/s]
85%|████████▌ | 8348/9770 [1:35:43<15:32, 1.52it/s]
85%|████████▌ | 8349/9770 [1:35:43<15:25, 1.53it/s]
85%|████████▌ | 8350/9770 [1:35:44<15:30, 1.53it/s]
85%|████████▌ | 8350/9770 [1:35:44<15:30, 1.53it/s]
85%|████████▌ | 8351/9770 [1:35:45<15:20, 1.54it/s]
85%|████████▌ |
+0: {'loss': 0.6421, 'grad_norm': 0.6657337970824535, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.86}
+0: 8352/9770 [1:35:45<15:20, 1.54it/s]
85%|████████▌ | 8353/9770 [1:35:46<15:13, 1.55it/s]
86%|████████▌ | 8354/9770 [1:35:47<15:15, 1.55it/s]
86%|████████▌ | 8355/9770 [1:35:47<15:11, 1.55it/s]
86%|████████▌ | 8356/9770 [1:35:48<15:12, 1.55it/s]
86%|████████▌ | 8357/9770 [1:35:48<15:12, 1.55it/s]
86%|████████▌ | 8358/9770 [1:35:49<15:16, 1.54it/s]
86%|████████▌ | 8359/9770 [1:35:50<15:08, 1.55it/s]
86%|████████▌ | 8360/9770 [1:35:50<15:04, 1.56it/s]
86%|████████▌ | 8360/9770 [1:35:50<15:04, 1.56it/s]
86%|████████▌ | 8361/9770 [1:35:51<15:10, 1.55it/s]
86%|████████▌ | 8362/9770 [1:35:52<15:17, 1.54it/s]
86%|████████▌ | 8363/9770 [1:35:52<15:17, 1.53it/s]
86%|████████▌ | 8364/9770 [1:35:53<15:18, 1.53it
+0: {'loss': 0.6536, 'grad_norm': 0.5832815917266596, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.86}
+0: /s]
86%|████████▌ | 8365/9770 [1:35:54<15:00, 1.56it/s]
86%|████████▌ | 8366/9770 [1:35:54<15:01, 1.56it/s]
86%|████████▌ | 8367/9770 [1:35:55<14:58, 1.56it/s]
86%|████████▌ | 8368/9770 [1:35:56<14:54, 1.57it/s]
86%|████████▌ | 8369/9770 [1:35:56<15:01, 1.55it/s]
86%|████████▌ | 8370/9770 [1:35:57<15:05, 1.55it/s]
86%|████████▌ | 8370/9770 [1:35:57<15:05, 1.55it/s]
86%|████████▌ | 8371/9770 [1:35:58<15:10, 1.54it/s]
86%|████████▌ | 8372/9770 [1:35:58<15:05, 1.54it/s]
86%|████████▌ | 8373/9770 [1:35:59<14:55, 1.56it/s]
86%|████████▌ | 8374/9770 [1:35:59<14:56, 1.56it/s]
86%|████████▌ | 8375/9770 [1:36:00<15:00, 1.55it/s]
86%|████████▌ | 8376/9770 [1:36:01<15:04, 1.54it/s]
86%|████████
+0: {'loss': 0.6329, 'grad_norm': 0.5801911628826856, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.86}
+0: ▌ | 8377/9770 [1:36:01<15:02, 1.54it/s]
86%|████████▌ | 8378/9770 [1:36:02<14:54, 1.56it/s]
86%|████████▌ | 8379/9770 [1:36:03<14:54, 1.56it/s]
86%|████████▌ | 8380/9770 [1:36:03<15:08, 1.53it/s]
86%|████████▌ | 8380/9770 [1:36:03<15:08, 1.53it/s]
86%|████████▌ | 8381/9770 [1:36:04<15:06, 1.53it/s]
86%|████████▌ | 8382/9770 [1:36:05<15:11, 1.52it/s]
86%|████████▌ | 8383/9770 [1:36:05<15:15, 1.51it/s]
86%|████████▌ | 8384/9770 [1:36:06<15:28, 1.49it/s]
86%|████████▌ | 8385/9770 [1:36:07<15:19, 1.51it/s]
86%|████████▌ | 8386/9770 [1:36:07<15:13, 1.52it/s]
86%|████████▌ | 8387/9770 [1:36:08<15:29, 1.49it/s]
86%|████████▌ | 8388/9770 [1:36:09<15:22, 1.50it/s]
86%|████████▌ | 8389/9770 [1:36:09<15:07,
+0: {'loss': 0.6596, 'grad_norm': 0.6350260297397763, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.86}
+0: {'loss': 0.6508, 'grad_norm': 0.6124821373067496, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.86}
+0: 1.52it/s]
86%|████████▌ | 8390/9770 [1:36:10<15:05, 1.52it/s]
86%|████████▌ | 8390/9770 [1:36:10<15:05, 1.52it/s]
86%|████████▌ | 8391/9770 [1:36:11<15:05, 1.52it/s]
86%|████████▌ | 8392/9770 [1:36:11<15:01, 1.53it/s]
86%|████████▌ | 8393/9770 [1:36:12<14:56, 1.54it/s]
86%|████████▌ | 8394/9770 [1:36:13<15:02, 1.52it/s]
86%|████████▌ | 8395/9770 [1:36:13<14:59, 1.53it/s]
86%|████████▌ | 8396/9770 [1:36:14<14:54, 1.54it/s]
86%|████████▌ | 8397/9770 [1:36:15<14:50, 1.54it/s]
86%|████████▌ | 8398/9770 [1:36:15<14:48, 1.54it/s]
86%|████████▌ | 8399/9770 [1:36:16<14:57, 1.53it/s]
86%|████████▌ | 8400/9770 [1:36:16<14:50, 1.54it/s]
86%|████████▌ | 8400/
+0: {'loss': 0.663, 'grad_norm': 0.5928170836107521, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.86}
+0: 9770 [1:36:17<14:50, 1.54it/s]
86%|████████▌ | 8401/9770 [1:36:17<14:50, 1.54it/s]
86%|████████▌ | 8402/9770 [1:36:18<14:43, 1.55it/s]
86%|████████▌ | 8403/9770 [1:36:18<14:45, 1.54it/s]
86%|████████▌ | 8404/9770 [1:36:19<14:51, 1.53it/s]
86%|████████▌ | 8405/9770 [1:36:20<15:07, 1.50it/s]
86%|████████▌ | 8406/9770 [1:36:20<15:02, 1.51it/s]
86%|████████▌ | 8407/9770 [1:36:21<14:59, 1.52it/s]
86%|████████▌ | 8408/9770 [1:36:22<14:49, 1.53it/s]
86%|████████▌ | 8409/9770 [1:36:22<14:44, 1.54it/s]
86%|████████▌ | 8410/9770 [1:36:23<14:50, 1.53it/s]
86%|████████▌ | 8410/9770 [1:36:23<14:50, 1.53it/s]
86%|████████▌ | 8411/9770 [1:36:24<15:00, 1.51it/s]
86%|████████▌ | 8412/9770 [1:36:24<15:01, 1.51it/s]
+0: {'loss': 0.641, 'grad_norm': 0.5619216197416638, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.86}
+0: 86%|████████▌ | 8413/9770 [1:36:25<14:53, 1.52it/s]
86%|████████▌ | 8414/9770 [1:36:26<15:04, 1.50it/s]
86%|████████▌ | 8415/9770 [1:36:26<14:50, 1.52it/s]
86%|████████▌ | 8416/9770 [1:36:27<15:14, 1.48it/s]
86%|████████▌ | 8417/9770 [1:36:28<15:00, 1.50it/s]
86%|████████▌ | 8418/9770 [1:36:28<14:49, 1.52it/s]
86%|████████▌ | 8419/9770 [1:36:29<14:40, 1.53it/s]
86%|████████▌ | 8420/9770 [1:36:30<14:50, 1.52it/s]
86%|████████▌ | 8420/9770 [1:36:30<14:50, 1.52it/s]
86%|████████▌ | 8421/9770 [1:36:30<14:47, 1.52it/s]
86%|████████▌ | 8422/9770 [1:36:31<14:46, 1.52it/s]
86%|████████▌ | 8423/9770 [1:36:32<14:45, 1.52it/s]
86%|████████▌ | 8424/9770 [1:36:32<14:40, 1.53it/s]
86%|████████▌ |
+0: {'loss': 0.6358, 'grad_norm': 0.6026142219192688, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.86}
+0: 8425/9770 [1:36:33<14:28, 1.55it/s]
86%|████████▌ | 8426/9770 [1:36:34<14:23, 1.56it/s]
86%|████████▋ | 8427/9770 [1:36:34<14:10, 1.58it/s]
86%|████████▋ | 8428/9770 [1:36:35<14:08, 1.58it/s]
86%|████████▋ | 8429/9770 [1:36:35<14:21, 1.56it/s]
86%|████████▋ | 8430/9770 [1:36:36<14:24, 1.55it/s]
86%|████████▋ | 8430/9770 [1:36:36<14:24, 1.55it/s]
86%|████████▋ | 8431/9770 [1:36:37<14:26, 1.55it/s]
86%|████████▋ | 8432/9770 [1:36:37<14:17, 1.56it/s]
86%|████████▋ | 8433/9770 [1:36:38<14:17, 1.56it/s]
86%|████████▋ | 8434/9770 [1:36:39<14:24, 1.55it/s]
86%|████████▋ | 8435/9770 [1:36:39<14:21, 1.55it/s]
86%|████████▋ | 8436/9770 [1:36:40<14:14, 1.56it/s]
86%|████████▋ | 8437/9770 [1:36:41<14:13, 1.56i
+0: {'loss': 0.6555, 'grad_norm': 0.6065234321260474, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.86}
+0: t/s]
86%|████████▋ | 8438/9770 [1:36:41<14:14, 1.56it/s]
86%|████████▋ | 8439/9770 [1:36:42<14:12, 1.56it/s]
86%|████████▋ | 8440/9770 [1:36:43<14:15, 1.55it/s]
86%|████████▋ | 8440/9770 [1:36:43<14:15, 1.55it/s]
86%|████████▋ | 8441/9770 [1:36:43<14:08, 1.57it/s]
86%|████████▋ | 8442/9770 [1:36:44<14:01, 1.58it/s]
86%|████████▋ | 8443/9770 [1:36:44<14:09, 1.56it/s]
86%|████████▋ | 8444/9770 [1:36:45<14:36, 1.51it/s]
86%|████████▋ | 8445/9770 [1:36:46<14:39, 1.51it/s]
86%|████████▋ | 8446/9770 [1:36:46<14:42, 1.50it/s]
86%|████████▋ | 8447/9770 [1:36:47<14:33, 1.51it/s]
86%|████████▋ | 8448/9770 [1:36:48<14:25, 1.53it/s]
86%|████████▋ | 8449/9770 [1:36:48<14:23, 1.53it/s]
86%|███████�
+0: {'loss': 0.6429, 'grad_norm': 0.635609117972797, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.86}
+0: {'loss': 0.6471, 'grad_norm': 0.6496045878158897, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.87}
+0: �▋ | 8450/9770 [1:36:49<14:16, 1.54it/s]
86%|████████▋ | 8450/9770 [1:36:49<14:16, 1.54it/s]
86%|████████▋ | 8451/9770 [1:36:50<14:16, 1.54it/s]
87%|████████▋ | 8452/9770 [1:36:50<14:06, 1.56it/s]
87%|████████▋ | 8453/9770 [1:36:51<14:12, 1.54it/s]
87%|████████▋ | 8454/9770 [1:36:52<14:08, 1.55it/s]
87%|████████▋ | 8455/9770 [1:36:52<14:07, 1.55it/s]
87%|████████▋ | 8456/9770 [1:36:53<14:12, 1.54it/s]
87%|████████▋ | 8457/9770 [1:36:54<14:16, 1.53it/s]
87%|████████▋ | 8458/9770 [1:36:54<14:17, 1.53it/s]
87%|████████▋ | 8459/9770 [1:36:55<14:10, 1.54it/s]
87%|████████▋ | 8460/9770 [1:36:56<14:06, 1.55it/s]
87%|████████▋ | 8460/9770 [1:36:56<14:06, 1.55it/s]
87%|�
+0: {'loss': 0.6682, 'grad_norm': 0.618038144296049, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.87}
+0: ��███████▋ | 8461/9770 [1:36:56<14:05, 1.55it/s]
87%|████████▋ | 8462/9770 [1:36:57<14:21, 1.52it/s]
87%|████████▋ | 8463/9770 [1:36:58<14:20, 1.52it/s]
87%|████████▋ | 8464/9770 [1:36:58<14:17, 1.52it/s]
87%|████████▋ | 8465/9770 [1:36:59<14:08, 1.54it/s]
87%|████████▋ | 8466/9770 [1:37:00<14:28, 1.50it/s]
87%|████████▋ | 8467/9770 [1:37:00<14:21, 1.51it/s]
87%|████████▋ | 8468/9770 [1:37:01<14:11, 1.53it/s]
87%|████████▋ | 8469/9770 [1:37:01<14:09, 1.53it/s]
87%|████████▋ | 8470/9770 [1:37:02<14:12, 1.52it/s]
87%|████████▋ | 8470/9770 [1:37:02<14:12, 1.52it/s]
87%|████████▋ | 8471/9770 [1:37:03<14:13, 1.52it/s]
87%|████████▋ | 8472/9770 [1:37:03<14:11, 1.52it/s]
87%|████████▋ | 8473
+0: {'loss': 0.6373, 'grad_norm': 0.5795847678857926, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.87}
+0: /9770 [1:37:04<14:34, 1.48it/s]
87%|████████▋ | 8474/9770 [1:37:05<14:24, 1.50it/s]
87%|████████▋ | 8475/9770 [1:37:05<14:15, 1.51it/s]
87%|████████▋ | 8476/9770 [1:37:06<14:15, 1.51it/s]
87%|████████▋ | 8477/9770 [1:37:07<14:20, 1.50it/s]
87%|████████▋ | 8478/9770 [1:37:07<14:13, 1.51it/s]
87%|████████▋ | 8479/9770 [1:37:08<14:05, 1.53it/s]
87%|████████▋ | 8480/9770 [1:37:09<13:59, 1.54it/s]
87%|████████▋ | 8480/9770 [1:37:09<13:59, 1.54it/s]
87%|████████▋ | 8481/9770 [1:37:09<13:58, 1.54it/s]
87%|████████▋ | 8482/9770 [1:37:10<13:55, 1.54it/s]
87%|████████▋ | 8483/9770 [1:37:11<13:46, 1.56it/s]
87%|████████▋ | 8484/9770 [1:37:11<13:44, 1.56it/s]
87%|████████▋ | 8485/9770 [1:37:12<13:41, 1.56it/s]
+0: {'loss': 0.6583, 'grad_norm': 0.5873279654115177, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.87}
+0: 87%|████████▋ | 8486/9770 [1:37:13<13:35, 1.57it/s]
87%|████████▋ | 8487/9770 [1:37:13<13:47, 1.55it/s]
87%|████████▋ | 8488/9770 [1:37:14<13:44, 1.55it/s]
87%|████████▋ | 8489/9770 [1:37:15<13:51, 1.54it/s]
87%|████████▋ | 8490/9770 [1:37:15<13:50, 1.54it/s]
87%|████████▋ | 8490/9770 [1:37:15<13:50, 1.54it/s]
87%|████████▋ | 8491/9770 [1:37:16<13:55, 1.53it/s]
87%|████████▋ | 8492/9770 [1:37:16<13:59, 1.52it/s]
87%|████████▋ | 8493/9770 [1:37:17<14:20, 1.48it/s]
87%|████████▋ | 8494/9770 [1:37:18<14:16, 1.49it/s]
87%|████████▋ | 8495/9770 [1:37:19<14:17, 1.49it/s]
87%|████████▋ | 8496/9770 [1:37:19<14:00, 1.52it/s]
87%|████████▋ | 8497/9770 [1:37:20<13:55, 1.52it/s]
87%|████████▋
+0: {'loss': 0.6404, 'grad_norm': 0.6160615730378544, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.87}
+0: | 8498/9770 [1:37:20<13:51, 1.53it/s]
87%|████████▋ | 8499/9770 [1:37:21<13:40, 1.55it/s]
87%|████████▋ | 8500/9770 [1:37:22<13:45, 1.54it/s]
87%|████████▋ | 8500/9770 [1:37:22<13:45, 1.54it/s]
87%|████████▋ | 8501/9770 [1:37:22<14:02, 1.51it/s]
87%|████████▋ | 8502/9770 [1:37:23<14:11, 1.49it/s]
87%|████████▋ | 8503/9770 [1:37:24<14:02, 1.50it/s]
87%|████████▋ | 8504/9770 [1:37:25<14:16, 1.48it/s]
87%|████████▋ | 8505/9770 [1:37:25<14:04, 1.50it/s]
87%|████████▋ | 8506/9770 [1:37:26<13:50, 1.52it/s]
87%|████████▋ | 8507/9770 [1:37:26<14:08, 1.49it/s]
87%|████████▋ | 8508/9770 [1:37:27<14:06, 1.49it/s]
87%|████████▋ | 8509/9770 [1:37:28<13:54, 1.51it/s]
87%|████████▋ | 8510/9770 [1:37:28<13:42, 1.53
+0: {'loss': 0.6409, 'grad_norm': 0.6189802003648246, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.87}
+0: {'loss': 0.6745, 'grad_norm': 0.6240342347917001, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.87}
+0: it/s]
87%|████████▋ | 8510/9770 [1:37:28<13:42, 1.53it/s]
87%|████████▋ | 8511/9770 [1:37:29<13:47, 1.52it/s]
87%|████████▋ | 8512/9770 [1:37:30<13:45, 1.52it/s]
87%|████████▋ | 8513/9770 [1:37:30<13:30, 1.55it/s]
87%|████████▋ | 8514/9770 [1:37:31<13:43, 1.53it/s]
87%|████████▋ | 8515/9770 [1:37:32<13:42, 1.53it/s]
87%|████████▋ | 8516/9770 [1:37:32<13:29, 1.55it/s]
87%|████████▋ | 8517/9770 [1:37:33<13:40, 1.53it/s]
87%|████████▋ | 8518/9770 [1:37:34<13:40, 1.53it/s]
87%|████████▋ | 8519/9770 [1:37:34<13:44, 1.52it/s]
87%|████████▋ | 8520/9770 [1:37:35<13:39, 1.53it/s]
87%|████████▋ | 8520/9770 [1:37:35<13:39, 1.53it/s]
87%|████████▋ | 8521/9770
+0: {'loss': 0.6431, 'grad_norm': 0.6112937163509875, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.87}
+0: [1:37:36<13:46, 1.51it/s]
87%|████████▋ | 8522/9770 [1:37:36<13:59, 1.49it/s]
87%|████████▋ | 8523/9770 [1:37:37<13:53, 1.50it/s]
87%|████████▋ | 8524/9770 [1:37:38<13:49, 1.50it/s]
87%|████████▋ | 8525/9770 [1:37:38<13:42, 1.51it/s]
87%|████████▋ | 8526/9770 [1:37:39<13:35, 1.53it/s]
87%|████████▋ | 8527/9770 [1:37:40<13:31, 1.53it/s]
87%|████████▋ | 8528/9770 [1:37:40<13:35, 1.52it/s]
87%|████████▋ | 8529/9770 [1:37:41<13:30, 1.53it/s]
87%|████████▋ | 8530/9770 [1:37:42<13:26, 1.54it/s]
87%|████████▋ | 8530/9770 [1:37:42<13:26, 1.54it/s]
87%|████████▋ | 8531/9770 [1:37:42<13:21, 1.55it/s]
87%|████████▋ | 8532/9770 [1:37:43<13:15, 1.56it/s]
87%|████████▋ | 8533/9770 [1:37:43<13:19, 1.55it/s]
87%|
+0: {'loss': 0.637, 'grad_norm': 0.6454752552430932, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.87}
+0: ████████▋ | 8534/9770 [1:37:44<13:18, 1.55it/s]
87%|████████▋ | 8535/9770 [1:37:45<13:12, 1.56it/s]
87%|████████▋ | 8536/9770 [1:37:45<13:10, 1.56it/s]
87%|████████▋ | 8537/9770 [1:37:46<13:19, 1.54it/s]
87%|████████▋ | 8538/9770 [1:37:47<13:17, 1.54it/s]
87%|████████▋ | 8539/9770 [1:37:47<13:36, 1.51it/s]
87%|████████▋ | 8540/9770 [1:37:48<13:34, 1.51it/s]
87%|████████▋ | 8540/9770 [1:37:48<13:34, 1.51it/s]
87%|████████▋ | 8541/9770 [1:37:49<13:35, 1.51it/s]
87%|████████▋ | 8542/9770 [1:37:49<13:26, 1.52it/s]
87%|████████▋ | 8543/9770 [1:37:50<13:45, 1.49it/s]
87%|████████▋ | 8544/9770 [1:37:51<13:34, 1.50it/s]
87%|████████▋ | 8545/9770 [1:37:51<13:41, 1.49it/s]
87%|████████▋ | 854
+0: {'loss': 0.6773, 'grad_norm': 0.5885302883964239, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.87}
+0: 6/9770 [1:37:52<13:36, 1.50it/s]
87%|████████▋ | 8547/9770 [1:37:53<13:37, 1.50it/s]
87%|████████▋ | 8548/9770 [1:37:53<13:31, 1.51it/s]
88%|████████▊ | 8549/9770 [1:37:54<13:20, 1.52it/s]
88%|████████▊ | 8550/9770 [1:37:55<13:17, 1.53it/s]
88%|████████▊ | 8550/9770 [1:37:55<13:17, 1.53it/s]
88%|████████▊ | 8551/9770 [1:37:55<13:33, 1.50it/s]
88%|████████▊ | 8552/9770 [1:37:56<13:26, 1.51it/s]
88%|████████▊ | 8553/9770 [1:37:57<13:16, 1.53it/s]
88%|████████▊ | 8554/9770 [1:37:57<13:09, 1.54it/s]
88%|████████▊ | 8555/9770 [1:37:58<13:28, 1.50it/s]
88%|████████▊ | 8556/9770 [1:37:59<13:25, 1.51it/s]
88%|████████▊ | 8557/9770 [1:37:59<13:12, 1.53it/s]
88%|████████▊ | 8558/9770 [1:38:00<13:04, 1.54it/s]
+0: {'loss': 0.6639, 'grad_norm': 0.6060617851454885, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.88}
+0:
88%|████████▊ | 8559/9770 [1:38:01<13:12, 1.53it/s]
88%|████████▊ | 8560/9770 [1:38:01<13:21, 1.51it/s]
88%|████████▊ | 8560/9770 [1:38:01<13:21, 1.51it/s]
88%|████████▊ | 8561/9770 [1:38:02<13:20, 1.51it/s]
88%|████████▊ | 8562/9770 [1:38:03<13:10, 1.53it/s]
88%|████████▊ | 8563/9770 [1:38:03<13:21, 1.51it/s]
88%|████████▊ | 8564/9770 [1:38:04<13:15, 1.52it/s]
88%|████████▊ | 8565/9770 [1:38:05<13:22, 1.50it/s]
88%|████████▊ | 8566/9770 [1:38:05<13:13, 1.52it/s]
88%|████████▊ | 8567/9770 [1:38:06<13:05, 1.53it/s]
88%|████████▊ | 8568/9770 [1:38:07<13:06, 1.53it/s]
88%|████████▊ | 8569/9770 [1:38:07<13:03, 1.53it/s]
88%|████████▊ | 8570/9770 [1:38:08<12:54, 1.55it/s]
+0: {'loss': 0.6391, 'grad_norm': 0.5760706070176715, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.88}
+0: {'loss': 0.6508, 'grad_norm': 0.5965601065925983, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.88}
+0:
88%|████████▊ | 8570/9770 [1:38:08<12:54, 1.55it/s]
88%|████████▊ | 8571/9770 [1:38:08<12:59, 1.54it/s]
88%|████████▊ | 8572/9770 [1:38:09<12:56, 1.54it/s]
88%|████████▊ | 8573/9770 [1:38:10<12:54, 1.55it/s]
88%|████████▊ | 8574/9770 [1:38:10<12:58, 1.54it/s]
88%|████████▊ | 8575/9770 [1:38:11<13:08, 1.52it/s]
88%|████████▊ | 8576/9770 [1:38:12<13:09, 1.51it/s]
88%|████████▊ | 8577/9770 [1:38:12<13:04, 1.52it/s]
88%|████████▊ | 8578/9770 [1:38:13<13:09, 1.51it/s]
88%|████████▊ | 8579/9770 [1:38:14<12:55, 1.54it/s]
88%|████████▊ | 8580/9770 [1:38:14<12:55, 1.53it/s]
88%|████████▊ | 8580/9770 [1:38:14<12:55, 1.53it/s]
88%|████████▊ | 8581/9770 [1:38:15<13:00, 1.52it/s]
88%|█�
+0: {'loss': 0.6488, 'grad_norm': 0.6168248671049509, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.88}
+0: �██████▊ | 8582/9770 [1:38:16<12:57, 1.53it/s]
88%|████████▊ | 8583/9770 [1:38:16<12:50, 1.54it/s]
88%|████████▊ | 8584/9770 [1:38:17<13:07, 1.51it/s]
88%|████████▊ | 8585/9770 [1:38:18<13:14, 1.49it/s]
88%|████████▊ | 8586/9770 [1:38:18<13:11, 1.50it/s]
88%|████████▊ | 8587/9770 [1:38:19<13:01, 1.51it/s]
88%|████████▊ | 8588/9770 [1:38:20<12:46, 1.54it/s]
88%|████████▊ | 8589/9770 [1:38:20<12:55, 1.52it/s]
88%|████████▊ | 8590/9770 [1:38:21<12:49, 1.53it/s]
88%|████████▊ | 8590/9770 [1:38:21<12:49, 1.53it/s]
88%|████████▊ | 8591/9770 [1:38:22<13:08, 1.50it/s]
88%|████████▊ | 8592/9770 [1:38:22<13:03, 1.50it/s]
88%|████████▊ | 8593/9770 [1:38:23<12:53, 1.52it/s]
88%|████████▊ | 8594/977
+0: {'loss': 0.6366, 'grad_norm': 0.6032865679992432, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.88}
+0: 0 [1:38:24<12:48, 1.53it/s]
88%|████████▊ | 8595/9770 [1:38:24<12:53, 1.52it/s]
88%|████████▊ | 8596/9770 [1:38:25<12:48, 1.53it/s]
88%|████████▊ | 8597/9770 [1:38:26<12:41, 1.54it/s]
88%|████████▊ | 8598/9770 [1:38:26<12:44, 1.53it/s]
88%|████████▊ | 8599/9770 [1:38:27<12:34, 1.55it/s]
88%|████████▊ | 8600/9770 [1:38:27<12:31, 1.56it/s]
88%|████████▊ | 8600/9770 [1:38:27<12:31, 1.56it/s]
88%|████████▊ | 8601/9770 [1:38:28<12:33, 1.55it/s]
88%|████████▊ | 8602/9770 [1:38:29<12:34, 1.55it/s]
88%|████████▊ | 8603/9770 [1:38:29<12:34, 1.55it/s]
88%|████████▊ | 8604/9770 [1:38:30<12:22, 1.57it/s]
88%|████████▊ | 8605/9770 [1:38:31<12:45, 1.52it/s]
88%|████████▊ | 8606/9770 [1:38:31<12:34, 1.54it/s]
88%
+0: {'loss': 0.652, 'grad_norm': 0.6168589508591895, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.88}
+0: |████████▊ | 8607/9770 [1:38:32<12:38, 1.53it/s]
88%|████████▊ | 8608/9770 [1:38:33<12:35, 1.54it/s]
88%|████████▊ | 8609/9770 [1:38:33<12:38, 1.53it/s]
88%|████████▊ | 8610/9770 [1:38:34<12:37, 1.53it/s]
88%|████████▊ | 8610/9770 [1:38:34<12:37, 1.53it/s]
88%|████████▊ | 8611/9770 [1:38:35<12:45, 1.51it/s]
88%|████████▊ | 8612/9770 [1:38:35<12:40, 1.52it/s]
88%|████████▊ | 8613/9770 [1:38:36<12:33, 1.54it/s]
88%|████████▊ | 8614/9770 [1:38:37<12:34, 1.53it/s]
88%|████████▊ | 8615/9770 [1:38:37<12:39, 1.52it/s]
88%|████████▊ | 8616/9770 [1:38:38<12:34, 1.53it/s]
88%|████████▊ | 8617/9770 [1:38:39<12:24, 1.55it/s]
88%|████████▊ | 8618/9770 [1:38:39<12:25, 1.55it/s]
88%|████████▊ | 86
+0: {'loss': 0.6516, 'grad_norm': 0.595497472824902, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.88}
+0: {'loss': 0.6628, 'grad_norm': 0.6049395103594175, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.88}
+0: 19/9770 [1:38:40<12:14, 1.57it/s]
88%|████████▊ | 8620/9770 [1:38:40<12:19, 1.56it/s]
88%|████████▊ | 8620/9770 [1:38:40<12:19, 1.56it/s]
88%|████████▊ | 8621/9770 [1:38:41<12:29, 1.53it/s]
88%|████████▊ | 8622/9770 [1:38:42<12:29, 1.53it/s]
88%|████████▊ | 8623/9770 [1:38:42<12:36, 1.52it/s]
88%|████████▊ | 8624/9770 [1:38:43<12:22, 1.54it/s]
88%|████████▊ | 8625/9770 [1:38:44<12:26, 1.53it/s]
88%|████████▊ | 8626/9770 [1:38:44<12:21, 1.54it/s]
88%|████████▊ | 8627/9770 [1:38:45<12:10, 1.57it/s]
88%|████████▊ | 8628/9770 [1:38:46<12:29, 1.52it/s]
88%|████████▊ | 8629/9770 [1:38:46<12:47, 1.49it/s]
88%|████████▊ | 8630/9770 [1:38:47<12:31, 1.52it/s]
88%|███�
+0: {'loss': 0.6444, 'grad_norm': 0.6330025249925728, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.88}
+0: ��████▊ | 8630/9770 [1:38:47<12:31, 1.52it/s]
88%|████████▊ | 8631/9770 [1:38:48<12:35, 1.51it/s]
88%|████████▊ | 8632/9770 [1:38:48<12:30, 1.52it/s]
88%|████████▊ | 8633/9770 [1:38:49<12:24, 1.53it/s]
88%|████████▊ | 8634/9770 [1:38:50<12:29, 1.51it/s]
88%|████████▊ | 8635/9770 [1:38:50<12:28, 1.52it/s]
88%|████████▊ | 8636/9770 [1:38:51<12:24, 1.52it/s]
88%|████████▊ | 8637/9770 [1:38:52<12:19, 1.53it/s]
88%|████████▊ | 8638/9770 [1:38:52<12:34, 1.50it/s]
88%|████████▊ | 8639/9770 [1:38:53<12:31, 1.50it/s]
88%|████████▊ | 8640/9770 [1:38:54<12:27, 1.51it/s]
88%|████████▊ | 8640/9770 [1:38:54<12:27, 1.51it/s]
88%|████████▊ | 8641/9770 [1:38:54<12:17, 1.53it/s]
88%|████████▊ | 8642/9770 [1:
+0: {'loss': 0.646, 'grad_norm': 0.6801030407353096, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.88}
+0: 38:55<12:22, 1.52it/s]
88%|████████▊ | 8643/9770 [1:38:56<12:24, 1.51it/s]
88%|████████▊ | 8644/9770 [1:38:56<12:21, 1.52it/s]
88%|████████▊ | 8645/9770 [1:38:57<12:19, 1.52it/s]
88%|████████▊ | 8646/9770 [1:38:58<12:21, 1.52it/s]
89%|████████▊ | 8647/9770 [1:38:58<12:13, 1.53it/s]
89%|████████▊ | 8648/9770 [1:38:59<12:01, 1.55it/s]
89%|████████▊ | 8649/9770 [1:38:59<11:59, 1.56it/s]
89%|████████▊ | 8650/9770 [1:39:00<11:58, 1.56it/s]
89%|████████▊ | 8650/9770 [1:39:00<11:58, 1.56it/s]
89%|████████▊ | 8651/9770 [1:39:01<11:53, 1.57it/s]
89%|████████▊ | 8652/9770 [1:39:01<12:02, 1.55it/s]
89%|████████▊ | 8653/9770 [1:39:02<12:14, 1.52it/s]
89%|████████▊ | 8654/9770 [1:39:03<12:16, 1.51it/s]
89%|█�
+0: {'loss': 0.6376, 'grad_norm': 0.6148884516601141, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.89}
+0: ��██████▊ | 8655/9770 [1:39:03<12:17, 1.51it/s]
89%|████████▊ | 8656/9770 [1:39:04<12:18, 1.51it/s]
89%|████████▊ | 8657/9770 [1:39:05<12:20, 1.50it/s]
89%|████████▊ | 8658/9770 [1:39:05<12:16, 1.51it/s]
89%|████████▊ | 8659/9770 [1:39:06<12:13, 1.51it/s]
89%|████████▊ | 8660/9770 [1:39:07<12:05, 1.53it/s]
89%|████████▊ | 8660/9770 [1:39:07<12:05, 1.53it/s]
89%|████████▊ | 8661/9770 [1:39:07<12:07, 1.52it/s]
89%|████████▊ | 8662/9770 [1:39:08<12:10, 1.52it/s]
89%|████████▊ | 8663/9770 [1:39:09<12:08, 1.52it/s]
89%|████████▊ | 8664/9770 [1:39:09<11:56, 1.54it/s]
89%|████████▊ | 8665/9770 [1:39:10<11:54, 1.55it/s]
89%|████████▊ | 8666/9770 [1:39:11<11:49, 1.56it/s]
89%|████████▊ | 8667/97
+0: {'loss': 0.6429, 'grad_norm': 0.6115386321301234, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.89}
+0: 70 [1:39:11<11:48, 1.56it/s]
89%|████████▊ | 8668/9770 [1:39:12<11:38, 1.58it/s]
89%|████████▊ | 8669/9770 [1:39:13<11:40, 1.57it/s]
89%|████████▊ | 8670/9770 [1:39:13<11:45, 1.56it/s]
89%|████████▊ | 8670/9770 [1:39:13<11:45, 1.56it/s]
89%|████████▉ | 8671/9770 [1:39:14<11:50, 1.55it/s]
89%|████████▉ | 8672/9770 [1:39:14<11:52, 1.54it/s]
89%|████████▉ | 8673/9770 [1:39:15<11:59, 1.53it/s]
89%|████████▉ | 8674/9770 [1:39:16<11:59, 1.52it/s]
89%|████████▉ | 8675/9770 [1:39:16<11:55, 1.53it/s]
89%|████████▉ | 8676/9770 [1:39:17<11:47, 1.55it/s]
89%|████████▉ | 8677/9770 [1:39:18<11:53, 1.53it/s]
89%|████████▉ | 8678/9770 [1:39:18<11:52, 1.53it/s]
89%|████████▉ | 8679/9770 [1:39:19<11:51, 1.53it/s]
89
+0: {'loss': 0.6483, 'grad_norm': 0.586653385539892, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.89}
+0: {'loss': 0.6592, 'grad_norm': 0.582585014988046, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.89}
+0: %|████████▉ | 8680/9770 [1:39:20<11:45, 1.55it/s]
89%|████████▉ | 8680/9770 [1:39:20<11:45, 1.55it/s]
89%|████████▉ | 8681/9770 [1:39:20<11:51, 1.53it/s]
89%|████████▉ | 8682/9770 [1:39:21<11:50, 1.53it/s]
89%|████████▉ | 8683/9770 [1:39:22<11:47, 1.54it/s]
89%|████████▉ | 8684/9770 [1:39:22<11:59, 1.51it/s]
89%|████████▉ | 8685/9770 [1:39:23<11:51, 1.53it/s]
89%|████████▉ | 8686/9770 [1:39:24<11:43, 1.54it/s]
89%|████████▉ | 8687/9770 [1:39:24<11:38, 1.55it/s]
89%|████████▉ | 8688/9770 [1:39:25<11:44, 1.53it/s]
89%|████████▉ | 8689/9770 [1:39:26<12:10, 1.48it/s]
89%|████████▉ | 8690/9770 [1:39:26<11:55, 1.51it/s]
89%|████████▉ | 8690/9770 [1:39:26
+0: {'loss': 0.6196, 'grad_norm': 0.5853838127434589, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.89}
+0: <11:55, 1.51it/s]
89%|████████▉ | 8691/9770 [1:39:27<11:50, 1.52it/s]
89%|████████▉ | 8692/9770 [1:39:28<11:43, 1.53it/s]
89%|████████▉ | 8693/9770 [1:39:28<11:45, 1.53it/s]
89%|████████▉ | 8694/9770 [1:39:29<11:36, 1.54it/s]
89%|████████▉ | 8695/9770 [1:39:30<11:36, 1.54it/s]
89%|████████▉ | 8696/9770 [1:39:30<11:36, 1.54it/s]
89%|████████▉ | 8697/9770 [1:39:31<11:33, 1.55it/s]
89%|████████▉ | 8698/9770 [1:39:31<11:36, 1.54it/s]
89%|████████▉ | 8699/9770 [1:39:32<11:48, 1.51it/s]
89%|████████▉ | 8700/9770 [1:39:33<11:36, 1.54it/s]
89%|████████▉ | 8700/9770 [1:39:33<11:36, 1.54it/s]
89%|████████▉ | 8701/9770 [1:39:33<11:34, 1.54it/s]
89%|████████▉ | 8702/9770 [1:39:34<11:48, 1.51it/s]
89%|███
+0: {'loss': 0.6464, 'grad_norm': 0.6132056282981102, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.89}
+0: █████▉ | 8703/9770 [1:39:35<11:51, 1.50it/s]
89%|████████▉ | 8704/9770 [1:39:35<11:44, 1.51it/s]
89%|████████▉ | 8705/9770 [1:39:36<11:40, 1.52it/s]
89%|████████▉ | 8706/9770 [1:39:37<11:39, 1.52it/s]
89%|████████▉ | 8707/9770 [1:39:37<11:41, 1.52it/s]
89%|████████▉ | 8708/9770 [1:39:38<11:38, 1.52it/s]
89%|████████▉ | 8709/9770 [1:39:39<11:34, 1.53it/s]
89%|████████▉ | 8710/9770 [1:39:39<11:38, 1.52it/s]
89%|████████▉ | 8710/9770 [1:39:39<11:38, 1.52it/s]
89%|████████▉ | 8711/9770 [1:39:40<11:37, 1.52it/s]
89%|████████▉ | 8712/9770 [1:39:41<11:37, 1.52it/s]
89%|████████▉ | 8713/9770 [1:39:41<11:35, 1.52it/s]
89%|████████▉ | 8714/9770 [1:39:42<11:34, 1.52it/s]
89%|████████▉ | 8715/9770 [1
+0: {'loss': 0.6724, 'grad_norm': 0.5833862736005774, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.89}
+0: :39:43<11:29, 1.53it/s]
89%|████████▉ | 8716/9770 [1:39:43<11:31, 1.53it/s]
89%|████████▉ | 8717/9770 [1:39:44<11:33, 1.52it/s]
89%|████████▉ | 8718/9770 [1:39:45<11:28, 1.53it/s]
89%|████████▉ | 8719/9770 [1:39:45<11:20, 1.55it/s]
89%|████████▉ | 8720/9770 [1:39:46<11:14, 1.56it/s]
89%|████████▉ | 8720/9770 [1:39:46<11:14, 1.56it/s]
89%|████████▉ | 8721/9770 [1:39:47<11:12, 1.56it/s]
89%|████████▉ | 8722/9770 [1:39:47<11:35, 1.51it/s]
89%|████████▉ | 8723/9770 [1:39:48<11:28, 1.52it/s]
89%|████████▉ | 8724/9770 [1:39:49<11:22, 1.53it/s]
89%|████████▉ | 8725/9770 [1:39:49<11:20, 1.53it/s]
89%|████████▉ | 8726/9770 [1:39:50<11:23, 1.53it/s]
89%|████████▉ | 8727/9770 [1:39:50<11:22, 1.53it/s]
89%|█
+0: {'loss': 0.6382, 'grad_norm': 0.5985510225455318, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.89}
+0: ███████▉ | 8728/9770 [1:39:51<11:21, 1.53it/s]
89%|████████▉ | 8729/9770 [1:39:52<11:22, 1.52it/s]
89%|████████▉ | 8730/9770 [1:39:52<11:21, 1.53it/s]
89%|████████▉ | 8730/9770 [1:39:52<11:21, 1.53it/s]
89%|████████▉ | 8731/9770 [1:39:53<11:12, 1.54it/s]
89%|████████▉ | 8732/9770 [1:39:54<11:13, 1.54it/s]
89%|████████▉ | 8733/9770 [1:39:54<11:10, 1.55it/s]
89%|████████▉ | 8734/9770 [1:39:55<11:06, 1.56it/s]
89%|████████▉ | 8735/9770 [1:39:56<11:10, 1.54it/s]
89%|████████▉ | 8736/9770 [1:39:56<11:00, 1.57it/s]
89%|████████▉ | 8737/9770 [1:39:57<11:05, 1.55it/s]
89%|████████▉ | 8738/9770 [1:39:58<11:01, 1.56it/s]
89%|████████▉ | 8739/9770 [1:39:58<11:16, 1.52it/s]
89%|████████▉ | 8740/9
+0: {'loss': 0.6236, 'grad_norm': 0.6356619541275096, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.89}
+0: {'loss': 0.6373, 'grad_norm': 0.5849162574581885, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.9}
+0: 770 [1:39:59<11:10, 1.54it/s]
89%|████████▉ | 8740/9770 [1:39:59<11:10, 1.54it/s]
89%|████████▉ | 8741/9770 [1:40:00<11:09, 1.54it/s]
89%|████████▉ | 8742/9770 [1:40:00<11:01, 1.55it/s]
89%|████████▉ | 8743/9770 [1:40:01<10:53, 1.57it/s]
89%|████████▉ | 8744/9770 [1:40:01<10:53, 1.57it/s]
90%|████████▉ | 8745/9770 [1:40:02<10:54, 1.57it/s]
90%|████████▉ | 8746/9770 [1:40:03<10:53, 1.57it/s]
90%|████████▉ | 8747/9770 [1:40:03<10:53, 1.57it/s]
90%|████████▉ | 8748/9770 [1:40:04<10:55, 1.56it/s]
90%|████████▉ | 8749/9770 [1:40:05<11:00, 1.55it/s]
90%|████████▉ | 8750/9770 [1:40:05<10:56, 1.55it/s]
90%|████████▉ | 8750/9770 [1:40:05<10:56, 1.55it/s]
90%|████�
+0: {'loss': 0.6417, 'grad_norm': 0.5725301793115135, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.9}
+0: �███▉ | 8751/9770 [1:40:06<11:06, 1.53it/s]
90%|████████▉ | 8752/9770 [1:40:07<10:58, 1.55it/s]
90%|████████▉ | 8753/9770 [1:40:07<11:10, 1.52it/s]
90%|████████▉ | 8754/9770 [1:40:08<11:12, 1.51it/s]
90%|████████▉ | 8755/9770 [1:40:09<11:01, 1.53it/s]
90%|████████▉ | 8756/9770 [1:40:09<10:57, 1.54it/s]
90%|████████▉ | 8757/9770 [1:40:10<10:58, 1.54it/s]
90%|████████▉ | 8758/9770 [1:40:11<10:53, 1.55it/s]
90%|████████▉ | 8759/9770 [1:40:11<11:04, 1.52it/s]
90%|████████▉ | 8760/9770 [1:40:12<11:09, 1.51it/s]
90%|████████▉ | 8760/9770 [1:40:12<11:09, 1.51it/s]
90%|████████▉ | 8761/9770 [1:40:13<11:11, 1.50it/s]
90%|████████▉ | 8762/9770 [1:40:13<10:59, 1.53it/s]
90%|████████▉ | 8763/9770 [1:40:1
+0: {'loss': 0.6515, 'grad_norm': 0.6190158774613381, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.9}
+0: 4<11:15, 1.49it/s]
90%|████████▉ | 8764/9770 [1:40:15<10:59, 1.53it/s]
90%|████████▉ | 8765/9770 [1:40:15<11:05, 1.51it/s]
90%|████████▉ | 8766/9770 [1:40:16<10:51, 1.54it/s]
90%|████████▉ | 8767/9770 [1:40:16<10:50, 1.54it/s]
90%|████████▉ | 8768/9770 [1:40:17<10:47, 1.55it/s]
90%|████████▉ | 8769/9770 [1:40:18<10:48, 1.54it/s]
90%|████████▉ | 8770/9770 [1:40:18<10:52, 1.53it/s]
90%|████████▉ | 8770/9770 [1:40:18<10:52, 1.53it/s]
90%|████████▉ | 8771/9770 [1:40:19<10:51, 1.53it/s]
90%|████████▉ | 8772/9770 [1:40:20<10:47, 1.54it/s]
90%|████████▉ | 8773/9770 [1:40:20<10:44, 1.55it/s]
90%|████████▉ | 8774/9770 [1:40:21<10:50, 1.53it/s]
90%|████████▉ | 8775/9770 [1:40:22<10:47, 1.54it/s]
90%|██�
+0: {'loss': 0.6466, 'grad_norm': 0.59364489234436, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.9}
+0: �█████▉ | 8776/9770 [1:40:22<11:01, 1.50it/s]
90%|████████▉ | 8777/9770 [1:40:23<10:58, 1.51it/s]
90%|████████▉ | 8778/9770 [1:40:24<10:59, 1.50it/s]
90%|████████▉ | 8779/9770 [1:40:24<10:49, 1.53it/s]
90%|████████▉ | 8780/9770 [1:40:25<10:41, 1.54it/s]
90%|████████▉ | 8780/9770 [1:40:25<10:41, 1.54it/s]
90%|████████▉ | 8781/9770 [1:40:26<10:57, 1.50it/s]
90%|████████▉ | 8782/9770 [1:40:26<10:55, 1.51it/s]
90%|████████▉ | 8783/9770 [1:40:27<10:53, 1.51it/s]
90%|████████▉ | 8784/9770 [1:40:28<10:48, 1.52it/s]
90%|████████▉ | 8785/9770 [1:40:28<10:45, 1.53it/s]
90%|████████▉ | 8786/9770 [1:40:29<10:36, 1.55it/s]
90%|████████▉ | 8787/9770 [1:40:30<10:34, 1.55it/s]
90%|████████▉ | 8788/9770 [
+0: {'loss': 0.6442, 'grad_norm': 0.6273796777531278, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.9}
+0: 1:40:30<10:37, 1.54it/s]
90%|████████▉ | 8789/9770 [1:40:31<10:40, 1.53it/s]
90%|████████▉ | 8790/9770 [1:40:32<10:35, 1.54it/s]
90%|████████▉ | 8790/9770 [1:40:32<10:35, 1.54it/s]
90%|████████▉ | 8791/9770 [1:40:32<10:39, 1.53it/s]
90%|████████▉ | 8792/9770 [1:40:33<10:53, 1.50it/s]
90%|█████████ | 8793/9770 [1:40:34<10:44, 1.52it/s]
90%|█████████ | 8794/9770 [1:40:34<10:54, 1.49it/s]
90%|█████████ | 8795/9770 [1:40:35<10:39, 1.52it/s]
90%|█████████ | 8796/9770 [1:40:35<10:32, 1.54it/s]
90%|█████████ | 8797/9770 [1:40:36<10:33, 1.53it/s]
90%|█████████ | 8798/9770 [1:40:37<10:32, 1.54it/s]
90%|█████████ | 8799/9770 [1:40:37<10:33, 1.53it/s]
90%|█████████ | 8800/9770 [1:40:38<10:22, 1.56it/s]
+0: {'loss': 0.6573, 'grad_norm': 0.6342980367172633, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.9}
+0: {'loss': 0.6533, 'grad_norm': 0.6165304337266457, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.9}
+0:
90%|█████████ | 8800/9770 [1:40:38<10:22, 1.56it/s]
90%|█████████ | 8801/9770 [1:40:39<10:22, 1.56it/s]
90%|█████████ | 8802/9770 [1:40:39<10:19, 1.56it/s]
90%|█████████ | 8803/9770 [1:40:40<10:23, 1.55it/s]
90%|█████████ | 8804/9770 [1:40:41<10:24, 1.55it/s]
90%|█████████ | 8805/9770 [1:40:41<10:25, 1.54it/s]
90%|█████████ | 8806/9770 [1:40:42<10:24, 1.54it/s]
90%|█████████ | 8807/9770 [1:40:43<10:24, 1.54it/s]
90%|█████████ | 8808/9770 [1:40:43<10:21, 1.55it/s]
90%|█████████ | 8809/9770 [1:40:44<10:25, 1.54it/s]
90%|█████████ | 8810/9770 [1:40:45<10:31, 1.52it/s]
90%|█████████ | 8810/9770 [1:40:45<10:31, 1.52it/s]
90%|█████████ | 8811/9770 [1:40:45<10:
+0: {'loss': 0.6583, 'grad_norm': 0.6445558806507502, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.9}
+0: 25, 1.53it/s]
90%|█████████ | 8812/9770 [1:40:46<10:23, 1.54it/s]
90%|█████████ | 8813/9770 [1:40:46<10:13, 1.56it/s]
90%|█████████ | 8814/9770 [1:40:47<10:12, 1.56it/s]
90%|█████████ | 8815/9770 [1:40:48<10:10, 1.56it/s]
90%|█████████ | 8816/9770 [1:40:48<10:08, 1.57it/s]
90%|█████████ | 8817/9770 [1:40:49<10:10, 1.56it/s]
90%|█████████ | 8818/9770 [1:40:50<10:27, 1.52it/s]
90%|█████████ | 8819/9770 [1:40:50<10:23, 1.52it/s]
90%|█████████ | 8820/9770 [1:40:51<10:29, 1.51it/s]
90%|█████████ | 8820/9770 [1:40:51<10:29, 1.51it/s]
90%|█████████ | 8821/9770 [1:40:52<10:31, 1.50it/s]
90%|█████████ | 8822/9770 [1:40:52<10:24, 1.52it/s]
90%|█████████ | 8823/9770 [1:40:53<10:22, 1.52it/s]
90%|████�
+0: {'loss': 0.6513, 'grad_norm': 0.5907378091949187, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.9}
+0: ��████ | 8824/9770 [1:40:54<10:36, 1.49it/s]
90%|█████████ | 8825/9770 [1:40:54<10:30, 1.50it/s]
90%|█████████ | 8826/9770 [1:40:55<10:22, 1.52it/s]
90%|█████████ | 8827/9770 [1:40:56<10:21, 1.52it/s]
90%|█████████ | 8828/9770 [1:40:56<10:25, 1.51it/s]
90%|█████████ | 8829/9770 [1:40:57<10:17, 1.52it/s]
90%|█████████ | 8830/9770 [1:40:58<10:10, 1.54it/s]
90%|█████████ | 8830/9770 [1:40:58<10:10, 1.54it/s]
90%|█████████ | 8831/9770 [1:40:58<10:11, 1.54it/s]
90%|█████████ | 8832/9770 [1:40:59<10:11, 1.53it/s]
90%|█████████ | 8833/9770 [1:41:00<10:04, 1.55it/s]
90%|█████████ | 8834/9770 [1:41:00<10:16, 1.52it/s]
90%|█████████ | 8835/9770 [1:41:01<10:20, 1.51it/s]
90%|█████████ | 8836/9770 [1:41:
+0: {'loss': 0.6494, 'grad_norm': 0.6164724133022096, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.9}
+0: 02<10:17, 1.51it/s]
90%|█████████ | 8837/9770 [1:41:02<10:06, 1.54it/s]
90%|█████████ | 8838/9770 [1:41:03<10:04, 1.54it/s]
90%|█████████ | 8839/9770 [1:41:03<10:00, 1.55it/s]
90%|█████████ | 8840/9770 [1:41:04<09:56, 1.56it/s]
90%|█████████ | 8840/9770 [1:41:04<09:56, 1.56it/s]
90%|█████████ | 8841/9770 [1:41:05<10:00, 1.55it/s]
91%|█████████ | 8842/9770 [1:41:05<09:56, 1.55it/s]
91%|█████████ | 8843/9770 [1:41:06<10:05, 1.53it/s]
91%|█████████ | 8844/9770 [1:41:07<10:03, 1.53it/s]
91%|█████████ | 8845/9770 [1:41:07<10:06, 1.52it/s]
91%|█████████ | 8846/9770 [1:41:08<10:04, 1.53it/s]
91%|█████████ | 8847/9770 [1:41:09<10:13, 1.50it/s]
91%|█████████ | 8848/9770 [1:41:09<10:04, 1.53it/s]
91%|██�
+0: {'loss': 0.6496, 'grad_norm': 0.6003184546725324, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.91}
+0: ��██████ | 8849/9770 [1:41:10<09:58, 1.54it/s]
91%|█████████ | 8850/9770 [1:41:11<09:56, 1.54it/s]
91%|█████████ | 8850/9770 [1:41:11<09:56, 1.54it/s]
91%|█████████ | 8851/9770 [1:41:11<10:11, 1.50it/s]
91%|█████████ | 8852/9770 [1:41:12<10:18, 1.48it/s]
91%|█████████ | 8853/9770 [1:41:13<10:18, 1.48it/s]
91%|█████████ | 8854/9770 [1:41:13<10:12, 1.50it/s]
91%|█████████ | 8855/9770 [1:41:14<10:05, 1.51it/s]
91%|█████████ | 8856/9770 [1:41:15<09:57, 1.53it/s]
91%|█████████ | 8857/9770 [1:41:15<09:59, 1.52it/s]
91%|█████████ | 8858/9770 [1:41:16<10:12, 1.49it/s]
91%|█████████ | 8859/9770 [1:41:17<10:02, 1.51it/s]
91%|█████████ | 8860/9770 [1:41:17<10:02, 1.51it/s]
+0: {'loss': 0.6548, 'grad_norm': 0.5949722249011528, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.91}
+0: {'loss': 0.6633, 'grad_norm': 0.6079243746631338, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.91}
+0:
91%|█████████ | 8860/9770 [1:41:17<10:02, 1.51it/s]
91%|█████████ | 8861/9770 [1:41:18<09:58, 1.52it/s]
91%|█████████ | 8862/9770 [1:41:19<09:48, 1.54it/s]
91%|█████████ | 8863/9770 [1:41:19<09:50, 1.54it/s]
91%|█████████ | 8864/9770 [1:41:20<09:48, 1.54it/s]
91%|█████████ | 8865/9770 [1:41:21<09:51, 1.53it/s]
91%|█████████ | 8866/9770 [1:41:21<09:50, 1.53it/s]
91%|█████████ | 8867/9770 [1:41:22<09:51, 1.53it/s]
91%|█████████ | 8868/9770 [1:41:23<10:00, 1.50it/s]
91%|█████████ | 8869/9770 [1:41:23<09:49, 1.53it/s]
91%|█████████ | 8870/9770 [1:41:24<09:45, 1.54it/s]
91%|█████████ | 8870/9770 [1:41:24<09:45, 1.54it/s]
91%|█████████ | 8871/9770 [1:41:24<09:37, 1.56it/s]
91%|██████
+0: {'loss': 0.6528, 'grad_norm': 0.6286621798326656, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.91}
+0: ███ | 8872/9770 [1:41:25<09:38, 1.55it/s]
91%|█████████ | 8873/9770 [1:41:26<09:38, 1.55it/s]
91%|█████████ | 8874/9770 [1:41:26<09:37, 1.55it/s]
91%|█████████ | 8875/9770 [1:41:27<09:31, 1.57it/s]
91%|█████████ | 8876/9770 [1:41:28<09:36, 1.55it/s]
91%|█████████ | 8877/9770 [1:41:28<09:38, 1.54it/s]
91%|█████████ | 8878/9770 [1:41:29<09:43, 1.53it/s]
91%|█████████ | 8879/9770 [1:41:30<09:39, 1.54it/s]
91%|█████████ | 8880/9770 [1:41:30<09:36, 1.55it/s]
91%|█████████ | 8880/9770 [1:41:30<09:36, 1.55it/s]
91%|█████████ | 8881/9770 [1:41:31<09:40, 1.53it/s]
91%|█████████ | 8882/9770 [1:41:32<09:39, 1.53it/s]
91%|█████████ | 8883/9770 [1:41:32<09:35, 1.54it/s]
91%|█████████ | 8884/9770 [1:41:33<09
+0: {'loss': 0.6772, 'grad_norm': 0.6469110840141102, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.91}
+0: :34, 1.54it/s]
91%|█████████ | 8885/9770 [1:41:34<09:36, 1.54it/s]
91%|█████████ | 8886/9770 [1:41:34<09:32, 1.54it/s]
91%|█████████ | 8887/9770 [1:41:35<09:36, 1.53it/s]
91%|█████████ | 8888/9770 [1:41:36<09:41, 1.52it/s]
91%|█████████ | 8889/9770 [1:41:36<09:34, 1.53it/s]
91%|█████████ | 8890/9770 [1:41:37<09:32, 1.54it/s]
91%|█████████ | 8890/9770 [1:41:37<09:32, 1.54it/s]
91%|█████████ | 8891/9770 [1:41:37<09:36, 1.53it/s]
91%|█████████ | 8892/9770 [1:41:38<09:40, 1.51it/s]
91%|█████████ | 8893/9770 [1:41:39<09:41, 1.51it/s]
91%|█████████ | 8894/9770 [1:41:39<09:39, 1.51it/s]
91%|█████████ | 8895/9770 [1:41:40<09:31, 1.53it/s]
91%|█████████ | 8896/9770 [1:41:41<09:27, 1.54it/s]
91%|████
+0: {'loss': 0.6383, 'grad_norm': 0.6046227197190288, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.91}
+0: █████ | 8897/9770 [1:41:41<09:27, 1.54it/s]
91%|█████████ | 8898/9770 [1:41:42<09:26, 1.54it/s]
91%|█████████ | 8899/9770 [1:41:43<09:24, 1.54it/s]
91%|█████████ | 8900/9770 [1:41:43<09:18, 1.56it/s]
91%|█████████ | 8900/9770 [1:41:43<09:18, 1.56it/s]
91%|█████████ | 8901/9770 [1:41:44<09:24, 1.54it/s]
91%|█████████ | 8902/9770 [1:41:45<09:23, 1.54it/s]
91%|█████████ | 8903/9770 [1:41:45<09:16, 1.56it/s]
91%|█████████ | 8904/9770 [1:41:46<09:12, 1.57it/s]
91%|█████████ | 8905/9770 [1:41:47<09:15, 1.56it/s]
91%|█████████ | 8906/9770 [1:41:47<09:16, 1.55it/s]
91%|█████████ | 8907/9770 [1:41:48<09:20, 1.54it/s]
91%|█████████ | 8908/9770 [1:41:49<09:21, 1.53it/s]
91%|█████████ | 8909/9770 [1:41
+0: {'loss': 0.6568, 'grad_norm': 0.5658794045844479, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.91}
+0: {'loss': 0.6175, 'grad_norm': 0.582273896832439, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.91}
+0: :49<09:15, 1.55it/s]
91%|█████████ | 8910/9770 [1:41:50<09:11, 1.56it/s]
91%|█████████ | 8910/9770 [1:41:50<09:11, 1.56it/s]
91%|█████████ | 8911/9770 [1:41:50<09:14, 1.55it/s]
91%|█████████ | 8912/9770 [1:41:51<09:12, 1.55it/s]
91%|█████████ | 8913/9770 [1:41:52<09:26, 1.51it/s]
91%|█████████ | 8914/9770 [1:41:52<09:20, 1.53it/s]
91%|█████████ | 8915/9770 [1:41:53<09:18, 1.53it/s]
91%|█████████▏| 8916/9770 [1:41:54<09:23, 1.52it/s]
91%|█████████▏| 8917/9770 [1:41:54<09:30, 1.49it/s]
91%|█████████▏| 8918/9770 [1:41:55<09:38, 1.47it/s]
91%|█████████▏| 8919/9770 [1:41:56<09:26, 1.50it/s]
91%|█████████▏| 8920/9770 [1:41:56<09:20, 1.52it/s]
91%|████�
+0: {'loss': 0.6511, 'grad_norm': 0.6349106806086336, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.91}
+0: ��████▏| 8920/9770 [1:41:56<09:20, 1.52it/s]
91%|█████████▏| 8921/9770 [1:41:57<09:13, 1.53it/s]
91%|█████████▏| 8922/9770 [1:41:58<09:08, 1.55it/s]
91%|█████████▏| 8923/9770 [1:41:58<09:06, 1.55it/s]
91%|█████████▏| 8924/9770 [1:41:59<09:16, 1.52it/s]
91%|█████████▏| 8925/9770 [1:42:00<09:28, 1.49it/s]
91%|█████████▏| 8926/9770 [1:42:00<09:23, 1.50it/s]
91%|█████████▏| 8927/9770 [1:42:01<09:17, 1.51it/s]
91%|█████████▏| 8928/9770 [1:42:02<09:15, 1.52it/s]
91%|█████████▏| 8929/9770 [1:42:02<09:15, 1.51it/s]
91%|█████████▏| 8930/9770 [1:42:03<09:13, 1.52it/s]
91%|█████████▏| 8930/9770 [1:42:03<09:13, 1.52it/s]
91%|█████████▏| 8931/9770 [1:42:04<09:11, 1.52it/s]
91%|██████�
+0: {'loss': 0.6556, 'grad_norm': 0.5848014910791852, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.91}
+0: �██▏| 8932/9770 [1:42:05<10:03, 1.39it/s]
91%|█████████▏| 8933/9770 [1:42:05<09:55, 1.41it/s]
91%|█████████▏| 8934/9770 [1:42:06<09:32, 1.46it/s]
91%|█████████▏| 8935/9770 [1:42:06<09:16, 1.50it/s]
91%|█████████▏| 8936/9770 [1:42:07<09:16, 1.50it/s]
91%|█████████▏| 8937/9770 [1:42:08<09:14, 1.50it/s]
91%|█████████▏| 8938/9770 [1:42:08<09:09, 1.51it/s]
91%|█████████▏| 8939/9770 [1:42:09<09:03, 1.53it/s]
92%|█████████▏| 8940/9770 [1:42:10<09:00, 1.53it/s]
92%|█████████▏| 8940/9770 [1:42:10<09:00, 1.53it/s]
92%|█████████▏| 8941/9770 [1:42:10<09:07, 1.52it/s]
92%|█████████▏| 8942/9770 [1:42:11<09:13, 1.50it/s]
92%|█████████▏| 8943/9770 [1:42:12<09:13, 1.50it/s]
92%|█████████
+0: {'loss': 0.6346, 'grad_norm': 0.6121770653829334, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.92}
+0: ▏| 8944/9770 [1:42:12<09:11, 1.50it/s]
92%|█████████▏| 8945/9770 [1:42:13<09:07, 1.51it/s]
92%|█████████▏| 8946/9770 [1:42:14<09:07, 1.51it/s]
92%|█████████▏| 8947/9770 [1:42:14<09:01, 1.52it/s]
92%|█████████▏| 8948/9770 [1:42:15<08:55, 1.53it/s]
92%|█████████▏| 8949/9770 [1:42:16<09:00, 1.52it/s]
92%|█████████▏| 8950/9770 [1:42:16<08:52, 1.54it/s]
92%|█████████▏| 8950/9770 [1:42:16<08:52, 1.54it/s]
92%|█████████▏| 8951/9770 [1:42:17<09:07, 1.50it/s]
92%|█████████▏| 8952/9770 [1:42:18<09:02, 1.51it/s]
92%|█████████▏| 8953/9770 [1:42:18<08:57, 1.52it/s]
92%|█████████▏| 8954/9770 [1:42:19<08:54, 1.53it/s]
92%|█████████▏| 8955/9770 [1:42:20<08:49, 1.54it/s]
92%|█████████▏| 89
+0: {'loss': 0.6385, 'grad_norm': 0.6648868044028359, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.92}
+0: 56/9770 [1:42:20<08:44, 1.55it/s]
92%|█████████▏| 8957/9770 [1:42:21<08:48, 1.54it/s]
92%|█████████▏| 8958/9770 [1:42:22<08:40, 1.56it/s]
92%|█████████▏| 8959/9770 [1:42:22<08:56, 1.51it/s]
92%|█████████▏| 8960/9770 [1:42:23<08:54, 1.51it/s]
92%|█████████▏| 8960/9770 [1:42:23<08:54, 1.51it/s]
92%|█████████▏| 8961/9770 [1:42:24<08:52, 1.52it/s]
92%|█████████▏| 8962/9770 [1:42:24<08:51, 1.52it/s]
92%|█████████▏| 8963/9770 [1:42:25<08:49, 1.53it/s]
92%|█████████▏| 8964/9770 [1:42:26<08:44, 1.54it/s]
92%|█████████▏| 8965/9770 [1:42:26<08:45, 1.53it/s]
92%|█████████▏| 8966/9770 [1:42:27<08:48, 1.52it/s]
92%|█████████▏| 8967/9770 [1:42:27<08:46, 1.53it/s]
92%|█████████▏| 8968/9770
+0: {'loss': 0.656, 'grad_norm': 0.6048487738666392, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.92}
+0: [1:42:28<08:41, 1.54it/s]
92%|█████████▏| 8969/9770 [1:42:29<08:41, 1.54it/s]
92%|█████████▏| 8970/9770 [1:42:29<08:43, 1.53it/s]
92%|█████████▏| 8970/9770 [1:42:29<08:43, 1.53it/s]
92%|█████████▏| 8971/9770 [1:42:30<08:43, 1.52it/s]
92%|█████████▏| 8972/9770 [1:42:31<08:32, 1.56it/s]
92%|█████████▏| 8973/9770 [1:42:31<08:32, 1.55it/s]
92%|█████████▏| 8974/9770 [1:42:32<08:28, 1.56it/s]
92%|█████████▏| 8975/9770 [1:42:33<08:30, 1.56it/s]
92%|█████████▏| 8976/9770 [1:42:33<08:29, 1.56it/s]
92%|█████████▏| 8977/9770 [1:42:34<08:25, 1.57it/s]
92%|█████████▏| 8978/9770 [1:42:35<08:35, 1.54it/s]
92%|█████████▏| 8979/9770 [1:42:35<08:35, 1.53it/s]
92%|█████████▏| 8980/9770 [1:42:
+0: {'loss': 0.6545, 'grad_norm': 0.6316234817479406, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.92}
+0: {'loss': 0.6541, 'grad_norm': 0.6356930256668423, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.92}
+0: 36<08:30, 1.55it/s]
92%|█████████▏| 8980/9770 [1:42:36<08:30, 1.55it/s]
92%|█████████▏| 8981/9770 [1:42:37<08:33, 1.54it/s]
92%|█████████▏| 8982/9770 [1:42:37<08:33, 1.53it/s]
92%|█████████▏| 8983/9770 [1:42:38<08:26, 1.55it/s]
92%|█████████▏| 8984/9770 [1:42:38<08:32, 1.53it/s]
92%|█████████▏| 8985/9770 [1:42:39<08:33, 1.53it/s]
92%|█████████▏| 8986/9770 [1:42:40<08:31, 1.53it/s]
92%|█████████▏| 8987/9770 [1:42:40<08:29, 1.54it/s]
92%|█████████▏| 8988/9770 [1:42:41<08:28, 1.54it/s]
92%|█████████▏| 8989/9770 [1:42:42<08:31, 1.53it/s]
92%|█████████▏| 8990/9770 [1:42:42<08:23, 1.55it/s]
92%|█████████▏| 8990/9770 [1:42:42<08:23, 1.55it/s]
92%|
+0: {'loss': 0.6454, 'grad_norm': 0.6145826754778619, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.92}
+0: █████████▏| 8991/9770 [1:42:43<08:26, 1.54it/s]
92%|█████████▏| 8992/9770 [1:42:44<08:23, 1.54it/s]
92%|█████████▏| 8993/9770 [1:42:44<08:24, 1.54it/s]
92%|█████████▏| 8994/9770 [1:42:45<08:23, 1.54it/s]
92%|█████████▏| 8995/9770 [1:42:46<08:21, 1.55it/s]
92%|█████████▏| 8996/9770 [1:42:46<08:18, 1.55it/s]
92%|█████████▏| 8997/9770 [1:42:47<08:19, 1.55it/s]
92%|█████████▏| 8998/9770 [1:42:48<08:25, 1.53it/s]
92%|█████████▏| 8999/9770 [1:42:48<08:22, 1.53it/s]
92%|█████████▏| 9000/9770 [1:42:49<08:14, 1.56it/s]
92%|█████████▏| 9000/9770 [1:42:49<08:14, 1.56it/s]
92%|█████████▏| 9001/9770 [1:42:50<08:17, 1.55it/s]
92%|█████████▏| 9002/9770 [1:42:50<08:12, 1.56it/s]
92%|██�
+0: {'loss': 0.6432, 'grad_norm': 0.6090897459389449, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.92}
+0: ��██████▏| 9003/9770 [1:42:51<08:13, 1.56it/s]
92%|█████████▏| 9004/9770 [1:42:51<08:13, 1.55it/s]
92%|█████████▏| 9005/9770 [1:42:52<08:27, 1.51it/s]
92%|█████████▏| 9006/9770 [1:42:53<08:22, 1.52it/s]
92%|█████████▏| 9007/9770 [1:42:53<08:16, 1.54it/s]
92%|█████████▏| 9008/9770 [1:42:54<08:18, 1.53it/s]
92%|█████████▏| 9009/9770 [1:42:55<08:19, 1.52it/s]
92%|█████████▏| 9010/9770 [1:42:55<08:19, 1.52it/s]
92%|█████████▏| 9010/9770 [1:42:55<08:19, 1.52it/s]
92%|█████████▏| 9011/9770 [1:42:56<08:21, 1.51it/s]
92%|█████████▏| 9012/9770 [1:42:57<08:19, 1.52it/s]
92%|█████████▏| 9013/9770 [1:42:57<08:14, 1.53it/s]
92%|█████████▏| 9014/9770 [1:42:58<08:19, 1.51it/s]
92%|████�
+0: {'loss': 0.6448, 'grad_norm': 0.5943273866248485, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.92}
+0: �████▏| 9015/9770 [1:42:59<08:16, 1.52it/s]
92%|█████████▏| 9016/9770 [1:42:59<08:16, 1.52it/s]
92%|█████████▏| 9017/9770 [1:43:00<08:15, 1.52it/s]
92%|█████████▏| 9018/9770 [1:43:01<08:17, 1.51it/s]
92%|█████████▏| 9019/9770 [1:43:01<08:18, 1.51it/s]
92%|█████████▏| 9020/9770 [1:43:02<08:12, 1.52it/s]
92%|█████████▏| 9020/9770 [1:43:02<08:12, 1.52it/s]
92%|█████████▏| 9021/9770 [1:43:03<08:19, 1.50it/s]
92%|█████████▏| 9022/9770 [1:43:03<08:12, 1.52it/s]
92%|█████████▏| 9023/9770 [1:43:04<08:06, 1.54it/s]
92%|█████████▏| 9024/9770 [1:43:05<08:15, 1.51it/s]
92%|█████████▏| 9025/9770 [1:43:05<08:08, 1.53it/s]
92%|█████████▏| 9026/9770 [1:43:06<08:02, 1.54it/s]
92%|███████
+0: {'loss': 0.6458, 'grad_norm': 0.588879941053277, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.92}
+0: ██▏| 9027/9770 [1:43:07<08:01, 1.54it/s]
92%|█████████▏| 9028/9770 [1:43:07<08:00, 1.54it/s]
92%|█████████▏| 9029/9770 [1:43:08<07:57, 1.55it/s]
92%|█████████▏| 9030/9770 [1:43:08<07:55, 1.56it/s]
92%|█████████▏| 9030/9770 [1:43:08<07:55, 1.56it/s]
92%|█████████▏| 9031/9770 [1:43:09<07:59, 1.54it/s]
92%|█████████▏| 9032/9770 [1:43:10<08:01, 1.53it/s]
92%|█████████▏| 9033/9770 [1:43:10<07:58, 1.54it/s]
92%|█████████▏| 9034/9770 [1:43:11<07:56, 1.55it/s]
92%|█████████▏| 9035/9770 [1:43:12<07:54, 1.55it/s]
92%|█████████▏| 9036/9770 [1:43:12<07:50, 1.56it/s]
92%|█████████▏| 9037/9770 [1:43:13<07:47, 1.57it/s]
93%|█████████▎| 9038/9770 [1:43:14<07:53, 1.55it/s]
93%|█████████�
+0: {'loss': 0.6592, 'grad_norm': 0.6252651380238585, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.92}
+0: ��| 9039/9770 [1:43:14<07:50, 1.55it/s]
93%|█████████▎| 9040/9770 [1:43:15<07:56, 1.53it/s]
93%|█████████▎| 9040/9770 [1:43:15<07:56, 1.53it/s]
93%|█████████▎| 9041/9770 [1:43:16<07:56, 1.53it/s]
93%|█████████▎| 9042/9770 [1:43:16<08:00, 1.52it/s]
93%|█████████▎| 9043/9770 [1:43:17<07:58, 1.52it/s]
93%|█████████▎| 9044/9770 [1:43:18<07:59, 1.51it/s]
93%|█████████▎| 9045/9770 [1:43:18<08:04, 1.50it/s]
93%|█████████▎| 9046/9770 [1:43:19<08:01, 1.50it/s]
93%|█████████▎| 9047/9770 [1:43:20<08:05, 1.49it/s]
93%|█████████▎| 9048/9770 [1:43:20<07:54, 1.52it/s]
93%|█████████▎| 9049/9770 [1:43:21<08:03, 1.49it/s]
93%|█████████▎| 9050/9770 [1:43:22<07:58, 1.51it/s]
+0: {'loss': 0.6788, 'grad_norm': 0.5998442737974874, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.93}
+0: {'loss': 0.6543, 'grad_norm': 0.5963629694320636, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.93}
+0:
93%|█████████▎| 9050/9770 [1:43:22<07:58, 1.51it/s]
93%|█████████▎| 9051/9770 [1:43:22<07:56, 1.51it/s]
93%|█████████▎| 9052/9770 [1:43:23<07:58, 1.50it/s]
93%|█████████▎| 9053/9770 [1:43:24<07:55, 1.51it/s]
93%|█████████▎| 9054/9770 [1:43:24<07:53, 1.51it/s]
93%|█████████▎| 9055/9770 [1:43:25<07:50, 1.52it/s]
93%|█████████▎| 9056/9770 [1:43:26<07:43, 1.54it/s]
93%|█████████▎| 9057/9770 [1:43:26<07:46, 1.53it/s]
93%|█████████▎| 9058/9770 [1:43:27<07:44, 1.53it/s]
93%|█████████▎| 9059/9770 [1:43:27<07:38, 1.55it/s]
93%|█████████▎| 9060/9770 [1:43:28<07:39, 1.55it/s]
93%|████████��▎| 9060/9770 [1:43:28<07:39, 1.55it/s]
93%|█████████▎| 9061/9770 [1:43:29<07:41, 1.
+0: {'loss': 0.6454, 'grad_norm': 0.5722944897162896, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.93}
+0: 54it/s]
93%|█████████▎| 9062/9770 [1:43:29<07:40, 1.54it/s]
93%|█████████▎| 9063/9770 [1:43:30<07:37, 1.55it/s]
93%|█████████▎| 9064/9770 [1:43:31<07:37, 1.54it/s]
93%|█████████▎| 9065/9770 [1:43:31<07:36, 1.55it/s]
93%|█████████▎| 9066/9770 [1:43:32<07:41, 1.53it/s]
93%|█████████▎| 9067/9770 [1:43:33<07:36, 1.54it/s]
93%|█████████▎| 9068/9770 [1:43:33<07:34, 1.55it/s]
93%|█████████▎| 9069/9770 [1:43:34<07:33, 1.55it/s]
93%|█████████▎| 9070/9770 [1:43:35<08:11, 1.43it/s]
93%|█████████▎| 9070/9770 [1:43:35<08:11, 1.43it/s]
93%|█████████▎| 9071/9770 [1:43:35<08:01, 1.45it/s]
93%|█████████▎| 9072/9770 [1:43:36<07:48, 1.49it/s]
93%|█████████▎| 9073/9770 [1:43:37<07:46, 1.49it/s]
+0: {'loss': 0.6515, 'grad_norm': 0.5960115992769335, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.93}
+0:
93%|█████████▎| 9074/9770 [1:43:37<07:52, 1.47it/s]
93%|█████████▎| 9075/9770 [1:43:38<07:58, 1.45it/s]
93%|█████████▎| 9076/9770 [1:43:39<07:47, 1.48it/s]
93%|█████████▎| 9077/9770 [1:43:39<07:46, 1.49it/s]
93%|█████████▎| 9078/9770 [1:43:40<07:42, 1.50it/s]
93%|█████████▎| 9079/9770 [1:43:41<07:38, 1.51it/s]
93%|█████████▎| 9080/9770 [1:43:41<07:31, 1.53it/s]
93%|█████████▎| 9080/9770 [1:43:41<07:31, 1.53it/s]
93%|█████████▎| 9081/9770 [1:43:42<07:30, 1.53it/s]
93%|█████████▎| 9082/9770 [1:43:43<07:27, 1.54it/s]
93%|█████████▎| 9083/9770 [1:43:43<07:23, 1.55it/s]
93%|█████████▎| 9084/9770 [1:43:44<07:15, 1.58it/s]
93%|█████████▎| 9085/9770 [1:43:45<07:17, 1.57it/s]
93%|�
+0: {'loss': 0.6481, 'grad_norm': 0.5685386914118158, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.93}
+0: ��████████▎| 9086/9770 [1:43:45<07:20, 1.55it/s]
93%|█████████▎| 9087/9770 [1:43:46<07:15, 1.57it/s]
93%|█████████▎| 9088/9770 [1:43:47<07:20, 1.55it/s]
93%|█████████▎| 9089/9770 [1:43:47<07:17, 1.56it/s]
93%|█████████▎| 9090/9770 [1:43:48<07:18, 1.55it/s]
93%|█████████▎| 9090/9770 [1:43:48<07:18, 1.55it/s]
93%|█████████▎| 9091/9770 [1:43:48<07:17, 1.55it/s]
93%|█████████▎| 9092/9770 [1:43:49<07:15, 1.56it/s]
93%|█████████▎| 9093/9770 [1:43:50<07:12, 1.57it/s]
93%|█████████▎| 9094/9770 [1:43:50<07:09, 1.57it/s]
93%|█████████▎| 9095/9770 [1:43:51<07:10, 1.57it/s]
93%|█████████▎| 9096/9770 [1:43:52<07:12, 1.56it/s]
93%|█████████▎| 9097/9770 [1:43:52<07:18, 1.53it/s]
93%|██�
+0: {'loss': 0.6456, 'grad_norm': 0.5753653422857062, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.93}
+0: �██████▎| 9098/9770 [1:43:53<07:21, 1.52it/s]
93%|█████████▎| 9099/9770 [1:43:54<07:18, 1.53it/s]
93%|█████████▎| 9100/9770 [1:43:54<07:19, 1.52it/s]
93%|█████████▎| 9100/9770 [1:43:54<07:19, 1.52it/s]
93%|█████████▎| 9101/9770 [1:43:55<07:18, 1.53it/s]
93%|█████████▎| 9102/9770 [1:43:56<07:21, 1.51it/s]
93%|█████████▎| 9103/9770 [1:43:56<07:30, 1.48it/s]
93%|█████████▎| 9104/9770 [1:43:57<07:25, 1.50it/s]
93%|█████████▎| 9105/9770 [1:43:58<07:19, 1.51it/s]
93%|█████████▎| 9106/9770 [1:43:58<07:17, 1.52it/s]
93%|█████████▎| 9107/9770 [1:43:59<07:16, 1.52it/s]
93%|█████████▎| 9108/9770 [1:44:00<07:14, 1.52it/s]
93%|█████████▎| 9109/9770 [1:44:00<07:08, 1.54it/s]
93%|█████
+0: {'loss': 0.6493, 'grad_norm': 0.5896975597523512, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.93}
+0: {'loss': 0.6599, 'grad_norm': 0.6162107450486746, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.93}
+0: ████▎| 9110/9770 [1:44:01<07:06, 1.55it/s]
93%|█████████▎| 9110/9770 [1:44:01<07:06, 1.55it/s]
93%|█████████▎| 9111/9770 [1:44:02<07:05, 1.55it/s]
93%|█████████▎| 9112/9770 [1:44:02<07:04, 1.55it/s]
93%|█████████▎| 9113/9770 [1:44:03<06:59, 1.57it/s]
93%|█████████▎| 9114/9770 [1:44:04<07:41, 1.42it/s]
93%|█████████▎| 9115/9770 [1:44:04<07:39, 1.43it/s]
93%|█████████▎| 9116/9770 [1:44:05<07:31, 1.45it/s]
93%|█████████▎| 9117/9770 [1:44:06<07:17, 1.49it/s]
93%|█████████▎| 9118/9770 [1:44:06<07:04, 1.53it/s]
93%|█████████▎| 9119/9770 [1:44:07<07:04, 1.53it/s]
93%|█████████▎| 9120/9770 [1:44:08<07:02, 1.54it/s]
93%|█████████▎| 9120/9770
+0: {'loss': 0.6357, 'grad_norm': 0.5991133982171295, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.93}
+0: [1:44:08<07:02, 1.54it/s]
93%|█████████▎| 9121/9770 [1:44:08<07:05, 1.52it/s]
93%|█████████▎| 9122/9770 [1:44:09<07:01, 1.54it/s]
93%|█████████▎| 9123/9770 [1:44:10<07:06, 1.52it/s]
93%|█████████▎| 9124/9770 [1:44:10<06:59, 1.54it/s]
93%|█████████▎| 9125/9770 [1:44:11<06:53, 1.56it/s]
93%|█████████▎| 9126/9770 [1:44:11<06:54, 1.55it/s]
93%|█████████▎| 9127/9770 [1:44:12<06:57, 1.54it/s]
93%|█████████▎| 9128/9770 [1:44:13<06:51, 1.56it/s]
93%|█████████▎| 9129/9770 [1:44:13<06:51, 1.56it/s]
93%|█████████▎| 9130/9770 [1:44:14<06:54, 1.54it/s]
93%|█████████▎| 9130/9770 [1:44:14<06:54, 1.54it/s]
93%|█████████▎| 9131/9770 [1:44:15<06:53, 1.55it/s]
93%|█████████▎| 9132/9770 [1:44:
+0: {'loss': 0.6574, 'grad_norm': 0.6206182712005641, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.94}
+0: 15<06:53, 1.54it/s]
93%|█████████▎| 9133/9770 [1:44:16<06:58, 1.52it/s]
93%|█████████▎| 9134/9770 [1:44:17<06:55, 1.53it/s]
94%|█████████▎| 9135/9770 [1:44:17<06:57, 1.52it/s]
94%|█████████▎| 9136/9770 [1:44:18<06:53, 1.53it/s]
94%|█████████▎| 9137/9770 [1:44:19<07:01, 1.50it/s]
94%|█████████▎| 9138/9770 [1:44:19<06:53, 1.53it/s]
94%|█████████▎| 9139/9770 [1:44:20<06:50, 1.54it/s]
94%|█████████▎| 9140/9770 [1:44:21<06:47, 1.54it/s]
94%|█████████▎| 9140/9770 [1:44:21<06:47, 1.54it/s]
94%|█████████▎| 9141/9770 [1:44:21<06:47, 1.55it/s]
94%|█████████▎| 9142/9770 [1:44:22<06:45, 1.55it/s]
94%|█████████▎| 9143/9770 [1:44:22<06:44, 1.55it/s]
94%|█████████▎| 9144/9770 [1:44:23<06:5
+0: {'loss': 0.6345, 'grad_norm': 0.6449612752162264, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.94}
+0: 1, 1.52it/s]
94%|█████████▎| 9145/9770 [1:44:24<06:54, 1.51it/s]
94%|█████████▎| 9146/9770 [1:44:25<07:00, 1.48it/s]
94%|█████████▎| 9147/9770 [1:44:25<06:54, 1.50it/s]
94%|█████████▎| 9148/9770 [1:44:26<06:50, 1.51it/s]
94%|█████████▎| 9149/9770 [1:44:27<06:55, 1.49it/s]
94%|█████████▎| 9150/9770 [1:44:27<06:54, 1.50it/s]
94%|█████████▎| 9150/9770 [1:44:27<06:54, 1.50it/s]
94%|█████████▎| 9151/9770 [1:44:28<06:45, 1.53it/s]
94%|█████████▎| 9152/9770 [1:44:28<06:43, 1.53it/s]
94%|█████████▎| 9153/9770 [1:44:29<06:40, 1.54it/s]
94%|█████████▎| 9154/9770 [1:44:30<06:40, 1.54it/s]
94%|█████████▎| 9155/9770 [1:44:30<06:39, 1.54it/s]
94%|█████████▎| 9156/9770 [1:44:31<06:38, 1.5
+0: {'loss': 0.6433, 'grad_norm': 0.6309728032732347, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.94}
+0: 4it/s]
94%|█████████▎| 9157/9770 [1:44:32<06:38, 1.54it/s]
94%|█████████▎| 9158/9770 [1:44:32<06:38, 1.54it/s]
94%|█████████▎| 9159/9770 [1:44:33<06:37, 1.54it/s]
94%|█████████▍| 9160/9770 [1:44:34<06:32, 1.55it/s]
94%|█████████▍| 9160/9770 [1:44:34<06:32, 1.55it/s]
94%|█████████▍| 9161/9770 [1:44:34<06:33, 1.55it/s]
94%|█████████▍| 9162/9770 [1:44:35<06:33, 1.54it/s]
94%|█████████▍| 9163/9770 [1:44:36<06:40, 1.52it/s]
94%|█████████▍| 9164/9770 [1:44:36<06:47, 1.49it/s]
94%|█████████▍| 9165/9770 [1:44:37<06:49, 1.48it/s]
94%|█████████▍| 9166/9770 [1:44:38<06:41, 1.50it/s]
94%|█████████▍| 9167/9770 [1:44:38<06:39, 1.51it/s]
94%|█████████▍| 9168/9770 [1:44:39<06:34, 1.53it/s]
+0: {'loss': 0.6532, 'grad_norm': 0.6295363364371517, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.94}
+0: 94%|█████████▍| 9169/9770 [1:44:40<06:31, 1.54it/s]
94%|█████████▍| 9170/9770 [1:44:40<06:26, 1.55it/s]
94%|█████████▍| 9170/9770 [1:44:40<06:26, 1.55it/s]
94%|█████████▍| 9171/9770 [1:44:41<06:25, 1.55it/s]
94%|█████████▍| 9172/9770 [1:44:42<06:25, 1.55it/s]
94%|█████████▍| 9173/9770 [1:44:42<06:30, 1.53it/s]
94%|█████████▍| 9174/9770 [1:44:43<06:27, 1.54it/s]
94%|█████████▍| 9175/9770 [1:44:44<06:33, 1.51it/s]
94%|█████████▍| 9176/9770 [1:44:44<06:33, 1.51it/s]
94%|█████████▍| 9177/9770 [1:44:45<06:33, 1.51it/s]
94%|█████████▍| 9178/9770 [1:44:45<06:27, 1.53it/s]
94%|█████████▍| 9179/9770 [1:44:46<06:26, 1.53it/s]
94%|█████████▍| 9180/9770 [1:44:47<06:20, 1.55it/s]
+0: {'loss': 0.6506, 'grad_norm': 0.6223546298723089, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.94}
+0: {'loss': 0.6593, 'grad_norm': 0.6518014406327484, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.94}
+0:
94%|█████████▍| 9180/9770 [1:44:47<06:20, 1.55it/s]
94%|█████████▍| 9181/9770 [1:44:47<06:20, 1.55it/s]
94%|█████████▍| 9182/9770 [1:44:48<06:29, 1.51it/s]
94%|█████████▍| 9183/9770 [1:44:49<06:37, 1.48it/s]
94%|█████████▍| 9184/9770 [1:44:49<06:29, 1.50it/s]
94%|█████████▍| 9185/9770 [1:44:50<06:32, 1.49it/s]
94%|█████████▍| 9186/9770 [1:44:51<06:35, 1.47it/s]
94%|█████████▍| 9187/9770 [1:44:51<06:29, 1.50it/s]
94%|█████████▍| 9188/9770 [1:44:52<06:35, 1.47it/s]
94%|█████████▍| 9189/9770 [1:44:53<06:30, 1.49it/s]
94%|█████████▍| 9190/9770 [1:44:54<06:29, 1.49it/s]
94%|█████████▍| 9190/9770 [1:44:54<06:29, 1.49it/s]
94%|█████████�
+0: {'loss': 0.6442, 'grad_norm': 0.5878341105812127, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.94}
+0: ��| 9191/9770 [1:44:54<06:26, 1.50it/s]
94%|█████████▍| 9192/9770 [1:44:55<06:21, 1.52it/s]
94%|█████████▍| 9193/9770 [1:44:55<06:21, 1.51it/s]
94%|█████████▍| 9194/9770 [1:44:56<06:18, 1.52it/s]
94%|█████████▍| 9195/9770 [1:44:57<06:16, 1.53it/s]
94%|█████████▍| 9196/9770 [1:44:57<06:17, 1.52it/s]
94%|█████████▍| 9197/9770 [1:44:58<06:12, 1.54it/s]
94%|█████████▍| 9198/9770 [1:44:59<06:21, 1.50it/s]
94%|█████████▍| 9199/9770 [1:44:59<06:14, 1.52it/s]
94%|█████████▍| 9200/9770 [1:45:00<06:11, 1.54it/s]
94%|█████████▍| 9200/9770 [1:45:00<06:11, 1.54it/s]
94%|█████████▍| 9201/9770 [1:45:01<06:07, 1.55it/s]
94%|█████████▍| 9202/9770 [1:45:01<06:05, 1.56it/s]
94%|█████████▍| 920
+0: {'loss': 0.6617, 'grad_norm': 0.6047952422693393, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.94}
+0: 3/9770 [1:45:02<06:05, 1.55it/s]
94%|█████████▍| 9204/9770 [1:45:03<06:07, 1.54it/s]
94%|█████████▍| 9205/9770 [1:45:03<06:02, 1.56it/s]
94%|█████████▍| 9206/9770 [1:45:04<06:02, 1.56it/s]
94%|█████████▍| 9207/9770 [1:45:05<06:01, 1.56it/s]
94%|█████████▍| 9208/9770 [1:45:05<06:05, 1.54it/s]
94%|█████████▍| 9209/9770 [1:45:06<06:05, 1.54it/s]
94%|█████████▍| 9210/9770 [1:45:07<06:11, 1.51it/s]
94%|█████████▍| 9210/9770 [1:45:07<06:11, 1.51it/s]
94%|█████████▍| 9211/9770 [1:45:07<06:13, 1.50it/s]
94%|█████████▍| 9212/9770 [1:45:08<06:12, 1.50it/s]
94%|█████████▍| 9213/9770 [1:45:09<06:13, 1.49it/s]
94%|█████████▍| 9214/9770 [1:45:09<06:13, 1.49it/s]
94%|█████████▍| 9215/9770
+0: {'loss': 0.6493, 'grad_norm': 0.5983847840084231, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.94}
+0: [1:45:10<06:12, 1.49it/s]
94%|█████████▍| 9216/9770 [1:45:11<06:07, 1.51it/s]
94%|█████████▍| 9217/9770 [1:45:11<06:06, 1.51it/s]
94%|█████████▍| 9218/9770 [1:45:12<05:58, 1.54it/s]
94%|█████████▍| 9219/9770 [1:45:12<05:58, 1.54it/s]
94%|█████████▍| 9220/9770 [1:45:13<05:56, 1.54it/s]
94%|█████████▍| 9220/9770 [1:45:13<05:56, 1.54it/s]
94%|█████████▍| 9221/9770 [1:45:14<05:55, 1.54it/s]
94%|█████████▍| 9222/9770 [1:45:14<06:02, 1.51it/s]
94%|█████████▍| 9223/9770 [1:45:15<06:01, 1.51it/s]
94%|█████████▍| 9224/9770 [1:45:16<05:59, 1.52it/s]
94%|█████████▍| 9225/9770 [1:45:16<06:00, 1.51it/s]
94%|█████████▍| 9226/9770 [1:45:17<06:05, 1.49it/s]
94%|█████████▍| 9227/9770 [1:45:1
+0: {'loss': 0.6635, 'grad_norm': 0.5806640127898456, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.94}
+0: 8<05:57, 1.52it/s]
94%|█████████▍| 9228/9770 [1:45:18<05:56, 1.52it/s]
94%|█████████▍| 9229/9770 [1:45:19<05:56, 1.52it/s]
94%|█████████▍| 9230/9770 [1:45:20<05:51, 1.54it/s]
94%|█████████▍| 9230/9770 [1:45:20<05:51, 1.54it/s]
94%|█████████▍| 9231/9770 [1:45:20<05:47, 1.55it/s]
94%|█████████▍| 9232/9770 [1:45:21<05:41, 1.58it/s]
95%|█████████▍| 9233/9770 [1:45:22<05:44, 1.56it/s]
95%|█████████▍| 9234/9770 [1:45:22<05:45, 1.55it/s]
95%|█████████▍| 9235/9770 [1:45:23<05:45, 1.55it/s]
95%|█████████▍| 9236/9770 [1:45:24<05:46, 1.54it/s]
95%|█████████▍| 9237/9770 [1:45:24<05:50, 1.52it/s]
95%|█████████▍| 9238/9770 [1:45:25<05:51, 1.51it/s]
95%|█████████▍| 9239/9770 [1:45:26<05:47
+0: {'loss': 0.6393, 'grad_norm': 0.6085019158392394, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.95}
+0: {'loss': 0.6393, 'grad_norm': 0.6722992955193487, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.95}
+0: , 1.53it/s]
95%|█████████▍| 9240/9770 [1:45:26<05:46, 1.53it/s]
95%|█████████▍| 9240/9770 [1:45:26<05:46, 1.53it/s]
95%|█████████▍| 9241/9770 [1:45:27<05:51, 1.51it/s]
95%|█████████▍| 9242/9770 [1:45:28<05:52, 1.50it/s]
95%|█████████▍| 9243/9770 [1:45:28<05:48, 1.51it/s]
95%|█████████▍| 9244/9770 [1:45:29<05:43, 1.53it/s]
95%|█████████▍| 9245/9770 [1:45:30<05:43, 1.53it/s]
95%|█████████▍| 9246/9770 [1:45:30<05:50, 1.50it/s]
95%|█████████▍| 9247/9770 [1:45:31<05:47, 1.50it/s]
95%|█████████▍| 9248/9770 [1:45:31<05:37, 1.55it/s]
95%|█████████▍| 9249/9770 [1:45:32<05:35, 1.55it/s]
95%|█████���███▍| 9250/9770 [1:45:33<05:36, 1.55it/s]
95%|██�
+0: {'loss': 0.6574, 'grad_norm': 0.6577717992564499, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.95}
+0: �██████▍| 9250/9770 [1:45:33<05:36, 1.55it/s]
95%|█████████▍| 9251/9770 [1:45:33<05:32, 1.56it/s]
95%|█████████▍| 9252/9770 [1:45:34<05:32, 1.56it/s]
95%|█████████▍| 9253/9770 [1:45:35<05:33, 1.55it/s]
95%|█████████▍| 9254/9770 [1:45:35<05:40, 1.51it/s]
95%|█████████▍| 9255/9770 [1:45:36<05:33, 1.54it/s]
95%|█████████▍| 9256/9770 [1:45:37<05:35, 1.53it/s]
95%|█████████▍| 9257/9770 [1:45:37<05:33, 1.54it/s]
95%|█████████▍| 9258/9770 [1:45:38<05:32, 1.54it/s]
95%|█████████▍| 9259/9770 [1:45:39<05:31, 1.54it/s]
95%|█████████▍| 9260/9770 [1:45:39<05:33, 1.53it/s]
95%|█████████▍| 9260/9770 [1:45:39<05:33, 1.53it/s]
95%|█████████▍| 9261/9770 [1:45:40<05:33, 1.53it/s]
95%|█████
+0: {'loss': 0.635, 'grad_norm': 0.6106677820634747, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.95}
+0: ████▍| 9262/9770 [1:45:41<05:32, 1.53it/s]
95%|█████████▍| 9263/9770 [1:45:41<05:31, 1.53it/s]
95%|█████████▍| 9264/9770 [1:45:42<05:29, 1.53it/s]
95%|█████████▍| 9265/9770 [1:45:43<05:26, 1.55it/s]
95%|█████████▍| 9266/9770 [1:45:43<05:25, 1.55it/s]
95%|█████████▍| 9267/9770 [1:45:44<05:24, 1.55it/s]
95%|█████████▍| 9268/9770 [1:45:44<05:24, 1.55it/s]
95%|█████████▍| 9269/9770 [1:45:45<05:30, 1.52it/s]
95%|█████████▍| 9270/9770 [1:45:46<05:26, 1.53it/s]
95%|█████████▍| 9270/9770 [1:45:46<05:26, 1.53it/s]
95%|█████████▍| 9271/9770 [1:45:46<05:32, 1.50it/s]
95%|█████████▍| 9272/9770 [1:45:47<05:24, 1.54it/s]
95%|█████████▍| 9273/9770 [1:45:48<05:18, 1.56it/s]
95%|███████�
+0: {'loss': 0.6459, 'grad_norm': 0.5892172470406517, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.95}
+0: ��█▍| 9274/9770 [1:45:48<05:15, 1.57it/s]
95%|█████████▍| 9275/9770 [1:45:49<05:17, 1.56it/s]
95%|█████████▍| 9276/9770 [1:45:50<05:15, 1.57it/s]
95%|█████████▍| 9277/9770 [1:45:50<05:16, 1.56it/s]
95%|█████████▍| 9278/9770 [1:45:51<05:26, 1.51it/s]
95%|█████████▍| 9279/9770 [1:45:52<05:21, 1.53it/s]
95%|█████████▍| 9280/9770 [1:45:52<05:24, 1.51it/s]
95%|█████████▍| 9280/9770 [1:45:52<05:24, 1.51it/s]
95%|█████████▍| 9281/9770 [1:45:53<05:23, 1.51it/s]
95%|█████████▌| 9282/9770 [1:45:54<05:19, 1.53it/s]
95%|█████████▌| 9283/9770 [1:45:54<05:15, 1.54it/s]
95%|█████████▌| 9284/9770 [1:45:55<05:12, 1.55it/s]
95%|█████████▌| 9285/9770 [1:45:56<05:10, 1.56it/s]
95%|█████████�
+0: {'loss': 0.6606, 'grad_norm': 0.6250498018994596, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.95}
+0: �| 9286/9770 [1:45:56<05:08, 1.57it/s]
95%|█████████▌| 9287/9770 [1:45:57<05:18, 1.52it/s]
95%|█████████▌| 9288/9770 [1:45:57<05:15, 1.53it/s]
95%|█████████▌| 9289/9770 [1:45:58<05:13, 1.53it/s]
95%|█████████▌| 9290/9770 [1:45:59<05:11, 1.54it/s]
95%|█████████▌| 9290/9770 [1:45:59<05:11, 1.54it/s]
95%|█████████▌| 9291/9770 [1:45:59<05:12, 1.53it/s]
95%|█████████▌| 9292/9770 [1:46:00<05:16, 1.51it/s]
95%|█████████▌| 9293/9770 [1:46:01<05:13, 1.52it/s]
95%|█████████▌| 9294/9770 [1:46:01<05:10, 1.54it/s]
95%|█████████▌| 9295/9770 [1:46:02<05:11, 1.53it/s]
95%|█████████▌| 9296/9770 [1:46:03<05:09, 1.53it/s]
95%|█████████▌| 9297/9770 [1:46:03<05:05, 1.55it/s]
95%|█████████▌| 9298
+0: {'loss': 0.6562, 'grad_norm': 0.6241406870419981, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.95}
+0: /9770 [1:46:04<05:05, 1.54it/s]
95%|█████████▌| 9299/9770 [1:46:05<05:04, 1.55it/s]
95%|█████████▌| 9300/9770 [1:46:05<05:05, 1.54it/s]
95%|█████████▌| 9300/9770 [1:46:05<05:05, 1.54it/s]
95%|█████████▌| 9301/9770 [1:46:06<05:02, 1.55it/s]
95%|█████████▌| 9302/9770 [1:46:07<05:02, 1.55it/s]
95%|█████████▌| 9303/9770 [1:46:07<05:01, 1.55it/s]
95%|█████████▌| 9304/9770 [1:46:08<04:57, 1.56it/s]
95%|█████████▌| 9305/9770 [1:46:09<05:04, 1.53it/s]
95%|█████████▌| 9306/9770 [1:46:09<05:03, 1.53it/s]
95%|█████████▌| 9307/9770 [1:46:10<04:58, 1.55it/s]
95%|█████████▌| 9308/9770 [1:46:10<05:02, 1.53it/s]
95%|█████████▌| 9309/9770 [1:46:11<05:03, 1.52it/s]
95%|█████████▌| 9310/9770 [
+0: {'loss': 0.6557, 'grad_norm': 0.6075309381489284, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.95}
+0: {'loss': 0.6499, 'grad_norm': 0.6565779546056522, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.95}
+0: 1:46:12<04:59, 1.54it/s]
95%|█████████▌| 9310/9770 [1:46:12<04:59, 1.54it/s]
95%|█████████▌| 9311/9770 [1:46:12<04:58, 1.54it/s]
95%|█████████▌| 9312/9770 [1:46:13<04:56, 1.55it/s]
95%|█████████▌| 9313/9770 [1:46:14<04:53, 1.56it/s]
95%|█████████▌| 9314/9770 [1:46:14<04:50, 1.57it/s]
95%|█████████▌| 9315/9770 [1:46:15<04:50, 1.57it/s]
95%|█████████▌| 9316/9770 [1:46:16<04:51, 1.56it/s]
95%|█████████▌| 9317/9770 [1:46:16<04:46, 1.58it/s]
95%|█████████▌| 9318/9770 [1:46:17<04:46, 1.58it/s]
95%|█████████▌| 9319/9770 [1:46:18<04:49, 1.56it/s]
95%|█████████▌| 9320/9770 [1:46:18<04:49, 1.55it/s]
95%|█████████▌| 9320/9770 [1:46:18<04:49, 1.55it/s]
+0: {'loss': 0.6326, 'grad_norm': 0.6006315586133305, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.95}
+0: 95%|█████████▌| 9321/9770 [1:46:19<04:54, 1.52it/s]
95%|█████████▌| 9322/9770 [1:46:20<04:52, 1.53it/s]
95%|█████████▌| 9323/9770 [1:46:20<04:49, 1.55it/s]
95%|█████████▌| 9324/9770 [1:46:21<04:48, 1.55it/s]
95%|█████████▌| 9325/9770 [1:46:21<04:49, 1.53it/s]
95%|█████████▌| 9326/9770 [1:46:22<04:47, 1.54it/s]
95%|█████████▌| 9327/9770 [1:46:23<04:43, 1.56it/s]
95%|█████████▌| 9328/9770 [1:46:23<04:43, 1.56it/s]
95%|█████████▌| 9329/9770 [1:46:24<04:48, 1.53it/s]
95%|█████████▌| 9330/9770 [1:46:25<04:46, 1.54it/s]
95%|█████████▌| 9330/9770 [1:46:25<04:46, 1.54it/s]
96%|█████████▌| 9331/9770 [1:46:25<04:42, 1.55it/s]
96%|█████████▌| 9332/9770 [1:46:26<04:38, 1.57it/s]
96%|�
+0: {'loss': 0.6617, 'grad_norm': 0.6010492938213644, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.96}
+0: �████████▌| 9333/9770 [1:46:27<04:41, 1.55it/s]
96%|█████████▌| 9334/9770 [1:46:27<04:48, 1.51it/s]
96%|█████████▌| 9335/9770 [1:46:28<04:44, 1.53it/s]
96%|█████████▌| 9336/9770 [1:46:29<04:41, 1.54it/s]
96%|█████████▌| 9337/9770 [1:46:29<04:40, 1.54it/s]
96%|█████████▌| 9338/9770 [1:46:30<04:37, 1.56it/s]
96%|█████████▌| 9339/9770 [1:46:30<04:36, 1.56it/s]
96%|█████████▌| 9340/9770 [1:46:31<04:35, 1.56it/s]
96%|█████████▌| 9340/9770 [1:46:31<04:35, 1.56it/s]
96%|█████████▌| 9341/9770 [1:46:32<04:41, 1.52it/s]
96%|█████████▌| 9342/9770 [1:46:32<04:40, 1.53it/s]
96%|█████████▌| 9343/9770 [1:46:33<04:38, 1.53it/s]
96%|█████████▌| 9344/9770 [1:46:34<04:43, 1.50it/s]
96%|███
+0: {'loss': 0.6654, 'grad_norm': 0.5917335559990318, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.96}
+0: ██████▌| 9345/9770 [1:46:34<04:40, 1.51it/s]
96%|█████████▌| 9346/9770 [1:46:35<04:38, 1.52it/s]
96%|█████████▌| 9347/9770 [1:46:36<04:35, 1.53it/s]
96%|█████████▌| 9348/9770 [1:46:36<04:34, 1.54it/s]
96%|█████████▌| 9349/9770 [1:46:37<04:33, 1.54it/s]
96%|█████████▌| 9350/9770 [1:46:38<04:34, 1.53it/s]
96%|█████████▌| 9350/9770 [1:46:38<04:34, 1.53it/s]
96%|█████████▌| 9351/9770 [1:46:38<04:31, 1.55it/s]
96%|█████████▌| 9352/9770 [1:46:39<04:30, 1.54it/s]
96%|█████████▌| 9353/9770 [1:46:40<04:29, 1.55it/s]
96%|█████████▌| 9354/9770 [1:46:40<04:27, 1.56it/s]
96%|█████████▌| 9355/9770 [1:46:41<04:25, 1.57it/s]
96%|█████████▌| 9356/9770 [1:46:42<04:24, 1.56it/s]
96%|█████�
+0: {'loss': 0.6399, 'grad_norm': 0.6185418935928941, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.96}
+0: ��███▌| 9357/9770 [1:46:42<04:25, 1.56it/s]
96%|█████████▌| 9358/9770 [1:46:43<04:23, 1.56it/s]
96%|█████████▌| 9359/9770 [1:46:43<04:23, 1.56it/s]
96%|█████████▌| 9360/9770 [1:46:44<04:21, 1.57it/s]
96%|█████████▌| 9360/9770 [1:46:44<04:21, 1.57it/s]
96%|█████████▌| 9361/9770 [1:46:45<04:21, 1.56it/s]
96%|█████████▌| 9362/9770 [1:46:45<04:20, 1.57it/s]
96%|█████████▌| 9363/9770 [1:46:46<04:17, 1.58it/s]
96%|█████████▌| 9364/9770 [1:46:47<04:19, 1.56it/s]
96%|█████████▌| 9365/9770 [1:46:47<04:24, 1.53it/s]
96%|█████████▌| 9366/9770 [1:46:48<04:21, 1.55it/s]
96%|█████████▌| 9367/9770 [1:46:49<04:18, 1.56it/s]
96%|█████████▌| 9368/9770 [1:46:49<04:18, 1.55it/s]
96%|███████�
+0: {'loss': 0.654, 'grad_norm': 0.5818399760468063, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.96}
+0: �█▌| 9369/9770 [1:46:50<04:18, 1.55it/s]
96%|█████████▌| 9370/9770 [1:46:51<04:18, 1.55it/s]
96%|█████████▌| 9370/9770 [1:46:51<04:18, 1.55it/s]
96%|█████████▌| 9371/9770 [1:46:51<04:18, 1.54it/s]
96%|█████████▌| 9372/9770 [1:46:52<04:20, 1.53it/s]
96%|█████████▌| 9373/9770 [1:46:53<04:17, 1.54it/s]
96%|█████████▌| 9374/9770 [1:46:53<04:17, 1.54it/s]
96%|█████████▌| 9375/9770 [1:46:54<04:12, 1.56it/s]
96%|█████████▌| 9376/9770 [1:46:54<04:15, 1.54it/s]
96%|█████████▌| 9377/9770 [1:46:55<04:14, 1.54it/s]
96%|█████████▌| 9378/9770 [1:46:56<04:13, 1.55it/s]
96%|█████████▌| 9379/9770 [1:46:56<04:17, 1.52it/s]
96%|█████████▌| 9380/9770 [1:46:57<04:14, 1.53it/s]
+0: {'loss': 0.6354, 'grad_norm': 0.5840479474234904, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.96}
+0: {'loss': 0.6384, 'grad_norm': 0.5873396434065948, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.96}
+0:
96%|█████████▌| 9380/9770 [1:46:57<04:14, 1.53it/s]
96%|█████████▌| 9381/9770 [1:46:58<04:10, 1.55it/s]
96%|█████████▌| 9382/9770 [1:46:58<04:08, 1.56it/s]
96%|█████████▌| 9383/9770 [1:46:59<04:06, 1.57it/s]
96%|█████████▌| 9384/9770 [1:47:00<04:05, 1.57it/s]
96%|█████████▌| 9385/9770 [1:47:00<04:06, 1.56it/s]
96%|█████████▌| 9386/9770 [1:47:01<04:06, 1.56it/s]
96%|█████████▌| 9387/9770 [1:47:02<04:06, 1.55it/s]
96%|█████████▌| 9388/9770 [1:47:02<04:02, 1.58it/s]
96%|█████████▌| 9389/9770 [1:47:03<04:01, 1.58it/s]
96%|█████████▌| 9390/9770 [1:47:03<03:59, 1.58it/s]
96%|█████████▌| 9390/9770 [1:47:03<03:59, 1.58it/s]
96%|█████████▌| 9391/9770 [1:47:04<04:03
+0: {'loss': 0.6383, 'grad_norm': 0.6130380983503895, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.96}
+0: , 1.56it/s]
96%|█████████▌| 9392/9770 [1:47:05<04:04, 1.55it/s]
96%|█████████▌| 9393/9770 [1:47:05<04:02, 1.55it/s]
96%|█████████▌| 9394/9770 [1:47:06<03:58, 1.58it/s]
96%|█████████▌| 9395/9770 [1:47:07<03:55, 1.59it/s]
96%|█████████▌| 9396/9770 [1:47:07<03:55, 1.59it/s]
96%|█████████▌| 9397/9770 [1:47:08<03:55, 1.58it/s]
96%|█████████▌| 9398/9770 [1:47:09<03:57, 1.57it/s]
96%|█████████▌| 9399/9770 [1:47:09<03:59, 1.55it/s]
96%|█████████▌| 9400/9770 [1:47:10<04:01, 1.53it/s]
96%|█████████▌| 9400/9770 [1:47:10<04:01, 1.53it/s]
96%|█████████▌| 9401/9770 [1:47:10<03:59, 1.54it/s]
96%|█████████▌| 9402/9770 [1:47:11<04:02, 1.52it/s]
96%|█████████▌| 9403/9770 [1:47:12<03:58, 1.54
+0: {'loss': 0.639, 'grad_norm': 0.621869921293018, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.96}
+0: it/s]
96%|█████████▋| 9404/9770 [1:47:12<04:01, 1.52it/s]
96%|█████████▋| 9405/9770 [1:47:13<03:56, 1.54it/s]
96%|█████████▋| 9406/9770 [1:47:14<03:56, 1.54it/s]
96%|█████████▋| 9407/9770 [1:47:14<03:54, 1.55it/s]
96%|█████████▋| 9408/9770 [1:47:15<03:53, 1.55it/s]
96%|█████████▋| 9409/9770 [1:47:16<03:52, 1.56it/s]
96%|█████████▋| 9410/9770 [1:47:16<03:51, 1.55it/s]
96%|█████████▋| 9410/9770 [1:47:16<03:51, 1.55it/s]
96%|█████████▋| 9411/9770 [1:47:17<03:58, 1.51it/s]
96%|█████████▋| 9412/9770 [1:47:18<03:56, 1.51it/s]
96%|█████████▋| 9413/9770 [1:47:18<03:54, 1.53it/s]
96%|█████████▋| 9414/9770 [1:47:19<03:52, 1.53it/s]
96%|█████████▋| 9415/9770 [1:47:20<03:51, 1.53it/s]
+0: {'loss': 0.6508, 'grad_norm': 0.5863603485788632, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.96}
+0: 96%|█████████▋| 9416/9770 [1:47:20<03:48, 1.55it/s]
96%|█████████▋| 9417/9770 [1:47:21<03:48, 1.55it/s]
96%|█████████▋| 9418/9770 [1:47:22<03:46, 1.55it/s]
96%|█████████▋| 9419/9770 [1:47:22<03:44, 1.56it/s]
96%|█████████▋| 9420/9770 [1:47:23<03:49, 1.52it/s]
96%|█████████▋| 9420/9770 [1:47:23<03:49, 1.52it/s]
96%|█████████▋| 9421/9770 [1:47:24<03:49, 1.52it/s]
96%|█████████▋| 9422/9770 [1:47:24<03:45, 1.54it/s]
96%|█████████▋| 9423/9770 [1:47:25<03:44, 1.54it/s]
96%|█████████▋| 9424/9770 [1:47:25<03:40, 1.57it/s]
96%|█████████▋| 9425/9770 [1:47:26<03:36, 1.59it/s]
96%|█████████▋| 9426/9770 [1:47:27<03:36, 1.59it/s]
96%|█████████▋| 9427/9770 [1:47:27<03:35, 1.59it/s]
96%|█
+0: {'loss': 0.6523, 'grad_norm': 0.6000655950105521, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.96}
+0: ████████▋| 9428/9770 [1:47:28<03:39, 1.56it/s]
97%|█████████▋| 9429/9770 [1:47:29<03:37, 1.57it/s]
97%|█████████▋| 9430/9770 [1:47:29<03:36, 1.57it/s]
97%|█████████▋| 9430/9770 [1:47:29<03:36, 1.57it/s]
97%|█████████▋| 9431/9770 [1:47:30<03:37, 1.56it/s]
97%|█████████▋| 9432/9770 [1:47:30<03:35, 1.57it/s]
97%|█████████▋| 9433/9770 [1:47:31<03:35, 1.56it/s]
97%|█████████▋| 9434/9770 [1:47:32<03:37, 1.55it/s]
97%|█████████▋| 9435/9770 [1:47:32<03:36, 1.55it/s]
97%|█████████▋| 9436/9770 [1:47:33<03:39, 1.52it/s]
97%|█████████▋| 9437/9770 [1:47:34<03:36, 1.54it/s]
97%|█████████▋| 9438/9770 [1:47:34<03:35, 1.54it/s]
97%|█████████▋| 9439/9770 [1:47:35<03:31, 1.57it/s]
97%|███�
+0: {'loss': 0.651, 'grad_norm': 0.5690883540561729, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.97}
+0: {'loss': 0.6423, 'grad_norm': 0.5931820646400616, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.97}
+0: ��█████▋| 9440/9770 [1:47:36<03:31, 1.56it/s]
97%|█████████▋| 9440/9770 [1:47:36<03:31, 1.56it/s]
97%|█████████▋| 9441/9770 [1:47:36<03:32, 1.55it/s]
97%|█████████▋| 9442/9770 [1:47:37<03:31, 1.55it/s]
97%|█████████▋| 9443/9770 [1:47:38<03:30, 1.55it/s]
97%|█████████▋| 9444/9770 [1:47:38<03:31, 1.54it/s]
97%|█████████▋| 9445/9770 [1:47:39<03:33, 1.52it/s]
97%|█████████▋| 9446/9770 [1:47:40<03:31, 1.53it/s]
97%|█████████▋| 9447/9770 [1:47:40<03:30, 1.54it/s]
97%|█████████▋| 9448/9770 [1:47:41<03:28, 1.54it/s]
97%|█████████▋| 9449/9770 [1:47:42<03:27, 1.54it/s]
97%|█████████▋| 9450/9770 [1:47:42<03:26, 1.55it/s]
97%|█████████▋| 9450
+0: {'loss': 0.6494, 'grad_norm': 0.5770314725540777, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.97}
+0: /9770 [1:47:42<03:26, 1.55it/s]
97%|█████████▋| 9451/9770 [1:47:43<03:23, 1.57it/s]
97%|█████████▋| 9452/9770 [1:47:43<03:28, 1.52it/s]
97%|█████████▋| 9453/9770 [1:47:44<03:26, 1.54it/s]
97%|█████████▋| 9454/9770 [1:47:45<03:22, 1.56it/s]
97%|█████████▋| 9455/9770 [1:47:45<03:22, 1.56it/s]
97%|█████████▋| 9456/9770 [1:47:46<03:22, 1.55it/s]
97%|█████████▋| 9457/9770 [1:47:47<03:21, 1.56it/s]
97%|█████████▋| 9458/9770 [1:47:47<03:21, 1.55it/s]
97%|█████████▋| 9459/9770 [1:47:48<03:18, 1.56it/s]
97%|█████████▋| 9460/9770 [1:47:49<03:18, 1.56it/s]
97%|█████████▋| 9460/9770 [1:47:49<03:18, 1.56it/s]
97%|█████████▋| 9461/9770 [1:47:49<03:19, 1.55it/s]
97%|█████████▋| 9462/9770 [
+0: {'loss': 0.6341, 'grad_norm': 0.6117866376643601, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.97}
+0: 1:47:50<03:19, 1.54it/s]
97%|█████████▋| 9463/9770 [1:47:51<03:19, 1.54it/s]
97%|█████████▋| 9464/9770 [1:47:51<03:17, 1.55it/s]
97%|█████████▋| 9465/9770 [1:47:52<03:17, 1.55it/s]
97%|█████████▋| 9466/9770 [1:47:52<03:15, 1.56it/s]
97%|█████████▋| 9467/9770 [1:47:53<03:13, 1.57it/s]
97%|█████████▋| 9468/9770 [1:47:54<03:14, 1.55it/s]
97%|█████████▋| 9469/9770 [1:47:54<03:14, 1.55it/s]
97%|█████████▋| 9470/9770 [1:47:55<03:16, 1.53it/s]
97%|█████████▋| 9470/9770 [1:47:55<03:16, 1.53it/s]
97%|█████████▋| 9471/9770 [1:47:56<03:15, 1.53it/s]
97%|█████████▋| 9472/9770 [1:47:56<03:14, 1.53it/s]
97%|█████████▋| 9473/9770 [1:47:57<03:14, 1.52it/s]
97%|█████████▋| 9474/9770 [1:47:58
+0: {'loss': 0.6642, 'grad_norm': 0.601466373901166, 'learning_rate': 1.996005768142772e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.97}
+0: <03:13, 1.53it/s]
97%|█████████▋| 9475/9770 [1:47:58<03:11, 1.54it/s]
97%|█████████▋| 9476/9770 [1:47:59<03:09, 1.55it/s]
97%|█████████▋| 9477/9770 [1:48:00<03:07, 1.57it/s]
97%|█████████▋| 9478/9770 [1:48:00<03:05, 1.57it/s]
97%|█████████▋| 9479/9770 [1:48:01<03:06, 1.56it/s]
97%|█████████▋| 9480/9770 [1:48:02<03:06, 1.56it/s]
97%|█████████▋| 9480/9770 [1:48:02<03:06, 1.56it/s]
97%|█████████▋| 9481/9770 [1:48:02<03:06, 1.55it/s]
97%|█████████▋| 9482/9770 [1:48:03<03:03, 1.57it/s]
97%|█████████▋| 9483/9770 [1:48:03<03:04, 1.55it/s]
97%|█████████▋| 9484/9770 [1:48:04<03:03, 1.56it/s]
97%|█████████▋| 9485/9770 [1:48:05<03:03, 1.55it/s]
97%|█████████▋| 9486/9770 [1:48:05<03:05,
+0: {'loss': 0.632, 'grad_norm': 0.6080561729558677, 'learning_rate': 1.98224405715955e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.97}
+0: 1.53it/s]
97%|█████████▋| 9487/9770 [1:48:06<03:02, 1.55it/s]
97%|█████████▋| 9488/9770 [1:48:07<03:02, 1.55it/s]
97%|█████████▋| 9489/9770 [1:48:07<02:58, 1.57it/s]
97%|█████████▋| 9490/9770 [1:48:08<02:58, 1.56it/s]
97%|█████████▋| 9490/9770 [1:48:08<02:58, 1.56it/s]
97%|█████████▋| 9491/9770 [1:48:09<03:02, 1.53it/s]
97%|█████████▋| 9492/9770 [1:48:09<03:04, 1.51it/s]
97%|█████████▋| 9493/9770 [1:48:10<03:01, 1.52it/s]
97%|█████████▋| 9494/9770 [1:48:11<02:59, 1.54it/s]
97%|█████████▋| 9495/9770 [1:48:11<02:57, 1.55it/s]
97%|█████████▋| 9496/9770 [1:48:12<02:54, 1.57it/s]
97%|█████████▋| 9497/9770 [1:48:13<02:57, 1.54it/s]
97%|█████████▋| 9498/9770 [1:48:13<02:56, 1.54i
+0: {'loss': 0.668, 'grad_norm': 0.6344582302657377, 'learning_rate': 1.958816295664649e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.97}
+0: t/s]
97%|█████████▋| 9499/9770 [1:48:14<02:54, 1.55it/s]
97%|█████████▋| 9500/9770 [1:48:15<02:59, 1.50it/s]
97%|█████████▋| 9500/9770 [1:48:15<02:59, 1.50it/s]
97%|█████████▋| 9501/9770 [1:48:15<02:57, 1.52it/s]
97%|█████████▋| 9502/9770 [1:48:16<02:53, 1.55it/s]
97%|█████████▋| 9503/9770 [1:48:16<02:51, 1.55it/s]
97%|█████████▋| 9504/9770 [1:48:17<02:49, 1.57it/s]
97%|█████████▋| 9505/9770 [1:48:18<02:48, 1.57it/s]
97%|█████████▋| 9506/9770 [1:48:18<02:48, 1.57it/s]
97%|█████████▋| 9507/9770 [1:48:19<02:48, 1.56it/s]
97%|█████████▋| 9508/9770 [1:48:20<02:48, 1.56it/s]
97%|█████████▋| 9509/9770 [1:48:20<02:52, 1.51it/s]
97%|█████████▋| 9510/9770 [1:48:21<02:51, 1.52it/s]
+0: {'loss': 0.6471, 'grad_norm': 0.5789882942483003, 'learning_rate': 1.925979163115583e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.97}
+0: {'loss': 0.6565, 'grad_norm': 0.601459843793583, 'learning_rate': 1.8840924300081706e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.97}
+0:
97%|█████████▋| 9510/9770 [1:48:21<02:51, 1.52it/s]
97%|█████████▋| 9511/9770 [1:48:22<02:48, 1.54it/s]
97%|█████████▋| 9512/9770 [1:48:22<02:47, 1.54it/s]
97%|█████████▋| 9513/9770 [1:48:23<02:46, 1.54it/s]
97%|█████████▋| 9514/9770 [1:48:24<02:44, 1.55it/s]
97%|█████████▋| 9515/9770 [1:48:24<02:44, 1.55it/s]
97%|█████████▋| 9516/9770 [1:48:25<02:42, 1.56it/s]
97%|█████████▋| 9517/9770 [1:48:25<02:44, 1.54it/s]
97%|█████████▋| 9518/9770 [1:48:26<02:43, 1.54it/s]
97%|█████████▋| 9519/9770 [1:48:27<02:41, 1.55it/s]
97%|█████████▋| 9520/9770 [1:48:27<02:40, 1.56it/s]
97%|█████████▋| 9520/9770 [1:48:27<02:40, 1.56it/s]
97%|███████�
+0: {'loss': 0.6519, 'grad_norm': 0.5386981168101596, 'learning_rate': 1.833615016155699e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.97}
+0: �█▋| 9521/9770 [1:48:28<02:40, 1.55it/s]
97%|█████████▋| 9522/9770 [1:48:29<02:39, 1.55it/s]
97%|█████████▋| 9523/9770 [1:48:29<02:37, 1.57it/s]
97%|█████████▋| 9524/9770 [1:48:30<02:37, 1.56it/s]
97%|█████████▋| 9525/9770 [1:48:31<02:36, 1.56it/s]
98%|█████████▊| 9526/9770 [1:48:31<02:34, 1.58it/s]
98%|█████████▊| 9527/9770 [1:48:32<02:34, 1.57it/s]
98%|█████████▊| 9528/9770 [1:48:32<02:33, 1.57it/s]
98%|█████████▊| 9529/9770 [1:48:33<02:33, 1.57it/s]
98%|█████████▊| 9530/9770 [1:48:34<02:32, 1.57it/s]
98%|█████████▊| 9530/9770 [1:48:34<02:32, 1.57it/s]
98%|█████████▊| 9531/9770 [1:48:34<02:37, 1.52it/s]
98%|█████████▊| 9532/9770 [1:48:35<02:40, 1.49it/s]
98%|█████████▊
+0: {'loss': 0.6647, 'grad_norm': 0.5970480385736316, 'learning_rate': 1.775099962667414e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.98}
+0: | 9533/9770 [1:48:36<02:37, 1.51it/s]
98%|█████████▊| 9534/9770 [1:48:36<02:33, 1.53it/s]
98%|█████████▊| 9535/9770 [1:48:37<02:32, 1.54it/s]
98%|█████████▊| 9536/9770 [1:48:38<02:32, 1.53it/s]
98%|█████████▊| 9537/9770 [1:48:38<02:30, 1.55it/s]
98%|█████████▊| 9538/9770 [1:48:39<02:29, 1.55it/s]
98%|█████████▊| 9539/9770 [1:48:40<02:28, 1.56it/s]
98%|█████████▊| 9540/9770 [1:48:40<02:27, 1.55it/s]
98%|█████████▊| 9540/9770 [1:48:40<02:27, 1.55it/s]
98%|█████████▊| 9541/9770 [1:48:41<02:26, 1.56it/s]
98%|█████████▊| 9542/9770 [1:48:42<02:25, 1.57it/s]
98%|█████████▊| 9543/9770 [1:48:42<02:23, 1.58it/s]
98%|█████████▊| 9544/9770 [1:48:43<02:23, 1.58it/s]
98%|█████████▊| 9545/
+0: {'loss': 0.6511, 'grad_norm': 0.6414451513390736, 'learning_rate': 1.7091883727143946e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.98}
+0: 9770 [1:48:43<02:22, 1.57it/s]
98%|█████████▊| 9546/9770 [1:48:44<02:20, 1.59it/s]
98%|█████████▊| 9547/9770 [1:48:45<02:23, 1.56it/s]
98%|█████████▊| 9548/9770 [1:48:45<02:23, 1.55it/s]
98%|█████████▊| 9549/9770 [1:48:46<02:22, 1.55it/s]
98%|█████████▊| 9550/9770 [1:48:47<02:21, 1.55it/s]
98%|█████████▊| 9550/9770 [1:48:47<02:21, 1.55it/s]
98%|█████████▊| 9551/9770 [1:48:47<02:20, 1.56it/s]
98%|█████████▊| 9552/9770 [1:48:48<02:18, 1.57it/s]
98%|█████████▊| 9553/9770 [1:48:49<02:17, 1.57it/s]
98%|█████████▊| 9554/9770 [1:48:49<02:18, 1.56it/s]
98%|█████████▊| 9555/9770 [1:48:50<02:19, 1.55it/s]
98%|█████████▊| 9556/9770 [1:48:51<02:18, 1.55it/s]
98%|█████████▊| 9557/9770 [1
+0: {'loss': 0.6372, 'grad_norm': 0.5805169337332307, 'learning_rate': 1.6366023874690543e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.98}
+0: :48:51<02:17, 1.54it/s]
98%|█████████▊| 9558/9770 [1:48:52<02:16, 1.55it/s]
98%|█████████▊| 9559/9770 [1:48:52<02:16, 1.55it/s]
98%|█████████▊| 9560/9770 [1:48:53<02:16, 1.54it/s]
98%|█████████▊| 9560/9770 [1:48:53<02:16, 1.54it/s]
98%|█████████▊| 9561/9770 [1:48:54<02:15, 1.54it/s]
98%|█████████▊| 9562/9770 [1:48:54<02:15, 1.54it/s]
98%|█████████▊| 9563/9770 [1:48:55<02:15, 1.52it/s]
98%|█████████▊| 9564/9770 [1:48:56<02:13, 1.54it/s]
98%|█████████▊| 9565/9770 [1:48:56<02:12, 1.54it/s]
98%|█████████▊| 9566/9770 [1:48:57<02:12, 1.55it/s]
98%|█████████▊| 9567/9770 [1:48:58<02:12, 1.53it/s]
98%|█████████▊| 9568/9770 [1:48:58<02:10, 1.55it/s]
98%|█████████▊| 9569/9770 [1:48:59<
+0: {'loss': 0.6638, 'grad_norm': 0.5848790352766927, 'learning_rate': 1.558137274175334e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.98}
+0: {'loss': 0.6394, 'grad_norm': 0.5700925984015552, 'learning_rate': 1.4746527130343613e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.98}
+0: 02:11, 1.53it/s]
98%|█████████▊| 9570/9770 [1:49:00<02:10, 1.54it/s]
98%|█████████▊| 9570/9770 [1:49:00<02:10, 1.54it/s]
98%|█████████▊| 9571/9770 [1:49:00<02:11, 1.52it/s]
98%|█████████▊| 9572/9770 [1:49:01<02:09, 1.53it/s]
98%|█████████▊| 9573/9770 [1:49:02<02:07, 1.55it/s]
98%|█████████▊| 9574/9770 [1:49:02<02:04, 1.57it/s]
98%|█████████▊| 9575/9770 [1:49:03<02:03, 1.57it/s]
98%|█████████▊| 9576/9770 [1:49:03<02:02, 1.59it/s]
98%|█████████▊| 9577/9770 [1:49:04<02:00, 1.60it/s]
98%|█████████▊| 9578/9770 [1:49:05<02:02, 1.56it/s]
98%|█████████▊| 9579/9770 [1:49:05<02:01, 1.57it/s]
98%|█████████▊| 9580/9770 [1:49:06<02:01, 1.56it/s]
98%|█
+0: {'loss': 0.6608, 'grad_norm': 0.5846458498232325, 'learning_rate': 1.3870633783682632e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.98}
+0: ████████▊| 9580/9770 [1:49:06<02:01, 1.56it/s]
98%|█████████▊| 9581/9770 [1:49:07<02:01, 1.56it/s]
98%|█████████▊| 9582/9770 [1:49:07<02:01, 1.54it/s]
98%|█████████▊| 9583/9770 [1:49:08<02:01, 1.54it/s]
98%|█████████▊| 9584/9770 [1:49:09<02:00, 1.54it/s]
98%|█████████▊| 9585/9770 [1:49:09<01:59, 1.55it/s]
98%|█████████▊| 9586/9770 [1:49:10<01:58, 1.55it/s]
98%|█████████▊| 9587/9770 [1:49:11<01:56, 1.57it/s]
98%|█████████▊| 9588/9770 [1:49:11<01:56, 1.57it/s]
98%|█████████▊| 9589/9770 [1:49:12<01:54, 1.58it/s]
98%|█████████▊| 9590/9770 [1:49:12<01:53, 1.59it/s]
98%|█████████▊| 9590/9770 [1:49:12<01:53, 1.59it/s]
98%|█████████▊| 9591/9770 [1:49:13<01:52, 1.59it/s]
98%|███�
+0: {'loss': 0.6541, 'grad_norm': 0.5777555439158457, 'learning_rate': 1.2963289172568885e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.98}
+0: ��█████▊| 9592/9770 [1:49:14<01:52, 1.58it/s]
98%|█████████▊| 9593/9770 [1:49:14<01:52, 1.57it/s]
98%|█████████▊| 9594/9770 [1:49:15<01:50, 1.59it/s]
98%|█████████▊| 9595/9770 [1:49:16<01:50, 1.58it/s]
98%|█████████▊| 9596/9770 [1:49:16<01:51, 1.56it/s]
98%|█████████▊| 9597/9770 [1:49:17<01:51, 1.55it/s]
98%|█████████▊| 9598/9770 [1:49:18<01:53, 1.52it/s]
98%|█████████▊| 9599/9770 [1:49:18<01:52, 1.52it/s]
98%|█████████▊| 9600/9770 [1:49:19<01:51, 1.52it/s]
98%|█████████▊| 9600/9770 [1:49:19<01:51, 1.52it/s]
98%|█████████▊| 9601/9770 [1:49:20<01:49, 1.54it/s]
98%|█████████▊| 9602/9770 [1:49:20<01:48, 1.55it/s]
98%|█████████▊| 9603/9770 [1:49:21<01:46, 1.56it/s]
98%|█████�
+0: {'loss': 0.6348, 'grad_norm': 0.5675719910023068, 'learning_rate': 1.20344343544358e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.98}
+0: �███▊| 9604/9770 [1:49:21<01:46, 1.57it/s]
98%|█████████▊| 9605/9770 [1:49:22<01:44, 1.58it/s]
98%|█████████▊| 9606/9770 [1:49:23<01:44, 1.57it/s]
98%|█████████▊| 9607/9770 [1:49:23<01:43, 1.58it/s]
98%|█████████▊| 9608/9770 [1:49:24<01:44, 1.55it/s]
98%|█████████▊| 9609/9770 [1:49:25<01:43, 1.56it/s]
98%|█████████▊| 9610/9770 [1:49:25<01:42, 1.56it/s]
98%|█████████▊| 9610/9770 [1:49:25<01:42, 1.56it/s]
98%|█████████▊| 9611/9770 [1:49:26<01:41, 1.57it/s]
98%|█████████▊| 9612/9770 [1:49:27<01:41, 1.56it/s]
98%|█████████▊| 9613/9770 [1:49:27<01:43, 1.52it/s]
98%|█████████▊| 9614/9770 [1:49:28<01:42, 1.52it/s]
98%|█████████▊| 9615/9770 [1:49:29<01:41, 1.53it/s]
98%|████████
+0: {'loss': 0.6523, 'grad_norm': 0.6151241464979615, 'learning_rate': 1.1094246057046214e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.98}
+0: █▊| 9616/9770 [1:49:29<01:39, 1.54it/s]
98%|█████████▊| 9617/9770 [1:49:30<01:38, 1.55it/s]
98%|█████████▊| 9618/9770 [1:49:30<01:38, 1.54it/s]
98%|█████████▊| 9619/9770 [1:49:31<01:37, 1.55it/s]
98%|█████████▊| 9620/9770 [1:49:32<01:36, 1.55it/s]
98%|█████████▊| 9620/9770 [1:49:32<01:36, 1.55it/s]
98%|█████████▊| 9621/9770 [1:49:32<01:36, 1.55it/s]
98%|█████████▊| 9622/9770 [1:49:33<01:35, 1.55it/s]
98%|█████████▊| 9623/9770 [1:49:34<01:36, 1.52it/s]
99%|█████████▊| 9624/9770 [1:49:34<01:34, 1.55it/s]
99%|█████████▊| 9625/9770 [1:49:35<01:34, 1.53it/s]
99%|███���█████▊| 9626/9770 [1:49:36<01:33, 1.54it/s]
99%|█████████▊| 9627/9770 [1:49:36<01:33, 1.54it/s]
99%|█████████▊|
+0: {'loss': 0.6278, 'grad_norm': 0.550106592947246, 'learning_rate': 1.0153025180133372e-05, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.99}
+0: 9628/9770 [1:49:37<01:31, 1.55it/s]
99%|█████████▊| 9629/9770 [1:49:38<01:30, 1.56it/s]
99%|█████████▊| 9630/9770 [1:49:38<01:29, 1.56it/s]
99%|█████████▊| 9630/9770 [1:49:38<01:29, 1.56it/s]
99%|█████████▊| 9631/9770 [1:49:39<01:37, 1.42it/s]
99%|█████████▊| 9632/9770 [1:49:40<01:34, 1.46it/s]
99%|█████████▊| 9633/9770 [1:49:40<01:34, 1.45it/s]
99%|█████████▊| 9634/9770 [1:49:41<01:33, 1.46it/s]
99%|█████████▊| 9635/9770 [1:49:42<01:31, 1.48it/s]
99%|█████████▊| 9636/9770 [1:49:42<01:29, 1.50it/s]
99%|█████████▊| 9637/9770 [1:49:43<01:28, 1.51it/s]
99%|█████████▊| 9638/9770 [1:49:44<01:26, 1.53it/s]
99%|█████████▊| 9639/9770 [1:49:44<01:26, 1.52it/s]
99%|█████████▊| 9640/9
+0: {'loss': 0.6299, 'grad_norm': 0.5585174558643733, 'learning_rate': 9.221083936587864e-06, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.99}
+0: {'loss': 0.6449, 'grad_norm': 0.5474740219469646, 'learning_rate': 8.308632869695222e-06, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.99}
+0: 770 [1:49:45<01:25, 1.52it/s]
99%|█████████▊| 9640/9770 [1:49:45<01:25, 1.52it/s]
99%|█████████▊| 9641/9770 [1:49:46<01:24, 1.52it/s]
99%|█████████▊| 9642/9770 [1:49:46<01:24, 1.52it/s]
99%|█████████▊| 9643/9770 [1:49:47<01:21, 1.55it/s]
99%|█████████▊| 9644/9770 [1:49:48<01:21, 1.55it/s]
99%|█████████▊| 9645/9770 [1:49:48<01:23, 1.50it/s]
99%|█████████▊| 9646/9770 [1:49:49<01:22, 1.51it/s]
99%|█████████▊| 9647/9770 [1:49:50<01:20, 1.53it/s]
99%|█████████▉| 9648/9770 [1:49:50<01:19, 1.53it/s]
99%|█████████▉| 9649/9770 [1:49:51<01:19, 1.53it/s]
99%|█████████▉| 9650/9770 [1:49:52<01:18, 1.53it/s]
99%|█████████▉| 9650/9770 [1:49:52<01:18, 1.53i
+0: {'loss': 0.6329, 'grad_norm': 0.5328633471127429, 'learning_rate': 7.425668984286976e-06, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.99}
+0: t/s]
99%|█████████▉| 9651/9770 [1:49:52<01:17, 1.53it/s]
99%|█████████▉| 9652/9770 [1:49:53<01:17, 1.51it/s]
99%|█████████▉| 9653/9770 [1:49:54<01:16, 1.53it/s]
99%|█████████▉| 9654/9770 [1:49:54<01:14, 1.56it/s]
99%|█████████▉| 9655/9770 [1:49:55<01:13, 1.56it/s]
99%|█████████▉| 9656/9770 [1:49:55<01:12, 1.56it/s]
99%|█████████▉| 9657/9770 [1:49:56<01:11, 1.57it/s]
99%|█████████▉| 9658/9770 [1:49:57<01:11, 1.56it/s]
99%|█████████▉| 9659/9770 [1:49:57<01:11, 1.56it/s]
99%|█████████▉| 9660/9770 [1:49:58<01:09, 1.57it/s]
99%|█████████▉| 9660/9770 [1:49:58<01:09, 1.57it/s]
99%|█████████▉| 9661/9770 [1:49:59<01:09, 1.58it/s]
99%|█████████▉| 9662/9770 [1:49:59<01:08, 1.57it/s]
9
+0: {'loss': 0.6481, 'grad_norm': 0.5662254303204157, 'learning_rate': 6.581866217463733e-06, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.99}
+0: 9%|█████████▉| 9663/9770 [1:50:00<01:07, 1.58it/s]
99%|█████████▉| 9664/9770 [1:50:00<01:07, 1.58it/s]
99%|█████████▉| 9665/9770 [1:50:01<01:06, 1.59it/s]
99%|█████████▉| 9666/9770 [1:50:02<01:05, 1.59it/s]
99%|█████████▉| 9667/9770 [1:50:02<01:04, 1.59it/s]
99%|█████████▉| 9668/9770 [1:50:03<01:04, 1.59it/s]
99%|█████████▉| 9669/9770 [1:50:04<01:03, 1.58it/s]
99%|█████████▉| 9670/9770 [1:50:04<01:04, 1.56it/s]
99%|█████████▉| 9670/9770 [1:50:04<01:04, 1.56it/s]
99%|█████████▉| 9671/9770 [1:50:05<01:03, 1.55it/s]
99%|█████████▉| 9672/9770 [1:50:06<01:03, 1.55it/s]
99%|█████████▉| 9673/9770 [1:50:06<01:02, 1.56it/s]
99%|█████████▉| 9674/9770 [1:50:07<01:00, 1.58it/s]
99%|█�
+0: {'loss': 0.6296, 'grad_norm': 0.5685001869287369, 'learning_rate': 5.786469448915892e-06, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.99}
+0: ��███████▉| 9675/9770 [1:50:07<00:59, 1.59it/s]
99%|█████████▉| 9676/9770 [1:50:08<00:59, 1.58it/s]
99%|█████████▉| 9677/9770 [1:50:09<01:00, 1.53it/s]
99%|█████████▉| 9678/9770 [1:50:09<00:59, 1.54it/s]
99%|█████████▉| 9679/9770 [1:50:10<00:59, 1.54it/s]
99%|█████████▉| 9680/9770 [1:50:11<00:58, 1.53it/s]
99%|█████████▉| 9680/9770 [1:50:11<00:58, 1.53it/s]
99%|█████████▉| 9681/9770 [1:50:11<00:58, 1.53it/s]
99%|█████████▉| 9682/9770 [1:50:12<00:56, 1.56it/s]
99%|█████████▉| 9683/9770 [1:50:13<00:54, 1.59it/s]
99%|█████████▉| 9684/9770 [1:50:13<00:55, 1.56it/s]
99%|█████████▉| 9685/9770 [1:50:14<00:54, 1.55it/s]
99%|█████████▉| 9686/9770 [1:50:15<00:54, 1.54it/s]
99%|███�
+0: {'loss': 0.6429, 'grad_norm': 0.5625623232656415, 'learning_rate': 5.048193212087136e-06, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.99}
+0: �█████▉| 9687/9770 [1:50:15<00:54, 1.53it/s]
99%|█████████▉| 9688/9770 [1:50:16<00:53, 1.54it/s]
99%|█████████▉| 9689/9770 [1:50:17<00:52, 1.54it/s]
99%|█████████▉| 9690/9770 [1:50:17<00:52, 1.52it/s]
99%|█████████▉| 9690/9770 [1:50:17<00:52, 1.52it/s]
99%|█████████▉| 9691/9770 [1:50:18<00:51, 1.54it/s]
99%|█████████▉| 9692/9770 [1:50:19<00:50, 1.54it/s]
99%|█████████▉| 9693/9770 [1:50:19<00:50, 1.54it/s]
99%|█████████▉| 9694/9770 [1:50:20<00:49, 1.55it/s]
99%|█████████▉| 9695/9770 [1:50:20<00:48, 1.54it/s]
99%|█████████▉| 9696/9770 [1:50:21<00:48, 1.53it/s]
99%|█████████▉| 9697/9770 [1:50:22<00:47, 1.53it/s]
99%|█████████▉| 9698/9770 [1:50:22<00:46, 1.56it/s]
99%|██████
+0: {'loss': 0.661, 'grad_norm': 0.5814069370871611, 'learning_rate': 4.3751262159223915e-06, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.99}
+0: ███▉| 9699/9770 [1:50:23<00:45, 1.56it/s]
99%|█████████▉| 9700/9770 [1:50:24<00:44, 1.56it/s]
99%|█████████▉| 9700/9770 [1:50:24<00:44, 1.56it/s]
99%|█████████▉| 9701/9770 [1:50:24<00:45, 1.53it/s]
99%|█████████▉| 9702/9770 [1:50:25<00:44, 1.52it/s]
99%|█████████▉| 9703/9770 [1:50:26<00:46, 1.44it/s]
99%|█████████▉| 9704/9770 [1:50:26<00:45, 1.44it/s]
99%|█████████▉| 9705/9770 [1:50:27<00:44, 1.45it/s]
99%|█████████▉| 9706/9770 [1:50:28<00:44, 1.43it/s]
99%|█████████▉| 9707/9770 [1:50:29<00:42, 1.47it/s]
99%|█████████▉| 9708/9770 [1:50:29<00:41, 1.50it/s]
99%|█████████▉| 9709/9770 [1:50:30<00:40, 1.52it/s]
99%|█████████▉| 9710/9770 [1:50:30<00:39, 1.52it/s]
+0: {'loss': 0.6271, 'grad_norm': 0.5491855585365405, 'learning_rate': 3.774642723279971e-06, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.99}
+0: {'loss': 0.6408, 'grad_norm': 0.5532423779121038, 'learning_rate': 3.2533217569645088e-06, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 0.99}
+0:
99%|█████████▉| 9710/9770 [1:50:30<00:39, 1.52it/s]
99%|█████████▉| 9711/9770 [1:50:31<00:38, 1.52it/s]
99%|█████████▉| 9712/9770 [1:50:32<00:38, 1.52it/s]
99%|█████████▉| 9713/9770 [1:50:32<00:37, 1.53it/s]
99%|█████████▉| 9714/9770 [1:50:33<00:36, 1.53it/s]
99%|█████████▉| 9715/9770 [1:50:34<00:35, 1.54it/s]
99%|█████████▉| 9716/9770 [1:50:34<00:34, 1.55it/s]
99%|█████████▉| 9717/9770 [1:50:35<00:34, 1.55it/s]
99%|█████████▉| 9718/9770 [1:50:36<00:33, 1.55it/s]
99%|█████████▉| 9719/9770 [1:50:36<00:32, 1.55it/s]
99%|█████████▉| 9720/9770 [1:50:37<00:32, 1.55it/s]
99%|█████████▉| 9720/9770 [1:50:37<00:32, 1.55it/s]
99%|█████████▉| 9721/9770 [1:50:38<
+0: {'loss': 0.6413, 'grad_norm': 0.542795882382674, 'learning_rate': 2.8168750185763838e-06, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 1.0}
+0: 00:31, 1.55it/s]
100%|█████████▉| 9722/9770 [1:50:38<00:30, 1.55it/s]
100%|█████████▉| 9723/9770 [1:50:39<00:30, 1.54it/s]
100%|█████████▉| 9724/9770 [1:50:40<00:29, 1.55it/s]
100%|█████████▉| 9725/9770 [1:50:40<00:28, 1.57it/s]
100%|█████████▉| 9726/9770 [1:50:41<00:28, 1.56it/s]
100%|█████████▉| 9727/9770 [1:50:41<00:27, 1.58it/s]
100%|█████████▉| 9728/9770 [1:50:42<00:27, 1.54it/s]
100%|█████████▉| 9729/9770 [1:50:43<00:26, 1.54it/s]
100%|█████████▉| 9730/9770 [1:50:43<00:26, 1.53it/s]
100%|█████████▉| 9730/9770 [1:50:43<00:26, 1.53it/s]
100%|█████████▉| 9731/9770 [1:50:44<00:25, 1.51it/s]
100%|█████████▉| 9732/9770 [1:50:45<00:25, 1.52it/s]
100%|█████████▉| 9733/9770 [1:50:45<00:24,
+0: {'loss': 0.6331, 'grad_norm': 0.5298572666971668, 'learning_rate': 2.4700843099137305e-06, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 1.0}
+0: 1.52it/s]
100%|█████████▉| 9734/9770 [1:50:46<00:24, 1.48it/s]
100%|█████████▉| 9735/9770 [1:50:47<00:23, 1.49it/s]
100%|█████████▉| 9736/9770 [1:50:47<00:22, 1.49it/s]
100%|█████████▉| 9737/9770 [1:50:48<00:21, 1.52it/s]
100%|█████████▉| 9738/9770 [1:50:49<00:21, 1.52it/s]
100%|█████████▉| 9739/9770 [1:50:49<00:20, 1.50it/s]
100%|█████████▉| 9740/9770 [1:50:50<00:19, 1.50it/s]
100%|█████████▉| 9740/9770 [1:50:50<00:19, 1.50it/s]
100%|█████████▉| 9741/9770 [1:50:51<00:19, 1.51it/s]
100%|█████████▉| 9742/9770 [1:50:51<00:18, 1.53it/s]
100%|█████████▉| 9743/9770 [1:50:52<00:17, 1.53it/s]
100%|█████████▉| 9744/9770 [1:50:53<00:17, 1.47it/s]
100%|█████████▉| 9745/9770 [1:50:53<00:16, 1.49it
+0: {'loss': 0.6584, 'grad_norm': 0.5415119503803693, 'learning_rate': 2.2167491425512732e-06, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 1.0}
+0: /s]
100%|█████████▉| 9746/9770 [1:50:54<00:15, 1.51it/s]
100%|█████████▉| 9747/9770 [1:50:55<00:15, 1.53it/s]
100%|█████████▉| 9748/9770 [1:50:55<00:14, 1.54it/s]
100%|█████████▉| 9749/9770 [1:50:56<00:13, 1.52it/s]
100%|█████████▉| 9750/9770 [1:50:57<00:13, 1.53it/s]
100%|█████████▉| 9750/9770 [1:50:57<00:13, 1.53it/s]
100%|█████████▉| 9751/9770 [1:50:57<00:12, 1.54it/s]
100%|█████████▉| 9752/9770 [1:50:58<00:11, 1.53it/s]
100%|█████████▉| 9753/9770 [1:50:59<00:10, 1.55it/s]
100%|█████████▉| 9754/9770 [1:50:59<00:10, 1.55it/s]
100%|█████████▉| 9755/9770 [1:51:00<00:09, 1.54it/s]
100%|█████████▉| 9756/9770 [1:51:00<00:08, 1.56it/s]
100%|█████████▉| 9757/9770 [1:51:01<00:08, 1.52it/s]
100
+0: {'loss': 0.6587, 'grad_norm': 0.5807606526510161, 'learning_rate': 2.0596451095964325e-06, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 1.0}
+0: %|█████████▉| 9758/9770 [1:51:02<00:07, 1.53it/s]
100%|█████████▉| 9759/9770 [1:51:02<00:07, 1.56it/s]
100%|█████████▉| 9760/9770 [1:51:03<00:06, 1.54it/s]
100%|█████████▉| 9760/9770 [1:51:03<00:06, 1.54it/s]
100%|█████████▉| 9761/9770 [1:51:04<00:05, 1.55it/s]
100%|█████████▉| 9762/9770 [1:51:04<00:05, 1.54it/s]
100%|█████████▉| 9763/9770 [1:51:05<00:04, 1.54it/s]
100%|█████████▉| 9764/9770 [1:51:06<00:03, 1.55it/s]
100%|████████��▉| 9765/9770 [1:51:06<00:03, 1.54it/s]
100%|█████████▉| 9766/9770 [1:51:07<00:02, 1.57it/s]
100%|█████████▉| 9767/9770 [1:51:08<00:01, 1.58it/s]
100%|█████████▉| 9768/9770 [1:51:08<00:01, 1.57it/s]
100%|█████████▉| 9769/9770 [1:51:09<00:00, 1.55it/s]
100%|█�
+0: {'loss': 0.6783, 'grad_norm': 0.6349343704858915, 'learning_rate': 2.000493475710391e-06, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 1.0}
+0: [2025-09-02 21:47:11,463] [INFO] [axolotl.core.trainers.base._save:613] [PID:3622631] [RANK:0] Saving model checkpoint to /lustre/fswork/projects/rech/dgo/udv55np/math/Qwen3-235B-A22B/Qwen2.5-0.5B_reasoning/1/checkpoint-9770[39m
+0: [2025-09-02 21:47:12,391] [INFO] [axolotl.core.trainers.base._save:662] [PID:3622631] [RANK:0] Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`[39m
+0: {'train_runtime': 6672.3863, 'train_samples_per_second': 23.428, 'train_steps_per_second': 1.464, 'train_loss': 0.6798874075605535, 'memory/max_mem_active(gib)': 30.07, 'memory/max_mem_allocated(gib)': 30.05, 'memory/device_mem_reserved(gib)': 35.21, 'epoch': 1.0}
+0: �████████| 9770/9770 [1:51:10<00:00, 1.55it/s]
100%|██████████| 9770/9770 [1:51:10<00:00, 1.55it/s]
100%|██████████| 9770/9770 [1:51:12<00:00, 1.55it/s]
100%|██████████| 9770/9770 [1:51:12<00:00, 1.46it/s]
+0: [2025-09-02 21:47:16,771] [INFO] [axolotl.train.save_trained_model:228] [PID:3622631] [RANK:0] Training completed! Saving trained model to /lustre/fswork/projects/rech/dgo/udv55np/math/Qwen3-235B-A22B/Qwen2.5-0.5B_reasoning/1.[39m
+0: [2025-09-02 21:47:17,333] [INFO] [axolotl.core.trainers.base._save:613] [PID:3622631] [RANK:0] Saving model checkpoint to /lustre/fswork/projects/rech/dgo/udv55np/math/Qwen3-235B-A22B/Qwen2.5-0.5B_reasoning/1[39m
+0: [2025-09-02 21:47:18,240] [INFO] [axolotl.core.trainers.base._save:662] [PID:3622631] [RANK:0] Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`[39m
+0: [2025-09-02 21:47:18,532] [INFO] [axolotl.train.save_trained_model:350] [PID:3622631] [RANK:0] Model successfully saved to /lustre/fswork/projects/rech/dgo/udv55np/math/Qwen3-235B-A22B/Qwen2.5-0.5B_reasoning/1[39m