jnmrr commited on Aug 14

Commit

3a530fe

verified ·

1 Parent(s): 978a129

Upload RT-DETRv2 voucher classifier

Browse files

Files changed (41) hide show

README.md +14 -14
checkpoint-1600/config.json +129 -0
checkpoint-1600/model.safetensors +3 -0
checkpoint-1600/optimizer.pt +3 -0
checkpoint-1600/preprocessor_config.json +26 -0
checkpoint-1600/rng_state.pth +3 -0
checkpoint-1600/scheduler.pt +3 -0
checkpoint-1600/trainer_state.json +258 -0
checkpoint-1600/training_args.bin +3 -0
checkpoint-1800/config.json +129 -0
checkpoint-1800/model.safetensors +3 -0
checkpoint-1800/optimizer.pt +3 -0
checkpoint-1800/preprocessor_config.json +26 -0
checkpoint-1800/rng_state.pth +3 -0
checkpoint-1800/scheduler.pt +3 -0
checkpoint-1800/trainer_state.json +286 -0
checkpoint-1800/training_args.bin +3 -0
checkpoint-2000/model.safetensors +1 -1
checkpoint-2000/optimizer.pt +1 -1
checkpoint-2000/rng_state.pth +1 -1
checkpoint-2000/trainer_state.json +179 -2699
checkpoint-2000/training_args.bin +1 -1
checkpoint-2200/config.json +129 -0
checkpoint-2200/model.safetensors +3 -0
checkpoint-2200/optimizer.pt +3 -0
checkpoint-2200/preprocessor_config.json +26 -0
checkpoint-2200/rng_state.pth +3 -0
checkpoint-2200/scheduler.pt +3 -0
checkpoint-2200/trainer_state.json +342 -0
checkpoint-2200/training_args.bin +3 -0
checkpoint-2250/config.json +129 -0
checkpoint-2250/model.safetensors +3 -0
checkpoint-2250/optimizer.pt +3 -0
checkpoint-2250/preprocessor_config.json +26 -0
checkpoint-2250/rng_state.pth +3 -0
checkpoint-2250/scheduler.pt +3 -0
checkpoint-2250/trainer_state.json +349 -0
checkpoint-2250/training_args.bin +3 -0
model.safetensors +1 -1
runs/Aug14_17-42-57_2676026c4495/events.out.tfevents.1755193378.2676026c4495.6591.0 +3 -0
training_args.bin +1 -1

README.md CHANGED Viewed

@@ -36,11 +36,11 @@ This model is a fine-tuned version of [PekingU/rtdetr_v2_r101vd](https://hugging
 ### Training Details
 **Training Dataset:**
-- **Total Samples**: 2186
 - **Class Distribution**:
-- **fisico** (id: 1): 1023 samples (46.8%)
-- **digital** (id: 0): 626 samples (28.6%)
-- **tesoreria** (id: 2): 537 samples (24.6%)
 **Training Configuration:**
@@ -91,15 +91,15 @@ This model is a fine-tuned version of [PekingU/rtdetr_v2_r101vd](https://hugging
 - **Tesoreria receipts mean confidence**: 0.0000 (low)
 **Performance by Object Size:**
-- **Small objects**: 0.0000
-- **Medium objects**: -1.0000
 - **Large objects**: 0.0000
 **Evaluation Dataset:**
-- **Digital invoices**: 157 samples (28.5%)
-- **Fisico receipts**: 261 samples (47.4%)
-- **Tesoreria receipts**: 133 samples (24.1%)
-- **Total evaluation samples**: 551
 **Model Configuration:**
 - **Base model**: PekingU/rtdetr_v2_r101vd
@@ -114,16 +114,16 @@ This model is a fine-tuned version of [PekingU/rtdetr_v2_r101vd](https://hugging
 - **RAM**: 83.5 GB
 - **GPU configuration**: A100 optimized
-**Training Time**: 1.19 hours
 **Training Summary:**
-- **Final training loss**: 175.9119
-- **Final learning rate**: 3.00e-07
 ### MLflow Tracking
-- **MLflow Run ID**: 7407ca0e0c584be4988a47523f45fabd
 - **MLflow Experiment**: RT-DETRv2_Voucher_Classification

 ### Training Details
 **Training Dataset:**
+- **Total Samples**: 2178
 - **Class Distribution**:
+- **digital** (id: 0): 626 samples (28.7%)
+- **tesoreria** (id: 2): 537 samples (24.7%)
+- **fisico** (id: 1): 1015 samples (46.6%)
 **Training Configuration:**
 - **Tesoreria receipts mean confidence**: 0.0000 (low)
 **Performance by Object Size:**
+- **Small objects**: -1.0000
+- **Medium objects**: 0.0000
 - **Large objects**: 0.0000
 **Evaluation Dataset:**
+- **Digital invoices**: 157 samples (28.8%)
+- **Fisico receipts**: 255 samples (46.8%)
+- **Tesoreria receipts**: 133 samples (24.4%)
+- **Total evaluation samples**: 545
 **Model Configuration:**
 - **Base model**: PekingU/rtdetr_v2_r101vd
 - **RAM**: 83.5 GB
 - **GPU configuration**: A100 optimized
+**Training Time**: 1.18 hours
 **Training Summary:**
+- **Final training loss**: 196.0164
+- **Final learning rate**: 6.00e-08
 ### MLflow Tracking
+- **MLflow Run ID**: fe7bd26bd1b344c292d9b485139be46c
 - **MLflow Experiment**: RT-DETRv2_Voucher_Classification

checkpoint-1600/config.json ADDED Viewed

	@@ -0,0 +1,129 @@

+{
+  "activation_dropout": 0.0,
+  "activation_function": "silu",
+  "anchor_image_size": null,
+  "architectures": [
+    "RTDetrV2ForObjectDetection"
+  ],
+  "attention_dropout": 0.0,
+  "auxiliary_loss": true,
+  "backbone": null,
+  "backbone_config": {
+    "depths": [
+      3,
+      4,
+      23,
+      3
+    ],
+    "downsample_in_bottleneck": false,
+    "downsample_in_first_stage": false,
+    "embedding_size": 64,
+    "hidden_act": "relu",
+    "hidden_sizes": [
+      256,
+      512,
+      1024,
+      2048
+    ],
+    "layer_type": "bottleneck",
+    "model_type": "rt_detr_resnet",
+    "num_channels": 3,
+    "out_features": [
+      "stage2",
+      "stage3",
+      "stage4"
+    ],
+    "out_indices": [
+      2,
+      3,
+      4
+    ],
+    "stage_names": [
+      "stem",
+      "stage1",
+      "stage2",
+      "stage3",
+      "stage4"
+    ],
+    "torch_dtype": "float32"
+  },
+  "backbone_kwargs": null,
+  "batch_norm_eps": 1e-05,
+  "box_noise_scale": 1.0,
+  "d_model": 256,
+  "decoder_activation_function": "relu",
+  "decoder_attention_heads": 8,
+  "decoder_ffn_dim": 1024,
+  "decoder_in_channels": [
+    384,
+    384,
+    384
+  ],
+  "decoder_layers": 6,
+  "decoder_method": "default",
+  "decoder_n_levels": 3,
+  "decoder_n_points": 4,
+  "decoder_offset_scale": 0.5,
+  "disable_custom_kernels": true,
+  "dropout": 0.0,
+  "encode_proj_layers": [
+    2
+  ],
+  "encoder_activation_function": "gelu",
+  "encoder_attention_heads": 8,
+  "encoder_ffn_dim": 2048,
+  "encoder_hidden_dim": 384,
+  "encoder_in_channels": [
+    512,
+    1024,
+    2048
+  ],
+  "encoder_layers": 1,
+  "eos_coefficient": 0.0001,
+  "eval_size": null,
+  "feat_strides": [
+    8,
+    16,
+    32
+  ],
+  "focal_loss_alpha": 0.75,
+  "focal_loss_gamma": 2.0,
+  "freeze_backbone_batch_norms": true,
+  "hidden_expansion": 1.0,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2"
+  },
+  "initializer_bias_prior_prob": null,
+  "initializer_range": 0.01,
+  "is_encoder_decoder": true,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_2": 2
+  },
+  "label_noise_ratio": 0.5,
+  "layer_norm_eps": 1e-05,
+  "learn_initial_query": false,
+  "matcher_alpha": 0.25,
+  "matcher_bbox_cost": 5.0,
+  "matcher_class_cost": 2.0,
+  "matcher_gamma": 2.0,
+  "matcher_giou_cost": 2.0,
+  "model_type": "rt_detr_v2",
+  "normalize_before": false,
+  "num_denoising": 100,
+  "num_feature_levels": 3,
+  "num_queries": 300,
+  "positional_encoding_temperature": 10000,
+  "torch_dtype": "float32",
+  "transformers_version": "4.55.0",
+  "use_focal_loss": true,
+  "use_pretrained_backbone": false,
+  "use_timm_backbone": false,
+  "weight_loss_bbox": 5.0,
+  "weight_loss_giou": 2.0,
+  "weight_loss_vfl": 1.0,
+  "with_box_refine": true
+}

checkpoint-1600/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:226b3771abee1cf8215eee8ff3a6c866930ba69be4144e554d6ad8d45317d5a4
+size 306699044

checkpoint-1600/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e429f0d5be726eb26d197c034148dc43ca3de0a5498a36ae73ffb251656d55e0
+size 611580433

checkpoint-1600/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "do_convert_annotations": true,
+  "do_normalize": false,
+  "do_pad": false,
+  "do_rescale": true,
+  "do_resize": true,
+  "format": "coco_detection",
+  "image_mean": [
+    0.485,
+    0.456,
+    0.406
+  ],
+  "image_processor_type": "RTDetrImageProcessor",
+  "image_std": [
+    0.229,
+    0.224,
+    0.225
+  ],
+  "pad_size": null,
+  "resample": 2,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 640,
+    "width": 640
+  }
+}

checkpoint-1600/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0ec603acfcf0594fe5e0f4a5b622df7eaf620ca0be75f37087ee07cf6a8bc746
+size 14244

checkpoint-1600/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:294a2ee5d41cd38781c829986780523b3ca2c9ddcff37ad7f87c59ebe379a29a
+size 1064

checkpoint-1600/trainer_state.json ADDED Viewed

	@@ -0,0 +1,258 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 35.55555555555556,
+  "eval_steps": 500,
+  "global_step": 1600,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 1.1111111111111112,
+      "grad_norm": 3663.33642578125,
+      "learning_rate": 3.6750000000000003e-07,
+      "loss": 1356.1239,
+      "step": 50
+    },
+    {
+      "epoch": 2.2222222222222223,
+      "grad_norm": 3973.032470703125,
+      "learning_rate": 7.425000000000001e-07,
+      "loss": 1275.5178,
+      "step": 100
+    },
+    {
+      "epoch": 3.3333333333333335,
+      "grad_norm": 2667.0068359375,
+      "learning_rate": 1.1174999999999999e-06,
+      "loss": 1123.6059,
+      "step": 150
+    },
+    {
+      "epoch": 4.444444444444445,
+      "grad_norm": 2320.48486328125,
+      "learning_rate": 1.4925000000000001e-06,
+      "loss": 922.8917,
+      "step": 200
+    },
+    {
+      "epoch": 5.555555555555555,
+      "grad_norm": 1611.3345947265625,
+      "learning_rate": 1.8675000000000001e-06,
+      "loss": 714.5638,
+      "step": 250
+    },
+    {
+      "epoch": 6.666666666666667,
+      "grad_norm": 1395.7064208984375,
+      "learning_rate": 2.2425e-06,
+      "loss": 542.1251,
+      "step": 300
+    },
+    {
+      "epoch": 7.777777777777778,
+      "grad_norm": 1019.4860229492188,
+      "learning_rate": 2.6175e-06,
+      "loss": 411.1387,
+      "step": 350
+    },
+    {
+      "epoch": 8.88888888888889,
+      "grad_norm": 888.315185546875,
+      "learning_rate": 2.9925e-06,
+      "loss": 318.4567,
+      "step": 400
+    },
+    {
+      "epoch": 10.0,
+      "grad_norm": 2590.113525390625,
+      "learning_rate": 3.3675000000000004e-06,
+      "loss": 261.6995,
+      "step": 450
+    },
+    {
+      "epoch": 11.11111111111111,
+      "grad_norm": 711.677734375,
+      "learning_rate": 3.7425e-06,
+      "loss": 220.2936,
+      "step": 500
+    },
+    {
+      "epoch": 12.222222222222221,
+      "grad_norm": 548.9371948242188,
+      "learning_rate": 4.117500000000001e-06,
+      "loss": 187.9833,
+      "step": 550
+    },
+    {
+      "epoch": 13.333333333333334,
+      "grad_norm": 1127.1611328125,
+      "learning_rate": 4.4925e-06,
+      "loss": 159.1351,
+      "step": 600
+    },
+    {
+      "epoch": 14.444444444444445,
+      "grad_norm": 426.074951171875,
+      "learning_rate": 4.8675e-06,
+      "loss": 137.1092,
+      "step": 650
+    },
+    {
+      "epoch": 15.555555555555555,
+      "grad_norm": 348.842529296875,
+      "learning_rate": 5.2425e-06,
+      "loss": 119.822,
+      "step": 700
+    },
+    {
+      "epoch": 16.666666666666668,
+      "grad_norm": 373.2169189453125,
+      "learning_rate": 5.6175e-06,
+      "loss": 104.3366,
+      "step": 750
+    },
+    {
+      "epoch": 17.77777777777778,
+      "grad_norm": 324.51702880859375,
+      "learning_rate": 5.992500000000001e-06,
+      "loss": 90.8788,
+      "step": 800
+    },
+    {
+      "epoch": 18.88888888888889,
+      "grad_norm": 269.91827392578125,
+      "learning_rate": 6.3675e-06,
+      "loss": 78.4644,
+      "step": 850
+    },
+    {
+      "epoch": 20.0,
+      "grad_norm": 1744.54052734375,
+      "learning_rate": 6.7425e-06,
+      "loss": 70.3526,
+      "step": 900
+    },
+    {
+      "epoch": 21.11111111111111,
+      "grad_norm": 369.39837646484375,
+      "learning_rate": 7.1175e-06,
+      "loss": 63.9417,
+      "step": 950
+    },
+    {
+      "epoch": 22.22222222222222,
+      "grad_norm": 303.95977783203125,
+      "learning_rate": 7.4925e-06,
+      "loss": 63.4575,
+      "step": 1000
+    },
+    {
+      "epoch": 23.333333333333332,
+      "grad_norm": 187.462890625,
+      "learning_rate": 7.8675e-06,
+      "loss": 54.7417,
+      "step": 1050
+    },
+    {
+      "epoch": 24.444444444444443,
+      "grad_norm": 165.56666564941406,
+      "learning_rate": 8.2425e-06,
+      "loss": 49.6842,
+      "step": 1100
+    },
+    {
+      "epoch": 25.555555555555557,
+      "grad_norm": 147.3148193359375,
+      "learning_rate": 8.6175e-06,
+      "loss": 43.027,
+      "step": 1150
+    },
+    {
+      "epoch": 26.666666666666668,
+      "grad_norm": 125.53775024414062,
+      "learning_rate": 8.9925e-06,
+      "loss": 38.2579,
+      "step": 1200
+    },
+    {
+      "epoch": 27.77777777777778,
+      "grad_norm": 109.85810089111328,
+      "learning_rate": 9.367500000000001e-06,
+      "loss": 34.3957,
+      "step": 1250
+    },
+    {
+      "epoch": 28.88888888888889,
+      "grad_norm": 98.328369140625,
+      "learning_rate": 9.7425e-06,
+      "loss": 31.4378,
+      "step": 1300
+    },
+    {
+      "epoch": 30.0,
+      "grad_norm": 109.77750396728516,
+      "learning_rate": 1.01175e-05,
+      "loss": 28.5084,
+      "step": 1350
+    },
+    {
+      "epoch": 31.11111111111111,
+      "grad_norm": 112.47483825683594,
+      "learning_rate": 1.04925e-05,
+      "loss": 26.1671,
+      "step": 1400
+    },
+    {
+      "epoch": 32.22222222222222,
+      "grad_norm": 85.60242462158203,
+      "learning_rate": 1.08675e-05,
+      "loss": 24.2309,
+      "step": 1450
+    },
+    {
+      "epoch": 33.333333333333336,
+      "grad_norm": 73.19799041748047,
+      "learning_rate": 1.1242500000000001e-05,
+      "loss": 22.6248,
+      "step": 1500
+    },
+    {
+      "epoch": 34.44444444444444,
+      "grad_norm": 80.70884704589844,
+      "learning_rate": 1.16175e-05,
+      "loss": 21.1187,
+      "step": 1550
+    },
+    {
+      "epoch": 35.55555555555556,
+      "grad_norm": 110.98326110839844,
+      "learning_rate": 1.19925e-05,
+      "loss": 20.4828,
+      "step": 1600
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 2250,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 50,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.31006450959319e+19,
+  "train_batch_size": 24,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-1600/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a7b84367094b7487f77de50fba614a6c6667e9cf018b77ee5bfc158268fc5eaf
+size 5368

checkpoint-1800/config.json ADDED Viewed

	@@ -0,0 +1,129 @@

+{
+  "activation_dropout": 0.0,
+  "activation_function": "silu",
+  "anchor_image_size": null,
+  "architectures": [
+    "RTDetrV2ForObjectDetection"
+  ],
+  "attention_dropout": 0.0,
+  "auxiliary_loss": true,
+  "backbone": null,
+  "backbone_config": {
+    "depths": [
+      3,
+      4,
+      23,
+      3
+    ],
+    "downsample_in_bottleneck": false,
+    "downsample_in_first_stage": false,
+    "embedding_size": 64,
+    "hidden_act": "relu",
+    "hidden_sizes": [
+      256,
+      512,
+      1024,
+      2048
+    ],
+    "layer_type": "bottleneck",
+    "model_type": "rt_detr_resnet",
+    "num_channels": 3,
+    "out_features": [
+      "stage2",
+      "stage3",
+      "stage4"
+    ],
+    "out_indices": [
+      2,
+      3,
+      4
+    ],
+    "stage_names": [
+      "stem",
+      "stage1",
+      "stage2",
+      "stage3",
+      "stage4"
+    ],
+    "torch_dtype": "float32"
+  },
+  "backbone_kwargs": null,
+  "batch_norm_eps": 1e-05,
+  "box_noise_scale": 1.0,
+  "d_model": 256,
+  "decoder_activation_function": "relu",
+  "decoder_attention_heads": 8,
+  "decoder_ffn_dim": 1024,
+  "decoder_in_channels": [
+    384,
+    384,
+    384
+  ],
+  "decoder_layers": 6,
+  "decoder_method": "default",
+  "decoder_n_levels": 3,
+  "decoder_n_points": 4,
+  "decoder_offset_scale": 0.5,
+  "disable_custom_kernels": true,
+  "dropout": 0.0,
+  "encode_proj_layers": [
+    2
+  ],
+  "encoder_activation_function": "gelu",
+  "encoder_attention_heads": 8,
+  "encoder_ffn_dim": 2048,
+  "encoder_hidden_dim": 384,
+  "encoder_in_channels": [
+    512,
+    1024,
+    2048
+  ],
+  "encoder_layers": 1,
+  "eos_coefficient": 0.0001,
+  "eval_size": null,
+  "feat_strides": [
+    8,
+    16,
+    32
+  ],
+  "focal_loss_alpha": 0.75,
+  "focal_loss_gamma": 2.0,
+  "freeze_backbone_batch_norms": true,
+  "hidden_expansion": 1.0,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2"
+  },
+  "initializer_bias_prior_prob": null,
+  "initializer_range": 0.01,
+  "is_encoder_decoder": true,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_2": 2
+  },
+  "label_noise_ratio": 0.5,
+  "layer_norm_eps": 1e-05,
+  "learn_initial_query": false,
+  "matcher_alpha": 0.25,
+  "matcher_bbox_cost": 5.0,
+  "matcher_class_cost": 2.0,
+  "matcher_gamma": 2.0,
+  "matcher_giou_cost": 2.0,
+  "model_type": "rt_detr_v2",
+  "normalize_before": false,
+  "num_denoising": 100,
+  "num_feature_levels": 3,
+  "num_queries": 300,
+  "positional_encoding_temperature": 10000,
+  "torch_dtype": "float32",
+  "transformers_version": "4.55.0",
+  "use_focal_loss": true,
+  "use_pretrained_backbone": false,
+  "use_timm_backbone": false,
+  "weight_loss_bbox": 5.0,
+  "weight_loss_giou": 2.0,
+  "weight_loss_vfl": 1.0,
+  "with_box_refine": true
+}

checkpoint-1800/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:07ca2b0b335e81ed89a2f7f930b576eee925506e057c93002392f1e2376180f0
+size 306699044

checkpoint-1800/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9bd5f81a1ac758d99c2a50f00a75bf78c82cda77f464f12cdda567e7a65306d7
+size 611580433

checkpoint-1800/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "do_convert_annotations": true,
+  "do_normalize": false,
+  "do_pad": false,
+  "do_rescale": true,
+  "do_resize": true,
+  "format": "coco_detection",
+  "image_mean": [
+    0.485,
+    0.456,
+    0.406
+  ],
+  "image_processor_type": "RTDetrImageProcessor",
+  "image_std": [
+    0.229,
+    0.224,
+    0.225
+  ],
+  "pad_size": null,
+  "resample": 2,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 640,
+    "width": 640
+  }
+}

checkpoint-1800/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a0f22c99e3d52b982805eed64c46ed79d1c93982c6433c732f7f4d4a2ab58f93
+size 14244

checkpoint-1800/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c7ef919f119174a0bac8b992747548bc0ab5a6be59a5cfc5a6c8633fade536fa
+size 1064

checkpoint-1800/trainer_state.json ADDED Viewed

	@@ -0,0 +1,286 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 40.0,
+  "eval_steps": 500,
+  "global_step": 1800,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 1.1111111111111112,
+      "grad_norm": 3663.33642578125,
+      "learning_rate": 3.6750000000000003e-07,
+      "loss": 1356.1239,
+      "step": 50
+    },
+    {
+      "epoch": 2.2222222222222223,
+      "grad_norm": 3973.032470703125,
+      "learning_rate": 7.425000000000001e-07,
+      "loss": 1275.5178,
+      "step": 100
+    },
+    {
+      "epoch": 3.3333333333333335,
+      "grad_norm": 2667.0068359375,
+      "learning_rate": 1.1174999999999999e-06,
+      "loss": 1123.6059,
+      "step": 150
+    },
+    {
+      "epoch": 4.444444444444445,
+      "grad_norm": 2320.48486328125,
+      "learning_rate": 1.4925000000000001e-06,
+      "loss": 922.8917,
+      "step": 200
+    },
+    {
+      "epoch": 5.555555555555555,
+      "grad_norm": 1611.3345947265625,
+      "learning_rate": 1.8675000000000001e-06,
+      "loss": 714.5638,
+      "step": 250
+    },
+    {
+      "epoch": 6.666666666666667,
+      "grad_norm": 1395.7064208984375,
+      "learning_rate": 2.2425e-06,
+      "loss": 542.1251,
+      "step": 300
+    },
+    {
+      "epoch": 7.777777777777778,
+      "grad_norm": 1019.4860229492188,
+      "learning_rate": 2.6175e-06,
+      "loss": 411.1387,
+      "step": 350
+    },
+    {
+      "epoch": 8.88888888888889,
+      "grad_norm": 888.315185546875,
+      "learning_rate": 2.9925e-06,
+      "loss": 318.4567,
+      "step": 400
+    },
+    {
+      "epoch": 10.0,
+      "grad_norm": 2590.113525390625,
+      "learning_rate": 3.3675000000000004e-06,
+      "loss": 261.6995,
+      "step": 450
+    },
+    {
+      "epoch": 11.11111111111111,
+      "grad_norm": 711.677734375,
+      "learning_rate": 3.7425e-06,
+      "loss": 220.2936,
+      "step": 500
+    },
+    {
+      "epoch": 12.222222222222221,
+      "grad_norm": 548.9371948242188,
+      "learning_rate": 4.117500000000001e-06,
+      "loss": 187.9833,
+      "step": 550
+    },
+    {
+      "epoch": 13.333333333333334,
+      "grad_norm": 1127.1611328125,
+      "learning_rate": 4.4925e-06,
+      "loss": 159.1351,
+      "step": 600
+    },
+    {
+      "epoch": 14.444444444444445,
+      "grad_norm": 426.074951171875,
+      "learning_rate": 4.8675e-06,
+      "loss": 137.1092,
+      "step": 650
+    },
+    {
+      "epoch": 15.555555555555555,
+      "grad_norm": 348.842529296875,
+      "learning_rate": 5.2425e-06,
+      "loss": 119.822,
+      "step": 700
+    },
+    {
+      "epoch": 16.666666666666668,
+      "grad_norm": 373.2169189453125,
+      "learning_rate": 5.6175e-06,
+      "loss": 104.3366,
+      "step": 750
+    },
+    {
+      "epoch": 17.77777777777778,
+      "grad_norm": 324.51702880859375,
+      "learning_rate": 5.992500000000001e-06,
+      "loss": 90.8788,
+      "step": 800
+    },
+    {
+      "epoch": 18.88888888888889,
+      "grad_norm": 269.91827392578125,
+      "learning_rate": 6.3675e-06,
+      "loss": 78.4644,
+      "step": 850
+    },
+    {
+      "epoch": 20.0,
+      "grad_norm": 1744.54052734375,
+      "learning_rate": 6.7425e-06,
+      "loss": 70.3526,
+      "step": 900
+    },
+    {
+      "epoch": 21.11111111111111,
+      "grad_norm": 369.39837646484375,
+      "learning_rate": 7.1175e-06,
+      "loss": 63.9417,
+      "step": 950
+    },
+    {
+      "epoch": 22.22222222222222,
+      "grad_norm": 303.95977783203125,
+      "learning_rate": 7.4925e-06,
+      "loss": 63.4575,
+      "step": 1000
+    },
+    {
+      "epoch": 23.333333333333332,
+      "grad_norm": 187.462890625,
+      "learning_rate": 7.8675e-06,
+      "loss": 54.7417,
+      "step": 1050
+    },
+    {
+      "epoch": 24.444444444444443,
+      "grad_norm": 165.56666564941406,
+      "learning_rate": 8.2425e-06,
+      "loss": 49.6842,
+      "step": 1100
+    },
+    {
+      "epoch": 25.555555555555557,
+      "grad_norm": 147.3148193359375,
+      "learning_rate": 8.6175e-06,
+      "loss": 43.027,
+      "step": 1150
+    },
+    {
+      "epoch": 26.666666666666668,
+      "grad_norm": 125.53775024414062,
+      "learning_rate": 8.9925e-06,
+      "loss": 38.2579,
+      "step": 1200
+    },
+    {
+      "epoch": 27.77777777777778,
+      "grad_norm": 109.85810089111328,
+      "learning_rate": 9.367500000000001e-06,
+      "loss": 34.3957,
+      "step": 1250
+    },
+    {
+      "epoch": 28.88888888888889,
+      "grad_norm": 98.328369140625,
+      "learning_rate": 9.7425e-06,
+      "loss": 31.4378,
+      "step": 1300
+    },
+    {
+      "epoch": 30.0,
+      "grad_norm": 109.77750396728516,
+      "learning_rate": 1.01175e-05,
+      "loss": 28.5084,
+      "step": 1350
+    },
+    {
+      "epoch": 31.11111111111111,
+      "grad_norm": 112.47483825683594,
+      "learning_rate": 1.04925e-05,
+      "loss": 26.1671,
+      "step": 1400
+    },
+    {
+      "epoch": 32.22222222222222,
+      "grad_norm": 85.60242462158203,
+      "learning_rate": 1.08675e-05,
+      "loss": 24.2309,
+      "step": 1450
+    },
+    {
+      "epoch": 33.333333333333336,
+      "grad_norm": 73.19799041748047,
+      "learning_rate": 1.1242500000000001e-05,
+      "loss": 22.6248,
+      "step": 1500
+    },
+    {
+      "epoch": 34.44444444444444,
+      "grad_norm": 80.70884704589844,
+      "learning_rate": 1.16175e-05,
+      "loss": 21.1187,
+      "step": 1550
+    },
+    {
+      "epoch": 35.55555555555556,
+      "grad_norm": 110.98326110839844,
+      "learning_rate": 1.19925e-05,
+      "loss": 20.4828,
+      "step": 1600
+    },
+    {
+      "epoch": 36.666666666666664,
+      "grad_norm": 113.65286254882812,
+      "learning_rate": 1.23675e-05,
+      "loss": 19.7366,
+      "step": 1650
+    },
+    {
+      "epoch": 37.77777777777778,
+      "grad_norm": 77.65855407714844,
+      "learning_rate": 1.27425e-05,
+      "loss": 18.6632,
+      "step": 1700
+    },
+    {
+      "epoch": 38.888888888888886,
+      "grad_norm": 88.96723175048828,
+      "learning_rate": 1.3117500000000001e-05,
+      "loss": 18.0793,
+      "step": 1750
+    },
+    {
+      "epoch": 40.0,
+      "grad_norm": 79.1690902709961,
+      "learning_rate": 1.34925e-05,
+      "loss": 17.0667,
+      "step": 1800
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 2250,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 50,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.848540943358362e+19,
+  "train_batch_size": 24,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-1800/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a7b84367094b7487f77de50fba614a6c6667e9cf018b77ee5bfc158268fc5eaf
+size 5368

checkpoint-2000/model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f77f03ccce08e6fc68356cfd96e1e595cd091842dfe691b57b0240f9d0ec534d
 size 306699044

 version https://git-lfs.github.com/spec/v1
+oid sha256:74e332bd7772e2e7ca2e34234d7da49a711abdbfb273e02189615d3a85b39c3b
 size 306699044

checkpoint-2000/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:232a5353d5019158bc112b85a00cc968eabb74598e1ef8a15e7f726735442ca6
 size 611580433

 version https://git-lfs.github.com/spec/v1
+oid sha256:04408b4cb5317b335a1dfa8034af865e68e9535b6db6f797f7454b76a0dbb0fd
 size 611580433

checkpoint-2000/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9b759482d4a8ad0299b9805d5a497036b386a5a5b40224d0e72b701c28b2f5ca
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:bf38b27cb6d82f34ae1038db7a7ddef1794dabfa258a278a90cdfdf859a01777
 size 14244

checkpoint-2000/trainer_state.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 43.505494505494504,
   "eval_steps": 500,
   "global_step": 2000,
   "is_hyper_param_search": false,
@@ -10,2811 +10,291 @@
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.47619047619047616,
-      "grad_norm": 5716.10205078125,
-      "learning_rate": 3.0000000000000004e-08,
-      "loss": 1383.1007,
-      "step": 5
-    },
-    {
-      "epoch": 0.9523809523809523,
-      "grad_norm": 12343.9111328125,
-      "learning_rate": 6.75e-08,
-      "loss": 1349.0135,
-      "step": 10
-    },
-    {
-      "epoch": 1.380952380952381,
-      "grad_norm": 10210.48046875,
-      "learning_rate": 1.05e-07,
-      "loss": 1340.3351,
-      "step": 15
-    },
-    {
-      "epoch": 1.8571428571428572,
-      "grad_norm": 7633.39599609375,
-      "learning_rate": 1.425e-07,
-      "loss": 1369.3844,
-      "step": 20
-    },
-    {
-      "epoch": 0.5494505494505495,
-      "grad_norm": 12217.7294921875,
-      "learning_rate": 1.8e-07,
-      "loss": 1362.663,
-      "step": 25
-    },
-    {
-      "epoch": 0.6593406593406593,
-      "grad_norm": 31348.806640625,
-      "learning_rate": 2.175e-07,
-      "loss": 1361.9012,
-      "step": 30
-    },
-    {
-      "epoch": 0.7692307692307693,
-      "grad_norm": 7385.05322265625,
-      "learning_rate": 2.5500000000000005e-07,
-      "loss": 1365.5517,
-      "step": 35
-    },
-    {
-      "epoch": 0.8791208791208791,
-      "grad_norm": 15594.4677734375,
-      "learning_rate": 2.925e-07,
-      "loss": 1341.6402,
-      "step": 40
-    },
-    {
-      "epoch": 0.989010989010989,
-      "grad_norm": 6404.94775390625,
-      "learning_rate": 3.2999999999999996e-07,
-      "loss": 1343.9832,
-      "step": 45
-    },
-    {
-      "epoch": 1.10989010989011,
-      "grad_norm": 6927.478515625,
       "learning_rate": 3.6750000000000003e-07,
-      "loss": 1611.4768,
       "step": 50
     },
     {
-      "epoch": 1.2197802197802199,
-      "grad_norm": 12092.9287109375,
-      "learning_rate": 4.05e-07,
-      "loss": 1330.508,
-      "step": 55
-    },
-    {
-      "epoch": 1.3296703296703296,
-      "grad_norm": 5830.10693359375,
-      "learning_rate": 4.425e-07,
-      "loss": 1333.5215,
-      "step": 60
-    },
-    {
-      "epoch": 1.4395604395604396,
-      "grad_norm": 5059.5302734375,
-      "learning_rate": 4.800000000000001e-07,
-      "loss": 1305.5257,
-      "step": 65
-    },
-    {
-      "epoch": 1.5494505494505495,
-      "grad_norm": 14086.837890625,
-      "learning_rate": 5.175e-07,
-      "loss": 1293.7212,
-      "step": 70
-    },
-    {
-      "epoch": 1.6593406593406592,
-      "grad_norm": 4606.49462890625,
-      "learning_rate": 5.55e-07,
-      "loss": 1293.7292,
-      "step": 75
-    },
-    {
-      "epoch": 1.7692307692307692,
-      "grad_norm": 7120.25244140625,
-      "learning_rate": 5.925e-07,
-      "loss": 1283.8888,
-      "step": 80
-    },
-    {
-      "epoch": 1.879120879120879,
-      "grad_norm": 4402.51513671875,
-      "learning_rate": 6.3e-07,
-      "loss": 1270.6494,
-      "step": 85
-    },
-    {
-      "epoch": 1.989010989010989,
-      "grad_norm": 4826.724609375,
-      "learning_rate": 6.675e-07,
-      "loss": 1263.9209,
-      "step": 90
-    },
-    {
-      "epoch": 2.087912087912088,
-      "grad_norm": 4356.83056640625,
-      "learning_rate": 7.05e-07,
-      "loss": 1255.111,
-      "step": 95
-    },
-    {
-      "epoch": 2.197802197802198,
-      "grad_norm": 6895.61962890625,
-      "learning_rate": 7.425000000000001e-07,
-      "loss": 1224.4364,
-      "step": 100
-    },
-    {
-      "epoch": 2.3076923076923075,
-      "grad_norm": 4793.59375,
-      "learning_rate": 7.799999999999999e-07,
-      "loss": 1211.3252,
-      "step": 105
-    },
-    {
-      "epoch": 2.4175824175824174,
-      "grad_norm": 6701.6357421875,
-      "learning_rate": 8.175e-07,
-      "loss": 1183.1627,
-      "step": 110
-    },
-    {
-      "epoch": 2.5274725274725274,
-      "grad_norm": 4253.81005859375,
-      "learning_rate": 8.550000000000001e-07,
-      "loss": 1179.29,
-      "step": 115
-    },
-    {
-      "epoch": 2.6373626373626373,
-      "grad_norm": 4965.86181640625,
-      "learning_rate": 8.925e-07,
-      "loss": 1168.3469,
-      "step": 120
-    },
-    {
-      "epoch": 2.7472527472527473,
-      "grad_norm": 3376.816650390625,
-      "learning_rate": 9.3e-07,
-      "loss": 1139.6433,
-      "step": 125
-    },
-    {
-      "epoch": 2.857142857142857,
-      "grad_norm": 3226.073486328125,
-      "learning_rate": 9.675e-07,
-      "loss": 1133.0777,
-      "step": 130
-    },
-    {
-      "epoch": 2.967032967032967,
-      "grad_norm": 3560.005126953125,
-      "learning_rate": 1.0050000000000001e-06,
-      "loss": 1113.3471,
-      "step": 135
-    },
-    {
-      "epoch": 3.065934065934066,
-      "grad_norm": 2917.787109375,
-      "learning_rate": 1.0425000000000002e-06,
-      "loss": 1094.2038,
-      "step": 140
-    },
-    {
-      "epoch": 3.1758241758241756,
-      "grad_norm": 3291.990478515625,
-      "learning_rate": 1.08e-06,
-      "loss": 1069.3087,
-      "step": 145
-    },
-    {
-      "epoch": 3.2857142857142856,
-      "grad_norm": 3082.956298828125,
-      "learning_rate": 1.1174999999999999e-06,
-      "loss": 1043.3319,
-      "step": 150
-    },
-    {
-      "epoch": 3.3956043956043955,
-      "grad_norm": 2947.577392578125,
-      "learning_rate": 1.155e-06,
-      "loss": 1040.2479,
-      "step": 155
-    },
-    {
-      "epoch": 3.5054945054945055,
-      "grad_norm": 2917.072021484375,
-      "learning_rate": 1.1925e-06,
-      "loss": 1014.5039,
-      "step": 160
-    },
-    {
-      "epoch": 3.6153846153846154,
-      "grad_norm": 2572.1064453125,
-      "learning_rate": 1.23e-06,
-      "loss": 980.6475,
-      "step": 165
-    },
-    {
-      "epoch": 3.7252747252747254,
-      "grad_norm": 3038.7041015625,
-      "learning_rate": 1.2675000000000001e-06,
-      "loss": 969.8687,
-      "step": 170
-    },
-    {
-      "epoch": 3.8351648351648353,
-      "grad_norm": 2459.19482421875,
-      "learning_rate": 1.305e-06,
-      "loss": 946.8183,
-      "step": 175
-    },
-    {
-      "epoch": 3.9450549450549453,
-      "grad_norm": 2485.954345703125,
-      "learning_rate": 1.3425e-06,
-      "loss": 923.6708,
-      "step": 180
-    },
-    {
-      "epoch": 4.043956043956044,
-      "grad_norm": 2574.14599609375,
-      "learning_rate": 1.38e-06,
-      "loss": 906.8975,
-      "step": 185
-    },
-    {
-      "epoch": 4.153846153846154,
-      "grad_norm": 2498.670166015625,
-      "learning_rate": 1.4175e-06,
-      "loss": 879.6545,
-      "step": 190
-    },
-    {
-      "epoch": 4.263736263736264,
-      "grad_norm": 2621.924072265625,
-      "learning_rate": 1.455e-06,
-      "loss": 859.6516,
-      "step": 195
-    },
-    {
-      "epoch": 4.373626373626374,
-      "grad_norm": 2298.13671875,
-      "learning_rate": 1.4925000000000001e-06,
-      "loss": 835.168,
-      "step": 200
-    },
-    {
-      "epoch": 4.483516483516484,
-      "grad_norm": 2228.95458984375,
-      "learning_rate": 1.53e-06,
-      "loss": 817.8176,
-      "step": 205
-    },
-    {
-      "epoch": 4.593406593406593,
-      "grad_norm": 2075.57568359375,
-      "learning_rate": 1.5675e-06,
-      "loss": 795.5578,
-      "step": 210
-    },
-    {
-      "epoch": 4.7032967032967035,
-      "grad_norm": 2308.193603515625,
-      "learning_rate": 1.605e-06,
-      "loss": 769.8123,
-      "step": 215
-    },
-    {
-      "epoch": 4.813186813186813,
-      "grad_norm": 2495.91259765625,
-      "learning_rate": 1.6425e-06,
-      "loss": 745.4971,
-      "step": 220
-    },
-    {
-      "epoch": 4.923076923076923,
-      "grad_norm": 2490.6796875,
-      "learning_rate": 1.68e-06,
-      "loss": 724.5927,
-      "step": 225
-    },
-    {
-      "epoch": 5.021978021978022,
-      "grad_norm": 1986.7320556640625,
-      "learning_rate": 1.7175e-06,
-      "loss": 711.5865,
-      "step": 230
-    },
-    {
-      "epoch": 5.131868131868132,
-      "grad_norm": 1901.4664306640625,
-      "learning_rate": 1.7550000000000001e-06,
-      "loss": 692.5993,
-      "step": 235
-    },
-    {
-      "epoch": 5.241758241758242,
-      "grad_norm": 2567.854248046875,
-      "learning_rate": 1.7925e-06,
-      "loss": 665.2945,
-      "step": 240
-    },
-    {
-      "epoch": 5.351648351648351,
-      "grad_norm": 1668.3482666015625,
-      "learning_rate": 1.83e-06,
-      "loss": 648.977,
-      "step": 245
-    },
-    {
-      "epoch": 5.461538461538462,
-      "grad_norm": 1845.254150390625,
-      "learning_rate": 1.8675000000000001e-06,
-      "loss": 627.8005,
-      "step": 250
-    },
-    {
-      "epoch": 5.571428571428571,
-      "grad_norm": 1811.66845703125,
-      "learning_rate": 1.905e-06,
-      "loss": 606.1213,
-      "step": 255
-    },
-    {
-      "epoch": 5.681318681318682,
-      "grad_norm": 1720.802734375,
-      "learning_rate": 1.9425e-06,
-      "loss": 587.5198,
-      "step": 260
-    },
-    {
-      "epoch": 5.791208791208791,
-      "grad_norm": 1535.228759765625,
-      "learning_rate": 1.98e-06,
-      "loss": 570.1765,
-      "step": 265
-    },
-    {
-      "epoch": 5.9010989010989015,
-      "grad_norm": 1655.658447265625,
-      "learning_rate": 2.0175e-06,
-      "loss": 551.0042,
-      "step": 270
-    },
-    {
-      "epoch": 6.0,
-      "grad_norm": 3979.0732421875,
-      "learning_rate": 2.0550000000000002e-06,
-      "loss": 535.4148,
-      "step": 275
-    },
-    {
-      "epoch": 6.1098901098901095,
-      "grad_norm": 1727.4771728515625,
-      "learning_rate": 2.0925000000000003e-06,
-      "loss": 519.8219,
-      "step": 280
-    },
-    {
-      "epoch": 6.21978021978022,
-      "grad_norm": 1350.4666748046875,
-      "learning_rate": 2.13e-06,
-      "loss": 510.5004,
-      "step": 285
-    },
-    {
-      "epoch": 6.329670329670329,
-      "grad_norm": 2242.578857421875,
-      "learning_rate": 2.1675e-06,
-      "loss": 497.497,
-      "step": 290
-    },
-    {
-      "epoch": 6.43956043956044,
-      "grad_norm": 1353.7908935546875,
-      "learning_rate": 2.205e-06,
-      "loss": 476.5748,
-      "step": 295
-    },
-    {
-      "epoch": 6.549450549450549,
-      "grad_norm": 1527.2796630859375,
-      "learning_rate": 2.2425e-06,
-      "loss": 468.7679,
-      "step": 300
-    },
-    {
-      "epoch": 6.65934065934066,
-      "grad_norm": 1145.3853759765625,
-      "learning_rate": 2.28e-06,
-      "loss": 454.7695,
-      "step": 305
-    },
-    {
-      "epoch": 6.769230769230769,
-      "grad_norm": 1233.435302734375,
-      "learning_rate": 2.3175e-06,
-      "loss": 438.3474,
-      "step": 310
-    },
-    {
-      "epoch": 6.8791208791208796,
-      "grad_norm": 1669.4859619140625,
-      "learning_rate": 2.355e-06,
-      "loss": 424.2438,
-      "step": 315
-    },
-    {
-      "epoch": 6.989010989010989,
-      "grad_norm": 1667.7239990234375,
-      "learning_rate": 2.3925e-06,
-      "loss": 412.884,
-      "step": 320
-    },
-    {
-      "epoch": 7.087912087912088,
-      "grad_norm": 1087.6927490234375,
-      "learning_rate": 2.43e-06,
-      "loss": 399.3542,
-      "step": 325
-    },
-    {
-      "epoch": 7.197802197802198,
-      "grad_norm": 1415.59765625,
-      "learning_rate": 2.4675e-06,
-      "loss": 387.3436,
-      "step": 330
-    },
-    {
-      "epoch": 7.3076923076923075,
-      "grad_norm": 1952.5645751953125,
-      "learning_rate": 2.505e-06,
-      "loss": 377.6118,
-      "step": 335
-    },
-    {
-      "epoch": 7.417582417582418,
-      "grad_norm": 1404.5712890625,
-      "learning_rate": 2.5425000000000002e-06,
-      "loss": 371.8316,
-      "step": 340
-    },
-    {
-      "epoch": 7.527472527472527,
-      "grad_norm": 1185.5135498046875,
-      "learning_rate": 2.58e-06,
-      "loss": 363.4571,
-      "step": 345
-    },
-    {
-      "epoch": 7.637362637362637,
-      "grad_norm": 4727.9638671875,
-      "learning_rate": 2.6175e-06,
-      "loss": 354.3981,
-      "step": 350
-    },
-    {
-      "epoch": 7.747252747252747,
-      "grad_norm": 937.0252075195312,
-      "learning_rate": 2.655e-06,
-      "loss": 344.7615,
-      "step": 355
-    },
-    {
-      "epoch": 7.857142857142857,
-      "grad_norm": 1180.9298095703125,
-      "learning_rate": 2.6925e-06,
-      "loss": 340.5888,
-      "step": 360
-    },
-    {
-      "epoch": 7.967032967032967,
-      "grad_norm": 1405.458984375,
-      "learning_rate": 2.73e-06,
-      "loss": 326.6432,
-      "step": 365
-    },
-    {
-      "epoch": 8.065934065934066,
-      "grad_norm": 936.8318481445312,
-      "learning_rate": 2.7675e-06,
-      "loss": 317.4954,
-      "step": 370
-    },
-    {
-      "epoch": 8.175824175824175,
-      "grad_norm": 888.236328125,
-      "learning_rate": 2.8050000000000002e-06,
-      "loss": 309.4422,
-      "step": 375
-    },
-    {
-      "epoch": 8.285714285714286,
-      "grad_norm": 970.7135620117188,
-      "learning_rate": 2.8425e-06,
-      "loss": 301.2726,
-      "step": 380
-    },
-    {
-      "epoch": 8.395604395604396,
-      "grad_norm": 1607.8035888671875,
-      "learning_rate": 2.88e-06,
-      "loss": 295.7361,
-      "step": 385
-    },
-    {
-      "epoch": 8.505494505494505,
-      "grad_norm": 814.48486328125,
-      "learning_rate": 2.9175e-06,
-      "loss": 290.0745,
-      "step": 390
-    },
-    {
-      "epoch": 8.615384615384615,
-      "grad_norm": 803.1909790039062,
-      "learning_rate": 2.955e-06,
-      "loss": 279.9774,
-      "step": 395
-    },
-    {
-      "epoch": 8.725274725274724,
-      "grad_norm": 1034.652099609375,
-      "learning_rate": 2.9925e-06,
-      "loss": 277.7123,
-      "step": 400
-    },
-    {
-      "epoch": 8.835164835164836,
-      "grad_norm": 993.9649658203125,
-      "learning_rate": 3.0300000000000002e-06,
-      "loss": 267.6938,
-      "step": 405
-    },
-    {
-      "epoch": 8.945054945054945,
-      "grad_norm": 1010.8787841796875,
-      "learning_rate": 3.0675e-06,
-      "loss": 266.7134,
-      "step": 410
-    },
-    {
-      "epoch": 9.043956043956044,
-      "grad_norm": 762.2285766601562,
-      "learning_rate": 3.105e-06,
-      "loss": 264.4567,
-      "step": 415
-    },
-    {
-      "epoch": 9.153846153846153,
-      "grad_norm": 5807.85791015625,
-      "learning_rate": 3.1425e-06,
-      "loss": 254.0208,
-      "step": 420
-    },
-    {
-      "epoch": 9.263736263736265,
-      "grad_norm": 717.2619018554688,
-      "learning_rate": 3.18e-06,
-      "loss": 249.0201,
-      "step": 425
-    },
-    {
-      "epoch": 9.373626373626374,
-      "grad_norm": 725.6996459960938,
-      "learning_rate": 3.2175e-06,
-      "loss": 243.3289,
-      "step": 430
-    },
-    {
-      "epoch": 9.483516483516484,
-      "grad_norm": 761.1790161132812,
-      "learning_rate": 3.255e-06,
-      "loss": 238.8994,
-      "step": 435
-    },
-    {
-      "epoch": 9.593406593406593,
-      "grad_norm": 792.3602905273438,
-      "learning_rate": 3.2925000000000002e-06,
-      "loss": 231.8852,
-      "step": 440
-    },
-    {
-      "epoch": 9.703296703296703,
-      "grad_norm": 744.7413330078125,
-      "learning_rate": 3.3300000000000003e-06,
-      "loss": 229.136,
-      "step": 445
-    },
-    {
-      "epoch": 9.813186813186814,
-      "grad_norm": 673.6207885742188,
-      "learning_rate": 3.3675000000000004e-06,
-      "loss": 221.0368,
-      "step": 450
-    },
-    {
-      "epoch": 9.923076923076923,
-      "grad_norm": 966.46630859375,
-      "learning_rate": 3.405e-06,
-      "loss": 217.8422,
-      "step": 455
-    },
-    {
-      "epoch": 10.021978021978022,
-      "grad_norm": 740.294921875,
-      "learning_rate": 3.4425e-06,
-      "loss": 228.7085,
-      "step": 460
-    },
-    {
-      "epoch": 10.131868131868131,
-      "grad_norm": 629.9981689453125,
-      "learning_rate": 3.48e-06,
-      "loss": 211.0439,
-      "step": 465
-    },
-    {
-      "epoch": 10.241758241758241,
-      "grad_norm": 809.6885375976562,
-      "learning_rate": 3.5174999999999998e-06,
-      "loss": 204.4649,
-      "step": 470
-    },
-    {
-      "epoch": 10.351648351648352,
-      "grad_norm": 1631.996337890625,
-      "learning_rate": 3.555e-06,
-      "loss": 204.0642,
-      "step": 475
-    },
-    {
-      "epoch": 10.461538461538462,
-      "grad_norm": 958.5594482421875,
-      "learning_rate": 3.5925e-06,
-      "loss": 199.6854,
-      "step": 480
-    },
-    {
-      "epoch": 10.571428571428571,
-      "grad_norm": 588.5241088867188,
-      "learning_rate": 3.63e-06,
-      "loss": 196.1547,
-      "step": 485
-    },
-    {
-      "epoch": 10.68131868131868,
-      "grad_norm": 1439.742919921875,
-      "learning_rate": 3.6675e-06,
-      "loss": 194.8075,
-      "step": 490
-    },
-    {
-      "epoch": 10.791208791208792,
-      "grad_norm": 547.2682495117188,
-      "learning_rate": 3.705e-06,
-      "loss": 190.5422,
-      "step": 495
-    },
-    {
-      "epoch": 10.901098901098901,
-      "grad_norm": 550.932373046875,
-      "learning_rate": 3.7425e-06,
-      "loss": 188.0797,
-      "step": 500
-    },
-    {
-      "epoch": 11.0,
-      "grad_norm": 1168.6212158203125,
-      "learning_rate": 3.7800000000000002e-06,
-      "loss": 180.8073,
-      "step": 505
-    },
-    {
-      "epoch": 11.10989010989011,
-      "grad_norm": 613.2390747070312,
-      "learning_rate": 3.8175e-06,
-      "loss": 181.0391,
-      "step": 510
-    },
-    {
-      "epoch": 11.219780219780219,
-      "grad_norm": 589.2745361328125,
-      "learning_rate": 3.855e-06,
-      "loss": 175.6959,
-      "step": 515
-    },
-    {
-      "epoch": 11.32967032967033,
-      "grad_norm": 531.6744995117188,
-      "learning_rate": 3.8925000000000004e-06,
-      "loss": 174.1553,
-      "step": 520
-    },
-    {
-      "epoch": 11.43956043956044,
-      "grad_norm": 514.2449340820312,
-      "learning_rate": 3.9300000000000005e-06,
-      "loss": 172.9302,
-      "step": 525
-    },
-    {
-      "epoch": 11.54945054945055,
-      "grad_norm": 532.4719848632812,
-      "learning_rate": 3.9675000000000006e-06,
-      "loss": 168.4628,
-      "step": 530
-    },
-    {
-      "epoch": 11.659340659340659,
-      "grad_norm": 475.0870361328125,
-      "learning_rate": 4.005000000000001e-06,
-      "loss": 164.1821,
-      "step": 535
-    },
-    {
-      "epoch": 11.76923076923077,
-      "grad_norm": 531.968994140625,
-      "learning_rate": 4.042500000000001e-06,
-      "loss": 160.9798,
-      "step": 540
-    },
-    {
-      "epoch": 11.87912087912088,
-      "grad_norm": 1276.7720947265625,
-      "learning_rate": 4.080000000000001e-06,
-      "loss": 161.732,
-      "step": 545
-    },
-    {
-      "epoch": 11.989010989010989,
-      "grad_norm": 566.782958984375,
-      "learning_rate": 4.117500000000001e-06,
-      "loss": 157.6948,
-      "step": 550
-    },
-    {
-      "epoch": 12.087912087912088,
-      "grad_norm": 749.6525268554688,
-      "learning_rate": 4.155000000000001e-06,
-      "loss": 157.8984,
-      "step": 555
-    },
-    {
-      "epoch": 12.197802197802197,
-      "grad_norm": 465.9924011230469,
-      "learning_rate": 4.1925e-06,
-      "loss": 151.8522,
-      "step": 560
-    },
-    {
-      "epoch": 12.307692307692308,
-      "grad_norm": 508.3736572265625,
-      "learning_rate": 4.229999999999999e-06,
-      "loss": 148.1901,
-      "step": 565
-    },
-    {
-      "epoch": 12.417582417582418,
-      "grad_norm": 1925.7998046875,
-      "learning_rate": 4.267499999999999e-06,
-      "loss": 148.9536,
-      "step": 570
-    },
-    {
-      "epoch": 12.527472527472527,
-      "grad_norm": 528.435546875,
-      "learning_rate": 4.3049999999999994e-06,
-      "loss": 146.6833,
-      "step": 575
-    },
-    {
-      "epoch": 12.637362637362637,
-      "grad_norm": 629.6632690429688,
-      "learning_rate": 4.3424999999999995e-06,
-      "loss": 141.5915,
-      "step": 580
-    },
-    {
-      "epoch": 12.747252747252748,
-      "grad_norm": 475.8123474121094,
-      "learning_rate": 4.3799999999999996e-06,
-      "loss": 139.1265,
-      "step": 585
-    },
-    {
-      "epoch": 12.857142857142858,
-      "grad_norm": 545.7217407226562,
-      "learning_rate": 4.4175e-06,
-      "loss": 135.9861,
-      "step": 590
-    },
-    {
-      "epoch": 12.967032967032967,
-      "grad_norm": 2502.422607421875,
-      "learning_rate": 4.455e-06,
-      "loss": 138.432,
-      "step": 595
-    },
-    {
-      "epoch": 13.065934065934066,
-      "grad_norm": 432.36279296875,
-      "learning_rate": 4.4925e-06,
-      "loss": 144.0422,
-      "step": 600
-    },
-    {
-      "epoch": 13.175824175824175,
-      "grad_norm": 429.08685302734375,
-      "learning_rate": 4.53e-06,
-      "loss": 132.7908,
-      "step": 605
-    },
-    {
-      "epoch": 13.285714285714286,
-      "grad_norm": 467.1208801269531,
-      "learning_rate": 4.5675e-06,
-      "loss": 130.9718,
-      "step": 610
-    },
-    {
-      "epoch": 13.395604395604396,
-      "grad_norm": 1329.150634765625,
-      "learning_rate": 4.605e-06,
-      "loss": 134.7514,
-      "step": 615
-    },
-    {
-      "epoch": 13.505494505494505,
-      "grad_norm": 404.3941650390625,
-      "learning_rate": 4.6425e-06,
-      "loss": 127.4091,
-      "step": 620
-    },
-    {
-      "epoch": 13.615384615384615,
-      "grad_norm": 439.34234619140625,
-      "learning_rate": 4.68e-06,
-      "loss": 123.8929,
-      "step": 625
-    },
-    {
-      "epoch": 13.725274725274724,
-      "grad_norm": 418.9570007324219,
-      "learning_rate": 4.7175e-06,
-      "loss": 122.8744,
-      "step": 630
-    },
-    {
-      "epoch": 13.835164835164836,
-      "grad_norm": 396.3748474121094,
-      "learning_rate": 4.755e-06,
-      "loss": 121.2023,
-      "step": 635
-    },
-    {
-      "epoch": 13.945054945054945,
-      "grad_norm": 617.2871704101562,
-      "learning_rate": 4.7925e-06,
-      "loss": 121.1829,
-      "step": 640
-    },
-    {
-      "epoch": 14.043956043956044,
-      "grad_norm": 409.8064270019531,
-      "learning_rate": 4.83e-06,
-      "loss": 117.2707,
-      "step": 645
-    },
-    {
-      "epoch": 14.153846153846153,
-      "grad_norm": 516.0888671875,
-      "learning_rate": 4.8675e-06,
-      "loss": 118.2377,
-      "step": 650
-    },
-    {
-      "epoch": 14.263736263736265,
-      "grad_norm": 505.36480712890625,
-      "learning_rate": 4.9050000000000005e-06,
-      "loss": 116.6757,
-      "step": 655
-    },
-    {
-      "epoch": 14.373626373626374,
-      "grad_norm": 449.5859680175781,
-      "learning_rate": 4.9425000000000005e-06,
-      "loss": 114.7447,
-      "step": 660
-    },
-    {
-      "epoch": 14.483516483516484,
-      "grad_norm": 420.5121765136719,
-      "learning_rate": 4.980000000000001e-06,
-      "loss": 113.2027,
-      "step": 665
-    },
-    {
-      "epoch": 14.593406593406593,
-      "grad_norm": 345.0196838378906,
-      "learning_rate": 5.017500000000001e-06,
-      "loss": 111.8875,
-      "step": 670
-    },
-    {
-      "epoch": 14.703296703296703,
-      "grad_norm": 377.2486877441406,
-      "learning_rate": 5.055000000000001e-06,
-      "loss": 107.6133,
-      "step": 675
-    },
-    {
-      "epoch": 14.813186813186814,
-      "grad_norm": 384.3365783691406,
-      "learning_rate": 5.092500000000001e-06,
-      "loss": 109.3438,
-      "step": 680
-    },
-    {
-      "epoch": 14.923076923076923,
-      "grad_norm": 385.41998291015625,
-      "learning_rate": 5.130000000000001e-06,
-      "loss": 103.7929,
-      "step": 685
-    },
-    {
-      "epoch": 15.021978021978022,
-      "grad_norm": 362.2362976074219,
-      "learning_rate": 5.1675e-06,
-      "loss": 110.7998,
-      "step": 690
-    },
-    {
-      "epoch": 15.131868131868131,
-      "grad_norm": 351.775146484375,
-      "learning_rate": 5.205e-06,
-      "loss": 100.9369,
-      "step": 695
-    },
-    {
-      "epoch": 15.241758241758241,
-      "grad_norm": 390.7868347167969,
-      "learning_rate": 5.2425e-06,
-      "loss": 103.8922,
-      "step": 700
-    },
-    {
-      "epoch": 15.351648351648352,
-      "grad_norm": 344.78564453125,
-      "learning_rate": 5.279999999999999e-06,
-      "loss": 99.1799,
-      "step": 705
-    },
-    {
-      "epoch": 15.461538461538462,
-      "grad_norm": 378.85675048828125,
-      "learning_rate": 5.3174999999999995e-06,
-      "loss": 98.0914,
-      "step": 710
-    },
-    {
-      "epoch": 15.571428571428571,
-      "grad_norm": 342.2828063964844,
-      "learning_rate": 5.3549999999999996e-06,
-      "loss": 97.6728,
-      "step": 715
-    },
-    {
-      "epoch": 15.68131868131868,
-      "grad_norm": 341.1894226074219,
-      "learning_rate": 5.3925e-06,
-      "loss": 98.8686,
-      "step": 720
-    },
-    {
-      "epoch": 15.791208791208792,
-      "grad_norm": 336.378662109375,
-      "learning_rate": 5.43e-06,
-      "loss": 93.9906,
-      "step": 725
-    },
-    {
-      "epoch": 15.901098901098901,
-      "grad_norm": 334.4036560058594,
-      "learning_rate": 5.4675e-06,
-      "loss": 93.7508,
-      "step": 730
-    },
-    {
-      "epoch": 16.0,
-      "grad_norm": 1449.2354736328125,
-      "learning_rate": 5.505e-06,
-      "loss": 96.8374,
-      "step": 735
-    },
-    {
-      "epoch": 16.10989010989011,
-      "grad_norm": 341.5977478027344,
-      "learning_rate": 5.5425e-06,
-      "loss": 90.1757,
-      "step": 740
-    },
-    {
-      "epoch": 16.21978021978022,
-      "grad_norm": 562.0274658203125,
-      "learning_rate": 5.58e-06,
-      "loss": 89.7033,
-      "step": 745
-    },
-    {
-      "epoch": 16.32967032967033,
-      "grad_norm": 341.64959716796875,
-      "learning_rate": 5.6175e-06,
-      "loss": 90.8916,
-      "step": 750
-    },
-    {
-      "epoch": 16.439560439560438,
-      "grad_norm": 305.3785400390625,
-      "learning_rate": 5.655e-06,
-      "loss": 87.8087,
-      "step": 755
-    },
-    {
-      "epoch": 16.54945054945055,
-      "grad_norm": 317.1134338378906,
-      "learning_rate": 5.6925e-06,
-      "loss": 87.2254,
-      "step": 760
-    },
-    {
-      "epoch": 16.65934065934066,
-      "grad_norm": 305.0978088378906,
-      "learning_rate": 5.73e-06,
-      "loss": 86.9592,
-      "step": 765
-    },
-    {
-      "epoch": 16.76923076923077,
-      "grad_norm": 280.41778564453125,
-      "learning_rate": 5.7675e-06,
-      "loss": 83.7875,
-      "step": 770
-    },
-    {
-      "epoch": 16.87912087912088,
-      "grad_norm": 318.2386169433594,
-      "learning_rate": 5.805e-06,
-      "loss": 87.919,
-      "step": 775
-    },
-    {
-      "epoch": 16.98901098901099,
-      "grad_norm": 278.9162902832031,
-      "learning_rate": 5.8425e-06,
-      "loss": 82.1755,
-      "step": 780
-    },
-    {
-      "epoch": 17.087912087912088,
-      "grad_norm": 267.6657409667969,
-      "learning_rate": 5.8800000000000005e-06,
-      "loss": 84.2234,
-      "step": 785
-    },
-    {
-      "epoch": 17.197802197802197,
-      "grad_norm": 286.16162109375,
-      "learning_rate": 5.9175000000000005e-06,
-      "loss": 81.0474,
-      "step": 790
-    },
-    {
-      "epoch": 17.307692307692307,
-      "grad_norm": 288.74151611328125,
-      "learning_rate": 5.955000000000001e-06,
-      "loss": 80.2032,
-      "step": 795
-    },
-    {
-      "epoch": 17.417582417582416,
-      "grad_norm": 293.4186096191406,
-      "learning_rate": 5.992500000000001e-06,
-      "loss": 79.8146,
-      "step": 800
-    },
-    {
-      "epoch": 17.52747252747253,
-      "grad_norm": 294.4632873535156,
-      "learning_rate": 6.030000000000001e-06,
-      "loss": 78.9017,
-      "step": 805
-    },
-    {
-      "epoch": 17.63736263736264,
-      "grad_norm": 278.7751159667969,
-      "learning_rate": 6.067500000000001e-06,
-      "loss": 74.4998,
-      "step": 810
-    },
-    {
-      "epoch": 17.747252747252748,
-      "grad_norm": 281.1651306152344,
-      "learning_rate": 6.105e-06,
-      "loss": 73.4448,
-      "step": 815
-    },
-    {
-      "epoch": 17.857142857142858,
-      "grad_norm": 253.44654846191406,
-      "learning_rate": 6.1425e-06,
-      "loss": 72.5747,
-      "step": 820
-    },
-    {
-      "epoch": 17.967032967032967,
-      "grad_norm": 270.03265380859375,
-      "learning_rate": 6.18e-06,
-      "loss": 72.7589,
-      "step": 825
-    },
-    {
-      "epoch": 18.065934065934066,
-      "grad_norm": 275.12652587890625,
-      "learning_rate": 6.2175e-06,
-      "loss": 72.1071,
-      "step": 830
-    },
-    {
-      "epoch": 18.175824175824175,
-      "grad_norm": 267.95965576171875,
-      "learning_rate": 6.255e-06,
-      "loss": 71.183,
-      "step": 835
-    },
-    {
-      "epoch": 18.285714285714285,
-      "grad_norm": 250.47987365722656,
-      "learning_rate": 6.2925e-06,
-      "loss": 72.3531,
-      "step": 840
-    },
-    {
-      "epoch": 18.395604395604394,
-      "grad_norm": 284.7598876953125,
-      "learning_rate": 6.3299999999999995e-06,
-      "loss": 75.0338,
-      "step": 845
-    },
-    {
-      "epoch": 18.505494505494504,
-      "grad_norm": 303.2804260253906,
-      "learning_rate": 6.3675e-06,
-      "loss": 73.4008,
-      "step": 850
-    },
-    {
-      "epoch": 18.615384615384617,
-      "grad_norm": 245.9633026123047,
-      "learning_rate": 6.405e-06,
-      "loss": 69.7974,
-      "step": 855
-    },
-    {
-      "epoch": 18.725274725274726,
-      "grad_norm": 243.55807495117188,
-      "learning_rate": 6.4425e-06,
-      "loss": 67.0503,
-      "step": 860
-    },
-    {
-      "epoch": 18.835164835164836,
-      "grad_norm": 274.70330810546875,
-      "learning_rate": 6.48e-06,
-      "loss": 67.1543,
-      "step": 865
-    },
-    {
-      "epoch": 18.945054945054945,
-      "grad_norm": 233.40606689453125,
-      "learning_rate": 6.5175e-06,
-      "loss": 64.5814,
-      "step": 870
-    },
-    {
-      "epoch": 19.043956043956044,
-      "grad_norm": 240.22552490234375,
-      "learning_rate": 6.555e-06,
-      "loss": 79.7664,
-      "step": 875
-    },
-    {
-      "epoch": 19.153846153846153,
-      "grad_norm": 242.1979522705078,
-      "learning_rate": 6.5925e-06,
-      "loss": 64.3813,
-      "step": 880
-    },
-    {
-      "epoch": 19.263736263736263,
-      "grad_norm": 231.37562561035156,
-      "learning_rate": 6.63e-06,
-      "loss": 65.6568,
-      "step": 885
-    },
-    {
-      "epoch": 19.373626373626372,
-      "grad_norm": 235.3353271484375,
-      "learning_rate": 6.6675e-06,
-      "loss": 63.7893,
-      "step": 890
-    },
-    {
-      "epoch": 19.483516483516482,
-      "grad_norm": 322.8166198730469,
-      "learning_rate": 6.705e-06,
-      "loss": 64.4515,
-      "step": 895
-    },
-    {
-      "epoch": 19.593406593406595,
-      "grad_norm": 226.3159637451172,
-      "learning_rate": 6.7425e-06,
-      "loss": 63.5289,
-      "step": 900
-    },
-    {
-      "epoch": 19.703296703296704,
-      "grad_norm": 218.580322265625,
-      "learning_rate": 6.78e-06,
-      "loss": 61.9305,
-      "step": 905
-    },
-    {
-      "epoch": 19.813186813186814,
-      "grad_norm": 263.286376953125,
-      "learning_rate": 6.8175e-06,
-      "loss": 58.6838,
-      "step": 910
-    },
-    {
-      "epoch": 19.923076923076923,
-      "grad_norm": 208.91209411621094,
-      "learning_rate": 6.8550000000000004e-06,
-      "loss": 57.9055,
-      "step": 915
-    },
-    {
-      "epoch": 20.021978021978022,
-      "grad_norm": 211.89749145507812,
-      "learning_rate": 6.8925000000000005e-06,
-      "loss": 59.9611,
-      "step": 920
-    },
-    {
-      "epoch": 20.13186813186813,
-      "grad_norm": 203.60804748535156,
-      "learning_rate": 6.9300000000000006e-06,
-      "loss": 57.4417,
-      "step": 925
-    },
-    {
-      "epoch": 20.24175824175824,
-      "grad_norm": 217.0232696533203,
-      "learning_rate": 6.967500000000001e-06,
-      "loss": 57.6954,
-      "step": 930
-    },
-    {
-      "epoch": 20.35164835164835,
-      "grad_norm": 215.05755615234375,
-      "learning_rate": 7.005000000000001e-06,
-      "loss": 58.4159,
-      "step": 935
-    },
-    {
-      "epoch": 20.46153846153846,
-      "grad_norm": 208.3353729248047,
-      "learning_rate": 7.0425e-06,
-      "loss": 54.717,
-      "step": 940
-    },
-    {
-      "epoch": 20.571428571428573,
-      "grad_norm": 205.00282287597656,
-      "learning_rate": 7.08e-06,
-      "loss": 53.7791,
-      "step": 945
-    },
-    {
-      "epoch": 20.681318681318682,
-      "grad_norm": 197.68243408203125,
-      "learning_rate": 7.1175e-06,
-      "loss": 53.9224,
-      "step": 950
-    },
-    {
-      "epoch": 20.791208791208792,
-      "grad_norm": 183.80923461914062,
-      "learning_rate": 7.155e-06,
-      "loss": 51.6971,
-      "step": 955
-    },
-    {
-      "epoch": 20.9010989010989,
-      "grad_norm": 201.22625732421875,
-      "learning_rate": 7.1925e-06,
-      "loss": 50.1511,
-      "step": 960
-    },
-    {
-      "epoch": 21.0,
-      "grad_norm": 433.5864562988281,
-      "learning_rate": 7.23e-06,
-      "loss": 53.6671,
-      "step": 965
-    },
-    {
-      "epoch": 21.10989010989011,
-      "grad_norm": 180.5747833251953,
-      "learning_rate": 7.2675e-06,
-      "loss": 51.5476,
-      "step": 970
-    },
-    {
-      "epoch": 21.21978021978022,
-      "grad_norm": 187.3274383544922,
-      "learning_rate": 7.305e-06,
-      "loss": 49.8905,
-      "step": 975
-    },
-    {
-      "epoch": 21.32967032967033,
-      "grad_norm": 175.39138793945312,
-      "learning_rate": 7.3425000000000004e-06,
-      "loss": 50.4588,
-      "step": 980
-    },
-    {
-      "epoch": 21.439560439560438,
-      "grad_norm": 173.7222900390625,
-      "learning_rate": 7.3800000000000005e-06,
-      "loss": 48.5218,
-      "step": 985
-    },
-    {
-      "epoch": 21.54945054945055,
-      "grad_norm": 429.6211242675781,
-      "learning_rate": 7.4175e-06,
-      "loss": 48.636,
-      "step": 990
-    },
-    {
-      "epoch": 21.65934065934066,
-      "grad_norm": 178.93069458007812,
-      "learning_rate": 7.455e-06,
-      "loss": 49.8087,
-      "step": 995
-    },
-    {
-      "epoch": 21.76923076923077,
-      "grad_norm": 232.20291137695312,
-      "learning_rate": 7.4925e-06,
-      "loss": 47.1187,
-      "step": 1000
-    },
-    {
-      "epoch": 21.87912087912088,
-      "grad_norm": 181.4752655029297,
-      "learning_rate": 7.53e-06,
-      "loss": 51.2776,
-      "step": 1005
-    },
-    {
-      "epoch": 21.98901098901099,
-      "grad_norm": 167.86483764648438,
-      "learning_rate": 7.567499999999999e-06,
-      "loss": 46.6794,
-      "step": 1010
-    },
-    {
-      "epoch": 22.087912087912088,
-      "grad_norm": 172.2127685546875,
-      "learning_rate": 7.605e-06,
-      "loss": 44.9643,
-      "step": 1015
-    },
-    {
-      "epoch": 22.197802197802197,
-      "grad_norm": 184.90206909179688,
-      "learning_rate": 7.6425e-06,
-      "loss": 45.4152,
-      "step": 1020
-    },
-    {
-      "epoch": 22.307692307692307,
-      "grad_norm": 155.5305938720703,
-      "learning_rate": 7.680000000000001e-06,
-      "loss": 45.4304,
-      "step": 1025
-    },
-    {
-      "epoch": 22.417582417582416,
-      "grad_norm": 670.8698120117188,
-      "learning_rate": 7.7175e-06,
-      "loss": 44.4255,
-      "step": 1030
-    },
-    {
-      "epoch": 22.52747252747253,
-      "grad_norm": 162.875244140625,
-      "learning_rate": 7.755000000000001e-06,
-      "loss": 43.0364,
-      "step": 1035
-    },
-    {
-      "epoch": 22.63736263736264,
-      "grad_norm": 159.903076171875,
-      "learning_rate": 7.7925e-06,
-      "loss": 44.4935,
-      "step": 1040
-    },
-    {
-      "epoch": 22.747252747252748,
-      "grad_norm": 161.0757598876953,
-      "learning_rate": 7.830000000000001e-06,
-      "loss": 43.3238,
-      "step": 1045
-    },
-    {
-      "epoch": 22.857142857142858,
-      "grad_norm": 140.3941650390625,
-      "learning_rate": 7.8675e-06,
-      "loss": 42.2547,
-      "step": 1050
-    },
-    {
-      "epoch": 22.967032967032967,
-      "grad_norm": 164.62034606933594,
-      "learning_rate": 7.905000000000001e-06,
-      "loss": 42.7775,
-      "step": 1055
-    },
-    {
-      "epoch": 23.065934065934066,
-      "grad_norm": 148.73211669921875,
-      "learning_rate": 7.942499999999999e-06,
-      "loss": 42.7699,
-      "step": 1060
-    },
-    {
-      "epoch": 23.175824175824175,
-      "grad_norm": 137.26893615722656,
-      "learning_rate": 7.98e-06,
-      "loss": 40.5411,
-      "step": 1065
-    },
-    {
-      "epoch": 23.285714285714285,
-      "grad_norm": 143.8221893310547,
-      "learning_rate": 8.017499999999999e-06,
-      "loss": 39.9001,
-      "step": 1070
-    },
-    {
-      "epoch": 23.395604395604394,
-      "grad_norm": 160.19586181640625,
-      "learning_rate": 8.055e-06,
-      "loss": 39.8523,
-      "step": 1075
-    },
-    {
-      "epoch": 23.505494505494504,
-      "grad_norm": 146.14981079101562,
-      "learning_rate": 8.0925e-06,
-      "loss": 39.7639,
-      "step": 1080
-    },
-    {
-      "epoch": 23.615384615384617,
-      "grad_norm": 132.55712890625,
-      "learning_rate": 8.13e-06,
-      "loss": 39.7051,
-      "step": 1085
-    },
-    {
-      "epoch": 23.725274725274726,
-      "grad_norm": 137.0689239501953,
-      "learning_rate": 8.1675e-06,
-      "loss": 39.1014,
-      "step": 1090
-    },
-    {
-      "epoch": 23.835164835164836,
-      "grad_norm": 125.66040802001953,
-      "learning_rate": 8.205e-06,
-      "loss": 38.4976,
-      "step": 1095
-    },
-    {
-      "epoch": 23.945054945054945,
-      "grad_norm": 118.39325714111328,
-      "learning_rate": 8.2425e-06,
-      "loss": 39.2203,
-      "step": 1100
-    },
-    {
-      "epoch": 24.043956043956044,
-      "grad_norm": 131.4208221435547,
-      "learning_rate": 8.28e-06,
-      "loss": 37.6028,
-      "step": 1105
-    },
-    {
-      "epoch": 24.153846153846153,
-      "grad_norm": 131.80392456054688,
-      "learning_rate": 8.3175e-06,
-      "loss": 38.8382,
-      "step": 1110
-    },
-    {
-      "epoch": 24.263736263736263,
-      "grad_norm": 120.43126678466797,
-      "learning_rate": 8.355e-06,
-      "loss": 37.7083,
-      "step": 1115
-    },
-    {
-      "epoch": 24.373626373626372,
-      "grad_norm": 147.37368774414062,
-      "learning_rate": 8.3925e-06,
-      "loss": 36.4178,
-      "step": 1120
-    },
-    {
-      "epoch": 24.483516483516482,
-      "grad_norm": 162.19711303710938,
-      "learning_rate": 8.43e-06,
-      "loss": 36.3008,
-      "step": 1125
-    },
-    {
-      "epoch": 24.593406593406595,
-      "grad_norm": 128.9326171875,
-      "learning_rate": 8.4675e-06,
-      "loss": 35.7747,
-      "step": 1130
-    },
-    {
-      "epoch": 24.703296703296704,
-      "grad_norm": 123.9283218383789,
-      "learning_rate": 8.504999999999999e-06,
-      "loss": 33.9809,
-      "step": 1135
-    },
-    {
-      "epoch": 24.813186813186814,
-      "grad_norm": 125.92423248291016,
-      "learning_rate": 8.5425e-06,
-      "loss": 35.4138,
-      "step": 1140
-    },
-    {
-      "epoch": 24.923076923076923,
-      "grad_norm": 129.04722595214844,
-      "learning_rate": 8.58e-06,
-      "loss": 35.177,
-      "step": 1145
-    },
-    {
-      "epoch": 25.021978021978022,
-      "grad_norm": 126.06185913085938,
-      "learning_rate": 8.6175e-06,
-      "loss": 34.5535,
-      "step": 1150
-    },
-    {
-      "epoch": 25.13186813186813,
-      "grad_norm": 122.82085418701172,
-      "learning_rate": 8.655e-06,
-      "loss": 34.716,
-      "step": 1155
-    },
-    {
-      "epoch": 25.24175824175824,
-      "grad_norm": 110.43183898925781,
-      "learning_rate": 8.6925e-06,
-      "loss": 33.8336,
-      "step": 1160
-    },
-    {
-      "epoch": 25.35164835164835,
-      "grad_norm": 111.39581298828125,
-      "learning_rate": 8.73e-06,
-      "loss": 33.2828,
-      "step": 1165
-    },
-    {
-      "epoch": 25.46153846153846,
-      "grad_norm": 133.24420166015625,
-      "learning_rate": 8.7675e-06,
-      "loss": 33.4885,
-      "step": 1170
-    },
-    {
-      "epoch": 25.571428571428573,
-      "grad_norm": 117.64434051513672,
-      "learning_rate": 8.805e-06,
-      "loss": 33.8398,
-      "step": 1175
-    },
-    {
-      "epoch": 25.681318681318682,
-      "grad_norm": 117.87113952636719,
-      "learning_rate": 8.8425e-06,
-      "loss": 31.6979,
-      "step": 1180
-    },
-    {
-      "epoch": 25.791208791208792,
-      "grad_norm": 115.08634948730469,
-      "learning_rate": 8.88e-06,
-      "loss": 32.0385,
-      "step": 1185
-    },
-    {
-      "epoch": 25.9010989010989,
-      "grad_norm": 127.63653564453125,
-      "learning_rate": 8.9175e-06,
-      "loss": 31.6513,
-      "step": 1190
-    },
-    {
-      "epoch": 26.0,
-      "grad_norm": 283.0741271972656,
-      "learning_rate": 8.955e-06,
-      "loss": 33.2661,
-      "step": 1195
-    },
-    {
-      "epoch": 26.10989010989011,
-      "grad_norm": 110.21710968017578,
-      "learning_rate": 8.9925e-06,
-      "loss": 30.7823,
-      "step": 1200
-    },
-    {
-      "epoch": 26.21978021978022,
-      "grad_norm": 100.66688537597656,
-      "learning_rate": 9.03e-06,
-      "loss": 31.5021,
-      "step": 1205
-    },
-    {
-      "epoch": 26.32967032967033,
-      "grad_norm": 105.822021484375,
-      "learning_rate": 9.067500000000001e-06,
-      "loss": 30.0609,
-      "step": 1210
-    },
-    {
-      "epoch": 26.439560439560438,
-      "grad_norm": 99.89389038085938,
-      "learning_rate": 9.105e-06,
-      "loss": 30.5733,
-      "step": 1215
-    },
-    {
-      "epoch": 26.54945054945055,
-      "grad_norm": 96.10974884033203,
-      "learning_rate": 9.142500000000001e-06,
-      "loss": 29.9534,
-      "step": 1220
-    },
-    {
-      "epoch": 26.65934065934066,
-      "grad_norm": 112.5010986328125,
-      "learning_rate": 9.18e-06,
-      "loss": 30.4645,
-      "step": 1225
-    },
-    {
-      "epoch": 26.76923076923077,
-      "grad_norm": 113.7469253540039,
-      "learning_rate": 9.217500000000001e-06,
-      "loss": 29.8336,
-      "step": 1230
-    },
-    {
-      "epoch": 26.87912087912088,
-      "grad_norm": 99.2851791381836,
-      "learning_rate": 9.255e-06,
-      "loss": 28.8012,
-      "step": 1235
-    },
-    {
-      "epoch": 26.98901098901099,
-      "grad_norm": 92.29454040527344,
-      "learning_rate": 9.292500000000001e-06,
-      "loss": 29.5607,
-      "step": 1240
-    },
-    {
-      "epoch": 27.087912087912088,
-      "grad_norm": 368.2291564941406,
-      "learning_rate": 9.33e-06,
-      "loss": 57.1964,
-      "step": 1245
-    },
-    {
-      "epoch": 27.197802197802197,
-      "grad_norm": 87.99185943603516,
-      "learning_rate": 9.367500000000001e-06,
-      "loss": 29.8244,
-      "step": 1250
-    },
-    {
-      "epoch": 27.307692307692307,
-      "grad_norm": 87.86891174316406,
-      "learning_rate": 9.405e-06,
-      "loss": 29.159,
-      "step": 1255
-    },
-    {
-      "epoch": 27.417582417582416,
-      "grad_norm": 106.52743530273438,
-      "learning_rate": 9.4425e-06,
-      "loss": 28.5417,
-      "step": 1260
-    },
-    {
-      "epoch": 27.52747252747253,
-      "grad_norm": 105.37548828125,
-      "learning_rate": 9.48e-06,
-      "loss": 29.6107,
-      "step": 1265
-    },
-    {
-      "epoch": 27.63736263736264,
-      "grad_norm": 126.07006072998047,
-      "learning_rate": 9.5175e-06,
-      "loss": 28.5252,
-      "step": 1270
-    },
-    {
-      "epoch": 27.747252747252748,
-      "grad_norm": 102.43346405029297,
-      "learning_rate": 9.555e-06,
-      "loss": 26.8626,
-      "step": 1275
-    },
-    {
-      "epoch": 27.857142857142858,
-      "grad_norm": 95.87886047363281,
-      "learning_rate": 9.5925e-06,
-      "loss": 27.4449,
-      "step": 1280
-    },
-    {
-      "epoch": 27.967032967032967,
-      "grad_norm": 183.47979736328125,
-      "learning_rate": 9.630000000000001e-06,
-      "loss": 26.4121,
-      "step": 1285
-    },
-    {
-      "epoch": 28.065934065934066,
-      "grad_norm": 95.40010070800781,
-      "learning_rate": 9.6675e-06,
-      "loss": 35.9095,
-      "step": 1290
-    },
-    {
-      "epoch": 28.175824175824175,
-      "grad_norm": 90.59423828125,
-      "learning_rate": 9.705000000000001e-06,
-      "loss": 26.6954,
-      "step": 1295
-    },
-    {
-      "epoch": 28.285714285714285,
-      "grad_norm": 91.06011199951172,
-      "learning_rate": 9.7425e-06,
-      "loss": 26.3503,
-      "step": 1300
-    },
-    {
-      "epoch": 28.395604395604394,
-      "grad_norm": 88.42830657958984,
-      "learning_rate": 9.780000000000001e-06,
-      "loss": 26.0797,
-      "step": 1305
-    },
-    {
-      "epoch": 28.505494505494504,
-      "grad_norm": 80.00355529785156,
-      "learning_rate": 9.8175e-06,
-      "loss": 25.8798,
-      "step": 1310
-    },
-    {
-      "epoch": 28.615384615384617,
-      "grad_norm": 81.79782104492188,
-      "learning_rate": 9.855000000000001e-06,
-      "loss": 26.4297,
-      "step": 1315
     },
     {
-      "epoch": 28.725274725274726,
-      "grad_norm": 75.8064193725586,
-      "learning_rate": 9.8925e-06,
-      "loss": 26.3461,
-      "step": 1320
     },
     {
-      "epoch": 28.835164835164836,
-      "grad_norm": 85.9918212890625,
-      "learning_rate": 9.930000000000001e-06,
-      "loss": 26.0146,
-      "step": 1325
     },
     {
-      "epoch": 28.945054945054945,
-      "grad_norm": 94.3312759399414,
-      "learning_rate": 9.9675e-06,
-      "loss": 25.2894,
-      "step": 1330
     },
     {
-      "epoch": 29.043956043956044,
-      "grad_norm": 126.92018127441406,
-      "learning_rate": 1.0005000000000002e-05,
-      "loss": 24.2071,
-      "step": 1335
     },
     {
-      "epoch": 29.153846153846153,
-      "grad_norm": 81.69698333740234,
-      "learning_rate": 1.00425e-05,
-      "loss": 25.3439,
-      "step": 1340
     },
     {
-      "epoch": 29.263736263736263,
-      "grad_norm": 74.29949188232422,
-      "learning_rate": 1.008e-05,
-      "loss": 25.7623,
-      "step": 1345
     },
     {
-      "epoch": 29.373626373626372,
-      "grad_norm": 93.50883483886719,
-      "learning_rate": 1.01175e-05,
-      "loss": 25.0202,
-      "step": 1350
     },
     {
-      "epoch": 29.483516483516482,
-      "grad_norm": 90.64075469970703,
-      "learning_rate": 1.0155e-05,
-      "loss": 25.1573,
-      "step": 1355
     },
     {
-      "epoch": 29.593406593406595,
-      "grad_norm": 89.82637786865234,
-      "learning_rate": 1.01925e-05,
-      "loss": 24.7463,
-      "step": 1360
     },
     {
-      "epoch": 29.703296703296704,
-      "grad_norm": 83.58129119873047,
-      "learning_rate": 1.023e-05,
-      "loss": 23.3669,
-      "step": 1365
     },
     {
-      "epoch": 29.813186813186814,
-      "grad_norm": 91.34410095214844,
-      "learning_rate": 1.02675e-05,
-      "loss": 25.5278,
-      "step": 1370
     },
     {
-      "epoch": 29.923076923076923,
-      "grad_norm": 118.92642211914062,
-      "learning_rate": 1.0305e-05,
-      "loss": 24.4363,
-      "step": 1375
     },
     {
-      "epoch": 30.021978021978022,
-      "grad_norm": 75.5335693359375,
-      "learning_rate": 1.03425e-05,
-      "loss": 32.3354,
-      "step": 1380
     },
     {
-      "epoch": 30.13186813186813,
-      "grad_norm": 76.67096710205078,
-      "learning_rate": 1.0379999999999999e-05,
-      "loss": 24.7026,
-      "step": 1385
     },
     {
-      "epoch": 30.24175824175824,
-      "grad_norm": 76.3076400756836,
-      "learning_rate": 1.04175e-05,
-      "loss": 23.3343,
-      "step": 1390
     },
     {
-      "epoch": 30.35164835164835,
-      "grad_norm": 76.03828430175781,
-      "learning_rate": 1.0454999999999999e-05,
-      "loss": 23.62,
-      "step": 1395
     },
     {
-      "epoch": 30.46153846153846,
-      "grad_norm": 86.96392059326172,
-      "learning_rate": 1.04925e-05,
-      "loss": 23.4644,
-      "step": 1400
     },
     {
-      "epoch": 30.571428571428573,
-      "grad_norm": 76.41240692138672,
-      "learning_rate": 1.0529999999999999e-05,
-      "loss": 22.895,
-      "step": 1405
     },
     {
-      "epoch": 30.681318681318682,
-      "grad_norm": 67.73085021972656,
-      "learning_rate": 1.05675e-05,
-      "loss": 23.6923,
-      "step": 1410
     },
     {
-      "epoch": 30.791208791208792,
-      "grad_norm": 141.68394470214844,
-      "learning_rate": 1.0605e-05,
-      "loss": 22.9296,
-      "step": 1415
     },
     {
-      "epoch": 30.9010989010989,
-      "grad_norm": 102.01689910888672,
-      "learning_rate": 1.06425e-05,
-      "loss": 22.6854,
-      "step": 1420
     },
     {
-      "epoch": 31.0,
-      "grad_norm": 252.77476501464844,
-      "learning_rate": 1.068e-05,
-      "loss": 24.2562,
-      "step": 1425
     },
     {
-      "epoch": 31.10989010989011,
-      "grad_norm": 79.69669342041016,
-      "learning_rate": 1.07175e-05,
-      "loss": 22.6885,
-      "step": 1430
     },
     {
-      "epoch": 31.21978021978022,
-      "grad_norm": 73.90184783935547,
-      "learning_rate": 1.0755e-05,
-      "loss": 21.5442,
-      "step": 1435
     },
     {
-      "epoch": 31.32967032967033,
-      "grad_norm": 100.4890365600586,
-      "learning_rate": 1.07925e-05,
-      "loss": 22.8439,
-      "step": 1440
     },
     {
-      "epoch": 31.439560439560438,
-      "grad_norm": 73.010009765625,
-      "learning_rate": 1.083e-05,
-      "loss": 22.7848,
-      "step": 1445
     },
     {
-      "epoch": 31.54945054945055,
-      "grad_norm": 107.31619262695312,
       "learning_rate": 1.08675e-05,
-      "loss": 22.1164,
       "step": 1450
     },
     {
-      "epoch": 31.65934065934066,
-      "grad_norm": 72.39157104492188,
-      "learning_rate": 1.0905e-05,
-      "loss": 21.4437,
-      "step": 1455
-    },
-    {
-      "epoch": 31.76923076923077,
-      "grad_norm": 79.01976013183594,
-      "learning_rate": 1.09425e-05,
-      "loss": 22.4324,
-      "step": 1460
-    },
-    {
-      "epoch": 31.87912087912088,
-      "grad_norm": 94.18357849121094,
-      "learning_rate": 1.098e-05,
-      "loss": 23.0076,
-      "step": 1465
-    },
-    {
-      "epoch": 31.98901098901099,
-      "grad_norm": 76.7142333984375,
-      "learning_rate": 1.10175e-05,
-      "loss": 21.8061,
-      "step": 1470
-    },
-    {
-      "epoch": 32.08791208791209,
-      "grad_norm": 65.66316986083984,
-      "learning_rate": 1.1055e-05,
-      "loss": 22.7557,
-      "step": 1475
-    },
-    {
-      "epoch": 32.1978021978022,
-      "grad_norm": 85.76339721679688,
-      "learning_rate": 1.1092500000000001e-05,
-      "loss": 20.9964,
-      "step": 1480
-    },
-    {
-      "epoch": 32.30769230769231,
-      "grad_norm": 84.71622467041016,
-      "learning_rate": 1.113e-05,
-      "loss": 21.0421,
-      "step": 1485
-    },
-    {
-      "epoch": 32.417582417582416,
-      "grad_norm": 94.56781768798828,
-      "learning_rate": 1.1167500000000001e-05,
-      "loss": 21.6047,
-      "step": 1490
-    },
-    {
-      "epoch": 32.527472527472526,
-      "grad_norm": 71.27996063232422,
-      "learning_rate": 1.1205e-05,
-      "loss": 21.281,
-      "step": 1495
-    },
-    {
-      "epoch": 32.637362637362635,
-      "grad_norm": 58.75380325317383,
       "learning_rate": 1.1242500000000001e-05,
-      "loss": 20.1862,
       "step": 1500
     },
     {
-      "epoch": 32.747252747252745,
-      "grad_norm": 73.71588134765625,
-      "learning_rate": 1.128e-05,
-      "loss": 20.9145,
-      "step": 1505
-    },
-    {
-      "epoch": 32.857142857142854,
-      "grad_norm": 109.56021881103516,
-      "learning_rate": 1.13175e-05,
-      "loss": 21.6375,
-      "step": 1510
-    },
-    {
-      "epoch": 32.967032967032964,
-      "grad_norm": 63.0958251953125,
-      "learning_rate": 1.1355e-05,
-      "loss": 21.3934,
-      "step": 1515
-    },
-    {
-      "epoch": 33.065934065934066,
-      "grad_norm": 72.25486755371094,
-      "learning_rate": 1.13925e-05,
-      "loss": 20.5177,
-      "step": 1520
-    },
-    {
-      "epoch": 33.175824175824175,
-      "grad_norm": 79.46709442138672,
-      "learning_rate": 1.143e-05,
-      "loss": 20.1796,
-      "step": 1525
-    },
-    {
-      "epoch": 33.285714285714285,
-      "grad_norm": 80.37206268310547,
-      "learning_rate": 1.14675e-05,
-      "loss": 20.519,
-      "step": 1530
-    },
-    {
-      "epoch": 33.395604395604394,
-      "grad_norm": 88.30254364013672,
-      "learning_rate": 1.1505e-05,
-      "loss": 20.5289,
-      "step": 1535
-    },
-    {
-      "epoch": 33.505494505494504,
-      "grad_norm": 59.99192428588867,
-      "learning_rate": 1.15425e-05,
-      "loss": 20.5064,
-      "step": 1540
-    },
-    {
-      "epoch": 33.61538461538461,
-      "grad_norm": 71.29468536376953,
-      "learning_rate": 1.1580000000000001e-05,
-      "loss": 19.2822,
-      "step": 1545
-    },
-    {
-      "epoch": 33.72527472527472,
-      "grad_norm": 67.42713928222656,
       "learning_rate": 1.16175e-05,
-      "loss": 20.9368,
       "step": 1550
     },
     {
-      "epoch": 33.83516483516483,
-      "grad_norm": 78.21910858154297,
-      "learning_rate": 1.1655000000000001e-05,
-      "loss": 19.9386,
-      "step": 1555
-    },
-    {
-      "epoch": 33.94505494505494,
-      "grad_norm": 133.632080078125,
-      "learning_rate": 1.16925e-05,
-      "loss": 20.3614,
-      "step": 1560
-    },
-    {
-      "epoch": 34.043956043956044,
-      "grad_norm": 56.151954650878906,
-      "learning_rate": 1.1730000000000001e-05,
-      "loss": 18.905,
-      "step": 1565
-    },
-    {
-      "epoch": 34.15384615384615,
-      "grad_norm": 72.71345520019531,
-      "learning_rate": 1.17675e-05,
-      "loss": 20.4241,
-      "step": 1570
-    },
-    {
-      "epoch": 34.26373626373626,
-      "grad_norm": 74.98194122314453,
-      "learning_rate": 1.1805000000000001e-05,
-      "loss": 19.3519,
-      "step": 1575
-    },
-    {
-      "epoch": 34.37362637362637,
-      "grad_norm": 67.34236145019531,
-      "learning_rate": 1.18425e-05,
-      "loss": 18.7936,
-      "step": 1580
-    },
-    {
-      "epoch": 34.48351648351648,
-      "grad_norm": 72.32279205322266,
-      "learning_rate": 1.1880000000000001e-05,
-      "loss": 20.0671,
-      "step": 1585
-    },
-    {
-      "epoch": 34.59340659340659,
-      "grad_norm": 54.22994613647461,
-      "learning_rate": 1.19175e-05,
-      "loss": 19.3878,
-      "step": 1590
-    },
-    {
-      "epoch": 34.7032967032967,
-      "grad_norm": 72.89130401611328,
-      "learning_rate": 1.1955000000000002e-05,
-      "loss": 19.4714,
-      "step": 1595
-    },
-    {
-      "epoch": 34.81318681318681,
-      "grad_norm": 65.4543685913086,
       "learning_rate": 1.19925e-05,
-      "loss": 20.4795,
       "step": 1600
     },
     {
-      "epoch": 34.92307692307692,
-      "grad_norm": 65.6441650390625,
-      "learning_rate": 1.2030000000000002e-05,
-      "loss": 18.6467,
-      "step": 1605
-    },
-    {
-      "epoch": 35.02197802197802,
-      "grad_norm": 82.81990051269531,
-      "learning_rate": 1.2067500000000001e-05,
-      "loss": 26.0395,
-      "step": 1610
-    },
-    {
-      "epoch": 35.13186813186813,
-      "grad_norm": 62.68242645263672,
-      "learning_rate": 1.2105000000000002e-05,
-      "loss": 19.2074,
-      "step": 1615
-    },
-    {
-      "epoch": 35.24175824175824,
-      "grad_norm": 79.71977996826172,
-      "learning_rate": 1.2142500000000001e-05,
-      "loss": 20.164,
-      "step": 1620
-    },
-    {
-      "epoch": 35.35164835164835,
-      "grad_norm": 82.37974548339844,
-      "learning_rate": 1.2180000000000002e-05,
-      "loss": 18.7425,
-      "step": 1625
-    },
-    {
-      "epoch": 35.46153846153846,
-      "grad_norm": 68.9244155883789,
-      "learning_rate": 1.22175e-05,
-      "loss": 18.8626,
-      "step": 1630
-    },
-    {
-      "epoch": 35.57142857142857,
-      "grad_norm": 72.45255279541016,
-      "learning_rate": 1.2254999999999999e-05,
-      "loss": 18.4994,
-      "step": 1635
-    },
-    {
-      "epoch": 35.68131868131868,
-      "grad_norm": 76.19093322753906,
-      "learning_rate": 1.22925e-05,
-      "loss": 18.911,
-      "step": 1640
-    },
-    {
-      "epoch": 35.79120879120879,
-      "grad_norm": 50.99027633666992,
-      "learning_rate": 1.2329999999999999e-05,
-      "loss": 18.2682,
-      "step": 1645
-    },
-    {
-      "epoch": 35.9010989010989,
-      "grad_norm": 79.6258316040039,
       "learning_rate": 1.23675e-05,
-      "loss": 19.1667,
       "step": 1650
     },
     {
-      "epoch": 36.0,
-      "grad_norm": 198.5669403076172,
-      "learning_rate": 1.2404999999999999e-05,
-      "loss": 18.7113,
-      "step": 1655
-    },
-    {
-      "epoch": 36.10989010989011,
-      "grad_norm": 52.13261032104492,
-      "learning_rate": 1.24425e-05,
-      "loss": 17.3056,
-      "step": 1660
-    },
-    {
-      "epoch": 36.21978021978022,
-      "grad_norm": 64.57988739013672,
-      "learning_rate": 1.2479999999999999e-05,
-      "loss": 17.8365,
-      "step": 1665
-    },
-    {
-      "epoch": 36.32967032967033,
-      "grad_norm": 111.00614929199219,
-      "learning_rate": 1.25175e-05,
-      "loss": 18.5584,
-      "step": 1670
-    },
-    {
-      "epoch": 36.43956043956044,
-      "grad_norm": 77.30622863769531,
-      "learning_rate": 1.2555e-05,
-      "loss": 18.0333,
-      "step": 1675
-    },
-    {
-      "epoch": 36.54945054945055,
-      "grad_norm": 60.33699417114258,
-      "learning_rate": 1.25925e-05,
-      "loss": 18.5811,
-      "step": 1680
-    },
-    {
-      "epoch": 36.65934065934066,
-      "grad_norm": 84.54650115966797,
-      "learning_rate": 1.263e-05,
-      "loss": 18.7701,
-      "step": 1685
-    },
-    {
-      "epoch": 36.76923076923077,
-      "grad_norm": 73.11846923828125,
-      "learning_rate": 1.26675e-05,
-      "loss": 18.816,
-      "step": 1690
-    },
-    {
-      "epoch": 36.879120879120876,
-      "grad_norm": 61.516761779785156,
-      "learning_rate": 1.2705e-05,
-      "loss": 17.5013,
-      "step": 1695
-    },
-    {
-      "epoch": 36.98901098901099,
-      "grad_norm": 74.83867645263672,
       "learning_rate": 1.27425e-05,
-      "loss": 17.6083,
       "step": 1700
     },
     {
-      "epoch": 37.08791208791209,
-      "grad_norm": 69.51549530029297,
-      "learning_rate": 1.278e-05,
-      "loss": 20.9143,
-      "step": 1705
-    },
-    {
-      "epoch": 37.1978021978022,
-      "grad_norm": 59.48684310913086,
-      "learning_rate": 1.28175e-05,
-      "loss": 17.725,
-      "step": 1710
-    },
-    {
-      "epoch": 37.30769230769231,
-      "grad_norm": 71.78469848632812,
-      "learning_rate": 1.2855e-05,
-      "loss": 17.7316,
-      "step": 1715
-    },
-    {
-      "epoch": 37.417582417582416,
-      "grad_norm": 82.73149871826172,
-      "learning_rate": 1.28925e-05,
-      "loss": 17.8607,
-      "step": 1720
-    },
-    {
-      "epoch": 37.527472527472526,
-      "grad_norm": 104.99292755126953,
-      "learning_rate": 1.293e-05,
-      "loss": 17.7329,
-      "step": 1725
-    },
-    {
-      "epoch": 37.637362637362635,
-      "grad_norm": 56.97990417480469,
-      "learning_rate": 1.29675e-05,
-      "loss": 17.7514,
-      "step": 1730
-    },
-    {
-      "epoch": 37.747252747252745,
-      "grad_norm": 76.46744537353516,
-      "learning_rate": 1.3005e-05,
-      "loss": 17.467,
-      "step": 1735
-    },
-    {
-      "epoch": 37.857142857142854,
-      "grad_norm": 67.00051879882812,
-      "learning_rate": 1.3042500000000001e-05,
-      "loss": 17.9124,
-      "step": 1740
-    },
-    {
-      "epoch": 37.967032967032964,
-      "grad_norm": 73.53226470947266,
-      "learning_rate": 1.308e-05,
-      "loss": 17.2669,
-      "step": 1745
-    },
-    {
-      "epoch": 38.065934065934066,
-      "grad_norm": 58.128150939941406,
       "learning_rate": 1.3117500000000001e-05,
-      "loss": 18.5807,
       "step": 1750
     },
     {
-      "epoch": 38.175824175824175,
-      "grad_norm": 66.3156509399414,
-      "learning_rate": 1.3155e-05,
-      "loss": 17.0461,
-      "step": 1755
-    },
-    {
-      "epoch": 38.285714285714285,
-      "grad_norm": 64.85994720458984,
-      "learning_rate": 1.31925e-05,
-      "loss": 16.7421,
-      "step": 1760
-    },
-    {
-      "epoch": 38.395604395604394,
-      "grad_norm": 65.9932861328125,
-      "learning_rate": 1.323e-05,
-      "loss": 16.7541,
-      "step": 1765
-    },
-    {
-      "epoch": 38.505494505494504,
-      "grad_norm": 60.61685562133789,
-      "learning_rate": 1.32675e-05,
-      "loss": 17.6106,
-      "step": 1770
-    },
-    {
-      "epoch": 38.61538461538461,
-      "grad_norm": 67.54483032226562,
-      "learning_rate": 1.3305e-05,
-      "loss": 16.9768,
-      "step": 1775
-    },
-    {
-      "epoch": 38.72527472527472,
-      "grad_norm": 74.80374908447266,
-      "learning_rate": 1.33425e-05,
-      "loss": 16.6762,
-      "step": 1780
-    },
-    {
-      "epoch": 38.83516483516483,
-      "grad_norm": 50.045692443847656,
-      "learning_rate": 1.338e-05,
-      "loss": 16.3018,
-      "step": 1785
-    },
-    {
-      "epoch": 38.94505494505494,
-      "grad_norm": 75.77816009521484,
-      "learning_rate": 1.34175e-05,
-      "loss": 17.1515,
-      "step": 1790
-    },
-    {
-      "epoch": 39.043956043956044,
-      "grad_norm": 48.32164764404297,
-      "learning_rate": 1.3455e-05,
-      "loss": 15.5191,
-      "step": 1795
-    },
-    {
-      "epoch": 39.15384615384615,
-      "grad_norm": 69.80590057373047,
       "learning_rate": 1.34925e-05,
-      "loss": 16.7698,
       "step": 1800
     },
     {
-      "epoch": 39.26373626373626,
-      "grad_norm": 58.667015075683594,
-      "learning_rate": 1.3530000000000001e-05,
-      "loss": 16.3254,
-      "step": 1805
-    },
-    {
-      "epoch": 39.37362637362637,
-      "grad_norm": 52.71421432495117,
-      "learning_rate": 1.35675e-05,
-      "loss": 16.1323,
-      "step": 1810
-    },
-    {
-      "epoch": 39.48351648351648,
-      "grad_norm": 68.22982025146484,
-      "learning_rate": 1.3605000000000001e-05,
-      "loss": 16.3548,
-      "step": 1815
-    },
-    {
-      "epoch": 39.59340659340659,
-      "grad_norm": 82.97122192382812,
-      "learning_rate": 1.36425e-05,
-      "loss": 16.154,
-      "step": 1820
-    },
-    {
-      "epoch": 39.7032967032967,
-      "grad_norm": 78.51126861572266,
-      "learning_rate": 1.3680000000000001e-05,
-      "loss": 16.1856,
-      "step": 1825
-    },
-    {
-      "epoch": 39.81318681318681,
-      "grad_norm": 84.22386932373047,
-      "learning_rate": 1.37175e-05,
-      "loss": 16.1977,
-      "step": 1830
-    },
-    {
-      "epoch": 39.92307692307692,
-      "grad_norm": 72.61677551269531,
-      "learning_rate": 1.3755000000000001e-05,
-      "loss": 15.5414,
-      "step": 1835
-    },
-    {
-      "epoch": 40.02197802197802,
-      "grad_norm": 60.4188117980957,
-      "learning_rate": 1.37925e-05,
-      "loss": 17.9055,
-      "step": 1840
-    },
-    {
-      "epoch": 40.13186813186813,
-      "grad_norm": 55.08159637451172,
-      "learning_rate": 1.3830000000000001e-05,
-      "loss": 15.7148,
-      "step": 1845
-    },
-    {
-      "epoch": 40.24175824175824,
-      "grad_norm": 58.98347091674805,
       "learning_rate": 1.38675e-05,
-      "loss": 16.586,
       "step": 1850
     },
     {
-      "epoch": 40.35164835164835,
-      "grad_norm": 59.37347412109375,
-      "learning_rate": 1.3905000000000002e-05,
-      "loss": 15.9047,
-      "step": 1855
-    },
-    {
-      "epoch": 40.46153846153846,
-      "grad_norm": 64.99933624267578,
-      "learning_rate": 1.39425e-05,
-      "loss": 15.9342,
-      "step": 1860
-    },
-    {
-      "epoch": 40.57142857142857,
-      "grad_norm": 63.66476058959961,
-      "learning_rate": 1.3980000000000002e-05,
-      "loss": 15.6676,
-      "step": 1865
-    },
-    {
-      "epoch": 40.68131868131868,
-      "grad_norm": 62.42832565307617,
-      "learning_rate": 1.4017500000000001e-05,
-      "loss": 16.0192,
-      "step": 1870
-    },
-    {
-      "epoch": 40.79120879120879,
-      "grad_norm": 64.26334381103516,
-      "learning_rate": 1.4055000000000002e-05,
-      "loss": 15.4637,
-      "step": 1875
-    },
-    {
-      "epoch": 40.9010989010989,
-      "grad_norm": 53.68131637573242,
-      "learning_rate": 1.4092500000000001e-05,
-      "loss": 15.4847,
-      "step": 1880
-    },
-    {
-      "epoch": 41.0,
-      "grad_norm": 133.40896606445312,
-      "learning_rate": 1.413e-05,
-      "loss": 15.1,
-      "step": 1885
-    },
-    {
-      "epoch": 41.10989010989011,
-      "grad_norm": 64.47783660888672,
-      "learning_rate": 1.4167500000000001e-05,
-      "loss": 15.8261,
-      "step": 1890
-    },
-    {
-      "epoch": 41.21978021978022,
-      "grad_norm": 53.95432662963867,
-      "learning_rate": 1.4205e-05,
-      "loss": 15.3971,
-      "step": 1895
-    },
-    {
-      "epoch": 41.32967032967033,
-      "grad_norm": 60.42184829711914,
       "learning_rate": 1.4242500000000001e-05,
-      "loss": 15.6888,
       "step": 1900
     },
     {
-      "epoch": 41.43956043956044,
-      "grad_norm": 65.85467529296875,
-      "learning_rate": 1.428e-05,
-      "loss": 16.5622,
-      "step": 1905
-    },
-    {
-      "epoch": 41.54945054945055,
-      "grad_norm": 71.76943969726562,
-      "learning_rate": 1.4317500000000001e-05,
-      "loss": 15.9214,
-      "step": 1910
-    },
-    {
-      "epoch": 41.65934065934066,
-      "grad_norm": 56.51178741455078,
-      "learning_rate": 1.4355e-05,
-      "loss": 15.283,
-      "step": 1915
-    },
-    {
-      "epoch": 41.76923076923077,
-      "grad_norm": 77.75347900390625,
-      "learning_rate": 1.43925e-05,
-      "loss": 15.6863,
-      "step": 1920
-    },
-    {
-      "epoch": 41.879120879120876,
-      "grad_norm": 69.0566635131836,
-      "learning_rate": 1.4429999999999999e-05,
-      "loss": 14.7517,
-      "step": 1925
-    },
-    {
-      "epoch": 41.98901098901099,
-      "grad_norm": 52.313724517822266,
-      "learning_rate": 1.44675e-05,
-      "loss": 14.5892,
-      "step": 1930
-    },
-    {
-      "epoch": 42.08791208791209,
-      "grad_norm": 46.05543518066406,
-      "learning_rate": 1.4505e-05,
-      "loss": 17.2186,
-      "step": 1935
-    },
-    {
-      "epoch": 42.1978021978022,
-      "grad_norm": 60.843135833740234,
-      "learning_rate": 1.45425e-05,
-      "loss": 15.052,
-      "step": 1940
-    },
-    {
-      "epoch": 42.30769230769231,
-      "grad_norm": 69.2349853515625,
-      "learning_rate": 1.458e-05,
-      "loss": 14.8454,
-      "step": 1945
-    },
-    {
-      "epoch": 42.417582417582416,
-      "grad_norm": 63.5760383605957,
       "learning_rate": 1.46175e-05,
-      "loss": 15.1218,
       "step": 1950
     },
     {
-      "epoch": 42.527472527472526,
-      "grad_norm": 61.12398147583008,
-      "learning_rate": 1.4655e-05,
-      "loss": 15.7076,
-      "step": 1955
-    },
-    {
-      "epoch": 42.637362637362635,
-      "grad_norm": 67.10089111328125,
-      "learning_rate": 1.46925e-05,
-      "loss": 14.6074,
-      "step": 1960
-    },
-    {
-      "epoch": 42.747252747252745,
-      "grad_norm": 56.15253448486328,
-      "learning_rate": 1.473e-05,
-      "loss": 14.7393,
-      "step": 1965
-    },
-    {
-      "epoch": 42.857142857142854,
-      "grad_norm": 62.48304748535156,
-      "learning_rate": 1.47675e-05,
-      "loss": 15.0895,
-      "step": 1970
-    },
-    {
-      "epoch": 42.967032967032964,
-      "grad_norm": 64.34083557128906,
-      "learning_rate": 1.4805e-05,
-      "loss": 14.4191,
-      "step": 1975
-    },
-    {
-      "epoch": 43.065934065934066,
-      "grad_norm": 72.964111328125,
-      "learning_rate": 1.48425e-05,
-      "loss": 15.1835,
-      "step": 1980
-    },
-    {
-      "epoch": 43.175824175824175,
-      "grad_norm": 64.65727233886719,
-      "learning_rate": 1.488e-05,
-      "loss": 14.8041,
-      "step": 1985
-    },
-    {
-      "epoch": 43.285714285714285,
-      "grad_norm": 58.626461029052734,
-      "learning_rate": 1.49175e-05,
-      "loss": 15.048,
-      "step": 1990
-    },
-    {
-      "epoch": 43.395604395604394,
-      "grad_norm": 66.43981170654297,
-      "learning_rate": 1.4955e-05,
-      "loss": 14.752,
-      "step": 1995
-    },
-    {
-      "epoch": 43.505494505494504,
-      "grad_norm": 62.531463623046875,
       "learning_rate": 1.4992500000000001e-05,
-      "loss": 14.5582,
       "step": 2000
     }
   ],
-  "logging_steps": 5,
-  "max_steps": 2300,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 50,
-  "save_steps": 500,
   "stateful_callbacks": {
     "TrainerControl": {
       "args": {
@@ -2827,7 +307,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 5.300275357458432e+19,
   "train_batch_size": 24,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 44.44444444444444,
   "eval_steps": 500,
   "global_step": 2000,
   "is_hyper_param_search": false,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 1.1111111111111112,
+      "grad_norm": 3663.33642578125,
       "learning_rate": 3.6750000000000003e-07,
+      "loss": 1356.1239,
       "step": 50
     },
     {
+      "epoch": 2.2222222222222223,
+      "grad_norm": 3973.032470703125,
+      "learning_rate": 7.425000000000001e-07,
+      "loss": 1275.5178,
+      "step": 100
     },
     {
+      "epoch": 3.3333333333333335,
+      "grad_norm": 2667.0068359375,
+      "learning_rate": 1.1174999999999999e-06,
+      "loss": 1123.6059,
+      "step": 150
     },
     {
+      "epoch": 4.444444444444445,
+      "grad_norm": 2320.48486328125,
+      "learning_rate": 1.4925000000000001e-06,
+      "loss": 922.8917,
+      "step": 200
     },
     {
+      "epoch": 5.555555555555555,
+      "grad_norm": 1611.3345947265625,
+      "learning_rate": 1.8675000000000001e-06,
+      "loss": 714.5638,
+      "step": 250
     },
     {
+      "epoch": 6.666666666666667,
+      "grad_norm": 1395.7064208984375,
+      "learning_rate": 2.2425e-06,
+      "loss": 542.1251,
+      "step": 300
     },
     {
+      "epoch": 7.777777777777778,
+      "grad_norm": 1019.4860229492188,
+      "learning_rate": 2.6175e-06,
+      "loss": 411.1387,
+      "step": 350
     },
     {
+      "epoch": 8.88888888888889,
+      "grad_norm": 888.315185546875,
+      "learning_rate": 2.9925e-06,
+      "loss": 318.4567,
+      "step": 400
     },
     {
+      "epoch": 10.0,
+      "grad_norm": 2590.113525390625,
+      "learning_rate": 3.3675000000000004e-06,
+      "loss": 261.6995,
+      "step": 450
     },
     {
+      "epoch": 11.11111111111111,
+      "grad_norm": 711.677734375,
+      "learning_rate": 3.7425e-06,
+      "loss": 220.2936,
+      "step": 500
     },
     {
+      "epoch": 12.222222222222221,
+      "grad_norm": 548.9371948242188,
+      "learning_rate": 4.117500000000001e-06,
+      "loss": 187.9833,
+      "step": 550
     },
     {
+      "epoch": 13.333333333333334,
+      "grad_norm": 1127.1611328125,
+      "learning_rate": 4.4925e-06,
+      "loss": 159.1351,
+      "step": 600
     },
     {
+      "epoch": 14.444444444444445,
+      "grad_norm": 426.074951171875,
+      "learning_rate": 4.8675e-06,
+      "loss": 137.1092,
+      "step": 650
     },
     {
+      "epoch": 15.555555555555555,
+      "grad_norm": 348.842529296875,
+      "learning_rate": 5.2425e-06,
+      "loss": 119.822,
+      "step": 700
     },
     {
+      "epoch": 16.666666666666668,
+      "grad_norm": 373.2169189453125,
+      "learning_rate": 5.6175e-06,
+      "loss": 104.3366,
+      "step": 750
     },
     {
+      "epoch": 17.77777777777778,
+      "grad_norm": 324.51702880859375,
+      "learning_rate": 5.992500000000001e-06,
+      "loss": 90.8788,
+      "step": 800
     },
     {
+      "epoch": 18.88888888888889,
+      "grad_norm": 269.91827392578125,
+      "learning_rate": 6.3675e-06,
+      "loss": 78.4644,
+      "step": 850
     },
     {
+      "epoch": 20.0,
+      "grad_norm": 1744.54052734375,
+      "learning_rate": 6.7425e-06,
+      "loss": 70.3526,
+      "step": 900
     },
     {
+      "epoch": 21.11111111111111,
+      "grad_norm": 369.39837646484375,
+      "learning_rate": 7.1175e-06,
+      "loss": 63.9417,
+      "step": 950
     },
     {
+      "epoch": 22.22222222222222,
+      "grad_norm": 303.95977783203125,
+      "learning_rate": 7.4925e-06,
+      "loss": 63.4575,
+      "step": 1000
     },
     {
+      "epoch": 23.333333333333332,
+      "grad_norm": 187.462890625,
+      "learning_rate": 7.8675e-06,
+      "loss": 54.7417,
+      "step": 1050
     },
     {
+      "epoch": 24.444444444444443,
+      "grad_norm": 165.56666564941406,
+      "learning_rate": 8.2425e-06,
+      "loss": 49.6842,
+      "step": 1100
     },
     {
+      "epoch": 25.555555555555557,
+      "grad_norm": 147.3148193359375,
+      "learning_rate": 8.6175e-06,
+      "loss": 43.027,
+      "step": 1150
     },
     {
+      "epoch": 26.666666666666668,
+      "grad_norm": 125.53775024414062,
+      "learning_rate": 8.9925e-06,
+      "loss": 38.2579,
+      "step": 1200
     },
     {
+      "epoch": 27.77777777777778,
+      "grad_norm": 109.85810089111328,
+      "learning_rate": 9.367500000000001e-06,
+      "loss": 34.3957,
+      "step": 1250
     },
     {
+      "epoch": 28.88888888888889,
+      "grad_norm": 98.328369140625,
+      "learning_rate": 9.7425e-06,
+      "loss": 31.4378,
+      "step": 1300
     },
     {
+      "epoch": 30.0,
+      "grad_norm": 109.77750396728516,
+      "learning_rate": 1.01175e-05,
+      "loss": 28.5084,
+      "step": 1350
     },
     {
+      "epoch": 31.11111111111111,
+      "grad_norm": 112.47483825683594,
+      "learning_rate": 1.04925e-05,
+      "loss": 26.1671,
+      "step": 1400
     },
     {
+      "epoch": 32.22222222222222,
+      "grad_norm": 85.60242462158203,
       "learning_rate": 1.08675e-05,
+      "loss": 24.2309,
       "step": 1450
     },
     {
+      "epoch": 33.333333333333336,
+      "grad_norm": 73.19799041748047,
       "learning_rate": 1.1242500000000001e-05,
+      "loss": 22.6248,
       "step": 1500
     },
     {
+      "epoch": 34.44444444444444,
+      "grad_norm": 80.70884704589844,
       "learning_rate": 1.16175e-05,
+      "loss": 21.1187,
       "step": 1550
     },
     {
+      "epoch": 35.55555555555556,
+      "grad_norm": 110.98326110839844,
       "learning_rate": 1.19925e-05,
+      "loss": 20.4828,
       "step": 1600
     },
     {
+      "epoch": 36.666666666666664,
+      "grad_norm": 113.65286254882812,
       "learning_rate": 1.23675e-05,
+      "loss": 19.7366,
       "step": 1650
     },
     {
+      "epoch": 37.77777777777778,
+      "grad_norm": 77.65855407714844,
       "learning_rate": 1.27425e-05,
+      "loss": 18.6632,
       "step": 1700
     },
     {
+      "epoch": 38.888888888888886,
+      "grad_norm": 88.96723175048828,
       "learning_rate": 1.3117500000000001e-05,
+      "loss": 18.0793,
       "step": 1750
     },
     {
+      "epoch": 40.0,
+      "grad_norm": 79.1690902709961,
       "learning_rate": 1.34925e-05,
+      "loss": 17.0667,
       "step": 1800
     },
     {
+      "epoch": 41.111111111111114,
+      "grad_norm": 93.60108184814453,
       "learning_rate": 1.38675e-05,
+      "loss": 16.6106,
       "step": 1850
     },
     {
+      "epoch": 42.22222222222222,
+      "grad_norm": 66.16129302978516,
       "learning_rate": 1.4242500000000001e-05,
+      "loss": 16.4905,
       "step": 1900
     },
     {
+      "epoch": 43.333333333333336,
+      "grad_norm": 62.41362380981445,
       "learning_rate": 1.46175e-05,
+      "loss": 15.6427,
       "step": 1950
     },
     {
+      "epoch": 44.44444444444444,
+      "grad_norm": 107.30168151855469,
       "learning_rate": 1.4992500000000001e-05,
+      "loss": 15.0731,
       "step": 2000
     }
   ],
+  "logging_steps": 50,
+  "max_steps": 2250,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 50,
+  "save_steps": 200,
   "stateful_callbacks": {
     "TrainerControl": {
       "args": {
       "attributes": {}
     }
   },
+  "total_flos": 5.387467985017897e+19,
   "train_batch_size": 24,
   "trial_name": null,
   "trial_params": null

checkpoint-2000/training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8e1ce90134704b5a6d94e1dd2b2ac60499368f272f8eda658e4e1ca0663c44cd
 size 5368

 version https://git-lfs.github.com/spec/v1
+oid sha256:a7b84367094b7487f77de50fba614a6c6667e9cf018b77ee5bfc158268fc5eaf
 size 5368

checkpoint-2200/config.json ADDED Viewed

	@@ -0,0 +1,129 @@

+{
+  "activation_dropout": 0.0,
+  "activation_function": "silu",
+  "anchor_image_size": null,
+  "architectures": [
+    "RTDetrV2ForObjectDetection"
+  ],
+  "attention_dropout": 0.0,
+  "auxiliary_loss": true,
+  "backbone": null,
+  "backbone_config": {
+    "depths": [
+      3,
+      4,
+      23,
+      3
+    ],
+    "downsample_in_bottleneck": false,
+    "downsample_in_first_stage": false,
+    "embedding_size": 64,
+    "hidden_act": "relu",
+    "hidden_sizes": [
+      256,
+      512,
+      1024,
+      2048
+    ],
+    "layer_type": "bottleneck",
+    "model_type": "rt_detr_resnet",
+    "num_channels": 3,
+    "out_features": [
+      "stage2",
+      "stage3",
+      "stage4"
+    ],
+    "out_indices": [
+      2,
+      3,
+      4
+    ],
+    "stage_names": [
+      "stem",
+      "stage1",
+      "stage2",
+      "stage3",
+      "stage4"
+    ],
+    "torch_dtype": "float32"
+  },
+  "backbone_kwargs": null,
+  "batch_norm_eps": 1e-05,
+  "box_noise_scale": 1.0,
+  "d_model": 256,
+  "decoder_activation_function": "relu",
+  "decoder_attention_heads": 8,
+  "decoder_ffn_dim": 1024,
+  "decoder_in_channels": [
+    384,
+    384,
+    384
+  ],
+  "decoder_layers": 6,
+  "decoder_method": "default",
+  "decoder_n_levels": 3,
+  "decoder_n_points": 4,
+  "decoder_offset_scale": 0.5,
+  "disable_custom_kernels": true,
+  "dropout": 0.0,
+  "encode_proj_layers": [
+    2
+  ],
+  "encoder_activation_function": "gelu",
+  "encoder_attention_heads": 8,
+  "encoder_ffn_dim": 2048,
+  "encoder_hidden_dim": 384,
+  "encoder_in_channels": [
+    512,
+    1024,
+    2048
+  ],
+  "encoder_layers": 1,
+  "eos_coefficient": 0.0001,
+  "eval_size": null,
+  "feat_strides": [
+    8,
+    16,
+    32
+  ],
+  "focal_loss_alpha": 0.75,
+  "focal_loss_gamma": 2.0,
+  "freeze_backbone_batch_norms": true,
+  "hidden_expansion": 1.0,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2"
+  },
+  "initializer_bias_prior_prob": null,
+  "initializer_range": 0.01,
+  "is_encoder_decoder": true,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_2": 2
+  },
+  "label_noise_ratio": 0.5,
+  "layer_norm_eps": 1e-05,
+  "learn_initial_query": false,
+  "matcher_alpha": 0.25,
+  "matcher_bbox_cost": 5.0,
+  "matcher_class_cost": 2.0,
+  "matcher_gamma": 2.0,
+  "matcher_giou_cost": 2.0,
+  "model_type": "rt_detr_v2",
+  "normalize_before": false,
+  "num_denoising": 100,
+  "num_feature_levels": 3,
+  "num_queries": 300,
+  "positional_encoding_temperature": 10000,
+  "torch_dtype": "float32",
+  "transformers_version": "4.55.0",
+  "use_focal_loss": true,
+  "use_pretrained_backbone": false,
+  "use_timm_backbone": false,
+  "weight_loss_bbox": 5.0,
+  "weight_loss_giou": 2.0,
+  "weight_loss_vfl": 1.0,
+  "with_box_refine": true
+}

checkpoint-2200/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2cb217c7b7445eb9473cddef9f4ea779004ca224f4615b6f69aa70c7cc0b781f
+size 306699044

checkpoint-2200/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8ca9f9b9c554485c8d65a25449ddeb6582be923a8692ab873d9c1e6a21062298
+size 611580433

checkpoint-2200/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "do_convert_annotations": true,
+  "do_normalize": false,
+  "do_pad": false,
+  "do_rescale": true,
+  "do_resize": true,
+  "format": "coco_detection",
+  "image_mean": [
+    0.485,
+    0.456,
+    0.406
+  ],
+  "image_processor_type": "RTDetrImageProcessor",
+  "image_std": [
+    0.229,
+    0.224,
+    0.225
+  ],
+  "pad_size": null,
+  "resample": 2,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 640,
+    "width": 640
+  }
+}

checkpoint-2200/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:90a75f76146ccf9511e324998dc7ec6df7764e7adf2655174e5f732a90e23392
+size 14244

checkpoint-2200/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4be819b970a626d9fdb18122878b575ce46f2fa37fc360325a23e4cfdc87bcd1
+size 1064

checkpoint-2200/trainer_state.json ADDED Viewed

	@@ -0,0 +1,342 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 48.888888888888886,
+  "eval_steps": 500,
+  "global_step": 2200,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 1.1111111111111112,
+      "grad_norm": 3663.33642578125,
+      "learning_rate": 3.6750000000000003e-07,
+      "loss": 1356.1239,
+      "step": 50
+    },
+    {
+      "epoch": 2.2222222222222223,
+      "grad_norm": 3973.032470703125,
+      "learning_rate": 7.425000000000001e-07,
+      "loss": 1275.5178,
+      "step": 100
+    },
+    {
+      "epoch": 3.3333333333333335,
+      "grad_norm": 2667.0068359375,
+      "learning_rate": 1.1174999999999999e-06,
+      "loss": 1123.6059,
+      "step": 150
+    },
+    {
+      "epoch": 4.444444444444445,
+      "grad_norm": 2320.48486328125,
+      "learning_rate": 1.4925000000000001e-06,
+      "loss": 922.8917,
+      "step": 200
+    },
+    {
+      "epoch": 5.555555555555555,
+      "grad_norm": 1611.3345947265625,
+      "learning_rate": 1.8675000000000001e-06,
+      "loss": 714.5638,
+      "step": 250
+    },
+    {
+      "epoch": 6.666666666666667,
+      "grad_norm": 1395.7064208984375,
+      "learning_rate": 2.2425e-06,
+      "loss": 542.1251,
+      "step": 300
+    },
+    {
+      "epoch": 7.777777777777778,
+      "grad_norm": 1019.4860229492188,
+      "learning_rate": 2.6175e-06,
+      "loss": 411.1387,
+      "step": 350
+    },
+    {
+      "epoch": 8.88888888888889,
+      "grad_norm": 888.315185546875,
+      "learning_rate": 2.9925e-06,
+      "loss": 318.4567,
+      "step": 400
+    },
+    {
+      "epoch": 10.0,
+      "grad_norm": 2590.113525390625,
+      "learning_rate": 3.3675000000000004e-06,
+      "loss": 261.6995,
+      "step": 450
+    },
+    {
+      "epoch": 11.11111111111111,
+      "grad_norm": 711.677734375,
+      "learning_rate": 3.7425e-06,
+      "loss": 220.2936,
+      "step": 500
+    },
+    {
+      "epoch": 12.222222222222221,
+      "grad_norm": 548.9371948242188,
+      "learning_rate": 4.117500000000001e-06,
+      "loss": 187.9833,
+      "step": 550
+    },
+    {
+      "epoch": 13.333333333333334,
+      "grad_norm": 1127.1611328125,
+      "learning_rate": 4.4925e-06,
+      "loss": 159.1351,
+      "step": 600
+    },
+    {
+      "epoch": 14.444444444444445,
+      "grad_norm": 426.074951171875,
+      "learning_rate": 4.8675e-06,
+      "loss": 137.1092,
+      "step": 650
+    },
+    {
+      "epoch": 15.555555555555555,
+      "grad_norm": 348.842529296875,
+      "learning_rate": 5.2425e-06,
+      "loss": 119.822,
+      "step": 700
+    },
+    {
+      "epoch": 16.666666666666668,
+      "grad_norm": 373.2169189453125,
+      "learning_rate": 5.6175e-06,
+      "loss": 104.3366,
+      "step": 750
+    },
+    {
+      "epoch": 17.77777777777778,
+      "grad_norm": 324.51702880859375,
+      "learning_rate": 5.992500000000001e-06,
+      "loss": 90.8788,
+      "step": 800
+    },
+    {
+      "epoch": 18.88888888888889,
+      "grad_norm": 269.91827392578125,
+      "learning_rate": 6.3675e-06,
+      "loss": 78.4644,
+      "step": 850
+    },
+    {
+      "epoch": 20.0,
+      "grad_norm": 1744.54052734375,
+      "learning_rate": 6.7425e-06,
+      "loss": 70.3526,
+      "step": 900
+    },
+    {
+      "epoch": 21.11111111111111,
+      "grad_norm": 369.39837646484375,
+      "learning_rate": 7.1175e-06,
+      "loss": 63.9417,
+      "step": 950
+    },
+    {
+      "epoch": 22.22222222222222,
+      "grad_norm": 303.95977783203125,
+      "learning_rate": 7.4925e-06,
+      "loss": 63.4575,
+      "step": 1000
+    },
+    {
+      "epoch": 23.333333333333332,
+      "grad_norm": 187.462890625,
+      "learning_rate": 7.8675e-06,
+      "loss": 54.7417,
+      "step": 1050
+    },
+    {
+      "epoch": 24.444444444444443,
+      "grad_norm": 165.56666564941406,
+      "learning_rate": 8.2425e-06,
+      "loss": 49.6842,
+      "step": 1100
+    },
+    {
+      "epoch": 25.555555555555557,
+      "grad_norm": 147.3148193359375,
+      "learning_rate": 8.6175e-06,
+      "loss": 43.027,
+      "step": 1150
+    },
+    {
+      "epoch": 26.666666666666668,
+      "grad_norm": 125.53775024414062,
+      "learning_rate": 8.9925e-06,
+      "loss": 38.2579,
+      "step": 1200
+    },
+    {
+      "epoch": 27.77777777777778,
+      "grad_norm": 109.85810089111328,
+      "learning_rate": 9.367500000000001e-06,
+      "loss": 34.3957,
+      "step": 1250
+    },
+    {
+      "epoch": 28.88888888888889,
+      "grad_norm": 98.328369140625,
+      "learning_rate": 9.7425e-06,
+      "loss": 31.4378,
+      "step": 1300
+    },
+    {
+      "epoch": 30.0,
+      "grad_norm": 109.77750396728516,
+      "learning_rate": 1.01175e-05,
+      "loss": 28.5084,
+      "step": 1350
+    },
+    {
+      "epoch": 31.11111111111111,
+      "grad_norm": 112.47483825683594,
+      "learning_rate": 1.04925e-05,
+      "loss": 26.1671,
+      "step": 1400
+    },
+    {
+      "epoch": 32.22222222222222,
+      "grad_norm": 85.60242462158203,
+      "learning_rate": 1.08675e-05,
+      "loss": 24.2309,
+      "step": 1450
+    },
+    {
+      "epoch": 33.333333333333336,
+      "grad_norm": 73.19799041748047,
+      "learning_rate": 1.1242500000000001e-05,
+      "loss": 22.6248,
+      "step": 1500
+    },
+    {
+      "epoch": 34.44444444444444,
+      "grad_norm": 80.70884704589844,
+      "learning_rate": 1.16175e-05,
+      "loss": 21.1187,
+      "step": 1550
+    },
+    {
+      "epoch": 35.55555555555556,
+      "grad_norm": 110.98326110839844,
+      "learning_rate": 1.19925e-05,
+      "loss": 20.4828,
+      "step": 1600
+    },
+    {
+      "epoch": 36.666666666666664,
+      "grad_norm": 113.65286254882812,
+      "learning_rate": 1.23675e-05,
+      "loss": 19.7366,
+      "step": 1650
+    },
+    {
+      "epoch": 37.77777777777778,
+      "grad_norm": 77.65855407714844,
+      "learning_rate": 1.27425e-05,
+      "loss": 18.6632,
+      "step": 1700
+    },
+    {
+      "epoch": 38.888888888888886,
+      "grad_norm": 88.96723175048828,
+      "learning_rate": 1.3117500000000001e-05,
+      "loss": 18.0793,
+      "step": 1750
+    },
+    {
+      "epoch": 40.0,
+      "grad_norm": 79.1690902709961,
+      "learning_rate": 1.34925e-05,
+      "loss": 17.0667,
+      "step": 1800
+    },
+    {
+      "epoch": 41.111111111111114,
+      "grad_norm": 93.60108184814453,
+      "learning_rate": 1.38675e-05,
+      "loss": 16.6106,
+      "step": 1850
+    },
+    {
+      "epoch": 42.22222222222222,
+      "grad_norm": 66.16129302978516,
+      "learning_rate": 1.4242500000000001e-05,
+      "loss": 16.4905,
+      "step": 1900
+    },
+    {
+      "epoch": 43.333333333333336,
+      "grad_norm": 62.41362380981445,
+      "learning_rate": 1.46175e-05,
+      "loss": 15.6427,
+      "step": 1950
+    },
+    {
+      "epoch": 44.44444444444444,
+      "grad_norm": 107.30168151855469,
+      "learning_rate": 1.4992500000000001e-05,
+      "loss": 15.0731,
+      "step": 2000
+    },
+    {
+      "epoch": 45.55555555555556,
+      "grad_norm": 100.18666076660156,
+      "learning_rate": 1.2060000000000001e-05,
+      "loss": 14.6973,
+      "step": 2050
+    },
+    {
+      "epoch": 46.666666666666664,
+      "grad_norm": 68.78209686279297,
+      "learning_rate": 9.06e-06,
+      "loss": 13.8928,
+      "step": 2100
+    },
+    {
+      "epoch": 47.77777777777778,
+      "grad_norm": 65.85958099365234,
+      "learning_rate": 6.0600000000000004e-06,
+      "loss": 13.4318,
+      "step": 2150
+    },
+    {
+      "epoch": 48.888888888888886,
+      "grad_norm": 72.31432342529297,
+      "learning_rate": 3.06e-06,
+      "loss": 12.6856,
+      "step": 2200
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 2250,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 50,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5.926395026677432e+19,
+  "train_batch_size": 24,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-2200/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a7b84367094b7487f77de50fba614a6c6667e9cf018b77ee5bfc158268fc5eaf
+size 5368

checkpoint-2250/config.json ADDED Viewed

	@@ -0,0 +1,129 @@

+{
+  "activation_dropout": 0.0,
+  "activation_function": "silu",
+  "anchor_image_size": null,
+  "architectures": [
+    "RTDetrV2ForObjectDetection"
+  ],
+  "attention_dropout": 0.0,
+  "auxiliary_loss": true,
+  "backbone": null,
+  "backbone_config": {
+    "depths": [
+      3,
+      4,
+      23,
+      3
+    ],
+    "downsample_in_bottleneck": false,
+    "downsample_in_first_stage": false,
+    "embedding_size": 64,
+    "hidden_act": "relu",
+    "hidden_sizes": [
+      256,
+      512,
+      1024,
+      2048
+    ],
+    "layer_type": "bottleneck",
+    "model_type": "rt_detr_resnet",
+    "num_channels": 3,
+    "out_features": [
+      "stage2",
+      "stage3",
+      "stage4"
+    ],
+    "out_indices": [
+      2,
+      3,
+      4
+    ],
+    "stage_names": [
+      "stem",
+      "stage1",
+      "stage2",
+      "stage3",
+      "stage4"
+    ],
+    "torch_dtype": "float32"
+  },
+  "backbone_kwargs": null,
+  "batch_norm_eps": 1e-05,
+  "box_noise_scale": 1.0,
+  "d_model": 256,
+  "decoder_activation_function": "relu",
+  "decoder_attention_heads": 8,
+  "decoder_ffn_dim": 1024,
+  "decoder_in_channels": [
+    384,
+    384,
+    384
+  ],
+  "decoder_layers": 6,
+  "decoder_method": "default",
+  "decoder_n_levels": 3,
+  "decoder_n_points": 4,
+  "decoder_offset_scale": 0.5,
+  "disable_custom_kernels": true,
+  "dropout": 0.0,
+  "encode_proj_layers": [
+    2
+  ],
+  "encoder_activation_function": "gelu",
+  "encoder_attention_heads": 8,
+  "encoder_ffn_dim": 2048,
+  "encoder_hidden_dim": 384,
+  "encoder_in_channels": [
+    512,
+    1024,
+    2048
+  ],
+  "encoder_layers": 1,
+  "eos_coefficient": 0.0001,
+  "eval_size": null,
+  "feat_strides": [
+    8,
+    16,
+    32
+  ],
+  "focal_loss_alpha": 0.75,
+  "focal_loss_gamma": 2.0,
+  "freeze_backbone_batch_norms": true,
+  "hidden_expansion": 1.0,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2"
+  },
+  "initializer_bias_prior_prob": null,
+  "initializer_range": 0.01,
+  "is_encoder_decoder": true,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_2": 2
+  },
+  "label_noise_ratio": 0.5,
+  "layer_norm_eps": 1e-05,
+  "learn_initial_query": false,
+  "matcher_alpha": 0.25,
+  "matcher_bbox_cost": 5.0,
+  "matcher_class_cost": 2.0,
+  "matcher_gamma": 2.0,
+  "matcher_giou_cost": 2.0,
+  "model_type": "rt_detr_v2",
+  "normalize_before": false,
+  "num_denoising": 100,
+  "num_feature_levels": 3,
+  "num_queries": 300,
+  "positional_encoding_temperature": 10000,
+  "torch_dtype": "float32",
+  "transformers_version": "4.55.0",
+  "use_focal_loss": true,
+  "use_pretrained_backbone": false,
+  "use_timm_backbone": false,
+  "weight_loss_bbox": 5.0,
+  "weight_loss_giou": 2.0,
+  "weight_loss_vfl": 1.0,
+  "with_box_refine": true
+}

checkpoint-2250/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d3a52d4cd4386295eedbfb267bc679eca4b27864d745fff06694c0f9dbf823a6
+size 306699044

checkpoint-2250/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:05dd6b927d3bf0bbfe4b8e26ceb5b08121784a39348599920d07011f715be702
+size 611580433

checkpoint-2250/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "do_convert_annotations": true,
+  "do_normalize": false,
+  "do_pad": false,
+  "do_rescale": true,
+  "do_resize": true,
+  "format": "coco_detection",
+  "image_mean": [
+    0.485,
+    0.456,
+    0.406
+  ],
+  "image_processor_type": "RTDetrImageProcessor",
+  "image_std": [
+    0.229,
+    0.224,
+    0.225
+  ],
+  "pad_size": null,
+  "resample": 2,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 640,
+    "width": 640
+  }
+}

checkpoint-2250/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:66787fd9597d0c4424fa82a41803d924585f86f960101c754c7f5cfada26d864
+size 14244

checkpoint-2250/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e97b09861aa673666d96648b6cbe41d7c63c06fd70e7ca49c43cf61264897a95
+size 1064

checkpoint-2250/trainer_state.json ADDED Viewed

	@@ -0,0 +1,349 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 50.0,
+  "eval_steps": 500,
+  "global_step": 2250,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 1.1111111111111112,
+      "grad_norm": 3663.33642578125,
+      "learning_rate": 3.6750000000000003e-07,
+      "loss": 1356.1239,
+      "step": 50
+    },
+    {
+      "epoch": 2.2222222222222223,
+      "grad_norm": 3973.032470703125,
+      "learning_rate": 7.425000000000001e-07,
+      "loss": 1275.5178,
+      "step": 100
+    },
+    {
+      "epoch": 3.3333333333333335,
+      "grad_norm": 2667.0068359375,
+      "learning_rate": 1.1174999999999999e-06,
+      "loss": 1123.6059,
+      "step": 150
+    },
+    {
+      "epoch": 4.444444444444445,
+      "grad_norm": 2320.48486328125,
+      "learning_rate": 1.4925000000000001e-06,
+      "loss": 922.8917,
+      "step": 200
+    },
+    {
+      "epoch": 5.555555555555555,
+      "grad_norm": 1611.3345947265625,
+      "learning_rate": 1.8675000000000001e-06,
+      "loss": 714.5638,
+      "step": 250
+    },
+    {
+      "epoch": 6.666666666666667,
+      "grad_norm": 1395.7064208984375,
+      "learning_rate": 2.2425e-06,
+      "loss": 542.1251,
+      "step": 300
+    },
+    {
+      "epoch": 7.777777777777778,
+      "grad_norm": 1019.4860229492188,
+      "learning_rate": 2.6175e-06,
+      "loss": 411.1387,
+      "step": 350
+    },
+    {
+      "epoch": 8.88888888888889,
+      "grad_norm": 888.315185546875,
+      "learning_rate": 2.9925e-06,
+      "loss": 318.4567,
+      "step": 400
+    },
+    {
+      "epoch": 10.0,
+      "grad_norm": 2590.113525390625,
+      "learning_rate": 3.3675000000000004e-06,
+      "loss": 261.6995,
+      "step": 450
+    },
+    {
+      "epoch": 11.11111111111111,
+      "grad_norm": 711.677734375,
+      "learning_rate": 3.7425e-06,
+      "loss": 220.2936,
+      "step": 500
+    },
+    {
+      "epoch": 12.222222222222221,
+      "grad_norm": 548.9371948242188,
+      "learning_rate": 4.117500000000001e-06,
+      "loss": 187.9833,
+      "step": 550
+    },
+    {
+      "epoch": 13.333333333333334,
+      "grad_norm": 1127.1611328125,
+      "learning_rate": 4.4925e-06,
+      "loss": 159.1351,
+      "step": 600
+    },
+    {
+      "epoch": 14.444444444444445,
+      "grad_norm": 426.074951171875,
+      "learning_rate": 4.8675e-06,
+      "loss": 137.1092,
+      "step": 650
+    },
+    {
+      "epoch": 15.555555555555555,
+      "grad_norm": 348.842529296875,
+      "learning_rate": 5.2425e-06,
+      "loss": 119.822,
+      "step": 700
+    },
+    {
+      "epoch": 16.666666666666668,
+      "grad_norm": 373.2169189453125,
+      "learning_rate": 5.6175e-06,
+      "loss": 104.3366,
+      "step": 750
+    },
+    {
+      "epoch": 17.77777777777778,
+      "grad_norm": 324.51702880859375,
+      "learning_rate": 5.992500000000001e-06,
+      "loss": 90.8788,
+      "step": 800
+    },
+    {
+      "epoch": 18.88888888888889,
+      "grad_norm": 269.91827392578125,
+      "learning_rate": 6.3675e-06,
+      "loss": 78.4644,
+      "step": 850
+    },
+    {
+      "epoch": 20.0,
+      "grad_norm": 1744.54052734375,
+      "learning_rate": 6.7425e-06,
+      "loss": 70.3526,
+      "step": 900
+    },
+    {
+      "epoch": 21.11111111111111,
+      "grad_norm": 369.39837646484375,
+      "learning_rate": 7.1175e-06,
+      "loss": 63.9417,
+      "step": 950
+    },
+    {
+      "epoch": 22.22222222222222,
+      "grad_norm": 303.95977783203125,
+      "learning_rate": 7.4925e-06,
+      "loss": 63.4575,
+      "step": 1000
+    },
+    {
+      "epoch": 23.333333333333332,
+      "grad_norm": 187.462890625,
+      "learning_rate": 7.8675e-06,
+      "loss": 54.7417,
+      "step": 1050
+    },
+    {
+      "epoch": 24.444444444444443,
+      "grad_norm": 165.56666564941406,
+      "learning_rate": 8.2425e-06,
+      "loss": 49.6842,
+      "step": 1100
+    },
+    {
+      "epoch": 25.555555555555557,
+      "grad_norm": 147.3148193359375,
+      "learning_rate": 8.6175e-06,
+      "loss": 43.027,
+      "step": 1150
+    },
+    {
+      "epoch": 26.666666666666668,
+      "grad_norm": 125.53775024414062,
+      "learning_rate": 8.9925e-06,
+      "loss": 38.2579,
+      "step": 1200
+    },
+    {
+      "epoch": 27.77777777777778,
+      "grad_norm": 109.85810089111328,
+      "learning_rate": 9.367500000000001e-06,
+      "loss": 34.3957,
+      "step": 1250
+    },
+    {
+      "epoch": 28.88888888888889,
+      "grad_norm": 98.328369140625,
+      "learning_rate": 9.7425e-06,
+      "loss": 31.4378,
+      "step": 1300
+    },
+    {
+      "epoch": 30.0,
+      "grad_norm": 109.77750396728516,
+      "learning_rate": 1.01175e-05,
+      "loss": 28.5084,
+      "step": 1350
+    },
+    {
+      "epoch": 31.11111111111111,
+      "grad_norm": 112.47483825683594,
+      "learning_rate": 1.04925e-05,
+      "loss": 26.1671,
+      "step": 1400
+    },
+    {
+      "epoch": 32.22222222222222,
+      "grad_norm": 85.60242462158203,
+      "learning_rate": 1.08675e-05,
+      "loss": 24.2309,
+      "step": 1450
+    },
+    {
+      "epoch": 33.333333333333336,
+      "grad_norm": 73.19799041748047,
+      "learning_rate": 1.1242500000000001e-05,
+      "loss": 22.6248,
+      "step": 1500
+    },
+    {
+      "epoch": 34.44444444444444,
+      "grad_norm": 80.70884704589844,
+      "learning_rate": 1.16175e-05,
+      "loss": 21.1187,
+      "step": 1550
+    },
+    {
+      "epoch": 35.55555555555556,
+      "grad_norm": 110.98326110839844,
+      "learning_rate": 1.19925e-05,
+      "loss": 20.4828,
+      "step": 1600
+    },
+    {
+      "epoch": 36.666666666666664,
+      "grad_norm": 113.65286254882812,
+      "learning_rate": 1.23675e-05,
+      "loss": 19.7366,
+      "step": 1650
+    },
+    {
+      "epoch": 37.77777777777778,
+      "grad_norm": 77.65855407714844,
+      "learning_rate": 1.27425e-05,
+      "loss": 18.6632,
+      "step": 1700
+    },
+    {
+      "epoch": 38.888888888888886,
+      "grad_norm": 88.96723175048828,
+      "learning_rate": 1.3117500000000001e-05,
+      "loss": 18.0793,
+      "step": 1750
+    },
+    {
+      "epoch": 40.0,
+      "grad_norm": 79.1690902709961,
+      "learning_rate": 1.34925e-05,
+      "loss": 17.0667,
+      "step": 1800
+    },
+    {
+      "epoch": 41.111111111111114,
+      "grad_norm": 93.60108184814453,
+      "learning_rate": 1.38675e-05,
+      "loss": 16.6106,
+      "step": 1850
+    },
+    {
+      "epoch": 42.22222222222222,
+      "grad_norm": 66.16129302978516,
+      "learning_rate": 1.4242500000000001e-05,
+      "loss": 16.4905,
+      "step": 1900
+    },
+    {
+      "epoch": 43.333333333333336,
+      "grad_norm": 62.41362380981445,
+      "learning_rate": 1.46175e-05,
+      "loss": 15.6427,
+      "step": 1950
+    },
+    {
+      "epoch": 44.44444444444444,
+      "grad_norm": 107.30168151855469,
+      "learning_rate": 1.4992500000000001e-05,
+      "loss": 15.0731,
+      "step": 2000
+    },
+    {
+      "epoch": 45.55555555555556,
+      "grad_norm": 100.18666076660156,
+      "learning_rate": 1.2060000000000001e-05,
+      "loss": 14.6973,
+      "step": 2050
+    },
+    {
+      "epoch": 46.666666666666664,
+      "grad_norm": 68.78209686279297,
+      "learning_rate": 9.06e-06,
+      "loss": 13.8928,
+      "step": 2100
+    },
+    {
+      "epoch": 47.77777777777778,
+      "grad_norm": 65.85958099365234,
+      "learning_rate": 6.0600000000000004e-06,
+      "loss": 13.4318,
+      "step": 2150
+    },
+    {
+      "epoch": 48.888888888888886,
+      "grad_norm": 72.31432342529297,
+      "learning_rate": 3.06e-06,
+      "loss": 12.6856,
+      "step": 2200
+    },
+    {
+      "epoch": 50.0,
+      "grad_norm": 50.271549224853516,
+      "learning_rate": 6.000000000000001e-08,
+      "loss": 12.0923,
+      "step": 2250
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 2250,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 50,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 6.060676179197952e+19,
+  "train_batch_size": 24,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-2250/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a7b84367094b7487f77de50fba614a6c6667e9cf018b77ee5bfc158268fc5eaf
+size 5368

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5c86f420c1f46d6f0b69c1628aa2bc3d17e96bd9bbaa480bfb7f3ac4ba2c3c4e
 size 306699044

 version https://git-lfs.github.com/spec/v1
+oid sha256:d3a52d4cd4386295eedbfb267bc679eca4b27864d745fff06694c0f9dbf823a6
 size 306699044

runs/Aug14_17-42-57_2676026c4495/events.out.tfevents.1755193378.2676026c4495.6591.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6333d2a8e567a7e0ba6820d48e67bd616d919e804c37bbbe27e4f49f18a3f884
+size 16890

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8e1ce90134704b5a6d94e1dd2b2ac60499368f272f8eda658e4e1ca0663c44cd
 size 5368

 version https://git-lfs.github.com/spec/v1
+oid sha256:a7b84367094b7487f77de50fba614a6c6667e9cf018b77ee5bfc158268fc5eaf
 size 5368