checkpoint-1715

Browse files

Files changed (8) hide show

config.json +147 -0
model.safetensors +3 -0
optimizer.pt +3 -0
preprocessor_config.json +23 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
trainer_state.json +1270 -0
training_args.bin +3 -0

config.json ADDED Viewed

	@@ -0,0 +1,147 @@

+{
+  "_name_or_path": "microsoft/swinv2-base-patch4-window16-256",
+  "architectures": [
+    "Swinv2ForImageClassification"
+  ],
+  "attention_probs_dropout_prob": 0.0,
+  "depths": [
+    2,
+    2,
+    18,
+    2
+  ],
+  "drop_path_rate": 0.1,
+  "embed_dim": 128,
+  "encoder_stride": 32,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.0,
+  "hidden_size": 1024,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2",
+    "3": "LABEL_3",
+    "4": "LABEL_4",
+    "5": "LABEL_5",
+    "6": "LABEL_6",
+    "7": "LABEL_7",
+    "8": "LABEL_8",
+    "9": "LABEL_9",
+    "10": "LABEL_10",
+    "11": "LABEL_11",
+    "12": "LABEL_12",
+    "13": "LABEL_13",
+    "14": "LABEL_14",
+    "15": "LABEL_15",
+    "16": "LABEL_16",
+    "17": "LABEL_17",
+    "18": "LABEL_18",
+    "19": "LABEL_19",
+    "20": "LABEL_20",
+    "21": "LABEL_21",
+    "22": "LABEL_22",
+    "23": "LABEL_23",
+    "24": "LABEL_24",
+    "25": "LABEL_25",
+    "26": "LABEL_26",
+    "27": "LABEL_27",
+    "28": "LABEL_28",
+    "29": "LABEL_29",
+    "30": "LABEL_30",
+    "31": "LABEL_31",
+    "32": "LABEL_32",
+    "33": "LABEL_33",
+    "34": "LABEL_34",
+    "35": "LABEL_35",
+    "36": "LABEL_36",
+    "37": "LABEL_37",
+    "38": "LABEL_38",
+    "39": "LABEL_39",
+    "40": "LABEL_40",
+    "41": "LABEL_41"
+  },
+  "image_size": 256,
+  "initializer_range": 0.02,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_10": 10,
+    "LABEL_11": 11,
+    "LABEL_12": 12,
+    "LABEL_13": 13,
+    "LABEL_14": 14,
+    "LABEL_15": 15,
+    "LABEL_16": 16,
+    "LABEL_17": 17,
+    "LABEL_18": 18,
+    "LABEL_19": 19,
+    "LABEL_2": 2,
+    "LABEL_20": 20,
+    "LABEL_21": 21,
+    "LABEL_22": 22,
+    "LABEL_23": 23,
+    "LABEL_24": 24,
+    "LABEL_25": 25,
+    "LABEL_26": 26,
+    "LABEL_27": 27,
+    "LABEL_28": 28,
+    "LABEL_29": 29,
+    "LABEL_3": 3,
+    "LABEL_30": 30,
+    "LABEL_31": 31,
+    "LABEL_32": 32,
+    "LABEL_33": 33,
+    "LABEL_34": 34,
+    "LABEL_35": 35,
+    "LABEL_36": 36,
+    "LABEL_37": 37,
+    "LABEL_38": 38,
+    "LABEL_39": 39,
+    "LABEL_4": 4,
+    "LABEL_40": 40,
+    "LABEL_41": 41,
+    "LABEL_5": 5,
+    "LABEL_6": 6,
+    "LABEL_7": 7,
+    "LABEL_8": 8,
+    "LABEL_9": 9
+  },
+  "layer_norm_eps": 1e-05,
+  "mlp_ratio": 4.0,
+  "model_type": "swinv2",
+  "num_channels": 3,
+  "num_heads": [
+    4,
+    8,
+    16,
+    32
+  ],
+  "num_layers": 4,
+  "out_features": [
+    "stage4"
+  ],
+  "out_indices": [
+    4
+  ],
+  "patch_size": 4,
+  "path_norm": true,
+  "pretrained_window_sizes": [
+    0,
+    0,
+    0,
+    0
+  ],
+  "problem_type": "single_label_classification",
+  "qkv_bias": true,
+  "stage_names": [
+    "stem",
+    "stage1",
+    "stage2",
+    "stage3",
+    "stage4"
+  ],
+  "torch_dtype": "float32",
+  "transformers_version": "4.47.0",
+  "use_absolute_embeddings": false,
+  "window_size": 16
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f3ebf02c412843b8598e0c0cc38f1e9573d5664dcf2e89a33fff0bbf0de739a7
+size 347809528

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4672721aeaed7851b718719ba89d07b8faade79816ada69b742235b791c02554
+size 695890973

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "do_convert_rgb": null,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.485,
+    0.456,
+    0.406
+  ],
+  "image_processor_type": "ViTImageProcessor",
+  "image_std": [
+    0.229,
+    0.224,
+    0.225
+  ],
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 256,
+    "width": 256
+  }
+}

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:92cc70c170169af5d978185b92767499e956a5f46adbffc2bd988ec8371ec7de
+size 14244

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:949b2b0a0f1ad791a5c9823d31ee47a7380d3468ca4b2ac70a9908fee83c9a58
+size 1064

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1270 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 5.0,
+  "eval_steps": 500,
+  "global_step": 1715,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.029154518950437316,
+      "grad_norm": 6.396158218383789,
+      "learning_rate": 4.970845481049563e-05,
+      "loss": 3.6944,
+      "step": 10
+    },
+    {
+      "epoch": 0.05830903790087463,
+      "grad_norm": 5.8870744705200195,
+      "learning_rate": 4.941690962099126e-05,
+      "loss": 3.4646,
+      "step": 20
+    },
+    {
+      "epoch": 0.08746355685131195,
+      "grad_norm": 8.024749755859375,
+      "learning_rate": 4.912536443148688e-05,
+      "loss": 3.2603,
+      "step": 30
+    },
+    {
+      "epoch": 0.11661807580174927,
+      "grad_norm": 12.580345153808594,
+      "learning_rate": 4.883381924198251e-05,
+      "loss": 2.9375,
+      "step": 40
+    },
+    {
+      "epoch": 0.1457725947521866,
+      "grad_norm": 13.44204330444336,
+      "learning_rate": 4.8542274052478136e-05,
+      "loss": 2.663,
+      "step": 50
+    },
+    {
+      "epoch": 0.1749271137026239,
+      "grad_norm": 13.08288288116455,
+      "learning_rate": 4.825072886297377e-05,
+      "loss": 2.4086,
+      "step": 60
+    },
+    {
+      "epoch": 0.20408163265306123,
+      "grad_norm": 14.201903343200684,
+      "learning_rate": 4.795918367346939e-05,
+      "loss": 2.0242,
+      "step": 70
+    },
+    {
+      "epoch": 0.23323615160349853,
+      "grad_norm": 14.91315746307373,
+      "learning_rate": 4.7667638483965015e-05,
+      "loss": 1.8438,
+      "step": 80
+    },
+    {
+      "epoch": 0.26239067055393583,
+      "grad_norm": 14.440930366516113,
+      "learning_rate": 4.7376093294460646e-05,
+      "loss": 1.7379,
+      "step": 90
+    },
+    {
+      "epoch": 0.2915451895043732,
+      "grad_norm": 14.924482345581055,
+      "learning_rate": 4.708454810495627e-05,
+      "loss": 1.3515,
+      "step": 100
+    },
+    {
+      "epoch": 0.3206997084548105,
+      "grad_norm": 15.159282684326172,
+      "learning_rate": 4.6793002915451894e-05,
+      "loss": 1.4493,
+      "step": 110
+    },
+    {
+      "epoch": 0.3498542274052478,
+      "grad_norm": 15.176348686218262,
+      "learning_rate": 4.6501457725947525e-05,
+      "loss": 1.3174,
+      "step": 120
+    },
+    {
+      "epoch": 0.37900874635568516,
+      "grad_norm": 15.329106330871582,
+      "learning_rate": 4.620991253644315e-05,
+      "loss": 1.3243,
+      "step": 130
+    },
+    {
+      "epoch": 0.40816326530612246,
+      "grad_norm": 13.873916625976562,
+      "learning_rate": 4.591836734693878e-05,
+      "loss": 1.0203,
+      "step": 140
+    },
+    {
+      "epoch": 0.43731778425655976,
+      "grad_norm": 11.680057525634766,
+      "learning_rate": 4.5626822157434404e-05,
+      "loss": 1.2277,
+      "step": 150
+    },
+    {
+      "epoch": 0.46647230320699706,
+      "grad_norm": 17.877845764160156,
+      "learning_rate": 4.533527696793003e-05,
+      "loss": 0.9823,
+      "step": 160
+    },
+    {
+      "epoch": 0.4956268221574344,
+      "grad_norm": 12.79694938659668,
+      "learning_rate": 4.504373177842566e-05,
+      "loss": 1.1087,
+      "step": 170
+    },
+    {
+      "epoch": 0.5247813411078717,
+      "grad_norm": 15.178705215454102,
+      "learning_rate": 4.475218658892128e-05,
+      "loss": 0.9267,
+      "step": 180
+    },
+    {
+      "epoch": 0.5539358600583091,
+      "grad_norm": 19.371179580688477,
+      "learning_rate": 4.4460641399416914e-05,
+      "loss": 1.1024,
+      "step": 190
+    },
+    {
+      "epoch": 0.5830903790087464,
+      "grad_norm": 12.616296768188477,
+      "learning_rate": 4.416909620991254e-05,
+      "loss": 0.9355,
+      "step": 200
+    },
+    {
+      "epoch": 0.6122448979591837,
+      "grad_norm": 15.549710273742676,
+      "learning_rate": 4.387755102040816e-05,
+      "loss": 0.8048,
+      "step": 210
+    },
+    {
+      "epoch": 0.641399416909621,
+      "grad_norm": 15.638792037963867,
+      "learning_rate": 4.358600583090379e-05,
+      "loss": 0.8175,
+      "step": 220
+    },
+    {
+      "epoch": 0.6705539358600583,
+      "grad_norm": 12.656390190124512,
+      "learning_rate": 4.3294460641399424e-05,
+      "loss": 1.0265,
+      "step": 230
+    },
+    {
+      "epoch": 0.6997084548104956,
+      "grad_norm": 12.056506156921387,
+      "learning_rate": 4.300291545189505e-05,
+      "loss": 0.8383,
+      "step": 240
+    },
+    {
+      "epoch": 0.7288629737609329,
+      "grad_norm": 13.179421424865723,
+      "learning_rate": 4.271137026239067e-05,
+      "loss": 0.882,
+      "step": 250
+    },
+    {
+      "epoch": 0.7580174927113703,
+      "grad_norm": 11.642322540283203,
+      "learning_rate": 4.2419825072886296e-05,
+      "loss": 0.9375,
+      "step": 260
+    },
+    {
+      "epoch": 0.7871720116618076,
+      "grad_norm": 14.77869987487793,
+      "learning_rate": 4.212827988338193e-05,
+      "loss": 0.721,
+      "step": 270
+    },
+    {
+      "epoch": 0.8163265306122449,
+      "grad_norm": 12.768646240234375,
+      "learning_rate": 4.183673469387756e-05,
+      "loss": 0.7863,
+      "step": 280
+    },
+    {
+      "epoch": 0.8454810495626822,
+      "grad_norm": 17.440149307250977,
+      "learning_rate": 4.1545189504373175e-05,
+      "loss": 0.7854,
+      "step": 290
+    },
+    {
+      "epoch": 0.8746355685131195,
+      "grad_norm": 14.728607177734375,
+      "learning_rate": 4.1253644314868806e-05,
+      "loss": 0.6795,
+      "step": 300
+    },
+    {
+      "epoch": 0.9037900874635568,
+      "grad_norm": 13.96823787689209,
+      "learning_rate": 4.0962099125364436e-05,
+      "loss": 0.8244,
+      "step": 310
+    },
+    {
+      "epoch": 0.9329446064139941,
+      "grad_norm": 14.88004207611084,
+      "learning_rate": 4.067055393586006e-05,
+      "loss": 0.8082,
+      "step": 320
+    },
+    {
+      "epoch": 0.9620991253644315,
+      "grad_norm": 12.001025199890137,
+      "learning_rate": 4.0379008746355685e-05,
+      "loss": 0.7094,
+      "step": 330
+    },
+    {
+      "epoch": 0.9912536443148688,
+      "grad_norm": 20.970056533813477,
+      "learning_rate": 4.0087463556851315e-05,
+      "loss": 0.8157,
+      "step": 340
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 0.7410290837287903,
+      "eval_runtime": 346.7173,
+      "eval_samples_per_second": 7.888,
+      "eval_steps_per_second": 0.248,
+      "step": 343
+    },
+    {
+      "epoch": 1.0204081632653061,
+      "grad_norm": 8.203807830810547,
+      "learning_rate": 3.979591836734694e-05,
+      "loss": 0.4442,
+      "step": 350
+    },
+    {
+      "epoch": 1.0495626822157433,
+      "grad_norm": 9.512131690979004,
+      "learning_rate": 3.950437317784257e-05,
+      "loss": 0.4501,
+      "step": 360
+    },
+    {
+      "epoch": 1.0787172011661808,
+      "grad_norm": 11.348304748535156,
+      "learning_rate": 3.9212827988338194e-05,
+      "loss": 0.5578,
+      "step": 370
+    },
+    {
+      "epoch": 1.1078717201166182,
+      "grad_norm": 16.04487419128418,
+      "learning_rate": 3.892128279883382e-05,
+      "loss": 0.5277,
+      "step": 380
+    },
+    {
+      "epoch": 1.1370262390670554,
+      "grad_norm": 11.712241172790527,
+      "learning_rate": 3.862973760932945e-05,
+      "loss": 0.4313,
+      "step": 390
+    },
+    {
+      "epoch": 1.1661807580174928,
+      "grad_norm": 10.177406311035156,
+      "learning_rate": 3.833819241982507e-05,
+      "loss": 0.4427,
+      "step": 400
+    },
+    {
+      "epoch": 1.19533527696793,
+      "grad_norm": 10.864590644836426,
+      "learning_rate": 3.8046647230320704e-05,
+      "loss": 0.4545,
+      "step": 410
+    },
+    {
+      "epoch": 1.2244897959183674,
+      "grad_norm": 27.668434143066406,
+      "learning_rate": 3.775510204081633e-05,
+      "loss": 0.3945,
+      "step": 420
+    },
+    {
+      "epoch": 1.2536443148688048,
+      "grad_norm": 12.273193359375,
+      "learning_rate": 3.746355685131195e-05,
+      "loss": 0.4442,
+      "step": 430
+    },
+    {
+      "epoch": 1.282798833819242,
+      "grad_norm": 10.925050735473633,
+      "learning_rate": 3.717201166180758e-05,
+      "loss": 0.3749,
+      "step": 440
+    },
+    {
+      "epoch": 1.3119533527696792,
+      "grad_norm": 15.893428802490234,
+      "learning_rate": 3.688046647230321e-05,
+      "loss": 0.3903,
+      "step": 450
+    },
+    {
+      "epoch": 1.3411078717201166,
+      "grad_norm": 17.293563842773438,
+      "learning_rate": 3.658892128279884e-05,
+      "loss": 0.5249,
+      "step": 460
+    },
+    {
+      "epoch": 1.370262390670554,
+      "grad_norm": 10.63139820098877,
+      "learning_rate": 3.629737609329446e-05,
+      "loss": 0.4097,
+      "step": 470
+    },
+    {
+      "epoch": 1.3994169096209912,
+      "grad_norm": 10.485889434814453,
+      "learning_rate": 3.6005830903790086e-05,
+      "loss": 0.4057,
+      "step": 480
+    },
+    {
+      "epoch": 1.4285714285714286,
+      "grad_norm": 15.918136596679688,
+      "learning_rate": 3.571428571428572e-05,
+      "loss": 0.333,
+      "step": 490
+    },
+    {
+      "epoch": 1.4577259475218658,
+      "grad_norm": 5.628406047821045,
+      "learning_rate": 3.542274052478135e-05,
+      "loss": 0.4202,
+      "step": 500
+    },
+    {
+      "epoch": 1.4868804664723032,
+      "grad_norm": 8.147640228271484,
+      "learning_rate": 3.5131195335276965e-05,
+      "loss": 0.3755,
+      "step": 510
+    },
+    {
+      "epoch": 1.5160349854227406,
+      "grad_norm": 11.122663497924805,
+      "learning_rate": 3.4839650145772596e-05,
+      "loss": 0.3533,
+      "step": 520
+    },
+    {
+      "epoch": 1.5451895043731778,
+      "grad_norm": 8.978357315063477,
+      "learning_rate": 3.454810495626823e-05,
+      "loss": 0.3938,
+      "step": 530
+    },
+    {
+      "epoch": 1.574344023323615,
+      "grad_norm": 13.574569702148438,
+      "learning_rate": 3.425655976676385e-05,
+      "loss": 0.4004,
+      "step": 540
+    },
+    {
+      "epoch": 1.6034985422740524,
+      "grad_norm": 12.93692398071289,
+      "learning_rate": 3.3965014577259475e-05,
+      "loss": 0.3081,
+      "step": 550
+    },
+    {
+      "epoch": 1.6326530612244898,
+      "grad_norm": 9.078178405761719,
+      "learning_rate": 3.36734693877551e-05,
+      "loss": 0.3557,
+      "step": 560
+    },
+    {
+      "epoch": 1.6618075801749272,
+      "grad_norm": 7.011285305023193,
+      "learning_rate": 3.338192419825073e-05,
+      "loss": 0.3968,
+      "step": 570
+    },
+    {
+      "epoch": 1.6909620991253644,
+      "grad_norm": 12.261427879333496,
+      "learning_rate": 3.309037900874636e-05,
+      "loss": 0.349,
+      "step": 580
+    },
+    {
+      "epoch": 1.7201166180758016,
+      "grad_norm": 7.620883464813232,
+      "learning_rate": 3.2798833819241985e-05,
+      "loss": 0.3861,
+      "step": 590
+    },
+    {
+      "epoch": 1.749271137026239,
+      "grad_norm": 7.384204864501953,
+      "learning_rate": 3.250728862973761e-05,
+      "loss": 0.3951,
+      "step": 600
+    },
+    {
+      "epoch": 1.7784256559766765,
+      "grad_norm": 11.785451889038086,
+      "learning_rate": 3.221574344023324e-05,
+      "loss": 0.4344,
+      "step": 610
+    },
+    {
+      "epoch": 1.8075801749271136,
+      "grad_norm": 9.102931022644043,
+      "learning_rate": 3.1924198250728864e-05,
+      "loss": 0.3569,
+      "step": 620
+    },
+    {
+      "epoch": 1.836734693877551,
+      "grad_norm": 12.737972259521484,
+      "learning_rate": 3.1632653061224494e-05,
+      "loss": 0.4627,
+      "step": 630
+    },
+    {
+      "epoch": 1.8658892128279883,
+      "grad_norm": 6.976524829864502,
+      "learning_rate": 3.134110787172012e-05,
+      "loss": 0.4338,
+      "step": 640
+    },
+    {
+      "epoch": 1.8950437317784257,
+      "grad_norm": 10.488119125366211,
+      "learning_rate": 3.104956268221574e-05,
+      "loss": 0.4369,
+      "step": 650
+    },
+    {
+      "epoch": 1.924198250728863,
+      "grad_norm": 13.129137992858887,
+      "learning_rate": 3.0758017492711373e-05,
+      "loss": 0.4023,
+      "step": 660
+    },
+    {
+      "epoch": 1.9533527696793003,
+      "grad_norm": 11.848146438598633,
+      "learning_rate": 3.0466472303207e-05,
+      "loss": 0.3211,
+      "step": 670
+    },
+    {
+      "epoch": 1.9825072886297375,
+      "grad_norm": 13.786744117736816,
+      "learning_rate": 3.017492711370263e-05,
+      "loss": 0.4545,
+      "step": 680
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 0.6418492794036865,
+      "eval_runtime": 344.1102,
+      "eval_samples_per_second": 7.948,
+      "eval_steps_per_second": 0.25,
+      "step": 686
+    },
+    {
+      "epoch": 2.011661807580175,
+      "grad_norm": 7.113315582275391,
+      "learning_rate": 2.988338192419825e-05,
+      "loss": 0.3137,
+      "step": 690
+    },
+    {
+      "epoch": 2.0408163265306123,
+      "grad_norm": 9.555095672607422,
+      "learning_rate": 2.959183673469388e-05,
+      "loss": 0.1409,
+      "step": 700
+    },
+    {
+      "epoch": 2.0699708454810497,
+      "grad_norm": 14.719175338745117,
+      "learning_rate": 2.9300291545189507e-05,
+      "loss": 0.1807,
+      "step": 710
+    },
+    {
+      "epoch": 2.0991253644314867,
+      "grad_norm": 6.63716459274292,
+      "learning_rate": 2.9008746355685135e-05,
+      "loss": 0.1116,
+      "step": 720
+    },
+    {
+      "epoch": 2.128279883381924,
+      "grad_norm": 4.129220962524414,
+      "learning_rate": 2.871720116618076e-05,
+      "loss": 0.1921,
+      "step": 730
+    },
+    {
+      "epoch": 2.1574344023323615,
+      "grad_norm": 7.979690074920654,
+      "learning_rate": 2.8425655976676386e-05,
+      "loss": 0.1378,
+      "step": 740
+    },
+    {
+      "epoch": 2.186588921282799,
+      "grad_norm": 11.688068389892578,
+      "learning_rate": 2.8134110787172014e-05,
+      "loss": 0.1304,
+      "step": 750
+    },
+    {
+      "epoch": 2.2157434402332363,
+      "grad_norm": 9.168377876281738,
+      "learning_rate": 2.784256559766764e-05,
+      "loss": 0.1774,
+      "step": 760
+    },
+    {
+      "epoch": 2.2448979591836733,
+      "grad_norm": 14.069756507873535,
+      "learning_rate": 2.7551020408163265e-05,
+      "loss": 0.1482,
+      "step": 770
+    },
+    {
+      "epoch": 2.2740524781341107,
+      "grad_norm": 2.5552010536193848,
+      "learning_rate": 2.7259475218658893e-05,
+      "loss": 0.1194,
+      "step": 780
+    },
+    {
+      "epoch": 2.303206997084548,
+      "grad_norm": 7.534204959869385,
+      "learning_rate": 2.696793002915452e-05,
+      "loss": 0.1321,
+      "step": 790
+    },
+    {
+      "epoch": 2.3323615160349855,
+      "grad_norm": 6.635725021362305,
+      "learning_rate": 2.6676384839650148e-05,
+      "loss": 0.0902,
+      "step": 800
+    },
+    {
+      "epoch": 2.3615160349854225,
+      "grad_norm": 4.213601589202881,
+      "learning_rate": 2.6384839650145775e-05,
+      "loss": 0.182,
+      "step": 810
+    },
+    {
+      "epoch": 2.39067055393586,
+      "grad_norm": 11.590983390808105,
+      "learning_rate": 2.60932944606414e-05,
+      "loss": 0.1745,
+      "step": 820
+    },
+    {
+      "epoch": 2.4198250728862973,
+      "grad_norm": 9.884819984436035,
+      "learning_rate": 2.5801749271137027e-05,
+      "loss": 0.1733,
+      "step": 830
+    },
+    {
+      "epoch": 2.4489795918367347,
+      "grad_norm": 8.458849906921387,
+      "learning_rate": 2.5510204081632654e-05,
+      "loss": 0.1307,
+      "step": 840
+    },
+    {
+      "epoch": 2.478134110787172,
+      "grad_norm": 7.574510097503662,
+      "learning_rate": 2.521865889212828e-05,
+      "loss": 0.1253,
+      "step": 850
+    },
+    {
+      "epoch": 2.5072886297376096,
+      "grad_norm": 9.187530517578125,
+      "learning_rate": 2.492711370262391e-05,
+      "loss": 0.1538,
+      "step": 860
+    },
+    {
+      "epoch": 2.5364431486880465,
+      "grad_norm": 5.005228519439697,
+      "learning_rate": 2.4635568513119533e-05,
+      "loss": 0.117,
+      "step": 870
+    },
+    {
+      "epoch": 2.565597667638484,
+      "grad_norm": 3.9266200065612793,
+      "learning_rate": 2.434402332361516e-05,
+      "loss": 0.1449,
+      "step": 880
+    },
+    {
+      "epoch": 2.5947521865889214,
+      "grad_norm": 8.087149620056152,
+      "learning_rate": 2.405247813411079e-05,
+      "loss": 0.1652,
+      "step": 890
+    },
+    {
+      "epoch": 2.6239067055393583,
+      "grad_norm": 13.278199195861816,
+      "learning_rate": 2.3760932944606415e-05,
+      "loss": 0.1834,
+      "step": 900
+    },
+    {
+      "epoch": 2.6530612244897958,
+      "grad_norm": 9.696410179138184,
+      "learning_rate": 2.3469387755102043e-05,
+      "loss": 0.1033,
+      "step": 910
+    },
+    {
+      "epoch": 2.682215743440233,
+      "grad_norm": 14.884827613830566,
+      "learning_rate": 2.3177842565597667e-05,
+      "loss": 0.2509,
+      "step": 920
+    },
+    {
+      "epoch": 2.7113702623906706,
+      "grad_norm": 7.274765968322754,
+      "learning_rate": 2.2886297376093298e-05,
+      "loss": 0.1497,
+      "step": 930
+    },
+    {
+      "epoch": 2.740524781341108,
+      "grad_norm": 11.48387622833252,
+      "learning_rate": 2.2594752186588922e-05,
+      "loss": 0.1106,
+      "step": 940
+    },
+    {
+      "epoch": 2.7696793002915454,
+      "grad_norm": 7.414773941040039,
+      "learning_rate": 2.230320699708455e-05,
+      "loss": 0.167,
+      "step": 950
+    },
+    {
+      "epoch": 2.7988338192419824,
+      "grad_norm": 11.676288604736328,
+      "learning_rate": 2.2011661807580177e-05,
+      "loss": 0.1707,
+      "step": 960
+    },
+    {
+      "epoch": 2.82798833819242,
+      "grad_norm": 7.887070178985596,
+      "learning_rate": 2.1720116618075804e-05,
+      "loss": 0.1284,
+      "step": 970
+    },
+    {
+      "epoch": 2.857142857142857,
+      "grad_norm": 16.793603897094727,
+      "learning_rate": 2.1428571428571428e-05,
+      "loss": 0.1464,
+      "step": 980
+    },
+    {
+      "epoch": 2.8862973760932946,
+      "grad_norm": 11.829538345336914,
+      "learning_rate": 2.1137026239067056e-05,
+      "loss": 0.109,
+      "step": 990
+    },
+    {
+      "epoch": 2.9154518950437316,
+      "grad_norm": 6.482255458831787,
+      "learning_rate": 2.0845481049562683e-05,
+      "loss": 0.1162,
+      "step": 1000
+    },
+    {
+      "epoch": 2.944606413994169,
+      "grad_norm": 10.546632766723633,
+      "learning_rate": 2.055393586005831e-05,
+      "loss": 0.1181,
+      "step": 1010
+    },
+    {
+      "epoch": 2.9737609329446064,
+      "grad_norm": 3.5399889945983887,
+      "learning_rate": 2.0262390670553938e-05,
+      "loss": 0.1005,
+      "step": 1020
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 0.680114209651947,
+      "eval_runtime": 335.8694,
+      "eval_samples_per_second": 8.143,
+      "eval_steps_per_second": 0.256,
+      "step": 1029
+    },
+    {
+      "epoch": 3.002915451895044,
+      "grad_norm": 3.2238965034484863,
+      "learning_rate": 1.9970845481049562e-05,
+      "loss": 0.166,
+      "step": 1030
+    },
+    {
+      "epoch": 3.0320699708454812,
+      "grad_norm": 2.8589000701904297,
+      "learning_rate": 1.9679300291545193e-05,
+      "loss": 0.0457,
+      "step": 1040
+    },
+    {
+      "epoch": 3.061224489795918,
+      "grad_norm": 2.0926849842071533,
+      "learning_rate": 1.9387755102040817e-05,
+      "loss": 0.0497,
+      "step": 1050
+    },
+    {
+      "epoch": 3.0903790087463556,
+      "grad_norm": 10.41883659362793,
+      "learning_rate": 1.9096209912536444e-05,
+      "loss": 0.0426,
+      "step": 1060
+    },
+    {
+      "epoch": 3.119533527696793,
+      "grad_norm": 4.982727527618408,
+      "learning_rate": 1.880466472303207e-05,
+      "loss": 0.0521,
+      "step": 1070
+    },
+    {
+      "epoch": 3.1486880466472305,
+      "grad_norm": 2.4037177562713623,
+      "learning_rate": 1.85131195335277e-05,
+      "loss": 0.0271,
+      "step": 1080
+    },
+    {
+      "epoch": 3.1778425655976674,
+      "grad_norm": 16.846464157104492,
+      "learning_rate": 1.8221574344023323e-05,
+      "loss": 0.0753,
+      "step": 1090
+    },
+    {
+      "epoch": 3.206997084548105,
+      "grad_norm": 0.7762933373451233,
+      "learning_rate": 1.793002915451895e-05,
+      "loss": 0.0203,
+      "step": 1100
+    },
+    {
+      "epoch": 3.2361516034985423,
+      "grad_norm": 0.7675560712814331,
+      "learning_rate": 1.7638483965014578e-05,
+      "loss": 0.037,
+      "step": 1110
+    },
+    {
+      "epoch": 3.2653061224489797,
+      "grad_norm": 6.022889137268066,
+      "learning_rate": 1.7346938775510206e-05,
+      "loss": 0.0512,
+      "step": 1120
+    },
+    {
+      "epoch": 3.294460641399417,
+      "grad_norm": 2.28266978263855,
+      "learning_rate": 1.7055393586005833e-05,
+      "loss": 0.0594,
+      "step": 1130
+    },
+    {
+      "epoch": 3.323615160349854,
+      "grad_norm": 2.531623363494873,
+      "learning_rate": 1.6763848396501457e-05,
+      "loss": 0.0747,
+      "step": 1140
+    },
+    {
+      "epoch": 3.3527696793002915,
+      "grad_norm": 3.3899452686309814,
+      "learning_rate": 1.6472303206997085e-05,
+      "loss": 0.0716,
+      "step": 1150
+    },
+    {
+      "epoch": 3.381924198250729,
+      "grad_norm": 1.235809326171875,
+      "learning_rate": 1.6180758017492712e-05,
+      "loss": 0.0407,
+      "step": 1160
+    },
+    {
+      "epoch": 3.4110787172011663,
+      "grad_norm": 2.9576103687286377,
+      "learning_rate": 1.588921282798834e-05,
+      "loss": 0.0434,
+      "step": 1170
+    },
+    {
+      "epoch": 3.4402332361516033,
+      "grad_norm": 4.579357624053955,
+      "learning_rate": 1.5597667638483964e-05,
+      "loss": 0.0424,
+      "step": 1180
+    },
+    {
+      "epoch": 3.4693877551020407,
+      "grad_norm": 3.1186563968658447,
+      "learning_rate": 1.5306122448979594e-05,
+      "loss": 0.0269,
+      "step": 1190
+    },
+    {
+      "epoch": 3.498542274052478,
+      "grad_norm": 6.906335830688477,
+      "learning_rate": 1.5014577259475218e-05,
+      "loss": 0.031,
+      "step": 1200
+    },
+    {
+      "epoch": 3.5276967930029155,
+      "grad_norm": 1.3179532289505005,
+      "learning_rate": 1.4723032069970846e-05,
+      "loss": 0.0356,
+      "step": 1210
+    },
+    {
+      "epoch": 3.556851311953353,
+      "grad_norm": 2.701486825942993,
+      "learning_rate": 1.4431486880466475e-05,
+      "loss": 0.0472,
+      "step": 1220
+    },
+    {
+      "epoch": 3.5860058309037903,
+      "grad_norm": 0.8598970174789429,
+      "learning_rate": 1.41399416909621e-05,
+      "loss": 0.039,
+      "step": 1230
+    },
+    {
+      "epoch": 3.6151603498542273,
+      "grad_norm": 3.872262954711914,
+      "learning_rate": 1.3848396501457728e-05,
+      "loss": 0.0296,
+      "step": 1240
+    },
+    {
+      "epoch": 3.6443148688046647,
+      "grad_norm": 1.517674446105957,
+      "learning_rate": 1.3556851311953352e-05,
+      "loss": 0.0306,
+      "step": 1250
+    },
+    {
+      "epoch": 3.673469387755102,
+      "grad_norm": 2.6350300312042236,
+      "learning_rate": 1.3265306122448982e-05,
+      "loss": 0.0475,
+      "step": 1260
+    },
+    {
+      "epoch": 3.702623906705539,
+      "grad_norm": 9.471416473388672,
+      "learning_rate": 1.2973760932944606e-05,
+      "loss": 0.0366,
+      "step": 1270
+    },
+    {
+      "epoch": 3.7317784256559765,
+      "grad_norm": 8.837738037109375,
+      "learning_rate": 1.2682215743440235e-05,
+      "loss": 0.0493,
+      "step": 1280
+    },
+    {
+      "epoch": 3.760932944606414,
+      "grad_norm": 3.3727469444274902,
+      "learning_rate": 1.239067055393586e-05,
+      "loss": 0.0271,
+      "step": 1290
+    },
+    {
+      "epoch": 3.7900874635568513,
+      "grad_norm": 2.860428810119629,
+      "learning_rate": 1.2099125364431488e-05,
+      "loss": 0.0574,
+      "step": 1300
+    },
+    {
+      "epoch": 3.8192419825072887,
+      "grad_norm": 12.31369400024414,
+      "learning_rate": 1.1807580174927114e-05,
+      "loss": 0.0567,
+      "step": 1310
+    },
+    {
+      "epoch": 3.848396501457726,
+      "grad_norm": 0.33458849787712097,
+      "learning_rate": 1.1516034985422741e-05,
+      "loss": 0.0418,
+      "step": 1320
+    },
+    {
+      "epoch": 3.877551020408163,
+      "grad_norm": 3.6039276123046875,
+      "learning_rate": 1.1224489795918369e-05,
+      "loss": 0.0258,
+      "step": 1330
+    },
+    {
+      "epoch": 3.9067055393586005,
+      "grad_norm": 2.461503505706787,
+      "learning_rate": 1.0932944606413994e-05,
+      "loss": 0.024,
+      "step": 1340
+    },
+    {
+      "epoch": 3.935860058309038,
+      "grad_norm": 1.2435688972473145,
+      "learning_rate": 1.0641399416909622e-05,
+      "loss": 0.0503,
+      "step": 1350
+    },
+    {
+      "epoch": 3.9650145772594754,
+      "grad_norm": 6.292943477630615,
+      "learning_rate": 1.0349854227405248e-05,
+      "loss": 0.037,
+      "step": 1360
+    },
+    {
+      "epoch": 3.9941690962099123,
+      "grad_norm": 5.214807033538818,
+      "learning_rate": 1.0058309037900875e-05,
+      "loss": 0.03,
+      "step": 1370
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 0.7046686410903931,
+      "eval_runtime": 338.8017,
+      "eval_samples_per_second": 8.073,
+      "eval_steps_per_second": 0.254,
+      "step": 1372
+    },
+    {
+      "epoch": 4.02332361516035,
+      "grad_norm": 0.912588894367218,
+      "learning_rate": 9.7667638483965e-06,
+      "loss": 0.0185,
+      "step": 1380
+    },
+    {
+      "epoch": 4.052478134110787,
+      "grad_norm": 0.4329352378845215,
+      "learning_rate": 9.47521865889213e-06,
+      "loss": 0.0049,
+      "step": 1390
+    },
+    {
+      "epoch": 4.081632653061225,
+      "grad_norm": 0.5743452906608582,
+      "learning_rate": 9.183673469387756e-06,
+      "loss": 0.0129,
+      "step": 1400
+    },
+    {
+      "epoch": 4.110787172011662,
+      "grad_norm": 0.7096942663192749,
+      "learning_rate": 8.892128279883383e-06,
+      "loss": 0.0108,
+      "step": 1410
+    },
+    {
+      "epoch": 4.139941690962099,
+      "grad_norm": 2.971156120300293,
+      "learning_rate": 8.600583090379009e-06,
+      "loss": 0.0171,
+      "step": 1420
+    },
+    {
+      "epoch": 4.169096209912537,
+      "grad_norm": 1.4994933605194092,
+      "learning_rate": 8.309037900874636e-06,
+      "loss": 0.0203,
+      "step": 1430
+    },
+    {
+      "epoch": 4.198250728862973,
+      "grad_norm": 2.779597759246826,
+      "learning_rate": 8.017492711370262e-06,
+      "loss": 0.0115,
+      "step": 1440
+    },
+    {
+      "epoch": 4.227405247813411,
+      "grad_norm": 1.1518133878707886,
+      "learning_rate": 7.72594752186589e-06,
+      "loss": 0.0099,
+      "step": 1450
+    },
+    {
+      "epoch": 4.256559766763848,
+      "grad_norm": 0.18404270708560944,
+      "learning_rate": 7.434402332361516e-06,
+      "loss": 0.0123,
+      "step": 1460
+    },
+    {
+      "epoch": 4.285714285714286,
+      "grad_norm": 0.3961102366447449,
+      "learning_rate": 7.142857142857143e-06,
+      "loss": 0.0082,
+      "step": 1470
+    },
+    {
+      "epoch": 4.314868804664723,
+      "grad_norm": 0.31849414110183716,
+      "learning_rate": 6.851311953352769e-06,
+      "loss": 0.0062,
+      "step": 1480
+    },
+    {
+      "epoch": 4.34402332361516,
+      "grad_norm": 0.35347217321395874,
+      "learning_rate": 6.559766763848396e-06,
+      "loss": 0.0091,
+      "step": 1490
+    },
+    {
+      "epoch": 4.373177842565598,
+      "grad_norm": 1.1199325323104858,
+      "learning_rate": 6.268221574344024e-06,
+      "loss": 0.0088,
+      "step": 1500
+    },
+    {
+      "epoch": 4.402332361516035,
+      "grad_norm": 0.714579701423645,
+      "learning_rate": 5.97667638483965e-06,
+      "loss": 0.0106,
+      "step": 1510
+    },
+    {
+      "epoch": 4.431486880466473,
+      "grad_norm": 0.4836759865283966,
+      "learning_rate": 5.685131195335277e-06,
+      "loss": 0.0062,
+      "step": 1520
+    },
+    {
+      "epoch": 4.460641399416909,
+      "grad_norm": 1.2272969484329224,
+      "learning_rate": 5.393586005830904e-06,
+      "loss": 0.0134,
+      "step": 1530
+    },
+    {
+      "epoch": 4.489795918367347,
+      "grad_norm": 1.6942965984344482,
+      "learning_rate": 5.102040816326531e-06,
+      "loss": 0.0158,
+      "step": 1540
+    },
+    {
+      "epoch": 4.518950437317784,
+      "grad_norm": 7.356329917907715,
+      "learning_rate": 4.810495626822157e-06,
+      "loss": 0.014,
+      "step": 1550
+    },
+    {
+      "epoch": 4.548104956268221,
+      "grad_norm": 0.13148389756679535,
+      "learning_rate": 4.518950437317785e-06,
+      "loss": 0.0078,
+      "step": 1560
+    },
+    {
+      "epoch": 4.577259475218659,
+      "grad_norm": 0.39737656712532043,
+      "learning_rate": 4.227405247813411e-06,
+      "loss": 0.0073,
+      "step": 1570
+    },
+    {
+      "epoch": 4.606413994169096,
+      "grad_norm": 4.271286964416504,
+      "learning_rate": 3.935860058309039e-06,
+      "loss": 0.0076,
+      "step": 1580
+    },
+    {
+      "epoch": 4.635568513119534,
+      "grad_norm": 0.39406758546829224,
+      "learning_rate": 3.644314868804665e-06,
+      "loss": 0.0108,
+      "step": 1590
+    },
+    {
+      "epoch": 4.664723032069971,
+      "grad_norm": 0.44297096133232117,
+      "learning_rate": 3.352769679300292e-06,
+      "loss": 0.0064,
+      "step": 1600
+    },
+    {
+      "epoch": 4.6938775510204085,
+      "grad_norm": 3.877448558807373,
+      "learning_rate": 3.0612244897959185e-06,
+      "loss": 0.0094,
+      "step": 1610
+    },
+    {
+      "epoch": 4.723032069970845,
+      "grad_norm": 0.836543619632721,
+      "learning_rate": 2.7696793002915456e-06,
+      "loss": 0.0093,
+      "step": 1620
+    },
+    {
+      "epoch": 4.752186588921282,
+      "grad_norm": 3.5220887660980225,
+      "learning_rate": 2.478134110787172e-06,
+      "loss": 0.0069,
+      "step": 1630
+    },
+    {
+      "epoch": 4.78134110787172,
+      "grad_norm": 0.4554370641708374,
+      "learning_rate": 2.1865889212827988e-06,
+      "loss": 0.0108,
+      "step": 1640
+    },
+    {
+      "epoch": 4.810495626822157,
+      "grad_norm": 1.146256923675537,
+      "learning_rate": 1.8950437317784258e-06,
+      "loss": 0.0063,
+      "step": 1650
+    },
+    {
+      "epoch": 4.839650145772595,
+      "grad_norm": 0.614227294921875,
+      "learning_rate": 1.6034985422740526e-06,
+      "loss": 0.0117,
+      "step": 1660
+    },
+    {
+      "epoch": 4.868804664723032,
+      "grad_norm": 1.8233929872512817,
+      "learning_rate": 1.3119533527696794e-06,
+      "loss": 0.0094,
+      "step": 1670
+    },
+    {
+      "epoch": 4.8979591836734695,
+      "grad_norm": 1.0853904485702515,
+      "learning_rate": 1.020408163265306e-06,
+      "loss": 0.0105,
+      "step": 1680
+    },
+    {
+      "epoch": 4.927113702623907,
+      "grad_norm": 0.5117106437683105,
+      "learning_rate": 7.28862973760933e-07,
+      "loss": 0.0064,
+      "step": 1690
+    },
+    {
+      "epoch": 4.956268221574344,
+      "grad_norm": 1.4317443370819092,
+      "learning_rate": 4.373177842565598e-07,
+      "loss": 0.0051,
+      "step": 1700
+    },
+    {
+      "epoch": 4.985422740524781,
+      "grad_norm": 1.4140545129776,
+      "learning_rate": 1.457725947521866e-07,
+      "loss": 0.009,
+      "step": 1710
+    },
+    {
+      "epoch": 5.0,
+      "eval_loss": 0.6613177061080933,
+      "eval_runtime": 338.7385,
+      "eval_samples_per_second": 8.074,
+      "eval_steps_per_second": 0.254,
+      "step": 1715
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1715,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 5,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5.624623496158249e+18,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e405555b8a5888fab9a92bf624aac0b7ed3325bfaab4271a6d8bdbfb34fc7012
+size 5304