Upload 10 files
Browse files- config.json +2 -2
- model.safetensors +1 -1
- optimizer.pt +1 -1
- rng_state.pth +1 -1
- scheduler.pt +1 -1
- tokenizer_config.json +1 -1
- trainer_state.json +2577 -3
- training_args.bin +2 -2
config.json
CHANGED
|
@@ -14,6 +14,7 @@
|
|
| 14 |
"cls_token_id": 50281,
|
| 15 |
"decoder_bias": true,
|
| 16 |
"deterministic_flash_attn": false,
|
|
|
|
| 17 |
"embedding_dropout": 0.0,
|
| 18 |
"eos_token_id": 50282,
|
| 19 |
"global_attn_every_n_layers": 3,
|
|
@@ -42,7 +43,6 @@
|
|
| 42 |
"sep_token_id": 50282,
|
| 43 |
"sparse_pred_ignore_index": -100,
|
| 44 |
"sparse_prediction": false,
|
| 45 |
-
"
|
| 46 |
-
"transformers_version": "4.51.3",
|
| 47 |
"vocab_size": 50368
|
| 48 |
}
|
|
|
|
| 14 |
"cls_token_id": 50281,
|
| 15 |
"decoder_bias": true,
|
| 16 |
"deterministic_flash_attn": false,
|
| 17 |
+
"dtype": "float32",
|
| 18 |
"embedding_dropout": 0.0,
|
| 19 |
"eos_token_id": 50282,
|
| 20 |
"global_attn_every_n_layers": 3,
|
|
|
|
| 43 |
"sep_token_id": 50282,
|
| 44 |
"sparse_pred_ignore_index": -100,
|
| 45 |
"sparse_prediction": false,
|
| 46 |
+
"transformers_version": "4.56.1",
|
|
|
|
| 47 |
"vocab_size": 50368
|
| 48 |
}
|
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 598635032
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4baee20d911bded3ac972714a9c339be4051aac75f3be17c5dd47c3bb0a04e63
|
| 3 |
size 598635032
|
optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1197359627
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dffb20f09d581f1a8db94110ac7014fac958626dee3c29e960da4cb1c9f38e85
|
| 3 |
size 1197359627
|
rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14645
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:db4c787397c7bd17a5fb6bef85caf0ed539cdf41b0fe201e17b766ab049c2a38
|
| 3 |
size 14645
|
scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:528826677c74cc85ac3103f0a7ddc5d791ae235096533cc53310113c112a4947
|
| 3 |
size 1465
|
tokenizer_config.json
CHANGED
|
@@ -940,6 +940,6 @@
|
|
| 940 |
"model_max_length": 512,
|
| 941 |
"pad_token": "[PAD]",
|
| 942 |
"sep_token": "[SEP]",
|
| 943 |
-
"tokenizer_class": "
|
| 944 |
"unk_token": "[UNK]"
|
| 945 |
}
|
|
|
|
| 940 |
"model_max_length": 512,
|
| 941 |
"pad_token": "[PAD]",
|
| 942 |
"sep_token": "[SEP]",
|
| 943 |
+
"tokenizer_class": "PreTrainedTokenizerFast",
|
| 944 |
"unk_token": "[UNK]"
|
| 945 |
}
|
trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 1000,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -10467,6 +10467,2580 @@
|
|
| 10467 |
"eval_samples_per_second": 196.558,
|
| 10468 |
"eval_steps_per_second": 1.543,
|
| 10469 |
"step": 134000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10470 |
}
|
| 10471 |
],
|
| 10472 |
"logging_steps": 100,
|
|
@@ -10486,7 +13060,7 @@
|
|
| 10486 |
"attributes": {}
|
| 10487 |
}
|
| 10488 |
},
|
| 10489 |
-
"total_flos": 1.
|
| 10490 |
"train_batch_size": 128,
|
| 10491 |
"trial_name": null,
|
| 10492 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.0195357196680044,
|
| 6 |
"eval_steps": 1000,
|
| 7 |
+
"global_step": 167000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 10467 |
"eval_samples_per_second": 196.558,
|
| 10468 |
"eval_steps_per_second": 1.543,
|
| 10469 |
"step": 134000
|
| 10470 |
+
},
|
| 10471 |
+
{
|
| 10472 |
+
"epoch": 0.00306989880497212,
|
| 10473 |
+
"grad_norm": 2.9615020751953125,
|
| 10474 |
+
"learning_rate": 3.528037405343427e-05,
|
| 10475 |
+
"loss": 3.6576,
|
| 10476 |
+
"step": 134100
|
| 10477 |
+
},
|
| 10478 |
+
{
|
| 10479 |
+
"epoch": 0.0033489805145150396,
|
| 10480 |
+
"grad_norm": 3.0420079231262207,
|
| 10481 |
+
"learning_rate": 3.52601881252502e-05,
|
| 10482 |
+
"loss": 3.6607,
|
| 10483 |
+
"step": 134200
|
| 10484 |
+
},
|
| 10485 |
+
{
|
| 10486 |
+
"epoch": 0.0036280622240579597,
|
| 10487 |
+
"grad_norm": 2.960287094116211,
|
| 10488 |
+
"learning_rate": 3.5239994149780645e-05,
|
| 10489 |
+
"loss": 3.6668,
|
| 10490 |
+
"step": 134300
|
| 10491 |
+
},
|
| 10492 |
+
{
|
| 10493 |
+
"epoch": 0.00390714393360088,
|
| 10494 |
+
"grad_norm": 3.181061267852783,
|
| 10495 |
+
"learning_rate": 3.521979214286417e-05,
|
| 10496 |
+
"loss": 3.6564,
|
| 10497 |
+
"step": 134400
|
| 10498 |
+
},
|
| 10499 |
+
{
|
| 10500 |
+
"epoch": 0.0041862256431437995,
|
| 10501 |
+
"grad_norm": 2.98903489112854,
|
| 10502 |
+
"learning_rate": 3.519958212034564e-05,
|
| 10503 |
+
"loss": 3.6662,
|
| 10504 |
+
"step": 134500
|
| 10505 |
+
},
|
| 10506 |
+
{
|
| 10507 |
+
"epoch": 0.004465307352686719,
|
| 10508 |
+
"grad_norm": 3.0481224060058594,
|
| 10509 |
+
"learning_rate": 3.5179364098076216e-05,
|
| 10510 |
+
"loss": 3.5675,
|
| 10511 |
+
"step": 134600
|
| 10512 |
+
},
|
| 10513 |
+
{
|
| 10514 |
+
"epoch": 0.00474438906222964,
|
| 10515 |
+
"grad_norm": 3.1568892002105713,
|
| 10516 |
+
"learning_rate": 3.5159138091913325e-05,
|
| 10517 |
+
"loss": 3.6681,
|
| 10518 |
+
"step": 134700
|
| 10519 |
+
},
|
| 10520 |
+
{
|
| 10521 |
+
"epoch": 0.005023470771772559,
|
| 10522 |
+
"grad_norm": 3.0683400630950928,
|
| 10523 |
+
"learning_rate": 3.5138904117720653e-05,
|
| 10524 |
+
"loss": 3.6584,
|
| 10525 |
+
"step": 134800
|
| 10526 |
+
},
|
| 10527 |
+
{
|
| 10528 |
+
"epoch": 0.00530255248131548,
|
| 10529 |
+
"grad_norm": 3.077857494354248,
|
| 10530 |
+
"learning_rate": 3.511866219136814e-05,
|
| 10531 |
+
"loss": 3.6734,
|
| 10532 |
+
"step": 134900
|
| 10533 |
+
},
|
| 10534 |
+
{
|
| 10535 |
+
"epoch": 0.0055816341908584,
|
| 10536 |
+
"grad_norm": 3.012407064437866,
|
| 10537 |
+
"learning_rate": 3.509841232873195e-05,
|
| 10538 |
+
"loss": 3.6649,
|
| 10539 |
+
"step": 135000
|
| 10540 |
+
},
|
| 10541 |
+
{
|
| 10542 |
+
"epoch": 0.0055816341908584,
|
| 10543 |
+
"eval_loss": 2.210810661315918,
|
| 10544 |
+
"eval_runtime": 51.4646,
|
| 10545 |
+
"eval_samples_per_second": 198.078,
|
| 10546 |
+
"eval_steps_per_second": 1.554,
|
| 10547 |
+
"step": 135000
|
| 10548 |
+
},
|
| 10549 |
+
{
|
| 10550 |
+
"epoch": 0.005860715900401319,
|
| 10551 |
+
"grad_norm": 3.164175033569336,
|
| 10552 |
+
"learning_rate": 3.507815454569451e-05,
|
| 10553 |
+
"loss": 3.6716,
|
| 10554 |
+
"step": 135100
|
| 10555 |
+
},
|
| 10556 |
+
{
|
| 10557 |
+
"epoch": 0.00613979760994424,
|
| 10558 |
+
"grad_norm": 3.1793689727783203,
|
| 10559 |
+
"learning_rate": 3.5057888858144416e-05,
|
| 10560 |
+
"loss": 3.643,
|
| 10561 |
+
"step": 135200
|
| 10562 |
+
},
|
| 10563 |
+
{
|
| 10564 |
+
"epoch": 0.0064188793194871595,
|
| 10565 |
+
"grad_norm": 3.0864334106445312,
|
| 10566 |
+
"learning_rate": 3.5037615281976495e-05,
|
| 10567 |
+
"loss": 3.6401,
|
| 10568 |
+
"step": 135300
|
| 10569 |
+
},
|
| 10570 |
+
{
|
| 10571 |
+
"epoch": 0.006697961029030079,
|
| 10572 |
+
"grad_norm": 3.1052911281585693,
|
| 10573 |
+
"learning_rate": 3.501733383309174e-05,
|
| 10574 |
+
"loss": 3.6583,
|
| 10575 |
+
"step": 135400
|
| 10576 |
+
},
|
| 10577 |
+
{
|
| 10578 |
+
"epoch": 0.006977042738573,
|
| 10579 |
+
"grad_norm": 3.1001381874084473,
|
| 10580 |
+
"learning_rate": 3.499704452739732e-05,
|
| 10581 |
+
"loss": 3.6582,
|
| 10582 |
+
"step": 135500
|
| 10583 |
+
},
|
| 10584 |
+
{
|
| 10585 |
+
"epoch": 0.0072561244481159195,
|
| 10586 |
+
"grad_norm": 3.0288331508636475,
|
| 10587 |
+
"learning_rate": 3.4976747380806574e-05,
|
| 10588 |
+
"loss": 3.6652,
|
| 10589 |
+
"step": 135600
|
| 10590 |
+
},
|
| 10591 |
+
{
|
| 10592 |
+
"epoch": 0.007535206157658839,
|
| 10593 |
+
"grad_norm": 2.998553991317749,
|
| 10594 |
+
"learning_rate": 3.4956442409238986e-05,
|
| 10595 |
+
"loss": 3.6602,
|
| 10596 |
+
"step": 135700
|
| 10597 |
+
},
|
| 10598 |
+
{
|
| 10599 |
+
"epoch": 0.00781428786720176,
|
| 10600 |
+
"grad_norm": 3.052278757095337,
|
| 10601 |
+
"learning_rate": 3.49361296286202e-05,
|
| 10602 |
+
"loss": 3.6572,
|
| 10603 |
+
"step": 135800
|
| 10604 |
+
},
|
| 10605 |
+
{
|
| 10606 |
+
"epoch": 0.00809336957674468,
|
| 10607 |
+
"grad_norm": 2.945789098739624,
|
| 10608 |
+
"learning_rate": 3.491580905488195e-05,
|
| 10609 |
+
"loss": 3.643,
|
| 10610 |
+
"step": 135900
|
| 10611 |
+
},
|
| 10612 |
+
{
|
| 10613 |
+
"epoch": 0.008372451286287599,
|
| 10614 |
+
"grad_norm": 2.888101577758789,
|
| 10615 |
+
"learning_rate": 3.48954807039621e-05,
|
| 10616 |
+
"loss": 3.6616,
|
| 10617 |
+
"step": 136000
|
| 10618 |
+
},
|
| 10619 |
+
{
|
| 10620 |
+
"epoch": 0.008372451286287599,
|
| 10621 |
+
"eval_loss": 2.203134298324585,
|
| 10622 |
+
"eval_runtime": 51.3238,
|
| 10623 |
+
"eval_samples_per_second": 198.621,
|
| 10624 |
+
"eval_steps_per_second": 1.559,
|
| 10625 |
+
"step": 136000
|
| 10626 |
+
},
|
| 10627 |
+
{
|
| 10628 |
+
"epoch": 0.008651532995830519,
|
| 10629 |
+
"grad_norm": 3.194450855255127,
|
| 10630 |
+
"learning_rate": 3.487514459180461e-05,
|
| 10631 |
+
"loss": 3.6524,
|
| 10632 |
+
"step": 136100
|
| 10633 |
+
},
|
| 10634 |
+
{
|
| 10635 |
+
"epoch": 0.008930614705373438,
|
| 10636 |
+
"grad_norm": 2.904616355895996,
|
| 10637 |
+
"learning_rate": 3.485480073435953e-05,
|
| 10638 |
+
"loss": 3.6361,
|
| 10639 |
+
"step": 136200
|
| 10640 |
+
},
|
| 10641 |
+
{
|
| 10642 |
+
"epoch": 0.00920969641491636,
|
| 10643 |
+
"grad_norm": 3.003732919692993,
|
| 10644 |
+
"learning_rate": 3.483444914758298e-05,
|
| 10645 |
+
"loss": 3.6364,
|
| 10646 |
+
"step": 136300
|
| 10647 |
+
},
|
| 10648 |
+
{
|
| 10649 |
+
"epoch": 0.00948877812445928,
|
| 10650 |
+
"grad_norm": 3.1838271617889404,
|
| 10651 |
+
"learning_rate": 3.481408984743716e-05,
|
| 10652 |
+
"loss": 3.6386,
|
| 10653 |
+
"step": 136400
|
| 10654 |
+
},
|
| 10655 |
+
{
|
| 10656 |
+
"epoch": 0.0097678598340022,
|
| 10657 |
+
"grad_norm": 3.0291504859924316,
|
| 10658 |
+
"learning_rate": 3.479372284989028e-05,
|
| 10659 |
+
"loss": 3.6795,
|
| 10660 |
+
"step": 136500
|
| 10661 |
+
},
|
| 10662 |
+
{
|
| 10663 |
+
"epoch": 0.010046941543545119,
|
| 10664 |
+
"grad_norm": 3.087804079055786,
|
| 10665 |
+
"learning_rate": 3.477334817091664e-05,
|
| 10666 |
+
"loss": 3.6999,
|
| 10667 |
+
"step": 136600
|
| 10668 |
+
},
|
| 10669 |
+
{
|
| 10670 |
+
"epoch": 0.010326023253088039,
|
| 10671 |
+
"grad_norm": 3.039384365081787,
|
| 10672 |
+
"learning_rate": 3.475296582649652e-05,
|
| 10673 |
+
"loss": 3.7043,
|
| 10674 |
+
"step": 136700
|
| 10675 |
+
},
|
| 10676 |
+
{
|
| 10677 |
+
"epoch": 0.01060510496263096,
|
| 10678 |
+
"grad_norm": 3.3074495792388916,
|
| 10679 |
+
"learning_rate": 3.4732575832616235e-05,
|
| 10680 |
+
"loss": 3.6944,
|
| 10681 |
+
"step": 136800
|
| 10682 |
+
},
|
| 10683 |
+
{
|
| 10684 |
+
"epoch": 0.01088418667217388,
|
| 10685 |
+
"grad_norm": 2.9702234268188477,
|
| 10686 |
+
"learning_rate": 3.471217820526808e-05,
|
| 10687 |
+
"loss": 3.7179,
|
| 10688 |
+
"step": 136900
|
| 10689 |
+
},
|
| 10690 |
+
{
|
| 10691 |
+
"epoch": 0.0111632683817168,
|
| 10692 |
+
"grad_norm": 3.0739312171936035,
|
| 10693 |
+
"learning_rate": 3.469177296045039e-05,
|
| 10694 |
+
"loss": 3.706,
|
| 10695 |
+
"step": 137000
|
| 10696 |
+
},
|
| 10697 |
+
{
|
| 10698 |
+
"epoch": 0.0111632683817168,
|
| 10699 |
+
"eval_loss": 2.2038073539733887,
|
| 10700 |
+
"eval_runtime": 51.4208,
|
| 10701 |
+
"eval_samples_per_second": 198.247,
|
| 10702 |
+
"eval_steps_per_second": 1.556,
|
| 10703 |
+
"step": 137000
|
| 10704 |
+
},
|
| 10705 |
+
{
|
| 10706 |
+
"epoch": 0.011442350091259719,
|
| 10707 |
+
"grad_norm": 3.1482625007629395,
|
| 10708 |
+
"learning_rate": 3.4671360114167395e-05,
|
| 10709 |
+
"loss": 3.6934,
|
| 10710 |
+
"step": 137100
|
| 10711 |
+
},
|
| 10712 |
+
{
|
| 10713 |
+
"epoch": 0.011721431800802639,
|
| 10714 |
+
"grad_norm": 3.0761804580688477,
|
| 10715 |
+
"learning_rate": 3.465093968242935e-05,
|
| 10716 |
+
"loss": 3.7073,
|
| 10717 |
+
"step": 137200
|
| 10718 |
+
},
|
| 10719 |
+
{
|
| 10720 |
+
"epoch": 0.012000513510345558,
|
| 10721 |
+
"grad_norm": 3.0399630069732666,
|
| 10722 |
+
"learning_rate": 3.463051168125243e-05,
|
| 10723 |
+
"loss": 3.6919,
|
| 10724 |
+
"step": 137300
|
| 10725 |
+
},
|
| 10726 |
+
{
|
| 10727 |
+
"epoch": 0.01227959521988848,
|
| 10728 |
+
"grad_norm": 2.898616075515747,
|
| 10729 |
+
"learning_rate": 3.4610076126658765e-05,
|
| 10730 |
+
"loss": 3.4094,
|
| 10731 |
+
"step": 137400
|
| 10732 |
+
},
|
| 10733 |
+
{
|
| 10734 |
+
"epoch": 0.0125586769294314,
|
| 10735 |
+
"grad_norm": 3.054553985595703,
|
| 10736 |
+
"learning_rate": 3.458963303467638e-05,
|
| 10737 |
+
"loss": 3.709,
|
| 10738 |
+
"step": 137500
|
| 10739 |
+
},
|
| 10740 |
+
{
|
| 10741 |
+
"epoch": 0.012837758638974319,
|
| 10742 |
+
"grad_norm": 3.0647337436676025,
|
| 10743 |
+
"learning_rate": 3.456918242133924e-05,
|
| 10744 |
+
"loss": 3.6935,
|
| 10745 |
+
"step": 137600
|
| 10746 |
+
},
|
| 10747 |
+
{
|
| 10748 |
+
"epoch": 0.013116840348517239,
|
| 10749 |
+
"grad_norm": 2.9094271659851074,
|
| 10750 |
+
"learning_rate": 3.45487243026872e-05,
|
| 10751 |
+
"loss": 3.709,
|
| 10752 |
+
"step": 137700
|
| 10753 |
+
},
|
| 10754 |
+
{
|
| 10755 |
+
"epoch": 0.013395922058060158,
|
| 10756 |
+
"grad_norm": 3.0624611377716064,
|
| 10757 |
+
"learning_rate": 3.4528258694766e-05,
|
| 10758 |
+
"loss": 3.7097,
|
| 10759 |
+
"step": 137800
|
| 10760 |
+
},
|
| 10761 |
+
{
|
| 10762 |
+
"epoch": 0.013675003767603078,
|
| 10763 |
+
"grad_norm": 3.080920696258545,
|
| 10764 |
+
"learning_rate": 3.4507785613627246e-05,
|
| 10765 |
+
"loss": 3.7166,
|
| 10766 |
+
"step": 137900
|
| 10767 |
+
},
|
| 10768 |
+
{
|
| 10769 |
+
"epoch": 0.013954085477146,
|
| 10770 |
+
"grad_norm": 3.08603835105896,
|
| 10771 |
+
"learning_rate": 3.4487305075328434e-05,
|
| 10772 |
+
"loss": 3.6971,
|
| 10773 |
+
"step": 138000
|
| 10774 |
+
},
|
| 10775 |
+
{
|
| 10776 |
+
"epoch": 0.013954085477146,
|
| 10777 |
+
"eval_loss": 2.199204683303833,
|
| 10778 |
+
"eval_runtime": 51.4121,
|
| 10779 |
+
"eval_samples_per_second": 198.28,
|
| 10780 |
+
"eval_steps_per_second": 1.556,
|
| 10781 |
+
"step": 138000
|
| 10782 |
+
},
|
| 10783 |
+
{
|
| 10784 |
+
"epoch": 0.01423316718668892,
|
| 10785 |
+
"grad_norm": 2.937602996826172,
|
| 10786 |
+
"learning_rate": 3.446681709593288e-05,
|
| 10787 |
+
"loss": 3.6892,
|
| 10788 |
+
"step": 138100
|
| 10789 |
+
},
|
| 10790 |
+
{
|
| 10791 |
+
"epoch": 0.014512248896231839,
|
| 10792 |
+
"grad_norm": 3.0739290714263916,
|
| 10793 |
+
"learning_rate": 3.444632169150974e-05,
|
| 10794 |
+
"loss": 3.6923,
|
| 10795 |
+
"step": 138200
|
| 10796 |
+
},
|
| 10797 |
+
{
|
| 10798 |
+
"epoch": 0.014791330605774759,
|
| 10799 |
+
"grad_norm": 3.076711654663086,
|
| 10800 |
+
"learning_rate": 3.4425818878134006e-05,
|
| 10801 |
+
"loss": 3.6838,
|
| 10802 |
+
"step": 138300
|
| 10803 |
+
},
|
| 10804 |
+
{
|
| 10805 |
+
"epoch": 0.015070412315317678,
|
| 10806 |
+
"grad_norm": 3.1818246841430664,
|
| 10807 |
+
"learning_rate": 3.4405308671886465e-05,
|
| 10808 |
+
"loss": 3.7162,
|
| 10809 |
+
"step": 138400
|
| 10810 |
+
},
|
| 10811 |
+
{
|
| 10812 |
+
"epoch": 0.015349494024860598,
|
| 10813 |
+
"grad_norm": 3.0970702171325684,
|
| 10814 |
+
"learning_rate": 3.438479108885372e-05,
|
| 10815 |
+
"loss": 3.6906,
|
| 10816 |
+
"step": 138500
|
| 10817 |
+
},
|
| 10818 |
+
{
|
| 10819 |
+
"epoch": 0.01562857573440352,
|
| 10820 |
+
"grad_norm": 2.924048662185669,
|
| 10821 |
+
"learning_rate": 3.436426614512815e-05,
|
| 10822 |
+
"loss": 3.688,
|
| 10823 |
+
"step": 138600
|
| 10824 |
+
},
|
| 10825 |
+
{
|
| 10826 |
+
"epoch": 0.015907657443946437,
|
| 10827 |
+
"grad_norm": 3.151340961456299,
|
| 10828 |
+
"learning_rate": 3.434373385680791e-05,
|
| 10829 |
+
"loss": 3.6952,
|
| 10830 |
+
"step": 138700
|
| 10831 |
+
},
|
| 10832 |
+
{
|
| 10833 |
+
"epoch": 0.01618673915348936,
|
| 10834 |
+
"grad_norm": 3.1132941246032715,
|
| 10835 |
+
"learning_rate": 3.4323194239996906e-05,
|
| 10836 |
+
"loss": 3.6774,
|
| 10837 |
+
"step": 138800
|
| 10838 |
+
},
|
| 10839 |
+
{
|
| 10840 |
+
"epoch": 0.01646582086303228,
|
| 10841 |
+
"grad_norm": 3.077354907989502,
|
| 10842 |
+
"learning_rate": 3.43026473108048e-05,
|
| 10843 |
+
"loss": 3.7007,
|
| 10844 |
+
"step": 138900
|
| 10845 |
+
},
|
| 10846 |
+
{
|
| 10847 |
+
"epoch": 0.016744902572575198,
|
| 10848 |
+
"grad_norm": 3.079162120819092,
|
| 10849 |
+
"learning_rate": 3.4282093085347e-05,
|
| 10850 |
+
"loss": 3.6982,
|
| 10851 |
+
"step": 139000
|
| 10852 |
+
},
|
| 10853 |
+
{
|
| 10854 |
+
"epoch": 0.016744902572575198,
|
| 10855 |
+
"eval_loss": 2.199798107147217,
|
| 10856 |
+
"eval_runtime": 51.6229,
|
| 10857 |
+
"eval_samples_per_second": 197.47,
|
| 10858 |
+
"eval_steps_per_second": 1.55,
|
| 10859 |
+
"step": 139000
|
| 10860 |
+
},
|
| 10861 |
+
{
|
| 10862 |
+
"epoch": 0.01702398428211812,
|
| 10863 |
+
"grad_norm": 3.0697269439697266,
|
| 10864 |
+
"learning_rate": 3.426153157974462e-05,
|
| 10865 |
+
"loss": 3.6903,
|
| 10866 |
+
"step": 139100
|
| 10867 |
+
},
|
| 10868 |
+
{
|
| 10869 |
+
"epoch": 0.017303065991661037,
|
| 10870 |
+
"grad_norm": 2.9596915245056152,
|
| 10871 |
+
"learning_rate": 3.4240962810124485e-05,
|
| 10872 |
+
"loss": 3.6961,
|
| 10873 |
+
"step": 139200
|
| 10874 |
+
},
|
| 10875 |
+
{
|
| 10876 |
+
"epoch": 0.01758214770120396,
|
| 10877 |
+
"grad_norm": 3.1975579261779785,
|
| 10878 |
+
"learning_rate": 3.4220386792619134e-05,
|
| 10879 |
+
"loss": 3.6893,
|
| 10880 |
+
"step": 139300
|
| 10881 |
+
},
|
| 10882 |
+
{
|
| 10883 |
+
"epoch": 0.017861229410746877,
|
| 10884 |
+
"grad_norm": 3.0534658432006836,
|
| 10885 |
+
"learning_rate": 3.419980354336677e-05,
|
| 10886 |
+
"loss": 3.6867,
|
| 10887 |
+
"step": 139400
|
| 10888 |
+
},
|
| 10889 |
+
{
|
| 10890 |
+
"epoch": 0.018140311120289798,
|
| 10891 |
+
"grad_norm": 3.014533519744873,
|
| 10892 |
+
"learning_rate": 3.4179213078511276e-05,
|
| 10893 |
+
"loss": 3.6807,
|
| 10894 |
+
"step": 139500
|
| 10895 |
+
},
|
| 10896 |
+
{
|
| 10897 |
+
"epoch": 0.01841939282983272,
|
| 10898 |
+
"grad_norm": 3.1853954792022705,
|
| 10899 |
+
"learning_rate": 3.415861541420219e-05,
|
| 10900 |
+
"loss": 3.6836,
|
| 10901 |
+
"step": 139600
|
| 10902 |
+
},
|
| 10903 |
+
{
|
| 10904 |
+
"epoch": 0.018698474539375638,
|
| 10905 |
+
"grad_norm": 2.994035482406616,
|
| 10906 |
+
"learning_rate": 3.413801056659471e-05,
|
| 10907 |
+
"loss": 3.6843,
|
| 10908 |
+
"step": 139700
|
| 10909 |
+
},
|
| 10910 |
+
{
|
| 10911 |
+
"epoch": 0.01897755624891856,
|
| 10912 |
+
"grad_norm": 3.01277232170105,
|
| 10913 |
+
"learning_rate": 3.411739855184966e-05,
|
| 10914 |
+
"loss": 3.6875,
|
| 10915 |
+
"step": 139800
|
| 10916 |
+
},
|
| 10917 |
+
{
|
| 10918 |
+
"epoch": 0.019256637958461477,
|
| 10919 |
+
"grad_norm": 2.956777334213257,
|
| 10920 |
+
"learning_rate": 3.409677938613348e-05,
|
| 10921 |
+
"loss": 3.6708,
|
| 10922 |
+
"step": 139900
|
| 10923 |
+
},
|
| 10924 |
+
{
|
| 10925 |
+
"epoch": 0.0195357196680044,
|
| 10926 |
+
"grad_norm": 3.136683225631714,
|
| 10927 |
+
"learning_rate": 3.407615308561822e-05,
|
| 10928 |
+
"loss": 3.6853,
|
| 10929 |
+
"step": 140000
|
| 10930 |
+
},
|
| 10931 |
+
{
|
| 10932 |
+
"epoch": 0.0195357196680044,
|
| 10933 |
+
"eval_loss": 2.1988320350646973,
|
| 10934 |
+
"eval_runtime": 51.6562,
|
| 10935 |
+
"eval_samples_per_second": 197.343,
|
| 10936 |
+
"eval_steps_per_second": 1.549,
|
| 10937 |
+
"step": 140000
|
| 10938 |
+
},
|
| 10939 |
+
{
|
| 10940 |
+
"epoch": 0.01981480137754732,
|
| 10941 |
+
"grad_norm": 3.045262575149536,
|
| 10942 |
+
"learning_rate": 3.405551966648155e-05,
|
| 10943 |
+
"loss": 3.6856,
|
| 10944 |
+
"step": 140100
|
| 10945 |
+
},
|
| 10946 |
+
{
|
| 10947 |
+
"epoch": 0.020093883087090238,
|
| 10948 |
+
"grad_norm": 3.14791202545166,
|
| 10949 |
+
"learning_rate": 3.4034879144906674e-05,
|
| 10950 |
+
"loss": 3.6802,
|
| 10951 |
+
"step": 140200
|
| 10952 |
+
},
|
| 10953 |
+
{
|
| 10954 |
+
"epoch": 0.02037296479663316,
|
| 10955 |
+
"grad_norm": 2.9139297008514404,
|
| 10956 |
+
"learning_rate": 3.401423153708242e-05,
|
| 10957 |
+
"loss": 3.6717,
|
| 10958 |
+
"step": 140300
|
| 10959 |
+
},
|
| 10960 |
+
{
|
| 10961 |
+
"epoch": 0.020652046506176077,
|
| 10962 |
+
"grad_norm": 3.0042569637298584,
|
| 10963 |
+
"learning_rate": 3.399357685920314e-05,
|
| 10964 |
+
"loss": 3.6752,
|
| 10965 |
+
"step": 140400
|
| 10966 |
+
},
|
| 10967 |
+
{
|
| 10968 |
+
"epoch": 0.020931128215719,
|
| 10969 |
+
"grad_norm": 3.045513153076172,
|
| 10970 |
+
"learning_rate": 3.397291512746873e-05,
|
| 10971 |
+
"loss": 3.6921,
|
| 10972 |
+
"step": 140500
|
| 10973 |
+
},
|
| 10974 |
+
{
|
| 10975 |
+
"epoch": 0.02121020992526192,
|
| 10976 |
+
"grad_norm": 3.0931684970855713,
|
| 10977 |
+
"learning_rate": 3.3952246358084645e-05,
|
| 10978 |
+
"loss": 3.6733,
|
| 10979 |
+
"step": 140600
|
| 10980 |
+
},
|
| 10981 |
+
{
|
| 10982 |
+
"epoch": 0.021489291634804838,
|
| 10983 |
+
"grad_norm": 3.1561226844787598,
|
| 10984 |
+
"learning_rate": 3.393157056726184e-05,
|
| 10985 |
+
"loss": 3.6702,
|
| 10986 |
+
"step": 140700
|
| 10987 |
+
},
|
| 10988 |
+
{
|
| 10989 |
+
"epoch": 0.02176837334434776,
|
| 10990 |
+
"grad_norm": 3.267413854598999,
|
| 10991 |
+
"learning_rate": 3.391088777121678e-05,
|
| 10992 |
+
"loss": 3.6848,
|
| 10993 |
+
"step": 140800
|
| 10994 |
+
},
|
| 10995 |
+
{
|
| 10996 |
+
"epoch": 0.022047455053890677,
|
| 10997 |
+
"grad_norm": 2.9542646408081055,
|
| 10998 |
+
"learning_rate": 3.3890197986171426e-05,
|
| 10999 |
+
"loss": 3.668,
|
| 11000 |
+
"step": 140900
|
| 11001 |
+
},
|
| 11002 |
+
{
|
| 11003 |
+
"epoch": 0.0223265367634336,
|
| 11004 |
+
"grad_norm": 3.1285250186920166,
|
| 11005 |
+
"learning_rate": 3.386950122835321e-05,
|
| 11006 |
+
"loss": 3.6633,
|
| 11007 |
+
"step": 141000
|
| 11008 |
+
},
|
| 11009 |
+
{
|
| 11010 |
+
"epoch": 0.0223265367634336,
|
| 11011 |
+
"eval_loss": 2.194960355758667,
|
| 11012 |
+
"eval_runtime": 51.7975,
|
| 11013 |
+
"eval_samples_per_second": 196.805,
|
| 11014 |
+
"eval_steps_per_second": 1.544,
|
| 11015 |
+
"step": 141000
|
| 11016 |
+
},
|
| 11017 |
+
{
|
| 11018 |
+
"epoch": 0.022605618472976517,
|
| 11019 |
+
"grad_norm": 3.117668628692627,
|
| 11020 |
+
"learning_rate": 3.3848797513995054e-05,
|
| 11021 |
+
"loss": 3.6846,
|
| 11022 |
+
"step": 141100
|
| 11023 |
+
},
|
| 11024 |
+
{
|
| 11025 |
+
"epoch": 0.022884700182519438,
|
| 11026 |
+
"grad_norm": 3.2943692207336426,
|
| 11027 |
+
"learning_rate": 3.3828086859335326e-05,
|
| 11028 |
+
"loss": 3.6798,
|
| 11029 |
+
"step": 141200
|
| 11030 |
+
},
|
| 11031 |
+
{
|
| 11032 |
+
"epoch": 0.02316378189206236,
|
| 11033 |
+
"grad_norm": 3.078214645385742,
|
| 11034 |
+
"learning_rate": 3.3807369280617834e-05,
|
| 11035 |
+
"loss": 3.6393,
|
| 11036 |
+
"step": 141300
|
| 11037 |
+
},
|
| 11038 |
+
{
|
| 11039 |
+
"epoch": 0.023442863601605277,
|
| 11040 |
+
"grad_norm": 2.948484182357788,
|
| 11041 |
+
"learning_rate": 3.3786644794091816e-05,
|
| 11042 |
+
"loss": 3.6748,
|
| 11043 |
+
"step": 141400
|
| 11044 |
+
},
|
| 11045 |
+
{
|
| 11046 |
+
"epoch": 0.0237219453111482,
|
| 11047 |
+
"grad_norm": 3.035909652709961,
|
| 11048 |
+
"learning_rate": 3.3765913416011935e-05,
|
| 11049 |
+
"loss": 3.6745,
|
| 11050 |
+
"step": 141500
|
| 11051 |
+
},
|
| 11052 |
+
{
|
| 11053 |
+
"epoch": 0.024001027020691117,
|
| 11054 |
+
"grad_norm": 2.9795923233032227,
|
| 11055 |
+
"learning_rate": 3.374517516263824e-05,
|
| 11056 |
+
"loss": 3.6788,
|
| 11057 |
+
"step": 141600
|
| 11058 |
+
},
|
| 11059 |
+
{
|
| 11060 |
+
"epoch": 0.024280108730234038,
|
| 11061 |
+
"grad_norm": 3.133145809173584,
|
| 11062 |
+
"learning_rate": 3.372443005023622e-05,
|
| 11063 |
+
"loss": 3.6672,
|
| 11064 |
+
"step": 141700
|
| 11065 |
+
},
|
| 11066 |
+
{
|
| 11067 |
+
"epoch": 0.02455919043977696,
|
| 11068 |
+
"grad_norm": 3.067864179611206,
|
| 11069 |
+
"learning_rate": 3.370367809507668e-05,
|
| 11070 |
+
"loss": 3.6433,
|
| 11071 |
+
"step": 141800
|
| 11072 |
+
},
|
| 11073 |
+
{
|
| 11074 |
+
"epoch": 0.024838272149319877,
|
| 11075 |
+
"grad_norm": 2.9287595748901367,
|
| 11076 |
+
"learning_rate": 3.3682919313435836e-05,
|
| 11077 |
+
"loss": 3.6574,
|
| 11078 |
+
"step": 141900
|
| 11079 |
+
},
|
| 11080 |
+
{
|
| 11081 |
+
"epoch": 0.0251173538588628,
|
| 11082 |
+
"grad_norm": 2.9526288509368896,
|
| 11083 |
+
"learning_rate": 3.3662153721595244e-05,
|
| 11084 |
+
"loss": 3.658,
|
| 11085 |
+
"step": 142000
|
| 11086 |
+
},
|
| 11087 |
+
{
|
| 11088 |
+
"epoch": 0.0251173538588628,
|
| 11089 |
+
"eval_loss": 2.188488483428955,
|
| 11090 |
+
"eval_runtime": 51.6355,
|
| 11091 |
+
"eval_samples_per_second": 197.422,
|
| 11092 |
+
"eval_steps_per_second": 1.549,
|
| 11093 |
+
"step": 142000
|
| 11094 |
+
},
|
| 11095 |
+
{
|
| 11096 |
+
"epoch": 0.025396435568405717,
|
| 11097 |
+
"grad_norm": 3.129251718521118,
|
| 11098 |
+
"learning_rate": 3.36413813358418e-05,
|
| 11099 |
+
"loss": 3.6714,
|
| 11100 |
+
"step": 142100
|
| 11101 |
+
},
|
| 11102 |
+
{
|
| 11103 |
+
"epoch": 0.025675517277948638,
|
| 11104 |
+
"grad_norm": 3.0163700580596924,
|
| 11105 |
+
"learning_rate": 3.362060217246775e-05,
|
| 11106 |
+
"loss": 3.662,
|
| 11107 |
+
"step": 142200
|
| 11108 |
+
},
|
| 11109 |
+
{
|
| 11110 |
+
"epoch": 0.025954598987491556,
|
| 11111 |
+
"grad_norm": 3.160065174102783,
|
| 11112 |
+
"learning_rate": 3.359981624777061e-05,
|
| 11113 |
+
"loss": 3.6398,
|
| 11114 |
+
"step": 142300
|
| 11115 |
+
},
|
| 11116 |
+
{
|
| 11117 |
+
"epoch": 0.026233680697034478,
|
| 11118 |
+
"grad_norm": 3.0241780281066895,
|
| 11119 |
+
"learning_rate": 3.3579023578053245e-05,
|
| 11120 |
+
"loss": 3.6516,
|
| 11121 |
+
"step": 142400
|
| 11122 |
+
},
|
| 11123 |
+
{
|
| 11124 |
+
"epoch": 0.0265127624065774,
|
| 11125 |
+
"grad_norm": 2.963665008544922,
|
| 11126 |
+
"learning_rate": 3.355822417962378e-05,
|
| 11127 |
+
"loss": 3.6691,
|
| 11128 |
+
"step": 142500
|
| 11129 |
+
},
|
| 11130 |
+
{
|
| 11131 |
+
"epoch": 0.026791844116120317,
|
| 11132 |
+
"grad_norm": 3.1197776794433594,
|
| 11133 |
+
"learning_rate": 3.3537418068795634e-05,
|
| 11134 |
+
"loss": 3.6647,
|
| 11135 |
+
"step": 142600
|
| 11136 |
+
},
|
| 11137 |
+
{
|
| 11138 |
+
"epoch": 0.02707092582566324,
|
| 11139 |
+
"grad_norm": 3.207745313644409,
|
| 11140 |
+
"learning_rate": 3.3516605261887494e-05,
|
| 11141 |
+
"loss": 3.6587,
|
| 11142 |
+
"step": 142700
|
| 11143 |
+
},
|
| 11144 |
+
{
|
| 11145 |
+
"epoch": 0.027350007535206156,
|
| 11146 |
+
"grad_norm": 3.1258158683776855,
|
| 11147 |
+
"learning_rate": 3.3495785775223274e-05,
|
| 11148 |
+
"loss": 3.6582,
|
| 11149 |
+
"step": 142800
|
| 11150 |
+
},
|
| 11151 |
+
{
|
| 11152 |
+
"epoch": 0.027629089244749078,
|
| 11153 |
+
"grad_norm": 3.1733665466308594,
|
| 11154 |
+
"learning_rate": 3.347495962513215e-05,
|
| 11155 |
+
"loss": 3.6611,
|
| 11156 |
+
"step": 142900
|
| 11157 |
+
},
|
| 11158 |
+
{
|
| 11159 |
+
"epoch": 0.027908170954292,
|
| 11160 |
+
"grad_norm": 3.0605976581573486,
|
| 11161 |
+
"learning_rate": 3.345412682794853e-05,
|
| 11162 |
+
"loss": 3.6533,
|
| 11163 |
+
"step": 143000
|
| 11164 |
+
},
|
| 11165 |
+
{
|
| 11166 |
+
"epoch": 0.027908170954292,
|
| 11167 |
+
"eval_loss": 2.192350149154663,
|
| 11168 |
+
"eval_runtime": 51.7936,
|
| 11169 |
+
"eval_samples_per_second": 196.82,
|
| 11170 |
+
"eval_steps_per_second": 1.545,
|
| 11171 |
+
"step": 143000
|
| 11172 |
+
},
|
| 11173 |
+
{
|
| 11174 |
+
"epoch": 0.028187252663834917,
|
| 11175 |
+
"grad_norm": 3.114699363708496,
|
| 11176 |
+
"learning_rate": 3.3433287400012e-05,
|
| 11177 |
+
"loss": 3.637,
|
| 11178 |
+
"step": 143100
|
| 11179 |
+
},
|
| 11180 |
+
{
|
| 11181 |
+
"epoch": 0.02846633437337784,
|
| 11182 |
+
"grad_norm": 3.0836739540100098,
|
| 11183 |
+
"learning_rate": 3.34124413576674e-05,
|
| 11184 |
+
"loss": 3.6546,
|
| 11185 |
+
"step": 143200
|
| 11186 |
+
},
|
| 11187 |
+
{
|
| 11188 |
+
"epoch": 0.028745416082920756,
|
| 11189 |
+
"grad_norm": 3.009408950805664,
|
| 11190 |
+
"learning_rate": 3.33915887172647e-05,
|
| 11191 |
+
"loss": 3.6605,
|
| 11192 |
+
"step": 143300
|
| 11193 |
+
},
|
| 11194 |
+
{
|
| 11195 |
+
"epoch": 0.029024497792463678,
|
| 11196 |
+
"grad_norm": 3.173821210861206,
|
| 11197 |
+
"learning_rate": 3.337072949515909e-05,
|
| 11198 |
+
"loss": 3.6607,
|
| 11199 |
+
"step": 143400
|
| 11200 |
+
},
|
| 11201 |
+
{
|
| 11202 |
+
"epoch": 0.0293035795020066,
|
| 11203 |
+
"grad_norm": 3.0830142498016357,
|
| 11204 |
+
"learning_rate": 3.334986370771089e-05,
|
| 11205 |
+
"loss": 3.6414,
|
| 11206 |
+
"step": 143500
|
| 11207 |
+
},
|
| 11208 |
+
{
|
| 11209 |
+
"epoch": 0.029582661211549517,
|
| 11210 |
+
"grad_norm": 3.06382155418396,
|
| 11211 |
+
"learning_rate": 3.3328991371285604e-05,
|
| 11212 |
+
"loss": 3.6384,
|
| 11213 |
+
"step": 143600
|
| 11214 |
+
},
|
| 11215 |
+
{
|
| 11216 |
+
"epoch": 0.02986174292109244,
|
| 11217 |
+
"grad_norm": 3.039879083633423,
|
| 11218 |
+
"learning_rate": 3.3308112502253844e-05,
|
| 11219 |
+
"loss": 3.6414,
|
| 11220 |
+
"step": 143700
|
| 11221 |
+
},
|
| 11222 |
+
{
|
| 11223 |
+
"epoch": 0.030140824630635357,
|
| 11224 |
+
"grad_norm": 3.145960807800293,
|
| 11225 |
+
"learning_rate": 3.3287227116991346e-05,
|
| 11226 |
+
"loss": 3.6554,
|
| 11227 |
+
"step": 143800
|
| 11228 |
+
},
|
| 11229 |
+
{
|
| 11230 |
+
"epoch": 0.030419906340178278,
|
| 11231 |
+
"grad_norm": 2.9724478721618652,
|
| 11232 |
+
"learning_rate": 3.326633523187897e-05,
|
| 11233 |
+
"loss": 3.6537,
|
| 11234 |
+
"step": 143900
|
| 11235 |
+
},
|
| 11236 |
+
{
|
| 11237 |
+
"epoch": 0.030698988049721196,
|
| 11238 |
+
"grad_norm": 3.0227105617523193,
|
| 11239 |
+
"learning_rate": 3.324543686330268e-05,
|
| 11240 |
+
"loss": 3.6496,
|
| 11241 |
+
"step": 144000
|
| 11242 |
+
},
|
| 11243 |
+
{
|
| 11244 |
+
"epoch": 0.030698988049721196,
|
| 11245 |
+
"eval_loss": 2.199296236038208,
|
| 11246 |
+
"eval_runtime": 51.7247,
|
| 11247 |
+
"eval_samples_per_second": 197.082,
|
| 11248 |
+
"eval_steps_per_second": 1.547,
|
| 11249 |
+
"step": 144000
|
| 11250 |
+
},
|
| 11251 |
+
{
|
| 11252 |
+
"epoch": 0.030978069759264117,
|
| 11253 |
+
"grad_norm": 3.1452724933624268,
|
| 11254 |
+
"learning_rate": 3.3224532027653506e-05,
|
| 11255 |
+
"loss": 3.6534,
|
| 11256 |
+
"step": 144100
|
| 11257 |
+
},
|
| 11258 |
+
{
|
| 11259 |
+
"epoch": 0.03125715146880704,
|
| 11260 |
+
"grad_norm": 3.0349013805389404,
|
| 11261 |
+
"learning_rate": 3.3203620741327555e-05,
|
| 11262 |
+
"loss": 3.6355,
|
| 11263 |
+
"step": 144200
|
| 11264 |
+
},
|
| 11265 |
+
{
|
| 11266 |
+
"epoch": 0.03153623317834996,
|
| 11267 |
+
"grad_norm": 2.9786078929901123,
|
| 11268 |
+
"learning_rate": 3.3182703020726e-05,
|
| 11269 |
+
"loss": 3.6582,
|
| 11270 |
+
"step": 144300
|
| 11271 |
+
},
|
| 11272 |
+
{
|
| 11273 |
+
"epoch": 0.031815314887892875,
|
| 11274 |
+
"grad_norm": 3.17039155960083,
|
| 11275 |
+
"learning_rate": 3.316177888225506e-05,
|
| 11276 |
+
"loss": 3.6421,
|
| 11277 |
+
"step": 144400
|
| 11278 |
+
},
|
| 11279 |
+
{
|
| 11280 |
+
"epoch": 0.0320943965974358,
|
| 11281 |
+
"grad_norm": 2.8847384452819824,
|
| 11282 |
+
"learning_rate": 3.3140848342325985e-05,
|
| 11283 |
+
"loss": 3.6547,
|
| 11284 |
+
"step": 144500
|
| 11285 |
+
},
|
| 11286 |
+
{
|
| 11287 |
+
"epoch": 0.03237347830697872,
|
| 11288 |
+
"grad_norm": 2.9963126182556152,
|
| 11289 |
+
"learning_rate": 3.3119911417355045e-05,
|
| 11290 |
+
"loss": 3.6473,
|
| 11291 |
+
"step": 144600
|
| 11292 |
+
},
|
| 11293 |
+
{
|
| 11294 |
+
"epoch": 0.032652560016521635,
|
| 11295 |
+
"grad_norm": 3.042747974395752,
|
| 11296 |
+
"learning_rate": 3.309896812376353e-05,
|
| 11297 |
+
"loss": 3.6501,
|
| 11298 |
+
"step": 144700
|
| 11299 |
+
},
|
| 11300 |
+
{
|
| 11301 |
+
"epoch": 0.03293164172606456,
|
| 11302 |
+
"grad_norm": 3.0630815029144287,
|
| 11303 |
+
"learning_rate": 3.307801847797769e-05,
|
| 11304 |
+
"loss": 3.6571,
|
| 11305 |
+
"step": 144800
|
| 11306 |
+
},
|
| 11307 |
+
{
|
| 11308 |
+
"epoch": 0.03321072343560748,
|
| 11309 |
+
"grad_norm": 3.174445390701294,
|
| 11310 |
+
"learning_rate": 3.30570624964288e-05,
|
| 11311 |
+
"loss": 3.6574,
|
| 11312 |
+
"step": 144900
|
| 11313 |
+
},
|
| 11314 |
+
{
|
| 11315 |
+
"epoch": 0.033489805145150396,
|
| 11316 |
+
"grad_norm": 3.1276638507843018,
|
| 11317 |
+
"learning_rate": 3.3036100195553074e-05,
|
| 11318 |
+
"loss": 3.6543,
|
| 11319 |
+
"step": 145000
|
| 11320 |
+
},
|
| 11321 |
+
{
|
| 11322 |
+
"epoch": 0.033489805145150396,
|
| 11323 |
+
"eval_loss": 2.1886751651763916,
|
| 11324 |
+
"eval_runtime": 51.7134,
|
| 11325 |
+
"eval_samples_per_second": 197.125,
|
| 11326 |
+
"eval_steps_per_second": 1.547,
|
| 11327 |
+
"step": 145000
|
| 11328 |
+
},
|
| 11329 |
+
{
|
| 11330 |
+
"epoch": 0.033768886854693314,
|
| 11331 |
+
"grad_norm": 2.9898717403411865,
|
| 11332 |
+
"learning_rate": 3.3015131591791705e-05,
|
| 11333 |
+
"loss": 3.6664,
|
| 11334 |
+
"step": 145100
|
| 11335 |
+
},
|
| 11336 |
+
{
|
| 11337 |
+
"epoch": 0.03404796856423624,
|
| 11338 |
+
"grad_norm": 3.03507137298584,
|
| 11339 |
+
"learning_rate": 3.2994156701590813e-05,
|
| 11340 |
+
"loss": 3.6707,
|
| 11341 |
+
"step": 145200
|
| 11342 |
+
},
|
| 11343 |
+
{
|
| 11344 |
+
"epoch": 0.03432705027377916,
|
| 11345 |
+
"grad_norm": 2.995556354522705,
|
| 11346 |
+
"learning_rate": 3.297317554140146e-05,
|
| 11347 |
+
"loss": 3.6656,
|
| 11348 |
+
"step": 145300
|
| 11349 |
+
},
|
| 11350 |
+
{
|
| 11351 |
+
"epoch": 0.034606131983322075,
|
| 11352 |
+
"grad_norm": 3.1159939765930176,
|
| 11353 |
+
"learning_rate": 3.295218812767961e-05,
|
| 11354 |
+
"loss": 3.6558,
|
| 11355 |
+
"step": 145400
|
| 11356 |
+
},
|
| 11357 |
+
{
|
| 11358 |
+
"epoch": 0.034885213692865,
|
| 11359 |
+
"grad_norm": 2.9724996089935303,
|
| 11360 |
+
"learning_rate": 3.293119447688615e-05,
|
| 11361 |
+
"loss": 3.6455,
|
| 11362 |
+
"step": 145500
|
| 11363 |
+
},
|
| 11364 |
+
{
|
| 11365 |
+
"epoch": 0.03516429540240792,
|
| 11366 |
+
"grad_norm": 3.123499631881714,
|
| 11367 |
+
"learning_rate": 3.291019460548684e-05,
|
| 11368 |
+
"loss": 3.6437,
|
| 11369 |
+
"step": 145600
|
| 11370 |
+
},
|
| 11371 |
+
{
|
| 11372 |
+
"epoch": 0.035443377111950836,
|
| 11373 |
+
"grad_norm": 3.0609242916107178,
|
| 11374 |
+
"learning_rate": 3.2889188529952334e-05,
|
| 11375 |
+
"loss": 3.6567,
|
| 11376 |
+
"step": 145700
|
| 11377 |
+
},
|
| 11378 |
+
{
|
| 11379 |
+
"epoch": 0.035722458821493754,
|
| 11380 |
+
"grad_norm": 3.1065030097961426,
|
| 11381 |
+
"learning_rate": 3.286817626675815e-05,
|
| 11382 |
+
"loss": 3.6503,
|
| 11383 |
+
"step": 145800
|
| 11384 |
+
},
|
| 11385 |
+
{
|
| 11386 |
+
"epoch": 0.03600154053103668,
|
| 11387 |
+
"grad_norm": 3.0567867755889893,
|
| 11388 |
+
"learning_rate": 3.284715783238466e-05,
|
| 11389 |
+
"loss": 3.6493,
|
| 11390 |
+
"step": 145900
|
| 11391 |
+
},
|
| 11392 |
+
{
|
| 11393 |
+
"epoch": 0.036280622240579596,
|
| 11394 |
+
"grad_norm": 2.944715738296509,
|
| 11395 |
+
"learning_rate": 3.282613324331707e-05,
|
| 11396 |
+
"loss": 3.663,
|
| 11397 |
+
"step": 146000
|
| 11398 |
+
},
|
| 11399 |
+
{
|
| 11400 |
+
"epoch": 0.036280622240579596,
|
| 11401 |
+
"eval_loss": 2.181875467300415,
|
| 11402 |
+
"eval_runtime": 51.9227,
|
| 11403 |
+
"eval_samples_per_second": 196.33,
|
| 11404 |
+
"eval_steps_per_second": 1.541,
|
| 11405 |
+
"step": 146000
|
| 11406 |
+
},
|
| 11407 |
+
{
|
| 11408 |
+
"epoch": 0.00027908170954291995,
|
| 11409 |
+
"grad_norm": 3.1367740631103516,
|
| 11410 |
+
"learning_rate": 3.280510251604541e-05,
|
| 11411 |
+
"loss": 3.6419,
|
| 11412 |
+
"step": 146100
|
| 11413 |
+
},
|
| 11414 |
+
{
|
| 11415 |
+
"epoch": 0.0005581634190858399,
|
| 11416 |
+
"grad_norm": 3.077601671218872,
|
| 11417 |
+
"learning_rate": 3.2784065667064536e-05,
|
| 11418 |
+
"loss": 3.661,
|
| 11419 |
+
"step": 146200
|
| 11420 |
+
},
|
| 11421 |
+
{
|
| 11422 |
+
"epoch": 0.0008372451286287599,
|
| 11423 |
+
"grad_norm": 3.0808331966400146,
|
| 11424 |
+
"learning_rate": 3.2763022712874094e-05,
|
| 11425 |
+
"loss": 3.6409,
|
| 11426 |
+
"step": 146300
|
| 11427 |
+
},
|
| 11428 |
+
{
|
| 11429 |
+
"epoch": 0.0011163268381716798,
|
| 11430 |
+
"grad_norm": 2.791093111038208,
|
| 11431 |
+
"learning_rate": 3.274197366997852e-05,
|
| 11432 |
+
"loss": 3.6515,
|
| 11433 |
+
"step": 146400
|
| 11434 |
+
},
|
| 11435 |
+
{
|
| 11436 |
+
"epoch": 0.0013954085477146,
|
| 11437 |
+
"grad_norm": 3.005890369415283,
|
| 11438 |
+
"learning_rate": 3.272091855488705e-05,
|
| 11439 |
+
"loss": 3.6402,
|
| 11440 |
+
"step": 146500
|
| 11441 |
+
},
|
| 11442 |
+
{
|
| 11443 |
+
"epoch": 0.0016744902572575198,
|
| 11444 |
+
"grad_norm": 3.0411083698272705,
|
| 11445 |
+
"learning_rate": 3.2699857384113644e-05,
|
| 11446 |
+
"loss": 3.6484,
|
| 11447 |
+
"step": 146600
|
| 11448 |
+
},
|
| 11449 |
+
{
|
| 11450 |
+
"epoch": 0.00195357196680044,
|
| 11451 |
+
"grad_norm": 2.9706947803497314,
|
| 11452 |
+
"learning_rate": 3.267879017417705e-05,
|
| 11453 |
+
"loss": 3.6431,
|
| 11454 |
+
"step": 146700
|
| 11455 |
+
},
|
| 11456 |
+
{
|
| 11457 |
+
"epoch": 0.0022326536763433596,
|
| 11458 |
+
"grad_norm": 2.8929619789123535,
|
| 11459 |
+
"learning_rate": 3.2657716941600694e-05,
|
| 11460 |
+
"loss": 3.6325,
|
| 11461 |
+
"step": 146800
|
| 11462 |
+
},
|
| 11463 |
+
{
|
| 11464 |
+
"epoch": 0.0025117353858862797,
|
| 11465 |
+
"grad_norm": 3.0911691188812256,
|
| 11466 |
+
"learning_rate": 3.2636637702912805e-05,
|
| 11467 |
+
"loss": 3.6321,
|
| 11468 |
+
"step": 146900
|
| 11469 |
+
},
|
| 11470 |
+
{
|
| 11471 |
+
"epoch": 0.0027908170954292,
|
| 11472 |
+
"grad_norm": 3.0026369094848633,
|
| 11473 |
+
"learning_rate": 3.261555247464626e-05,
|
| 11474 |
+
"loss": 3.6279,
|
| 11475 |
+
"step": 147000
|
| 11476 |
+
},
|
| 11477 |
+
{
|
| 11478 |
+
"epoch": 0.0027908170954292,
|
| 11479 |
+
"eval_loss": 2.1829118728637695,
|
| 11480 |
+
"eval_runtime": 52.5212,
|
| 11481 |
+
"eval_samples_per_second": 194.093,
|
| 11482 |
+
"eval_steps_per_second": 1.523,
|
| 11483 |
+
"step": 147000
|
| 11484 |
+
},
|
| 11485 |
+
{
|
| 11486 |
+
"epoch": 0.00306989880497212,
|
| 11487 |
+
"grad_norm": 3.0439035892486572,
|
| 11488 |
+
"learning_rate": 3.259446127333865e-05,
|
| 11489 |
+
"loss": 3.6467,
|
| 11490 |
+
"step": 147100
|
| 11491 |
+
},
|
| 11492 |
+
{
|
| 11493 |
+
"epoch": 0.0033489805145150396,
|
| 11494 |
+
"grad_norm": 2.952643871307373,
|
| 11495 |
+
"learning_rate": 3.2573364115532276e-05,
|
| 11496 |
+
"loss": 3.6524,
|
| 11497 |
+
"step": 147200
|
| 11498 |
+
},
|
| 11499 |
+
{
|
| 11500 |
+
"epoch": 0.0036280622240579597,
|
| 11501 |
+
"grad_norm": 3.039597988128662,
|
| 11502 |
+
"learning_rate": 3.2552261017774075e-05,
|
| 11503 |
+
"loss": 3.6339,
|
| 11504 |
+
"step": 147300
|
| 11505 |
+
},
|
| 11506 |
+
{
|
| 11507 |
+
"epoch": 0.00390714393360088,
|
| 11508 |
+
"grad_norm": 3.1887271404266357,
|
| 11509 |
+
"learning_rate": 3.253115199661567e-05,
|
| 11510 |
+
"loss": 3.6367,
|
| 11511 |
+
"step": 147400
|
| 11512 |
+
},
|
| 11513 |
+
{
|
| 11514 |
+
"epoch": 0.0041862256431437995,
|
| 11515 |
+
"grad_norm": 3.123321056365967,
|
| 11516 |
+
"learning_rate": 3.2510037068613314e-05,
|
| 11517 |
+
"loss": 3.6283,
|
| 11518 |
+
"step": 147500
|
| 11519 |
+
},
|
| 11520 |
+
{
|
| 11521 |
+
"epoch": 0.004465307352686719,
|
| 11522 |
+
"grad_norm": 3.1954147815704346,
|
| 11523 |
+
"learning_rate": 3.248891625032789e-05,
|
| 11524 |
+
"loss": 3.6295,
|
| 11525 |
+
"step": 147600
|
| 11526 |
+
},
|
| 11527 |
+
{
|
| 11528 |
+
"epoch": 0.00474438906222964,
|
| 11529 |
+
"grad_norm": 3.2092411518096924,
|
| 11530 |
+
"learning_rate": 3.246778955832493e-05,
|
| 11531 |
+
"loss": 3.6417,
|
| 11532 |
+
"step": 147700
|
| 11533 |
+
},
|
| 11534 |
+
{
|
| 11535 |
+
"epoch": 0.005023470771772559,
|
| 11536 |
+
"grad_norm": 3.2568812370300293,
|
| 11537 |
+
"learning_rate": 3.2446657009174523e-05,
|
| 11538 |
+
"loss": 3.6327,
|
| 11539 |
+
"step": 147800
|
| 11540 |
+
},
|
| 11541 |
+
{
|
| 11542 |
+
"epoch": 0.00530255248131548,
|
| 11543 |
+
"grad_norm": 3.068138837814331,
|
| 11544 |
+
"learning_rate": 3.242551861945141e-05,
|
| 11545 |
+
"loss": 3.6543,
|
| 11546 |
+
"step": 147900
|
| 11547 |
+
},
|
| 11548 |
+
{
|
| 11549 |
+
"epoch": 0.0055816341908584,
|
| 11550 |
+
"grad_norm": 3.317512273788452,
|
| 11551 |
+
"learning_rate": 3.240437440573485e-05,
|
| 11552 |
+
"loss": 3.6408,
|
| 11553 |
+
"step": 148000
|
| 11554 |
+
},
|
| 11555 |
+
{
|
| 11556 |
+
"epoch": 0.0055816341908584,
|
| 11557 |
+
"eval_loss": 2.1870715618133545,
|
| 11558 |
+
"eval_runtime": 52.1184,
|
| 11559 |
+
"eval_samples_per_second": 195.593,
|
| 11560 |
+
"eval_steps_per_second": 1.535,
|
| 11561 |
+
"step": 148000
|
| 11562 |
+
},
|
| 11563 |
+
{
|
| 11564 |
+
"epoch": 0.005860715900401319,
|
| 11565 |
+
"grad_norm": 3.3598294258117676,
|
| 11566 |
+
"learning_rate": 3.238322438460874e-05,
|
| 11567 |
+
"loss": 3.6164,
|
| 11568 |
+
"step": 148100
|
| 11569 |
+
},
|
| 11570 |
+
{
|
| 11571 |
+
"epoch": 0.00613979760994424,
|
| 11572 |
+
"grad_norm": 3.139274835586548,
|
| 11573 |
+
"learning_rate": 3.2362068572661465e-05,
|
| 11574 |
+
"loss": 3.6436,
|
| 11575 |
+
"step": 148200
|
| 11576 |
+
},
|
| 11577 |
+
{
|
| 11578 |
+
"epoch": 0.0064188793194871595,
|
| 11579 |
+
"grad_norm": 3.0762457847595215,
|
| 11580 |
+
"learning_rate": 3.234090698648599e-05,
|
| 11581 |
+
"loss": 3.6247,
|
| 11582 |
+
"step": 148300
|
| 11583 |
+
},
|
| 11584 |
+
{
|
| 11585 |
+
"epoch": 0.006697961029030079,
|
| 11586 |
+
"grad_norm": 3.061337947845459,
|
| 11587 |
+
"learning_rate": 3.2319739642679806e-05,
|
| 11588 |
+
"loss": 3.623,
|
| 11589 |
+
"step": 148400
|
| 11590 |
+
},
|
| 11591 |
+
{
|
| 11592 |
+
"epoch": 0.006977042738573,
|
| 11593 |
+
"grad_norm": 2.983355760574341,
|
| 11594 |
+
"learning_rate": 3.229856655784491e-05,
|
| 11595 |
+
"loss": 3.6257,
|
| 11596 |
+
"step": 148500
|
| 11597 |
+
},
|
| 11598 |
+
{
|
| 11599 |
+
"epoch": 0.0072561244481159195,
|
| 11600 |
+
"grad_norm": 3.085252523422241,
|
| 11601 |
+
"learning_rate": 3.227738774858782e-05,
|
| 11602 |
+
"loss": 3.6421,
|
| 11603 |
+
"step": 148600
|
| 11604 |
+
},
|
| 11605 |
+
{
|
| 11606 |
+
"epoch": 0.007535206157658839,
|
| 11607 |
+
"grad_norm": 3.194308042526245,
|
| 11608 |
+
"learning_rate": 3.225620323151951e-05,
|
| 11609 |
+
"loss": 3.6212,
|
| 11610 |
+
"step": 148700
|
| 11611 |
+
},
|
| 11612 |
+
{
|
| 11613 |
+
"epoch": 0.00781428786720176,
|
| 11614 |
+
"grad_norm": 2.822134494781494,
|
| 11615 |
+
"learning_rate": 3.223501302325546e-05,
|
| 11616 |
+
"loss": 3.6332,
|
| 11617 |
+
"step": 148800
|
| 11618 |
+
},
|
| 11619 |
+
{
|
| 11620 |
+
"epoch": 0.00809336957674468,
|
| 11621 |
+
"grad_norm": 3.303119421005249,
|
| 11622 |
+
"learning_rate": 3.2213817140415606e-05,
|
| 11623 |
+
"loss": 3.6295,
|
| 11624 |
+
"step": 148900
|
| 11625 |
+
},
|
| 11626 |
+
{
|
| 11627 |
+
"epoch": 0.008372451286287599,
|
| 11628 |
+
"grad_norm": 3.2773683071136475,
|
| 11629 |
+
"learning_rate": 3.219261559962433e-05,
|
| 11630 |
+
"loss": 3.637,
|
| 11631 |
+
"step": 149000
|
| 11632 |
+
},
|
| 11633 |
+
{
|
| 11634 |
+
"epoch": 0.008372451286287599,
|
| 11635 |
+
"eval_loss": 2.179696798324585,
|
| 11636 |
+
"eval_runtime": 52.2402,
|
| 11637 |
+
"eval_samples_per_second": 195.137,
|
| 11638 |
+
"eval_steps_per_second": 1.531,
|
| 11639 |
+
"step": 149000
|
| 11640 |
+
},
|
| 11641 |
+
{
|
| 11642 |
+
"epoch": 0.008651532995830519,
|
| 11643 |
+
"grad_norm": 3.0133464336395264,
|
| 11644 |
+
"learning_rate": 3.217140841751045e-05,
|
| 11645 |
+
"loss": 3.6203,
|
| 11646 |
+
"step": 149100
|
| 11647 |
+
},
|
| 11648 |
+
{
|
| 11649 |
+
"epoch": 0.008930614705373438,
|
| 11650 |
+
"grad_norm": 2.9925966262817383,
|
| 11651 |
+
"learning_rate": 3.215019561070723e-05,
|
| 11652 |
+
"loss": 3.6204,
|
| 11653 |
+
"step": 149200
|
| 11654 |
+
},
|
| 11655 |
+
{
|
| 11656 |
+
"epoch": 0.00920969641491636,
|
| 11657 |
+
"grad_norm": 3.0842456817626953,
|
| 11658 |
+
"learning_rate": 3.2128977195852314e-05,
|
| 11659 |
+
"loss": 3.6303,
|
| 11660 |
+
"step": 149300
|
| 11661 |
+
},
|
| 11662 |
+
{
|
| 11663 |
+
"epoch": 0.00948877812445928,
|
| 11664 |
+
"grad_norm": 3.073462724685669,
|
| 11665 |
+
"learning_rate": 3.210775318958776e-05,
|
| 11666 |
+
"loss": 3.6235,
|
| 11667 |
+
"step": 149400
|
| 11668 |
+
},
|
| 11669 |
+
{
|
| 11670 |
+
"epoch": 0.0097678598340022,
|
| 11671 |
+
"grad_norm": 3.0209946632385254,
|
| 11672 |
+
"learning_rate": 3.208652360856002e-05,
|
| 11673 |
+
"loss": 3.6212,
|
| 11674 |
+
"step": 149500
|
| 11675 |
+
},
|
| 11676 |
+
{
|
| 11677 |
+
"epoch": 0.010046941543545119,
|
| 11678 |
+
"grad_norm": 3.250084161758423,
|
| 11679 |
+
"learning_rate": 3.2065288469419906e-05,
|
| 11680 |
+
"loss": 3.6139,
|
| 11681 |
+
"step": 149600
|
| 11682 |
+
},
|
| 11683 |
+
{
|
| 11684 |
+
"epoch": 0.010326023253088039,
|
| 11685 |
+
"grad_norm": 3.0430448055267334,
|
| 11686 |
+
"learning_rate": 3.204404778882258e-05,
|
| 11687 |
+
"loss": 3.6206,
|
| 11688 |
+
"step": 149700
|
| 11689 |
+
},
|
| 11690 |
+
{
|
| 11691 |
+
"epoch": 0.01060510496263096,
|
| 11692 |
+
"grad_norm": 3.081878662109375,
|
| 11693 |
+
"learning_rate": 3.20228015834276e-05,
|
| 11694 |
+
"loss": 3.6167,
|
| 11695 |
+
"step": 149800
|
| 11696 |
+
},
|
| 11697 |
+
{
|
| 11698 |
+
"epoch": 0.01088418667217388,
|
| 11699 |
+
"grad_norm": 3.110133171081543,
|
| 11700 |
+
"learning_rate": 3.2001549869898774e-05,
|
| 11701 |
+
"loss": 3.627,
|
| 11702 |
+
"step": 149900
|
| 11703 |
+
},
|
| 11704 |
+
{
|
| 11705 |
+
"epoch": 0.0111632683817168,
|
| 11706 |
+
"grad_norm": 3.1069679260253906,
|
| 11707 |
+
"learning_rate": 3.198029266490431e-05,
|
| 11708 |
+
"loss": 3.6122,
|
| 11709 |
+
"step": 150000
|
| 11710 |
+
},
|
| 11711 |
+
{
|
| 11712 |
+
"epoch": 0.0111632683817168,
|
| 11713 |
+
"eval_loss": 2.1840949058532715,
|
| 11714 |
+
"eval_runtime": 52.0857,
|
| 11715 |
+
"eval_samples_per_second": 195.716,
|
| 11716 |
+
"eval_steps_per_second": 1.536,
|
| 11717 |
+
"step": 150000
|
| 11718 |
+
},
|
| 11719 |
+
{
|
| 11720 |
+
"epoch": 0.011442350091259719,
|
| 11721 |
+
"grad_norm": 3.110675096511841,
|
| 11722 |
+
"learning_rate": 3.195902998511666e-05,
|
| 11723 |
+
"loss": 3.6101,
|
| 11724 |
+
"step": 150100
|
| 11725 |
+
},
|
| 11726 |
+
{
|
| 11727 |
+
"epoch": 0.011721431800802639,
|
| 11728 |
+
"grad_norm": 3.100144386291504,
|
| 11729 |
+
"learning_rate": 3.193776184721263e-05,
|
| 11730 |
+
"loss": 3.6098,
|
| 11731 |
+
"step": 150200
|
| 11732 |
+
},
|
| 11733 |
+
{
|
| 11734 |
+
"epoch": 0.012000513510345558,
|
| 11735 |
+
"grad_norm": 3.0613949298858643,
|
| 11736 |
+
"learning_rate": 3.191648826787326e-05,
|
| 11737 |
+
"loss": 3.5987,
|
| 11738 |
+
"step": 150300
|
| 11739 |
+
},
|
| 11740 |
+
{
|
| 11741 |
+
"epoch": 0.01227959521988848,
|
| 11742 |
+
"grad_norm": 2.962594747543335,
|
| 11743 |
+
"learning_rate": 3.189520926378388e-05,
|
| 11744 |
+
"loss": 3.6353,
|
| 11745 |
+
"step": 150400
|
| 11746 |
+
},
|
| 11747 |
+
{
|
| 11748 |
+
"epoch": 0.0125586769294314,
|
| 11749 |
+
"grad_norm": 3.2426040172576904,
|
| 11750 |
+
"learning_rate": 3.187392485163406e-05,
|
| 11751 |
+
"loss": 3.6268,
|
| 11752 |
+
"step": 150500
|
| 11753 |
+
},
|
| 11754 |
+
{
|
| 11755 |
+
"epoch": 0.012837758638974319,
|
| 11756 |
+
"grad_norm": 3.0770397186279297,
|
| 11757 |
+
"learning_rate": 3.1852635048117634e-05,
|
| 11758 |
+
"loss": 3.6132,
|
| 11759 |
+
"step": 150600
|
| 11760 |
+
},
|
| 11761 |
+
{
|
| 11762 |
+
"epoch": 0.013116840348517239,
|
| 11763 |
+
"grad_norm": 3.1057562828063965,
|
| 11764 |
+
"learning_rate": 3.183133986993265e-05,
|
| 11765 |
+
"loss": 3.6077,
|
| 11766 |
+
"step": 150700
|
| 11767 |
+
},
|
| 11768 |
+
{
|
| 11769 |
+
"epoch": 0.013395922058060158,
|
| 11770 |
+
"grad_norm": 3.1398537158966064,
|
| 11771 |
+
"learning_rate": 3.181003933378136e-05,
|
| 11772 |
+
"loss": 3.5958,
|
| 11773 |
+
"step": 150800
|
| 11774 |
+
},
|
| 11775 |
+
{
|
| 11776 |
+
"epoch": 0.013675003767603078,
|
| 11777 |
+
"grad_norm": 3.1277642250061035,
|
| 11778 |
+
"learning_rate": 3.178873345637023e-05,
|
| 11779 |
+
"loss": 3.6304,
|
| 11780 |
+
"step": 150900
|
| 11781 |
+
},
|
| 11782 |
+
{
|
| 11783 |
+
"epoch": 0.013954085477146,
|
| 11784 |
+
"grad_norm": 3.247443675994873,
|
| 11785 |
+
"learning_rate": 3.176742225440994e-05,
|
| 11786 |
+
"loss": 3.6196,
|
| 11787 |
+
"step": 151000
|
| 11788 |
+
},
|
| 11789 |
+
{
|
| 11790 |
+
"epoch": 0.013954085477146,
|
| 11791 |
+
"eval_loss": 2.1872923374176025,
|
| 11792 |
+
"eval_runtime": 52.3067,
|
| 11793 |
+
"eval_samples_per_second": 194.889,
|
| 11794 |
+
"eval_steps_per_second": 1.529,
|
| 11795 |
+
"step": 151000
|
| 11796 |
+
},
|
| 11797 |
+
{
|
| 11798 |
+
"epoch": 0.01423316718668892,
|
| 11799 |
+
"grad_norm": 3.074709177017212,
|
| 11800 |
+
"learning_rate": 3.17461057446153e-05,
|
| 11801 |
+
"loss": 3.6314,
|
| 11802 |
+
"step": 151100
|
| 11803 |
+
},
|
| 11804 |
+
{
|
| 11805 |
+
"epoch": 0.014512248896231839,
|
| 11806 |
+
"grad_norm": 3.163147211074829,
|
| 11807 |
+
"learning_rate": 3.1724783943705304e-05,
|
| 11808 |
+
"loss": 3.6013,
|
| 11809 |
+
"step": 151200
|
| 11810 |
+
},
|
| 11811 |
+
{
|
| 11812 |
+
"epoch": 0.014791330605774759,
|
| 11813 |
+
"grad_norm": 3.062178373336792,
|
| 11814 |
+
"learning_rate": 3.1703456868403126e-05,
|
| 11815 |
+
"loss": 3.6219,
|
| 11816 |
+
"step": 151300
|
| 11817 |
+
},
|
| 11818 |
+
{
|
| 11819 |
+
"epoch": 0.015070412315317678,
|
| 11820 |
+
"grad_norm": 2.8890295028686523,
|
| 11821 |
+
"learning_rate": 3.168212453543601e-05,
|
| 11822 |
+
"loss": 3.6319,
|
| 11823 |
+
"step": 151400
|
| 11824 |
+
},
|
| 11825 |
+
{
|
| 11826 |
+
"epoch": 0.015349494024860598,
|
| 11827 |
+
"grad_norm": 2.8499069213867188,
|
| 11828 |
+
"learning_rate": 3.166078696153539e-05,
|
| 11829 |
+
"loss": 3.615,
|
| 11830 |
+
"step": 151500
|
| 11831 |
+
},
|
| 11832 |
+
{
|
| 11833 |
+
"epoch": 0.01562857573440352,
|
| 11834 |
+
"grad_norm": 3.079871892929077,
|
| 11835 |
+
"learning_rate": 3.163944416343677e-05,
|
| 11836 |
+
"loss": 3.5953,
|
| 11837 |
+
"step": 151600
|
| 11838 |
+
},
|
| 11839 |
+
{
|
| 11840 |
+
"epoch": 0.015907657443946437,
|
| 11841 |
+
"grad_norm": 3.028519868850708,
|
| 11842 |
+
"learning_rate": 3.1618096157879776e-05,
|
| 11843 |
+
"loss": 3.6217,
|
| 11844 |
+
"step": 151700
|
| 11845 |
+
},
|
| 11846 |
+
{
|
| 11847 |
+
"epoch": 0.01618673915348936,
|
| 11848 |
+
"grad_norm": 3.1988399028778076,
|
| 11849 |
+
"learning_rate": 3.159674296160809e-05,
|
| 11850 |
+
"loss": 3.6,
|
| 11851 |
+
"step": 151800
|
| 11852 |
+
},
|
| 11853 |
+
{
|
| 11854 |
+
"epoch": 0.01646582086303228,
|
| 11855 |
+
"grad_norm": 3.1502091884613037,
|
| 11856 |
+
"learning_rate": 3.157538459136949e-05,
|
| 11857 |
+
"loss": 3.6181,
|
| 11858 |
+
"step": 151900
|
| 11859 |
+
},
|
| 11860 |
+
{
|
| 11861 |
+
"epoch": 0.016744902572575198,
|
| 11862 |
+
"grad_norm": 3.1333131790161133,
|
| 11863 |
+
"learning_rate": 3.1554021063915806e-05,
|
| 11864 |
+
"loss": 3.6065,
|
| 11865 |
+
"step": 152000
|
| 11866 |
+
},
|
| 11867 |
+
{
|
| 11868 |
+
"epoch": 0.016744902572575198,
|
| 11869 |
+
"eval_loss": 2.1706299781799316,
|
| 11870 |
+
"eval_runtime": 52.2255,
|
| 11871 |
+
"eval_samples_per_second": 195.192,
|
| 11872 |
+
"eval_steps_per_second": 1.532,
|
| 11873 |
+
"step": 152000
|
| 11874 |
+
},
|
| 11875 |
+
{
|
| 11876 |
+
"epoch": 0.01702398428211812,
|
| 11877 |
+
"grad_norm": 3.1799027919769287,
|
| 11878 |
+
"learning_rate": 3.153265239600291e-05,
|
| 11879 |
+
"loss": 3.6177,
|
| 11880 |
+
"step": 152100
|
| 11881 |
+
},
|
| 11882 |
+
{
|
| 11883 |
+
"epoch": 0.017303065991661037,
|
| 11884 |
+
"grad_norm": 3.09015154838562,
|
| 11885 |
+
"learning_rate": 3.1511278604390694e-05,
|
| 11886 |
+
"loss": 3.6111,
|
| 11887 |
+
"step": 152200
|
| 11888 |
+
},
|
| 11889 |
+
{
|
| 11890 |
+
"epoch": 0.01758214770120396,
|
| 11891 |
+
"grad_norm": 3.2853939533233643,
|
| 11892 |
+
"learning_rate": 3.1489899705843094e-05,
|
| 11893 |
+
"loss": 3.6164,
|
| 11894 |
+
"step": 152300
|
| 11895 |
+
},
|
| 11896 |
+
{
|
| 11897 |
+
"epoch": 0.017861229410746877,
|
| 11898 |
+
"grad_norm": 3.114593982696533,
|
| 11899 |
+
"learning_rate": 3.146851571712804e-05,
|
| 11900 |
+
"loss": 3.5874,
|
| 11901 |
+
"step": 152400
|
| 11902 |
+
},
|
| 11903 |
+
{
|
| 11904 |
+
"epoch": 0.018140311120289798,
|
| 11905 |
+
"grad_norm": 3.1264195442199707,
|
| 11906 |
+
"learning_rate": 3.1447126655017446e-05,
|
| 11907 |
+
"loss": 3.6051,
|
| 11908 |
+
"step": 152500
|
| 11909 |
+
},
|
| 11910 |
+
{
|
| 11911 |
+
"epoch": 0.01841939282983272,
|
| 11912 |
+
"grad_norm": 3.064248561859131,
|
| 11913 |
+
"learning_rate": 3.142573253628721e-05,
|
| 11914 |
+
"loss": 3.5926,
|
| 11915 |
+
"step": 152600
|
| 11916 |
+
},
|
| 11917 |
+
{
|
| 11918 |
+
"epoch": 0.018698474539375638,
|
| 11919 |
+
"grad_norm": 3.199820041656494,
|
| 11920 |
+
"learning_rate": 3.140433337771721e-05,
|
| 11921 |
+
"loss": 3.6214,
|
| 11922 |
+
"step": 152700
|
| 11923 |
+
},
|
| 11924 |
+
{
|
| 11925 |
+
"epoch": 0.01897755624891856,
|
| 11926 |
+
"grad_norm": 3.1903645992279053,
|
| 11927 |
+
"learning_rate": 3.138292919609125e-05,
|
| 11928 |
+
"loss": 3.602,
|
| 11929 |
+
"step": 152800
|
| 11930 |
+
},
|
| 11931 |
+
{
|
| 11932 |
+
"epoch": 0.019256637958461477,
|
| 11933 |
+
"grad_norm": 3.2537660598754883,
|
| 11934 |
+
"learning_rate": 3.13615200081971e-05,
|
| 11935 |
+
"loss": 3.618,
|
| 11936 |
+
"step": 152900
|
| 11937 |
+
},
|
| 11938 |
+
{
|
| 11939 |
+
"epoch": 0.0195357196680044,
|
| 11940 |
+
"grad_norm": 3.051445722579956,
|
| 11941 |
+
"learning_rate": 3.134010583082643e-05,
|
| 11942 |
+
"loss": 3.5982,
|
| 11943 |
+
"step": 153000
|
| 11944 |
+
},
|
| 11945 |
+
{
|
| 11946 |
+
"epoch": 0.0195357196680044,
|
| 11947 |
+
"eval_loss": 2.1735916137695312,
|
| 11948 |
+
"eval_runtime": 52.2439,
|
| 11949 |
+
"eval_samples_per_second": 195.123,
|
| 11950 |
+
"eval_steps_per_second": 1.531,
|
| 11951 |
+
"step": 153000
|
| 11952 |
+
},
|
| 11953 |
+
{
|
| 11954 |
+
"epoch": 0.01981480137754732,
|
| 11955 |
+
"grad_norm": 2.984973192214966,
|
| 11956 |
+
"learning_rate": 3.131868668077486e-05,
|
| 11957 |
+
"loss": 3.5892,
|
| 11958 |
+
"step": 153100
|
| 11959 |
+
},
|
| 11960 |
+
{
|
| 11961 |
+
"epoch": 0.020093883087090238,
|
| 11962 |
+
"grad_norm": 2.9883575439453125,
|
| 11963 |
+
"learning_rate": 3.129726257484187e-05,
|
| 11964 |
+
"loss": 3.6092,
|
| 11965 |
+
"step": 153200
|
| 11966 |
+
},
|
| 11967 |
+
{
|
| 11968 |
+
"epoch": 0.02037296479663316,
|
| 11969 |
+
"grad_norm": 3.299248695373535,
|
| 11970 |
+
"learning_rate": 3.127583352983086e-05,
|
| 11971 |
+
"loss": 3.5973,
|
| 11972 |
+
"step": 153300
|
| 11973 |
+
},
|
| 11974 |
+
{
|
| 11975 |
+
"epoch": 0.020652046506176077,
|
| 11976 |
+
"grad_norm": 3.1858959197998047,
|
| 11977 |
+
"learning_rate": 3.125439956254907e-05,
|
| 11978 |
+
"loss": 3.5986,
|
| 11979 |
+
"step": 153400
|
| 11980 |
+
},
|
| 11981 |
+
{
|
| 11982 |
+
"epoch": 0.020931128215719,
|
| 11983 |
+
"grad_norm": 3.2093448638916016,
|
| 11984 |
+
"learning_rate": 3.123296068980764e-05,
|
| 11985 |
+
"loss": 3.5987,
|
| 11986 |
+
"step": 153500
|
| 11987 |
+
},
|
| 11988 |
+
{
|
| 11989 |
+
"epoch": 0.02121020992526192,
|
| 11990 |
+
"grad_norm": 3.049703598022461,
|
| 11991 |
+
"learning_rate": 3.1211516928421526e-05,
|
| 11992 |
+
"loss": 3.5995,
|
| 11993 |
+
"step": 153600
|
| 11994 |
+
},
|
| 11995 |
+
{
|
| 11996 |
+
"epoch": 0.021489291634804838,
|
| 11997 |
+
"grad_norm": 3.1410481929779053,
|
| 11998 |
+
"learning_rate": 3.119006829520953e-05,
|
| 11999 |
+
"loss": 3.586,
|
| 12000 |
+
"step": 153700
|
| 12001 |
+
},
|
| 12002 |
+
{
|
| 12003 |
+
"epoch": 0.02176837334434776,
|
| 12004 |
+
"grad_norm": 2.9701108932495117,
|
| 12005 |
+
"learning_rate": 3.1168614806994286e-05,
|
| 12006 |
+
"loss": 3.5826,
|
| 12007 |
+
"step": 153800
|
| 12008 |
+
},
|
| 12009 |
+
{
|
| 12010 |
+
"epoch": 0.022047455053890677,
|
| 12011 |
+
"grad_norm": 3.016268253326416,
|
| 12012 |
+
"learning_rate": 3.114715648060221e-05,
|
| 12013 |
+
"loss": 3.5746,
|
| 12014 |
+
"step": 153900
|
| 12015 |
+
},
|
| 12016 |
+
{
|
| 12017 |
+
"epoch": 0.0223265367634336,
|
| 12018 |
+
"grad_norm": 3.112840175628662,
|
| 12019 |
+
"learning_rate": 3.1125693332863545e-05,
|
| 12020 |
+
"loss": 3.5908,
|
| 12021 |
+
"step": 154000
|
| 12022 |
+
},
|
| 12023 |
+
{
|
| 12024 |
+
"epoch": 0.0223265367634336,
|
| 12025 |
+
"eval_loss": 2.1794586181640625,
|
| 12026 |
+
"eval_runtime": 52.4216,
|
| 12027 |
+
"eval_samples_per_second": 194.462,
|
| 12028 |
+
"eval_steps_per_second": 1.526,
|
| 12029 |
+
"step": 154000
|
| 12030 |
+
},
|
| 12031 |
+
{
|
| 12032 |
+
"epoch": 0.022605618472976517,
|
| 12033 |
+
"grad_norm": 3.132110595703125,
|
| 12034 |
+
"learning_rate": 3.110422538061228e-05,
|
| 12035 |
+
"loss": 3.57,
|
| 12036 |
+
"step": 154100
|
| 12037 |
+
},
|
| 12038 |
+
{
|
| 12039 |
+
"epoch": 0.022884700182519438,
|
| 12040 |
+
"grad_norm": 3.2359519004821777,
|
| 12041 |
+
"learning_rate": 3.108275264068619e-05,
|
| 12042 |
+
"loss": 3.6035,
|
| 12043 |
+
"step": 154200
|
| 12044 |
+
},
|
| 12045 |
+
{
|
| 12046 |
+
"epoch": 0.02316378189206236,
|
| 12047 |
+
"grad_norm": 2.9039528369903564,
|
| 12048 |
+
"learning_rate": 3.1061275129926816e-05,
|
| 12049 |
+
"loss": 3.5772,
|
| 12050 |
+
"step": 154300
|
| 12051 |
+
},
|
| 12052 |
+
{
|
| 12053 |
+
"epoch": 0.023442863601605277,
|
| 12054 |
+
"grad_norm": 3.10616397857666,
|
| 12055 |
+
"learning_rate": 3.103979286517943e-05,
|
| 12056 |
+
"loss": 3.58,
|
| 12057 |
+
"step": 154400
|
| 12058 |
+
},
|
| 12059 |
+
{
|
| 12060 |
+
"epoch": 0.0237219453111482,
|
| 12061 |
+
"grad_norm": 3.2507059574127197,
|
| 12062 |
+
"learning_rate": 3.101830586329302e-05,
|
| 12063 |
+
"loss": 3.5788,
|
| 12064 |
+
"step": 154500
|
| 12065 |
+
},
|
| 12066 |
+
{
|
| 12067 |
+
"epoch": 0.024001027020691117,
|
| 12068 |
+
"grad_norm": 3.145289421081543,
|
| 12069 |
+
"learning_rate": 3.099681414112032e-05,
|
| 12070 |
+
"loss": 3.5909,
|
| 12071 |
+
"step": 154600
|
| 12072 |
+
},
|
| 12073 |
+
{
|
| 12074 |
+
"epoch": 0.024280108730234038,
|
| 12075 |
+
"grad_norm": 3.2056355476379395,
|
| 12076 |
+
"learning_rate": 3.097531771551774e-05,
|
| 12077 |
+
"loss": 3.5776,
|
| 12078 |
+
"step": 154700
|
| 12079 |
+
},
|
| 12080 |
+
{
|
| 12081 |
+
"epoch": 0.02455919043977696,
|
| 12082 |
+
"grad_norm": 3.3703291416168213,
|
| 12083 |
+
"learning_rate": 3.095381660334539e-05,
|
| 12084 |
+
"loss": 3.5746,
|
| 12085 |
+
"step": 154800
|
| 12086 |
+
},
|
| 12087 |
+
{
|
| 12088 |
+
"epoch": 0.024838272149319877,
|
| 12089 |
+
"grad_norm": 3.181138277053833,
|
| 12090 |
+
"learning_rate": 3.0932310821467036e-05,
|
| 12091 |
+
"loss": 3.5715,
|
| 12092 |
+
"step": 154900
|
| 12093 |
+
},
|
| 12094 |
+
{
|
| 12095 |
+
"epoch": 0.0251173538588628,
|
| 12096 |
+
"grad_norm": 3.000821590423584,
|
| 12097 |
+
"learning_rate": 3.091080038675015e-05,
|
| 12098 |
+
"loss": 3.5743,
|
| 12099 |
+
"step": 155000
|
| 12100 |
+
},
|
| 12101 |
+
{
|
| 12102 |
+
"epoch": 0.0251173538588628,
|
| 12103 |
+
"eval_loss": 2.1706509590148926,
|
| 12104 |
+
"eval_runtime": 52.3403,
|
| 12105 |
+
"eval_samples_per_second": 194.764,
|
| 12106 |
+
"eval_steps_per_second": 1.528,
|
| 12107 |
+
"step": 155000
|
| 12108 |
+
},
|
| 12109 |
+
{
|
| 12110 |
+
"epoch": 0.025396435568405717,
|
| 12111 |
+
"grad_norm": 2.9740653038024902,
|
| 12112 |
+
"learning_rate": 3.0889285316065806e-05,
|
| 12113 |
+
"loss": 3.5711,
|
| 12114 |
+
"step": 155100
|
| 12115 |
+
},
|
| 12116 |
+
{
|
| 12117 |
+
"epoch": 0.025675517277948638,
|
| 12118 |
+
"grad_norm": 3.1311419010162354,
|
| 12119 |
+
"learning_rate": 3.0867765626288755e-05,
|
| 12120 |
+
"loss": 3.5845,
|
| 12121 |
+
"step": 155200
|
| 12122 |
+
},
|
| 12123 |
+
{
|
| 12124 |
+
"epoch": 0.025954598987491556,
|
| 12125 |
+
"grad_norm": 3.0719974040985107,
|
| 12126 |
+
"learning_rate": 3.084624133429733e-05,
|
| 12127 |
+
"loss": 3.5731,
|
| 12128 |
+
"step": 155300
|
| 12129 |
+
},
|
| 12130 |
+
{
|
| 12131 |
+
"epoch": 0.026233680697034478,
|
| 12132 |
+
"grad_norm": 3.0461819171905518,
|
| 12133 |
+
"learning_rate": 3.082471245697351e-05,
|
| 12134 |
+
"loss": 3.5738,
|
| 12135 |
+
"step": 155400
|
| 12136 |
+
},
|
| 12137 |
+
{
|
| 12138 |
+
"epoch": 0.0265127624065774,
|
| 12139 |
+
"grad_norm": 2.9734132289886475,
|
| 12140 |
+
"learning_rate": 3.080317901120285e-05,
|
| 12141 |
+
"loss": 3.5853,
|
| 12142 |
+
"step": 155500
|
| 12143 |
+
},
|
| 12144 |
+
{
|
| 12145 |
+
"epoch": 0.026791844116120317,
|
| 12146 |
+
"grad_norm": 3.117506980895996,
|
| 12147 |
+
"learning_rate": 3.078164101387449e-05,
|
| 12148 |
+
"loss": 3.5847,
|
| 12149 |
+
"step": 155600
|
| 12150 |
+
},
|
| 12151 |
+
{
|
| 12152 |
+
"epoch": 0.02707092582566324,
|
| 12153 |
+
"grad_norm": 3.141174554824829,
|
| 12154 |
+
"learning_rate": 3.076009848188114e-05,
|
| 12155 |
+
"loss": 3.5861,
|
| 12156 |
+
"step": 155700
|
| 12157 |
+
},
|
| 12158 |
+
{
|
| 12159 |
+
"epoch": 0.027350007535206156,
|
| 12160 |
+
"grad_norm": 3.1444389820098877,
|
| 12161 |
+
"learning_rate": 3.0738551432119086e-05,
|
| 12162 |
+
"loss": 3.5716,
|
| 12163 |
+
"step": 155800
|
| 12164 |
+
},
|
| 12165 |
+
{
|
| 12166 |
+
"epoch": 0.027629089244749078,
|
| 12167 |
+
"grad_norm": 3.2603206634521484,
|
| 12168 |
+
"learning_rate": 3.0716999881488135e-05,
|
| 12169 |
+
"loss": 3.5878,
|
| 12170 |
+
"step": 155900
|
| 12171 |
+
},
|
| 12172 |
+
{
|
| 12173 |
+
"epoch": 0.027908170954292,
|
| 12174 |
+
"grad_norm": 3.119466781616211,
|
| 12175 |
+
"learning_rate": 3.069544384689162e-05,
|
| 12176 |
+
"loss": 3.5913,
|
| 12177 |
+
"step": 156000
|
| 12178 |
+
},
|
| 12179 |
+
{
|
| 12180 |
+
"epoch": 0.027908170954292,
|
| 12181 |
+
"eval_loss": 2.1707427501678467,
|
| 12182 |
+
"eval_runtime": 52.4036,
|
| 12183 |
+
"eval_samples_per_second": 194.529,
|
| 12184 |
+
"eval_steps_per_second": 1.527,
|
| 12185 |
+
"step": 156000
|
| 12186 |
+
},
|
| 12187 |
+
{
|
| 12188 |
+
"epoch": 0.028187252663834917,
|
| 12189 |
+
"grad_norm": 3.3349738121032715,
|
| 12190 |
+
"learning_rate": 3.06738833452364e-05,
|
| 12191 |
+
"loss": 3.5642,
|
| 12192 |
+
"step": 156100
|
| 12193 |
+
},
|
| 12194 |
+
{
|
| 12195 |
+
"epoch": 0.02846633437337784,
|
| 12196 |
+
"grad_norm": 3.151176691055298,
|
| 12197 |
+
"learning_rate": 3.065231839343285e-05,
|
| 12198 |
+
"loss": 3.5908,
|
| 12199 |
+
"step": 156200
|
| 12200 |
+
},
|
| 12201 |
+
{
|
| 12202 |
+
"epoch": 0.028745416082920756,
|
| 12203 |
+
"grad_norm": 3.139475107192993,
|
| 12204 |
+
"learning_rate": 3.0630749008394813e-05,
|
| 12205 |
+
"loss": 3.5672,
|
| 12206 |
+
"step": 156300
|
| 12207 |
+
},
|
| 12208 |
+
{
|
| 12209 |
+
"epoch": 0.029024497792463678,
|
| 12210 |
+
"grad_norm": 3.1396217346191406,
|
| 12211 |
+
"learning_rate": 3.0609175207039636e-05,
|
| 12212 |
+
"loss": 3.5787,
|
| 12213 |
+
"step": 156400
|
| 12214 |
+
},
|
| 12215 |
+
{
|
| 12216 |
+
"epoch": 0.0293035795020066,
|
| 12217 |
+
"grad_norm": 2.9696104526519775,
|
| 12218 |
+
"learning_rate": 3.05875970062881e-05,
|
| 12219 |
+
"loss": 3.573,
|
| 12220 |
+
"step": 156500
|
| 12221 |
+
},
|
| 12222 |
+
{
|
| 12223 |
+
"epoch": 0.029582661211549517,
|
| 12224 |
+
"grad_norm": 3.078080415725708,
|
| 12225 |
+
"learning_rate": 3.056601442306445e-05,
|
| 12226 |
+
"loss": 3.5583,
|
| 12227 |
+
"step": 156600
|
| 12228 |
+
},
|
| 12229 |
+
{
|
| 12230 |
+
"epoch": 0.02986174292109244,
|
| 12231 |
+
"grad_norm": 3.1915009021759033,
|
| 12232 |
+
"learning_rate": 3.054442747429638e-05,
|
| 12233 |
+
"loss": 3.5809,
|
| 12234 |
+
"step": 156700
|
| 12235 |
+
},
|
| 12236 |
+
{
|
| 12237 |
+
"epoch": 0.030140824630635357,
|
| 12238 |
+
"grad_norm": 3.194831132888794,
|
| 12239 |
+
"learning_rate": 3.052283617691499e-05,
|
| 12240 |
+
"loss": 3.5695,
|
| 12241 |
+
"step": 156800
|
| 12242 |
+
},
|
| 12243 |
+
{
|
| 12244 |
+
"epoch": 0.030419906340178278,
|
| 12245 |
+
"grad_norm": 3.0851330757141113,
|
| 12246 |
+
"learning_rate": 3.0501240547854793e-05,
|
| 12247 |
+
"loss": 3.5686,
|
| 12248 |
+
"step": 156900
|
| 12249 |
+
},
|
| 12250 |
+
{
|
| 12251 |
+
"epoch": 0.030698988049721196,
|
| 12252 |
+
"grad_norm": 3.050123929977417,
|
| 12253 |
+
"learning_rate": 3.047964060405371e-05,
|
| 12254 |
+
"loss": 3.5643,
|
| 12255 |
+
"step": 157000
|
| 12256 |
+
},
|
| 12257 |
+
{
|
| 12258 |
+
"epoch": 0.030698988049721196,
|
| 12259 |
+
"eval_loss": 2.165821075439453,
|
| 12260 |
+
"eval_runtime": 52.6341,
|
| 12261 |
+
"eval_samples_per_second": 193.677,
|
| 12262 |
+
"eval_steps_per_second": 1.52,
|
| 12263 |
+
"step": 157000
|
| 12264 |
+
},
|
| 12265 |
+
{
|
| 12266 |
+
"epoch": 0.030978069759264117,
|
| 12267 |
+
"grad_norm": 3.080662727355957,
|
| 12268 |
+
"learning_rate": 3.0458036362453036e-05,
|
| 12269 |
+
"loss": 3.5674,
|
| 12270 |
+
"step": 157100
|
| 12271 |
+
},
|
| 12272 |
+
{
|
| 12273 |
+
"epoch": 0.03125715146880704,
|
| 12274 |
+
"grad_norm": 2.9070680141448975,
|
| 12275 |
+
"learning_rate": 3.0436427839997444e-05,
|
| 12276 |
+
"loss": 3.5709,
|
| 12277 |
+
"step": 157200
|
| 12278 |
+
},
|
| 12279 |
+
{
|
| 12280 |
+
"epoch": 0.03153623317834996,
|
| 12281 |
+
"grad_norm": 3.212815046310425,
|
| 12282 |
+
"learning_rate": 3.0414815053634966e-05,
|
| 12283 |
+
"loss": 3.5596,
|
| 12284 |
+
"step": 157300
|
| 12285 |
+
},
|
| 12286 |
+
{
|
| 12287 |
+
"epoch": 0.031815314887892875,
|
| 12288 |
+
"grad_norm": 2.9851808547973633,
|
| 12289 |
+
"learning_rate": 3.039319802031696e-05,
|
| 12290 |
+
"loss": 3.5877,
|
| 12291 |
+
"step": 157400
|
| 12292 |
+
},
|
| 12293 |
+
{
|
| 12294 |
+
"epoch": 0.0320943965974358,
|
| 12295 |
+
"grad_norm": 3.1525375843048096,
|
| 12296 |
+
"learning_rate": 3.037157675699814e-05,
|
| 12297 |
+
"loss": 3.5742,
|
| 12298 |
+
"step": 157500
|
| 12299 |
+
},
|
| 12300 |
+
{
|
| 12301 |
+
"epoch": 0.03237347830697872,
|
| 12302 |
+
"grad_norm": 3.253023386001587,
|
| 12303 |
+
"learning_rate": 3.034995128063651e-05,
|
| 12304 |
+
"loss": 3.5823,
|
| 12305 |
+
"step": 157600
|
| 12306 |
+
},
|
| 12307 |
+
{
|
| 12308 |
+
"epoch": 0.032652560016521635,
|
| 12309 |
+
"grad_norm": 3.126237154006958,
|
| 12310 |
+
"learning_rate": 3.0328321608193427e-05,
|
| 12311 |
+
"loss": 3.5695,
|
| 12312 |
+
"step": 157700
|
| 12313 |
+
},
|
| 12314 |
+
{
|
| 12315 |
+
"epoch": 0.03293164172606456,
|
| 12316 |
+
"grad_norm": 2.9912712574005127,
|
| 12317 |
+
"learning_rate": 3.030668775663347e-05,
|
| 12318 |
+
"loss": 3.5762,
|
| 12319 |
+
"step": 157800
|
| 12320 |
+
},
|
| 12321 |
+
{
|
| 12322 |
+
"epoch": 0.03321072343560748,
|
| 12323 |
+
"grad_norm": 3.2612810134887695,
|
| 12324 |
+
"learning_rate": 3.0285049742924564e-05,
|
| 12325 |
+
"loss": 3.551,
|
| 12326 |
+
"step": 157900
|
| 12327 |
+
},
|
| 12328 |
+
{
|
| 12329 |
+
"epoch": 0.033489805145150396,
|
| 12330 |
+
"grad_norm": 3.226860761642456,
|
| 12331 |
+
"learning_rate": 3.026340758403785e-05,
|
| 12332 |
+
"loss": 3.5442,
|
| 12333 |
+
"step": 158000
|
| 12334 |
+
},
|
| 12335 |
+
{
|
| 12336 |
+
"epoch": 0.033489805145150396,
|
| 12337 |
+
"eval_loss": 2.1677935123443604,
|
| 12338 |
+
"eval_runtime": 52.5523,
|
| 12339 |
+
"eval_samples_per_second": 193.978,
|
| 12340 |
+
"eval_steps_per_second": 1.522,
|
| 12341 |
+
"step": 158000
|
| 12342 |
+
},
|
| 12343 |
+
{
|
| 12344 |
+
"epoch": 0.033768886854693314,
|
| 12345 |
+
"grad_norm": 3.011373281478882,
|
| 12346 |
+
"learning_rate": 3.024176129694774e-05,
|
| 12347 |
+
"loss": 3.5603,
|
| 12348 |
+
"step": 158100
|
| 12349 |
+
},
|
| 12350 |
+
{
|
| 12351 |
+
"epoch": 0.03404796856423624,
|
| 12352 |
+
"grad_norm": 3.067375898361206,
|
| 12353 |
+
"learning_rate": 3.022011089863187e-05,
|
| 12354 |
+
"loss": 3.5734,
|
| 12355 |
+
"step": 158200
|
| 12356 |
+
},
|
| 12357 |
+
{
|
| 12358 |
+
"epoch": 0.03432705027377916,
|
| 12359 |
+
"grad_norm": 3.1003239154815674,
|
| 12360 |
+
"learning_rate": 3.0198456406071134e-05,
|
| 12361 |
+
"loss": 3.5688,
|
| 12362 |
+
"step": 158300
|
| 12363 |
+
},
|
| 12364 |
+
{
|
| 12365 |
+
"epoch": 0.034606131983322075,
|
| 12366 |
+
"grad_norm": 2.9454071521759033,
|
| 12367 |
+
"learning_rate": 3.017679783624959e-05,
|
| 12368 |
+
"loss": 3.5617,
|
| 12369 |
+
"step": 158400
|
| 12370 |
+
},
|
| 12371 |
+
{
|
| 12372 |
+
"epoch": 0.034885213692865,
|
| 12373 |
+
"grad_norm": 3.2112362384796143,
|
| 12374 |
+
"learning_rate": 3.015513520615455e-05,
|
| 12375 |
+
"loss": 3.5651,
|
| 12376 |
+
"step": 158500
|
| 12377 |
+
},
|
| 12378 |
+
{
|
| 12379 |
+
"epoch": 0.03516429540240792,
|
| 12380 |
+
"grad_norm": 3.0805153846740723,
|
| 12381 |
+
"learning_rate": 3.0133468532776454e-05,
|
| 12382 |
+
"loss": 3.555,
|
| 12383 |
+
"step": 158600
|
| 12384 |
+
},
|
| 12385 |
+
{
|
| 12386 |
+
"epoch": 0.035443377111950836,
|
| 12387 |
+
"grad_norm": 3.0264370441436768,
|
| 12388 |
+
"learning_rate": 3.011179783310894e-05,
|
| 12389 |
+
"loss": 3.5537,
|
| 12390 |
+
"step": 158700
|
| 12391 |
+
},
|
| 12392 |
+
{
|
| 12393 |
+
"epoch": 0.035722458821493754,
|
| 12394 |
+
"grad_norm": 3.0615999698638916,
|
| 12395 |
+
"learning_rate": 3.0090123124148807e-05,
|
| 12396 |
+
"loss": 3.5466,
|
| 12397 |
+
"step": 158800
|
| 12398 |
+
},
|
| 12399 |
+
{
|
| 12400 |
+
"epoch": 0.03600154053103668,
|
| 12401 |
+
"grad_norm": 3.1815524101257324,
|
| 12402 |
+
"learning_rate": 3.0068444422896004e-05,
|
| 12403 |
+
"loss": 3.5535,
|
| 12404 |
+
"step": 158900
|
| 12405 |
+
},
|
| 12406 |
+
{
|
| 12407 |
+
"epoch": 0.036280622240579596,
|
| 12408 |
+
"grad_norm": 3.0562305450439453,
|
| 12409 |
+
"learning_rate": 3.004676174635358e-05,
|
| 12410 |
+
"loss": 3.5663,
|
| 12411 |
+
"step": 159000
|
| 12412 |
+
},
|
| 12413 |
+
{
|
| 12414 |
+
"epoch": 0.036280622240579596,
|
| 12415 |
+
"eval_loss": 2.173013925552368,
|
| 12416 |
+
"eval_runtime": 52.4542,
|
| 12417 |
+
"eval_samples_per_second": 194.341,
|
| 12418 |
+
"eval_steps_per_second": 1.525,
|
| 12419 |
+
"step": 159000
|
| 12420 |
+
},
|
| 12421 |
+
{
|
| 12422 |
+
"epoch": 0.036559703950122514,
|
| 12423 |
+
"grad_norm": 3.14689302444458,
|
| 12424 |
+
"learning_rate": 3.002507511152774e-05,
|
| 12425 |
+
"loss": 3.5568,
|
| 12426 |
+
"step": 159100
|
| 12427 |
+
},
|
| 12428 |
+
{
|
| 12429 |
+
"epoch": 0.03683878565966544,
|
| 12430 |
+
"grad_norm": 3.087399959564209,
|
| 12431 |
+
"learning_rate": 3.0003384535427765e-05,
|
| 12432 |
+
"loss": 3.557,
|
| 12433 |
+
"step": 159200
|
| 12434 |
+
},
|
| 12435 |
+
{
|
| 12436 |
+
"epoch": 0.03711786736920836,
|
| 12437 |
+
"grad_norm": 3.04844331741333,
|
| 12438 |
+
"learning_rate": 2.9981690035066057e-05,
|
| 12439 |
+
"loss": 3.5409,
|
| 12440 |
+
"step": 159300
|
| 12441 |
+
},
|
| 12442 |
+
{
|
| 12443 |
+
"epoch": 0.037396949078751275,
|
| 12444 |
+
"grad_norm": 3.2028706073760986,
|
| 12445 |
+
"learning_rate": 2.995999162745805e-05,
|
| 12446 |
+
"loss": 3.5761,
|
| 12447 |
+
"step": 159400
|
| 12448 |
+
},
|
| 12449 |
+
{
|
| 12450 |
+
"epoch": 0.0376760307882942,
|
| 12451 |
+
"grad_norm": 2.9123711585998535,
|
| 12452 |
+
"learning_rate": 2.99382893296223e-05,
|
| 12453 |
+
"loss": 3.5473,
|
| 12454 |
+
"step": 159500
|
| 12455 |
+
},
|
| 12456 |
+
{
|
| 12457 |
+
"epoch": 0.03795511249783712,
|
| 12458 |
+
"grad_norm": 3.165459156036377,
|
| 12459 |
+
"learning_rate": 2.9916583158580357e-05,
|
| 12460 |
+
"loss": 3.5596,
|
| 12461 |
+
"step": 159600
|
| 12462 |
+
},
|
| 12463 |
+
{
|
| 12464 |
+
"epoch": 0.038234194207380036,
|
| 12465 |
+
"grad_norm": 3.1565003395080566,
|
| 12466 |
+
"learning_rate": 2.989487313135686e-05,
|
| 12467 |
+
"loss": 3.5577,
|
| 12468 |
+
"step": 159700
|
| 12469 |
+
},
|
| 12470 |
+
{
|
| 12471 |
+
"epoch": 0.038513275916922954,
|
| 12472 |
+
"grad_norm": 3.1155638694763184,
|
| 12473 |
+
"learning_rate": 2.9873159264979433e-05,
|
| 12474 |
+
"loss": 3.5572,
|
| 12475 |
+
"step": 159800
|
| 12476 |
+
},
|
| 12477 |
+
{
|
| 12478 |
+
"epoch": 0.03879235762646588,
|
| 12479 |
+
"grad_norm": 3.1283843517303467,
|
| 12480 |
+
"learning_rate": 2.9851441576478734e-05,
|
| 12481 |
+
"loss": 3.5478,
|
| 12482 |
+
"step": 159900
|
| 12483 |
+
},
|
| 12484 |
+
{
|
| 12485 |
+
"epoch": 0.0390714393360088,
|
| 12486 |
+
"grad_norm": 3.2107434272766113,
|
| 12487 |
+
"learning_rate": 2.9829720082888406e-05,
|
| 12488 |
+
"loss": 3.5637,
|
| 12489 |
+
"step": 160000
|
| 12490 |
+
},
|
| 12491 |
+
{
|
| 12492 |
+
"epoch": 0.0390714393360088,
|
| 12493 |
+
"eval_loss": 2.1652143001556396,
|
| 12494 |
+
"eval_runtime": 52.479,
|
| 12495 |
+
"eval_samples_per_second": 194.249,
|
| 12496 |
+
"eval_steps_per_second": 1.524,
|
| 12497 |
+
"step": 160000
|
| 12498 |
+
},
|
| 12499 |
+
{
|
| 12500 |
+
"epoch": 0.00027908170954291995,
|
| 12501 |
+
"grad_norm": 1.561998724937439,
|
| 12502 |
+
"learning_rate": 2.9807994801245094e-05,
|
| 12503 |
+
"loss": 1.7734,
|
| 12504 |
+
"step": 160100
|
| 12505 |
+
},
|
| 12506 |
+
{
|
| 12507 |
+
"epoch": 0.0005581634190858399,
|
| 12508 |
+
"grad_norm": 1.5832147598266602,
|
| 12509 |
+
"learning_rate": 2.9786265748588383e-05,
|
| 12510 |
+
"loss": 1.7793,
|
| 12511 |
+
"step": 160200
|
| 12512 |
+
},
|
| 12513 |
+
{
|
| 12514 |
+
"epoch": 0.0008372451286287599,
|
| 12515 |
+
"grad_norm": 1.5801053047180176,
|
| 12516 |
+
"learning_rate": 2.9764532941960848e-05,
|
| 12517 |
+
"loss": 1.7738,
|
| 12518 |
+
"step": 160300
|
| 12519 |
+
},
|
| 12520 |
+
{
|
| 12521 |
+
"epoch": 0.0011163268381716798,
|
| 12522 |
+
"grad_norm": 1.5710026025772095,
|
| 12523 |
+
"learning_rate": 2.9742796398407996e-05,
|
| 12524 |
+
"loss": 1.7729,
|
| 12525 |
+
"step": 160400
|
| 12526 |
+
},
|
| 12527 |
+
{
|
| 12528 |
+
"epoch": 0.0013954085477146,
|
| 12529 |
+
"grad_norm": 1.5338846445083618,
|
| 12530 |
+
"learning_rate": 2.9721056134978263e-05,
|
| 12531 |
+
"loss": 1.7725,
|
| 12532 |
+
"step": 160500
|
| 12533 |
+
},
|
| 12534 |
+
{
|
| 12535 |
+
"epoch": 0.0016744902572575198,
|
| 12536 |
+
"grad_norm": 1.5545586347579956,
|
| 12537 |
+
"learning_rate": 2.9699312168722998e-05,
|
| 12538 |
+
"loss": 1.7748,
|
| 12539 |
+
"step": 160600
|
| 12540 |
+
},
|
| 12541 |
+
{
|
| 12542 |
+
"epoch": 0.00195357196680044,
|
| 12543 |
+
"grad_norm": 1.5022891759872437,
|
| 12544 |
+
"learning_rate": 2.967756451669646e-05,
|
| 12545 |
+
"loss": 1.7757,
|
| 12546 |
+
"step": 160700
|
| 12547 |
+
},
|
| 12548 |
+
{
|
| 12549 |
+
"epoch": 0.0022326536763433596,
|
| 12550 |
+
"grad_norm": 1.5773268938064575,
|
| 12551 |
+
"learning_rate": 2.9655813195955808e-05,
|
| 12552 |
+
"loss": 1.7746,
|
| 12553 |
+
"step": 160800
|
| 12554 |
+
},
|
| 12555 |
+
{
|
| 12556 |
+
"epoch": 0.0025117353858862797,
|
| 12557 |
+
"grad_norm": 1.5689061880111694,
|
| 12558 |
+
"learning_rate": 2.9634058223561058e-05,
|
| 12559 |
+
"loss": 1.7767,
|
| 12560 |
+
"step": 160900
|
| 12561 |
+
},
|
| 12562 |
+
{
|
| 12563 |
+
"epoch": 0.0027908170954292,
|
| 12564 |
+
"grad_norm": 1.523888349533081,
|
| 12565 |
+
"learning_rate": 2.9612299616575108e-05,
|
| 12566 |
+
"loss": 1.7725,
|
| 12567 |
+
"step": 161000
|
| 12568 |
+
},
|
| 12569 |
+
{
|
| 12570 |
+
"epoch": 0.0027908170954292,
|
| 12571 |
+
"eval_loss": 2.154878616333008,
|
| 12572 |
+
"eval_runtime": 52.3123,
|
| 12573 |
+
"eval_samples_per_second": 194.868,
|
| 12574 |
+
"eval_steps_per_second": 1.529,
|
| 12575 |
+
"step": 161000
|
| 12576 |
+
},
|
| 12577 |
+
{
|
| 12578 |
+
"epoch": 0.00306989880497212,
|
| 12579 |
+
"grad_norm": 1.5541263818740845,
|
| 12580 |
+
"learning_rate": 2.9590537392063693e-05,
|
| 12581 |
+
"loss": 1.775,
|
| 12582 |
+
"step": 161100
|
| 12583 |
+
},
|
| 12584 |
+
{
|
| 12585 |
+
"epoch": 0.0033489805145150396,
|
| 12586 |
+
"grad_norm": 1.542082667350769,
|
| 12587 |
+
"learning_rate": 2.9568771567095403e-05,
|
| 12588 |
+
"loss": 1.775,
|
| 12589 |
+
"step": 161200
|
| 12590 |
+
},
|
| 12591 |
+
{
|
| 12592 |
+
"epoch": 0.0036280622240579597,
|
| 12593 |
+
"grad_norm": 1.6016403436660767,
|
| 12594 |
+
"learning_rate": 2.9547002158741637e-05,
|
| 12595 |
+
"loss": 1.7809,
|
| 12596 |
+
"step": 161300
|
| 12597 |
+
},
|
| 12598 |
+
{
|
| 12599 |
+
"epoch": 0.00390714393360088,
|
| 12600 |
+
"grad_norm": 1.604466438293457,
|
| 12601 |
+
"learning_rate": 2.952522918407661e-05,
|
| 12602 |
+
"loss": 1.7691,
|
| 12603 |
+
"step": 161400
|
| 12604 |
+
},
|
| 12605 |
+
{
|
| 12606 |
+
"epoch": 0.0041862256431437995,
|
| 12607 |
+
"grad_norm": 1.507333755493164,
|
| 12608 |
+
"learning_rate": 2.950345266017732e-05,
|
| 12609 |
+
"loss": 1.7706,
|
| 12610 |
+
"step": 161500
|
| 12611 |
+
},
|
| 12612 |
+
{
|
| 12613 |
+
"epoch": 0.004465307352686719,
|
| 12614 |
+
"grad_norm": 1.6002788543701172,
|
| 12615 |
+
"learning_rate": 2.948167260412358e-05,
|
| 12616 |
+
"loss": 1.7947,
|
| 12617 |
+
"step": 161600
|
| 12618 |
+
},
|
| 12619 |
+
{
|
| 12620 |
+
"epoch": 0.00474438906222964,
|
| 12621 |
+
"grad_norm": 1.5560261011123657,
|
| 12622 |
+
"learning_rate": 2.9459889032997933e-05,
|
| 12623 |
+
"loss": 1.7981,
|
| 12624 |
+
"step": 161700
|
| 12625 |
+
},
|
| 12626 |
+
{
|
| 12627 |
+
"epoch": 0.005023470771772559,
|
| 12628 |
+
"grad_norm": 1.665317177772522,
|
| 12629 |
+
"learning_rate": 2.9438101963885728e-05,
|
| 12630 |
+
"loss": 1.7923,
|
| 12631 |
+
"step": 161800
|
| 12632 |
+
},
|
| 12633 |
+
{
|
| 12634 |
+
"epoch": 0.00530255248131548,
|
| 12635 |
+
"grad_norm": 1.5762344598770142,
|
| 12636 |
+
"learning_rate": 2.9416311413875008e-05,
|
| 12637 |
+
"loss": 1.7968,
|
| 12638 |
+
"step": 161900
|
| 12639 |
+
},
|
| 12640 |
+
{
|
| 12641 |
+
"epoch": 0.0055816341908584,
|
| 12642 |
+
"grad_norm": 1.553109884262085,
|
| 12643 |
+
"learning_rate": 2.9394517400056583e-05,
|
| 12644 |
+
"loss": 1.7948,
|
| 12645 |
+
"step": 162000
|
| 12646 |
+
},
|
| 12647 |
+
{
|
| 12648 |
+
"epoch": 0.0055816341908584,
|
| 12649 |
+
"eval_loss": 2.1695141792297363,
|
| 12650 |
+
"eval_runtime": 51.7758,
|
| 12651 |
+
"eval_samples_per_second": 196.887,
|
| 12652 |
+
"eval_steps_per_second": 1.545,
|
| 12653 |
+
"step": 162000
|
| 12654 |
+
},
|
| 12655 |
+
{
|
| 12656 |
+
"epoch": 0.005860715900401319,
|
| 12657 |
+
"grad_norm": 1.566721796989441,
|
| 12658 |
+
"learning_rate": 2.937271993952395e-05,
|
| 12659 |
+
"loss": 1.7959,
|
| 12660 |
+
"step": 162100
|
| 12661 |
+
},
|
| 12662 |
+
{
|
| 12663 |
+
"epoch": 0.00613979760994424,
|
| 12664 |
+
"grad_norm": 1.5952585935592651,
|
| 12665 |
+
"learning_rate": 2.9350919049373343e-05,
|
| 12666 |
+
"loss": 1.7892,
|
| 12667 |
+
"step": 162200
|
| 12668 |
+
},
|
| 12669 |
+
{
|
| 12670 |
+
"epoch": 0.0064188793194871595,
|
| 12671 |
+
"grad_norm": 1.5383343696594238,
|
| 12672 |
+
"learning_rate": 2.932911474670365e-05,
|
| 12673 |
+
"loss": 1.7918,
|
| 12674 |
+
"step": 162300
|
| 12675 |
+
},
|
| 12676 |
+
{
|
| 12677 |
+
"epoch": 0.006697961029030079,
|
| 12678 |
+
"grad_norm": 1.5342581272125244,
|
| 12679 |
+
"learning_rate": 2.9307307048616468e-05,
|
| 12680 |
+
"loss": 1.7815,
|
| 12681 |
+
"step": 162400
|
| 12682 |
+
},
|
| 12683 |
+
{
|
| 12684 |
+
"epoch": 0.006977042738573,
|
| 12685 |
+
"grad_norm": 1.533451795578003,
|
| 12686 |
+
"learning_rate": 2.9285495972216027e-05,
|
| 12687 |
+
"loss": 1.7834,
|
| 12688 |
+
"step": 162500
|
| 12689 |
+
},
|
| 12690 |
+
{
|
| 12691 |
+
"epoch": 0.0072561244481159195,
|
| 12692 |
+
"grad_norm": 1.5433770418167114,
|
| 12693 |
+
"learning_rate": 2.9263681534609233e-05,
|
| 12694 |
+
"loss": 1.7886,
|
| 12695 |
+
"step": 162600
|
| 12696 |
+
},
|
| 12697 |
+
{
|
| 12698 |
+
"epoch": 0.007535206157658839,
|
| 12699 |
+
"grad_norm": 1.5695182085037231,
|
| 12700 |
+
"learning_rate": 2.924186375290562e-05,
|
| 12701 |
+
"loss": 1.7934,
|
| 12702 |
+
"step": 162700
|
| 12703 |
+
},
|
| 12704 |
+
{
|
| 12705 |
+
"epoch": 0.00781428786720176,
|
| 12706 |
+
"grad_norm": 1.5996042490005493,
|
| 12707 |
+
"learning_rate": 2.922004264421733e-05,
|
| 12708 |
+
"loss": 1.7896,
|
| 12709 |
+
"step": 162800
|
| 12710 |
+
},
|
| 12711 |
+
{
|
| 12712 |
+
"epoch": 0.00809336957674468,
|
| 12713 |
+
"grad_norm": 1.4973070621490479,
|
| 12714 |
+
"learning_rate": 2.919821822565913e-05,
|
| 12715 |
+
"loss": 1.7862,
|
| 12716 |
+
"step": 162900
|
| 12717 |
+
},
|
| 12718 |
+
{
|
| 12719 |
+
"epoch": 0.008372451286287599,
|
| 12720 |
+
"grad_norm": 1.6223851442337036,
|
| 12721 |
+
"learning_rate": 2.9176390514348384e-05,
|
| 12722 |
+
"loss": 1.7797,
|
| 12723 |
+
"step": 163000
|
| 12724 |
+
},
|
| 12725 |
+
{
|
| 12726 |
+
"epoch": 0.008372451286287599,
|
| 12727 |
+
"eval_loss": 2.1567208766937256,
|
| 12728 |
+
"eval_runtime": 51.8158,
|
| 12729 |
+
"eval_samples_per_second": 196.735,
|
| 12730 |
+
"eval_steps_per_second": 1.544,
|
| 12731 |
+
"step": 163000
|
| 12732 |
+
},
|
| 12733 |
+
{
|
| 12734 |
+
"epoch": 0.008651532995830519,
|
| 12735 |
+
"grad_norm": 1.5375715494155884,
|
| 12736 |
+
"learning_rate": 2.915455952740503e-05,
|
| 12737 |
+
"loss": 1.7847,
|
| 12738 |
+
"step": 163100
|
| 12739 |
+
},
|
| 12740 |
+
{
|
| 12741 |
+
"epoch": 0.008930614705373438,
|
| 12742 |
+
"grad_norm": 1.4768236875534058,
|
| 12743 |
+
"learning_rate": 2.9132725281951584e-05,
|
| 12744 |
+
"loss": 1.7804,
|
| 12745 |
+
"step": 163200
|
| 12746 |
+
},
|
| 12747 |
+
{
|
| 12748 |
+
"epoch": 0.00920969641491636,
|
| 12749 |
+
"grad_norm": 1.583068609237671,
|
| 12750 |
+
"learning_rate": 2.9110887795113108e-05,
|
| 12751 |
+
"loss": 1.785,
|
| 12752 |
+
"step": 163300
|
| 12753 |
+
},
|
| 12754 |
+
{
|
| 12755 |
+
"epoch": 0.00948877812445928,
|
| 12756 |
+
"grad_norm": 1.5692275762557983,
|
| 12757 |
+
"learning_rate": 2.9089047084017206e-05,
|
| 12758 |
+
"loss": 1.7824,
|
| 12759 |
+
"step": 163400
|
| 12760 |
+
},
|
| 12761 |
+
{
|
| 12762 |
+
"epoch": 0.0097678598340022,
|
| 12763 |
+
"grad_norm": 1.5128995180130005,
|
| 12764 |
+
"learning_rate": 2.9067203165794028e-05,
|
| 12765 |
+
"loss": 1.7888,
|
| 12766 |
+
"step": 163500
|
| 12767 |
+
},
|
| 12768 |
+
{
|
| 12769 |
+
"epoch": 0.010046941543545119,
|
| 12770 |
+
"grad_norm": 1.5675851106643677,
|
| 12771 |
+
"learning_rate": 2.904535605757622e-05,
|
| 12772 |
+
"loss": 1.7761,
|
| 12773 |
+
"step": 163600
|
| 12774 |
+
},
|
| 12775 |
+
{
|
| 12776 |
+
"epoch": 0.010326023253088039,
|
| 12777 |
+
"grad_norm": 1.5943450927734375,
|
| 12778 |
+
"learning_rate": 2.902350577649894e-05,
|
| 12779 |
+
"loss": 1.7837,
|
| 12780 |
+
"step": 163700
|
| 12781 |
+
},
|
| 12782 |
+
{
|
| 12783 |
+
"epoch": 0.01060510496263096,
|
| 12784 |
+
"grad_norm": 1.4797823429107666,
|
| 12785 |
+
"learning_rate": 2.9001652339699818e-05,
|
| 12786 |
+
"loss": 1.7785,
|
| 12787 |
+
"step": 163800
|
| 12788 |
+
},
|
| 12789 |
+
{
|
| 12790 |
+
"epoch": 0.01088418667217388,
|
| 12791 |
+
"grad_norm": 1.5837212800979614,
|
| 12792 |
+
"learning_rate": 2.8979795764319007e-05,
|
| 12793 |
+
"loss": 1.7769,
|
| 12794 |
+
"step": 163900
|
| 12795 |
+
},
|
| 12796 |
+
{
|
| 12797 |
+
"epoch": 0.0111632683817168,
|
| 12798 |
+
"grad_norm": 1.5809427499771118,
|
| 12799 |
+
"learning_rate": 2.8957936067499054e-05,
|
| 12800 |
+
"loss": 1.7876,
|
| 12801 |
+
"step": 164000
|
| 12802 |
+
},
|
| 12803 |
+
{
|
| 12804 |
+
"epoch": 0.0111632683817168,
|
| 12805 |
+
"eval_loss": 2.157485246658325,
|
| 12806 |
+
"eval_runtime": 51.8186,
|
| 12807 |
+
"eval_samples_per_second": 196.725,
|
| 12808 |
+
"eval_steps_per_second": 1.544,
|
| 12809 |
+
"step": 164000
|
| 12810 |
+
},
|
| 12811 |
+
{
|
| 12812 |
+
"epoch": 0.011442350091259719,
|
| 12813 |
+
"grad_norm": 1.4939141273498535,
|
| 12814 |
+
"learning_rate": 2.8936073266385e-05,
|
| 12815 |
+
"loss": 1.7851,
|
| 12816 |
+
"step": 164100
|
| 12817 |
+
},
|
| 12818 |
+
{
|
| 12819 |
+
"epoch": 0.011721431800802639,
|
| 12820 |
+
"grad_norm": 1.6713926792144775,
|
| 12821 |
+
"learning_rate": 2.8914207378124304e-05,
|
| 12822 |
+
"loss": 1.7852,
|
| 12823 |
+
"step": 164200
|
| 12824 |
+
},
|
| 12825 |
+
{
|
| 12826 |
+
"epoch": 0.012000513510345558,
|
| 12827 |
+
"grad_norm": 1.526655673980713,
|
| 12828 |
+
"learning_rate": 2.889233841986686e-05,
|
| 12829 |
+
"loss": 1.7744,
|
| 12830 |
+
"step": 164300
|
| 12831 |
+
},
|
| 12832 |
+
{
|
| 12833 |
+
"epoch": 0.01227959521988848,
|
| 12834 |
+
"grad_norm": 1.674926996231079,
|
| 12835 |
+
"learning_rate": 2.8870466408764952e-05,
|
| 12836 |
+
"loss": 1.7761,
|
| 12837 |
+
"step": 164400
|
| 12838 |
+
},
|
| 12839 |
+
{
|
| 12840 |
+
"epoch": 0.0125586769294314,
|
| 12841 |
+
"grad_norm": 1.5404378175735474,
|
| 12842 |
+
"learning_rate": 2.8848591361973278e-05,
|
| 12843 |
+
"loss": 1.7889,
|
| 12844 |
+
"step": 164500
|
| 12845 |
+
},
|
| 12846 |
+
{
|
| 12847 |
+
"epoch": 0.012837758638974319,
|
| 12848 |
+
"grad_norm": 1.5554423332214355,
|
| 12849 |
+
"learning_rate": 2.88267132966489e-05,
|
| 12850 |
+
"loss": 1.7876,
|
| 12851 |
+
"step": 164600
|
| 12852 |
+
},
|
| 12853 |
+
{
|
| 12854 |
+
"epoch": 0.013116840348517239,
|
| 12855 |
+
"grad_norm": 1.595197319984436,
|
| 12856 |
+
"learning_rate": 2.880483222995125e-05,
|
| 12857 |
+
"loss": 1.7806,
|
| 12858 |
+
"step": 164700
|
| 12859 |
+
},
|
| 12860 |
+
{
|
| 12861 |
+
"epoch": 0.013395922058060158,
|
| 12862 |
+
"grad_norm": 1.611559510231018,
|
| 12863 |
+
"learning_rate": 2.8782948179042114e-05,
|
| 12864 |
+
"loss": 1.7856,
|
| 12865 |
+
"step": 164800
|
| 12866 |
+
},
|
| 12867 |
+
{
|
| 12868 |
+
"epoch": 0.013675003767603078,
|
| 12869 |
+
"grad_norm": 1.622501015663147,
|
| 12870 |
+
"learning_rate": 2.876106116108564e-05,
|
| 12871 |
+
"loss": 1.7838,
|
| 12872 |
+
"step": 164900
|
| 12873 |
+
},
|
| 12874 |
+
{
|
| 12875 |
+
"epoch": 0.013954085477146,
|
| 12876 |
+
"grad_norm": 1.5229750871658325,
|
| 12877 |
+
"learning_rate": 2.873917119324826e-05,
|
| 12878 |
+
"loss": 1.7851,
|
| 12879 |
+
"step": 165000
|
| 12880 |
+
},
|
| 12881 |
+
{
|
| 12882 |
+
"epoch": 0.013954085477146,
|
| 12883 |
+
"eval_loss": 2.1587064266204834,
|
| 12884 |
+
"eval_runtime": 51.9184,
|
| 12885 |
+
"eval_samples_per_second": 196.347,
|
| 12886 |
+
"eval_steps_per_second": 1.541,
|
| 12887 |
+
"step": 165000
|
| 12888 |
+
},
|
| 12889 |
+
{
|
| 12890 |
+
"epoch": 0.01423316718668892,
|
| 12891 |
+
"grad_norm": 1.5421415567398071,
|
| 12892 |
+
"learning_rate": 2.8717278292698767e-05,
|
| 12893 |
+
"loss": 1.7853,
|
| 12894 |
+
"step": 165100
|
| 12895 |
+
},
|
| 12896 |
+
{
|
| 12897 |
+
"epoch": 0.014512248896231839,
|
| 12898 |
+
"grad_norm": 1.598402976989746,
|
| 12899 |
+
"learning_rate": 2.8695382476608228e-05,
|
| 12900 |
+
"loss": 1.7886,
|
| 12901 |
+
"step": 165200
|
| 12902 |
+
},
|
| 12903 |
+
{
|
| 12904 |
+
"epoch": 0.014791330605774759,
|
| 12905 |
+
"grad_norm": 1.5251154899597168,
|
| 12906 |
+
"learning_rate": 2.867348376215e-05,
|
| 12907 |
+
"loss": 1.7885,
|
| 12908 |
+
"step": 165300
|
| 12909 |
+
},
|
| 12910 |
+
{
|
| 12911 |
+
"epoch": 0.015070412315317678,
|
| 12912 |
+
"grad_norm": 1.554371953010559,
|
| 12913 |
+
"learning_rate": 2.86515821664997e-05,
|
| 12914 |
+
"loss": 1.7831,
|
| 12915 |
+
"step": 165400
|
| 12916 |
+
},
|
| 12917 |
+
{
|
| 12918 |
+
"epoch": 0.015349494024860598,
|
| 12919 |
+
"grad_norm": 1.606136679649353,
|
| 12920 |
+
"learning_rate": 2.8629677706835234e-05,
|
| 12921 |
+
"loss": 1.7672,
|
| 12922 |
+
"step": 165500
|
| 12923 |
+
},
|
| 12924 |
+
{
|
| 12925 |
+
"epoch": 0.01562857573440352,
|
| 12926 |
+
"grad_norm": 1.5520561933517456,
|
| 12927 |
+
"learning_rate": 2.8607770400336738e-05,
|
| 12928 |
+
"loss": 1.7775,
|
| 12929 |
+
"step": 165600
|
| 12930 |
+
},
|
| 12931 |
+
{
|
| 12932 |
+
"epoch": 0.015907657443946437,
|
| 12933 |
+
"grad_norm": 1.5017564296722412,
|
| 12934 |
+
"learning_rate": 2.8585860264186582e-05,
|
| 12935 |
+
"loss": 1.7837,
|
| 12936 |
+
"step": 165700
|
| 12937 |
+
},
|
| 12938 |
+
{
|
| 12939 |
+
"epoch": 0.01618673915348936,
|
| 12940 |
+
"grad_norm": 1.5462771654129028,
|
| 12941 |
+
"learning_rate": 2.8563947315569346e-05,
|
| 12942 |
+
"loss": 1.7757,
|
| 12943 |
+
"step": 165800
|
| 12944 |
+
},
|
| 12945 |
+
{
|
| 12946 |
+
"epoch": 0.01646582086303228,
|
| 12947 |
+
"grad_norm": 1.519423484802246,
|
| 12948 |
+
"learning_rate": 2.8542031571671833e-05,
|
| 12949 |
+
"loss": 1.7737,
|
| 12950 |
+
"step": 165900
|
| 12951 |
+
},
|
| 12952 |
+
{
|
| 12953 |
+
"epoch": 0.016744902572575198,
|
| 12954 |
+
"grad_norm": 1.552426815032959,
|
| 12955 |
+
"learning_rate": 2.852011304968304e-05,
|
| 12956 |
+
"loss": 1.7845,
|
| 12957 |
+
"step": 166000
|
| 12958 |
+
},
|
| 12959 |
+
{
|
| 12960 |
+
"epoch": 0.016744902572575198,
|
| 12961 |
+
"eval_loss": 2.1654627323150635,
|
| 12962 |
+
"eval_runtime": 51.7515,
|
| 12963 |
+
"eval_samples_per_second": 196.98,
|
| 12964 |
+
"eval_steps_per_second": 1.546,
|
| 12965 |
+
"step": 166000
|
| 12966 |
+
},
|
| 12967 |
+
{
|
| 12968 |
+
"epoch": 0.01702398428211812,
|
| 12969 |
+
"grad_norm": 1.6090401411056519,
|
| 12970 |
+
"learning_rate": 2.849819176679412e-05,
|
| 12971 |
+
"loss": 1.7792,
|
| 12972 |
+
"step": 166100
|
| 12973 |
+
},
|
| 12974 |
+
{
|
| 12975 |
+
"epoch": 0.017303065991661037,
|
| 12976 |
+
"grad_norm": 1.4991530179977417,
|
| 12977 |
+
"learning_rate": 2.8476267740198403e-05,
|
| 12978 |
+
"loss": 1.7757,
|
| 12979 |
+
"step": 166200
|
| 12980 |
+
},
|
| 12981 |
+
{
|
| 12982 |
+
"epoch": 0.01758214770120396,
|
| 12983 |
+
"grad_norm": 1.545792579650879,
|
| 12984 |
+
"learning_rate": 2.8454340987091382e-05,
|
| 12985 |
+
"loss": 1.7782,
|
| 12986 |
+
"step": 166300
|
| 12987 |
+
},
|
| 12988 |
+
{
|
| 12989 |
+
"epoch": 0.017861229410746877,
|
| 12990 |
+
"grad_norm": 1.5758668184280396,
|
| 12991 |
+
"learning_rate": 2.8432411524670675e-05,
|
| 12992 |
+
"loss": 1.7627,
|
| 12993 |
+
"step": 166400
|
| 12994 |
+
},
|
| 12995 |
+
{
|
| 12996 |
+
"epoch": 0.018140311120289798,
|
| 12997 |
+
"grad_norm": 1.5638821125030518,
|
| 12998 |
+
"learning_rate": 2.8410479370136035e-05,
|
| 12999 |
+
"loss": 1.7816,
|
| 13000 |
+
"step": 166500
|
| 13001 |
+
},
|
| 13002 |
+
{
|
| 13003 |
+
"epoch": 0.01841939282983272,
|
| 13004 |
+
"grad_norm": 1.6477131843566895,
|
| 13005 |
+
"learning_rate": 2.8388544540689314e-05,
|
| 13006 |
+
"loss": 1.7814,
|
| 13007 |
+
"step": 166600
|
| 13008 |
+
},
|
| 13009 |
+
{
|
| 13010 |
+
"epoch": 0.018698474539375638,
|
| 13011 |
+
"grad_norm": 1.5519869327545166,
|
| 13012 |
+
"learning_rate": 2.836660705353447e-05,
|
| 13013 |
+
"loss": 1.7747,
|
| 13014 |
+
"step": 166700
|
| 13015 |
+
},
|
| 13016 |
+
{
|
| 13017 |
+
"epoch": 0.01897755624891856,
|
| 13018 |
+
"grad_norm": 1.5598399639129639,
|
| 13019 |
+
"learning_rate": 2.8344666925877556e-05,
|
| 13020 |
+
"loss": 1.7778,
|
| 13021 |
+
"step": 166800
|
| 13022 |
+
},
|
| 13023 |
+
{
|
| 13024 |
+
"epoch": 0.019256637958461477,
|
| 13025 |
+
"grad_norm": 1.5361994504928589,
|
| 13026 |
+
"learning_rate": 2.8322724174926664e-05,
|
| 13027 |
+
"loss": 1.7796,
|
| 13028 |
+
"step": 166900
|
| 13029 |
+
},
|
| 13030 |
+
{
|
| 13031 |
+
"epoch": 0.0195357196680044,
|
| 13032 |
+
"grad_norm": 1.5680959224700928,
|
| 13033 |
+
"learning_rate": 2.8300778817891976e-05,
|
| 13034 |
+
"loss": 1.7742,
|
| 13035 |
+
"step": 167000
|
| 13036 |
+
},
|
| 13037 |
+
{
|
| 13038 |
+
"epoch": 0.0195357196680044,
|
| 13039 |
+
"eval_loss": 2.1607890129089355,
|
| 13040 |
+
"eval_runtime": 52.1406,
|
| 13041 |
+
"eval_samples_per_second": 195.51,
|
| 13042 |
+
"eval_steps_per_second": 1.534,
|
| 13043 |
+
"step": 167000
|
| 13044 |
}
|
| 13045 |
],
|
| 13046 |
"logging_steps": 100,
|
|
|
|
| 13060 |
"attributes": {}
|
| 13061 |
}
|
| 13062 |
},
|
| 13063 |
+
"total_flos": 1.4574492479127552e+19,
|
| 13064 |
"train_batch_size": 128,
|
| 13065 |
"trial_name": null,
|
| 13066 |
"trial_params": null
|
training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9318402efc23f8b2e09dec877ba7b88863d76a00aceeef7c22f944e9f6a43e28
|
| 3 |
+
size 5777
|