felipeoes commited on
Commit
d20113e
·
verified ·
1 Parent(s): be76131

Model save

Browse files
README.md ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: meta-llama/Llama-3.1-8B-Instruct
3
+ library_name: transformers
4
+ model_name: legis-llama3-1-8b-valid-arandu
5
+ tags:
6
+ - generated_from_trainer
7
+ - trl
8
+ - sft
9
+ licence: license
10
+ ---
11
+
12
+ # Model Card for legis-llama3-1-8b-valid-arandu
13
+
14
+ This model is a fine-tuned version of [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct).
15
+ It has been trained using [TRL](https://github.com/huggingface/trl).
16
+
17
+ ## Quick start
18
+
19
+ ```python
20
+ from transformers import pipeline
21
+
22
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
23
+ generator = pipeline("text-generation", model="felipeoes/legis-llama3-1-8b-valid-arandu", device="cuda")
24
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
25
+ print(output["generated_text"])
26
+ ```
27
+
28
+ ## Training procedure
29
+
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/felipealumni-usp/huggingface/runs/bm7zcvk0)
31
+
32
+ This model was trained with SFT.
33
+
34
+ ### Framework versions
35
+
36
+ - TRL: 0.12.1
37
+ - Transformers: 4.46.3
38
+ - Pytorch: 2.4.1
39
+ - Datasets: 3.1.0
40
+ - Tokenizers: 0.20.3
41
+
42
+ ## Citations
43
+
44
+
45
+
46
+ Cite TRL as:
47
+
48
+ ```bibtex
49
+ @misc{vonwerra2022trl,
50
+ title = {{TRL: Transformer Reinforcement Learning}},
51
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
52
+ year = 2020,
53
+ journal = {GitHub repository},
54
+ publisher = {GitHub},
55
+ howpublished = {\url{https://github.com/huggingface/trl}}
56
+ }
57
+ ```
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d19ba1fa534d12364c4c4b293afa935c5b3f07c518f21a5dec82873165c4e8bc
3
  size 2269178776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9568b9eff59730f694dac3f3bf24bcc7dc5e810d21f35eadeb2d3aa7c7150d4a
3
  size 2269178776
all_results.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.9995600527936648,
3
+ "eval_samples": 117,
4
+ "total_flos": 7.211600370336793e+18,
5
+ "train_loss": 0.039691918463984004,
6
+ "train_runtime": 9596.3839,
7
+ "train_samples": 116076,
8
+ "train_samples_per_second": 1.895,
9
+ "train_steps_per_second": 0.118
10
+ }
train_results.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.9995600527936648,
3
+ "eval_samples": 117,
4
+ "total_flos": 7.211600370336793e+18,
5
+ "train_loss": 0.039691918463984004,
6
+ "train_runtime": 9596.3839,
7
+ "train_samples": 116076,
8
+ "train_samples_per_second": 1.895,
9
+ "train_steps_per_second": 0.118
10
+ }
trainer_state.json ADDED
@@ -0,0 +1,3447 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.439,
3
+ "best_model_checkpoint": "runs/legis-llama3-1-8b-valid-arandu/checkpoint-1120",
4
+ "epoch": 0.9995600527936648,
5
+ "eval_steps": 5,
6
+ "global_step": 1136,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.004399472063352398,
13
+ "grad_norm": 25.937191009521484,
14
+ "learning_rate": 8.771929824561403e-06,
15
+ "loss": 1.0992,
16
+ "step": 5
17
+ },
18
+ {
19
+ "epoch": 0.004399472063352398,
20
+ "eval_loss": 1.1428982019424438,
21
+ "eval_runtime": 29.8805,
22
+ "eval_samples_per_second": 0.569,
23
+ "eval_steps_per_second": 0.301,
24
+ "step": 5
25
+ },
26
+ {
27
+ "epoch": 0.008798944126704795,
28
+ "grad_norm": 32.52676773071289,
29
+ "learning_rate": 1.7543859649122806e-05,
30
+ "loss": 1.067,
31
+ "step": 10
32
+ },
33
+ {
34
+ "epoch": 0.008798944126704795,
35
+ "eval_loss": 1.0669578313827515,
36
+ "eval_runtime": 28.5282,
37
+ "eval_samples_per_second": 0.596,
38
+ "eval_steps_per_second": 0.315,
39
+ "step": 10
40
+ },
41
+ {
42
+ "epoch": 0.013198416190057193,
43
+ "grad_norm": 78.51001739501953,
44
+ "learning_rate": 2.6315789473684212e-05,
45
+ "loss": 1.0057,
46
+ "step": 15
47
+ },
48
+ {
49
+ "epoch": 0.013198416190057193,
50
+ "eval_loss": 1.0462743043899536,
51
+ "eval_runtime": 28.5697,
52
+ "eval_samples_per_second": 0.595,
53
+ "eval_steps_per_second": 0.315,
54
+ "step": 15
55
+ },
56
+ {
57
+ "epoch": 0.01759788825340959,
58
+ "grad_norm": 21.255964279174805,
59
+ "learning_rate": 3.508771929824561e-05,
60
+ "loss": 0.9236,
61
+ "step": 20
62
+ },
63
+ {
64
+ "epoch": 0.01759788825340959,
65
+ "eval_loss": 0.9604344367980957,
66
+ "eval_runtime": 28.6152,
67
+ "eval_samples_per_second": 0.594,
68
+ "eval_steps_per_second": 0.315,
69
+ "step": 20
70
+ },
71
+ {
72
+ "epoch": 0.02199736031676199,
73
+ "grad_norm": 1.3699233531951904,
74
+ "learning_rate": 4.3859649122807014e-05,
75
+ "loss": 0.8823,
76
+ "step": 25
77
+ },
78
+ {
79
+ "epoch": 0.02199736031676199,
80
+ "eval_loss": 0.9002779126167297,
81
+ "eval_runtime": 28.579,
82
+ "eval_samples_per_second": 0.595,
83
+ "eval_steps_per_second": 0.315,
84
+ "step": 25
85
+ },
86
+ {
87
+ "epoch": 0.026396832380114386,
88
+ "grad_norm": 2.50810170173645,
89
+ "learning_rate": 5.2631578947368424e-05,
90
+ "loss": 0.8144,
91
+ "step": 30
92
+ },
93
+ {
94
+ "epoch": 0.026396832380114386,
95
+ "eval_loss": 0.8441588878631592,
96
+ "eval_runtime": 28.4936,
97
+ "eval_samples_per_second": 0.597,
98
+ "eval_steps_per_second": 0.316,
99
+ "step": 30
100
+ },
101
+ {
102
+ "epoch": 0.030796304443466784,
103
+ "grad_norm": 1.6816316843032837,
104
+ "learning_rate": 6.140350877192983e-05,
105
+ "loss": 0.7829,
106
+ "step": 35
107
+ },
108
+ {
109
+ "epoch": 0.030796304443466784,
110
+ "eval_loss": 0.7928382754325867,
111
+ "eval_runtime": 28.5908,
112
+ "eval_samples_per_second": 0.595,
113
+ "eval_steps_per_second": 0.315,
114
+ "step": 35
115
+ },
116
+ {
117
+ "epoch": 0.03519577650681918,
118
+ "grad_norm": 0.5125584006309509,
119
+ "learning_rate": 7.017543859649122e-05,
120
+ "loss": 0.7075,
121
+ "step": 40
122
+ },
123
+ {
124
+ "epoch": 0.03519577650681918,
125
+ "eval_loss": 0.7538504600524902,
126
+ "eval_runtime": 28.5816,
127
+ "eval_samples_per_second": 0.595,
128
+ "eval_steps_per_second": 0.315,
129
+ "step": 40
130
+ },
131
+ {
132
+ "epoch": 0.039595248570171576,
133
+ "grad_norm": 0.36081045866012573,
134
+ "learning_rate": 7.894736842105263e-05,
135
+ "loss": 0.6776,
136
+ "step": 45
137
+ },
138
+ {
139
+ "epoch": 0.039595248570171576,
140
+ "eval_loss": 0.7313268184661865,
141
+ "eval_runtime": 28.6141,
142
+ "eval_samples_per_second": 0.594,
143
+ "eval_steps_per_second": 0.315,
144
+ "step": 45
145
+ },
146
+ {
147
+ "epoch": 0.04399472063352398,
148
+ "grad_norm": 0.32318177819252014,
149
+ "learning_rate": 8.771929824561403e-05,
150
+ "loss": 0.6499,
151
+ "step": 50
152
+ },
153
+ {
154
+ "epoch": 0.04399472063352398,
155
+ "eval_loss": 0.71351158618927,
156
+ "eval_runtime": 28.5766,
157
+ "eval_samples_per_second": 0.595,
158
+ "eval_steps_per_second": 0.315,
159
+ "step": 50
160
+ },
161
+ {
162
+ "epoch": 0.04839419269687637,
163
+ "grad_norm": 0.34377261996269226,
164
+ "learning_rate": 9.649122807017544e-05,
165
+ "loss": 0.6487,
166
+ "step": 55
167
+ },
168
+ {
169
+ "epoch": 0.04839419269687637,
170
+ "eval_loss": 0.7006722092628479,
171
+ "eval_runtime": 28.6048,
172
+ "eval_samples_per_second": 0.594,
173
+ "eval_steps_per_second": 0.315,
174
+ "step": 55
175
+ },
176
+ {
177
+ "epoch": 0.05279366476022877,
178
+ "grad_norm": 0.4360629618167877,
179
+ "learning_rate": 0.00010526315789473685,
180
+ "loss": 0.6405,
181
+ "step": 60
182
+ },
183
+ {
184
+ "epoch": 0.05279366476022877,
185
+ "eval_loss": 0.6905343532562256,
186
+ "eval_runtime": 28.5257,
187
+ "eval_samples_per_second": 0.596,
188
+ "eval_steps_per_second": 0.316,
189
+ "step": 60
190
+ },
191
+ {
192
+ "epoch": 0.05719313682358117,
193
+ "grad_norm": 0.28764936327934265,
194
+ "learning_rate": 0.00011403508771929824,
195
+ "loss": 0.6352,
196
+ "step": 65
197
+ },
198
+ {
199
+ "epoch": 0.05719313682358117,
200
+ "eval_loss": 0.68143630027771,
201
+ "eval_runtime": 28.6362,
202
+ "eval_samples_per_second": 0.594,
203
+ "eval_steps_per_second": 0.314,
204
+ "step": 65
205
+ },
206
+ {
207
+ "epoch": 0.06159260888693357,
208
+ "grad_norm": 0.34088754653930664,
209
+ "learning_rate": 0.00012280701754385965,
210
+ "loss": 0.6064,
211
+ "step": 70
212
+ },
213
+ {
214
+ "epoch": 0.06159260888693357,
215
+ "eval_loss": 0.6742813587188721,
216
+ "eval_runtime": 28.5667,
217
+ "eval_samples_per_second": 0.595,
218
+ "eval_steps_per_second": 0.315,
219
+ "step": 70
220
+ },
221
+ {
222
+ "epoch": 0.06599208095028597,
223
+ "grad_norm": 0.31284183263778687,
224
+ "learning_rate": 0.00013157894736842108,
225
+ "loss": 0.5924,
226
+ "step": 75
227
+ },
228
+ {
229
+ "epoch": 0.06599208095028597,
230
+ "eval_loss": 0.6679767966270447,
231
+ "eval_runtime": 28.461,
232
+ "eval_samples_per_second": 0.597,
233
+ "eval_steps_per_second": 0.316,
234
+ "step": 75
235
+ },
236
+ {
237
+ "epoch": 0.07039155301363836,
238
+ "grad_norm": 0.30470508337020874,
239
+ "learning_rate": 0.00014035087719298245,
240
+ "loss": 0.5992,
241
+ "step": 80
242
+ },
243
+ {
244
+ "epoch": 0.07039155301363836,
245
+ "eval_loss": 0.6631008386611938,
246
+ "eval_runtime": 28.6891,
247
+ "eval_samples_per_second": 0.593,
248
+ "eval_steps_per_second": 0.314,
249
+ "step": 80
250
+ },
251
+ {
252
+ "epoch": 0.07479102507699076,
253
+ "grad_norm": 0.3255262076854706,
254
+ "learning_rate": 0.00014912280701754387,
255
+ "loss": 0.5704,
256
+ "step": 85
257
+ },
258
+ {
259
+ "epoch": 0.07479102507699076,
260
+ "eval_loss": 0.658618688583374,
261
+ "eval_runtime": 28.6094,
262
+ "eval_samples_per_second": 0.594,
263
+ "eval_steps_per_second": 0.315,
264
+ "step": 85
265
+ },
266
+ {
267
+ "epoch": 0.07919049714034315,
268
+ "grad_norm": 0.31922295689582825,
269
+ "learning_rate": 0.00015789473684210527,
270
+ "loss": 0.6048,
271
+ "step": 90
272
+ },
273
+ {
274
+ "epoch": 0.07919049714034315,
275
+ "eval_loss": 0.6537344455718994,
276
+ "eval_runtime": 28.532,
277
+ "eval_samples_per_second": 0.596,
278
+ "eval_steps_per_second": 0.315,
279
+ "step": 90
280
+ },
281
+ {
282
+ "epoch": 0.08358996920369556,
283
+ "grad_norm": 0.45636337995529175,
284
+ "learning_rate": 0.0001666666666666667,
285
+ "loss": 0.613,
286
+ "step": 95
287
+ },
288
+ {
289
+ "epoch": 0.08358996920369556,
290
+ "eval_loss": 0.6501972079277039,
291
+ "eval_runtime": 28.6568,
292
+ "eval_samples_per_second": 0.593,
293
+ "eval_steps_per_second": 0.314,
294
+ "step": 95
295
+ },
296
+ {
297
+ "epoch": 0.08798944126704795,
298
+ "grad_norm": 0.29334941506385803,
299
+ "learning_rate": 0.00017543859649122806,
300
+ "loss": 0.5799,
301
+ "step": 100
302
+ },
303
+ {
304
+ "epoch": 0.08798944126704795,
305
+ "eval_loss": 0.6471393704414368,
306
+ "eval_runtime": 28.5997,
307
+ "eval_samples_per_second": 0.594,
308
+ "eval_steps_per_second": 0.315,
309
+ "step": 100
310
+ },
311
+ {
312
+ "epoch": 0.09238891333040035,
313
+ "grad_norm": 0.31318825483322144,
314
+ "learning_rate": 0.00018421052631578948,
315
+ "loss": 0.5887,
316
+ "step": 105
317
+ },
318
+ {
319
+ "epoch": 0.09238891333040035,
320
+ "eval_loss": 0.6440868377685547,
321
+ "eval_runtime": 28.6275,
322
+ "eval_samples_per_second": 0.594,
323
+ "eval_steps_per_second": 0.314,
324
+ "step": 105
325
+ },
326
+ {
327
+ "epoch": 0.09678838539375274,
328
+ "grad_norm": 0.27908894419670105,
329
+ "learning_rate": 0.00019298245614035088,
330
+ "loss": 0.5905,
331
+ "step": 110
332
+ },
333
+ {
334
+ "epoch": 0.09678838539375274,
335
+ "eval_loss": 0.6423875689506531,
336
+ "eval_runtime": 28.5491,
337
+ "eval_samples_per_second": 0.595,
338
+ "eval_steps_per_second": 0.315,
339
+ "step": 110
340
+ },
341
+ {
342
+ "epoch": 0.10118785745710515,
343
+ "grad_norm": 0.2715133726596832,
344
+ "learning_rate": 0.00019999952753720356,
345
+ "loss": 0.5902,
346
+ "step": 115
347
+ },
348
+ {
349
+ "epoch": 0.10118785745710515,
350
+ "eval_loss": 0.6415910720825195,
351
+ "eval_runtime": 28.5086,
352
+ "eval_samples_per_second": 0.596,
353
+ "eval_steps_per_second": 0.316,
354
+ "step": 115
355
+ },
356
+ {
357
+ "epoch": 0.10558732952045755,
358
+ "grad_norm": 0.3028790056705475,
359
+ "learning_rate": 0.000199982991808088,
360
+ "loss": 0.5773,
361
+ "step": 120
362
+ },
363
+ {
364
+ "epoch": 0.10558732952045755,
365
+ "eval_loss": 0.6377425789833069,
366
+ "eval_runtime": 28.6438,
367
+ "eval_samples_per_second": 0.593,
368
+ "eval_steps_per_second": 0.314,
369
+ "step": 120
370
+ },
371
+ {
372
+ "epoch": 0.10998680158380994,
373
+ "grad_norm": 0.3071883022785187,
374
+ "learning_rate": 0.00019994283740338306,
375
+ "loss": 0.5598,
376
+ "step": 125
377
+ },
378
+ {
379
+ "epoch": 0.10998680158380994,
380
+ "eval_loss": 0.6367806196212769,
381
+ "eval_runtime": 28.4852,
382
+ "eval_samples_per_second": 0.597,
383
+ "eval_steps_per_second": 0.316,
384
+ "step": 125
385
+ },
386
+ {
387
+ "epoch": 0.11438627364716233,
388
+ "grad_norm": 0.34842655062675476,
389
+ "learning_rate": 0.00019987907380864062,
390
+ "loss": 0.596,
391
+ "step": 130
392
+ },
393
+ {
394
+ "epoch": 0.11438627364716233,
395
+ "eval_loss": 0.6347749829292297,
396
+ "eval_runtime": 28.5908,
397
+ "eval_samples_per_second": 0.595,
398
+ "eval_steps_per_second": 0.315,
399
+ "step": 130
400
+ },
401
+ {
402
+ "epoch": 0.11878574571051474,
403
+ "grad_norm": 0.2854275107383728,
404
+ "learning_rate": 0.00019979171608653924,
405
+ "loss": 0.5733,
406
+ "step": 135
407
+ },
408
+ {
409
+ "epoch": 0.11878574571051474,
410
+ "eval_loss": 0.6301032900810242,
411
+ "eval_runtime": 28.5482,
412
+ "eval_samples_per_second": 0.595,
413
+ "eval_steps_per_second": 0.315,
414
+ "step": 135
415
+ },
416
+ {
417
+ "epoch": 0.12318521777386714,
418
+ "grad_norm": 0.27615901827812195,
419
+ "learning_rate": 0.00019968078487332566,
420
+ "loss": 0.5875,
421
+ "step": 140
422
+ },
423
+ {
424
+ "epoch": 0.12318521777386714,
425
+ "eval_loss": 0.6269793510437012,
426
+ "eval_runtime": 28.4974,
427
+ "eval_samples_per_second": 0.597,
428
+ "eval_steps_per_second": 0.316,
429
+ "step": 140
430
+ },
431
+ {
432
+ "epoch": 0.12758468983721954,
433
+ "grad_norm": 0.2709368169307709,
434
+ "learning_rate": 0.00019954630637394029,
435
+ "loss": 0.5711,
436
+ "step": 145
437
+ },
438
+ {
439
+ "epoch": 0.12758468983721954,
440
+ "eval_loss": 0.6240233182907104,
441
+ "eval_runtime": 28.5264,
442
+ "eval_samples_per_second": 0.596,
443
+ "eval_steps_per_second": 0.315,
444
+ "step": 145
445
+ },
446
+ {
447
+ "epoch": 0.13198416190057194,
448
+ "grad_norm": 0.2877412736415863,
449
+ "learning_rate": 0.00019938831235582672,
450
+ "loss": 0.5885,
451
+ "step": 150
452
+ },
453
+ {
454
+ "epoch": 0.13198416190057194,
455
+ "eval_loss": 0.6206945776939392,
456
+ "eval_runtime": 28.5668,
457
+ "eval_samples_per_second": 0.595,
458
+ "eval_steps_per_second": 0.315,
459
+ "step": 150
460
+ },
461
+ {
462
+ "epoch": 0.13638363396392433,
463
+ "grad_norm": 0.2922605574131012,
464
+ "learning_rate": 0.00019920684014142738,
465
+ "loss": 0.5485,
466
+ "step": 155
467
+ },
468
+ {
469
+ "epoch": 0.13638363396392433,
470
+ "eval_loss": 0.6200662851333618,
471
+ "eval_runtime": 28.5452,
472
+ "eval_samples_per_second": 0.596,
473
+ "eval_steps_per_second": 0.315,
474
+ "step": 155
475
+ },
476
+ {
477
+ "epoch": 0.14078310602727673,
478
+ "grad_norm": 0.28340834379196167,
479
+ "learning_rate": 0.00019900193259936704,
480
+ "loss": 0.5754,
481
+ "step": 160
482
+ },
483
+ {
484
+ "epoch": 0.14078310602727673,
485
+ "eval_loss": 0.6187402606010437,
486
+ "eval_runtime": 28.5939,
487
+ "eval_samples_per_second": 0.595,
488
+ "eval_steps_per_second": 0.315,
489
+ "step": 160
490
+ },
491
+ {
492
+ "epoch": 0.14518257809062912,
493
+ "grad_norm": 0.2796618938446045,
494
+ "learning_rate": 0.0001987736381343261,
495
+ "loss": 0.5535,
496
+ "step": 165
497
+ },
498
+ {
499
+ "epoch": 0.14518257809062912,
500
+ "eval_loss": 0.6156266331672668,
501
+ "eval_runtime": 28.5378,
502
+ "eval_samples_per_second": 0.596,
503
+ "eval_steps_per_second": 0.315,
504
+ "step": 165
505
+ },
506
+ {
507
+ "epoch": 0.14958205015398152,
508
+ "grad_norm": 0.25343528389930725,
509
+ "learning_rate": 0.00019852201067560606,
510
+ "loss": 0.5697,
511
+ "step": 170
512
+ },
513
+ {
514
+ "epoch": 0.14958205015398152,
515
+ "eval_loss": 0.6125033497810364,
516
+ "eval_runtime": 28.5565,
517
+ "eval_samples_per_second": 0.595,
518
+ "eval_steps_per_second": 0.315,
519
+ "step": 170
520
+ },
521
+ {
522
+ "epoch": 0.1539815222173339,
523
+ "grad_norm": 0.23438464105129242,
524
+ "learning_rate": 0.00019824710966438996,
525
+ "loss": 0.5335,
526
+ "step": 175
527
+ },
528
+ {
529
+ "epoch": 0.1539815222173339,
530
+ "eval_loss": 0.6096713542938232,
531
+ "eval_runtime": 28.6017,
532
+ "eval_samples_per_second": 0.594,
533
+ "eval_steps_per_second": 0.315,
534
+ "step": 175
535
+ },
536
+ {
537
+ "epoch": 0.1583809942806863,
538
+ "grad_norm": 0.24729043245315552,
539
+ "learning_rate": 0.00019794900003970077,
540
+ "loss": 0.5702,
541
+ "step": 180
542
+ },
543
+ {
544
+ "epoch": 0.1583809942806863,
545
+ "eval_loss": 0.6071114540100098,
546
+ "eval_runtime": 28.5677,
547
+ "eval_samples_per_second": 0.595,
548
+ "eval_steps_per_second": 0.315,
549
+ "step": 180
550
+ },
551
+ {
552
+ "epoch": 0.16278046634403873,
553
+ "grad_norm": 0.257964551448822,
554
+ "learning_rate": 0.00019762775222306107,
555
+ "loss": 0.5494,
556
+ "step": 185
557
+ },
558
+ {
559
+ "epoch": 0.16278046634403873,
560
+ "eval_loss": 0.6062531471252441,
561
+ "eval_runtime": 28.5933,
562
+ "eval_samples_per_second": 0.595,
563
+ "eval_steps_per_second": 0.315,
564
+ "step": 185
565
+ },
566
+ {
567
+ "epoch": 0.16717993840739112,
568
+ "grad_norm": 0.2648680806159973,
569
+ "learning_rate": 0.0001972834421018576,
570
+ "loss": 0.5379,
571
+ "step": 190
572
+ },
573
+ {
574
+ "epoch": 0.16717993840739112,
575
+ "eval_loss": 0.6054437756538391,
576
+ "eval_runtime": 28.5575,
577
+ "eval_samples_per_second": 0.595,
578
+ "eval_steps_per_second": 0.315,
579
+ "step": 190
580
+ },
581
+ {
582
+ "epoch": 0.17157941047074352,
583
+ "grad_norm": 0.2540712356567383,
584
+ "learning_rate": 0.00019691615101141455,
585
+ "loss": 0.5415,
586
+ "step": 195
587
+ },
588
+ {
589
+ "epoch": 0.17157941047074352,
590
+ "eval_loss": 0.6023730039596558,
591
+ "eval_runtime": 28.5419,
592
+ "eval_samples_per_second": 0.596,
593
+ "eval_steps_per_second": 0.315,
594
+ "step": 195
595
+ },
596
+ {
597
+ "epoch": 0.1759788825340959,
598
+ "grad_norm": 0.2424851357936859,
599
+ "learning_rate": 0.00019652596571578004,
600
+ "loss": 0.5504,
601
+ "step": 200
602
+ },
603
+ {
604
+ "epoch": 0.1759788825340959,
605
+ "eval_loss": 0.5997632145881653,
606
+ "eval_runtime": 28.6422,
607
+ "eval_samples_per_second": 0.594,
608
+ "eval_steps_per_second": 0.314,
609
+ "step": 200
610
+ },
611
+ {
612
+ "epoch": 0.1803783545974483,
613
+ "grad_norm": 0.2573873698711395,
614
+ "learning_rate": 0.0001961129783872301,
615
+ "loss": 0.5418,
616
+ "step": 205
617
+ },
618
+ {
619
+ "epoch": 0.1803783545974483,
620
+ "eval_loss": 0.5976300239562988,
621
+ "eval_runtime": 28.5752,
622
+ "eval_samples_per_second": 0.595,
623
+ "eval_steps_per_second": 0.315,
624
+ "step": 205
625
+ },
626
+ {
627
+ "epoch": 0.1847778266608007,
628
+ "grad_norm": 0.22338183224201202,
629
+ "learning_rate": 0.00019567728658449504,
630
+ "loss": 0.54,
631
+ "step": 210
632
+ },
633
+ {
634
+ "epoch": 0.1847778266608007,
635
+ "eval_loss": 0.5960862040519714,
636
+ "eval_runtime": 28.4685,
637
+ "eval_samples_per_second": 0.597,
638
+ "eval_steps_per_second": 0.316,
639
+ "step": 210
640
+ },
641
+ {
642
+ "epoch": 0.1891772987241531,
643
+ "grad_norm": 0.2706097960472107,
644
+ "learning_rate": 0.00019521899322971352,
645
+ "loss": 0.5522,
646
+ "step": 215
647
+ },
648
+ {
649
+ "epoch": 0.1891772987241531,
650
+ "eval_loss": 0.5958646535873413,
651
+ "eval_runtime": 28.5678,
652
+ "eval_samples_per_second": 0.595,
653
+ "eval_steps_per_second": 0.315,
654
+ "step": 215
655
+ },
656
+ {
657
+ "epoch": 0.1935767707875055,
658
+ "grad_norm": 0.23476411402225494,
659
+ "learning_rate": 0.00019473820658411957,
660
+ "loss": 0.5262,
661
+ "step": 220
662
+ },
663
+ {
664
+ "epoch": 0.1935767707875055,
665
+ "eval_loss": 0.5945417284965515,
666
+ "eval_runtime": 28.5611,
667
+ "eval_samples_per_second": 0.595,
668
+ "eval_steps_per_second": 0.315,
669
+ "step": 220
670
+ },
671
+ {
672
+ "epoch": 0.1979762428508579,
673
+ "grad_norm": 0.23705659806728363,
674
+ "learning_rate": 0.00019423504022246825,
675
+ "loss": 0.5439,
676
+ "step": 225
677
+ },
678
+ {
679
+ "epoch": 0.1979762428508579,
680
+ "eval_loss": 0.5934200286865234,
681
+ "eval_runtime": 28.5955,
682
+ "eval_samples_per_second": 0.594,
683
+ "eval_steps_per_second": 0.315,
684
+ "step": 225
685
+ },
686
+ {
687
+ "epoch": 0.2023757149142103,
688
+ "grad_norm": 0.22662319242954254,
689
+ "learning_rate": 0.00019370961300620637,
690
+ "loss": 0.5262,
691
+ "step": 230
692
+ },
693
+ {
694
+ "epoch": 0.2023757149142103,
695
+ "eval_loss": 0.5928044319152832,
696
+ "eval_runtime": 28.514,
697
+ "eval_samples_per_second": 0.596,
698
+ "eval_steps_per_second": 0.316,
699
+ "step": 230
700
+ },
701
+ {
702
+ "epoch": 0.2067751869775627,
703
+ "grad_norm": 0.24046145379543304,
704
+ "learning_rate": 0.00019316204905539425,
705
+ "loss": 0.5462,
706
+ "step": 235
707
+ },
708
+ {
709
+ "epoch": 0.2067751869775627,
710
+ "eval_loss": 0.5904839038848877,
711
+ "eval_runtime": 28.5557,
712
+ "eval_samples_per_second": 0.595,
713
+ "eval_steps_per_second": 0.315,
714
+ "step": 235
715
+ },
716
+ {
717
+ "epoch": 0.2111746590409151,
718
+ "grad_norm": 0.23923470079898834,
719
+ "learning_rate": 0.000192592477719385,
720
+ "loss": 0.5345,
721
+ "step": 240
722
+ },
723
+ {
724
+ "epoch": 0.2111746590409151,
725
+ "eval_loss": 0.590508759021759,
726
+ "eval_runtime": 28.5204,
727
+ "eval_samples_per_second": 0.596,
728
+ "eval_steps_per_second": 0.316,
729
+ "step": 240
730
+ },
731
+ {
732
+ "epoch": 0.21557413110426749,
733
+ "grad_norm": 0.24345721304416656,
734
+ "learning_rate": 0.00019200103354626892,
735
+ "loss": 0.5478,
736
+ "step": 245
737
+ },
738
+ {
739
+ "epoch": 0.21557413110426749,
740
+ "eval_loss": 0.5882726907730103,
741
+ "eval_runtime": 28.5722,
742
+ "eval_samples_per_second": 0.595,
743
+ "eval_steps_per_second": 0.315,
744
+ "step": 245
745
+ },
746
+ {
747
+ "epoch": 0.21997360316761988,
748
+ "grad_norm": 0.27501732110977173,
749
+ "learning_rate": 0.00019138785625108957,
750
+ "loss": 0.5607,
751
+ "step": 250
752
+ },
753
+ {
754
+ "epoch": 0.21997360316761988,
755
+ "eval_loss": 0.5860432982444763,
756
+ "eval_runtime": 28.503,
757
+ "eval_samples_per_second": 0.596,
758
+ "eval_steps_per_second": 0.316,
759
+ "step": 250
760
+ },
761
+ {
762
+ "epoch": 0.22437307523097227,
763
+ "grad_norm": 0.3151032328605652,
764
+ "learning_rate": 0.0001907530906828393,
765
+ "loss": 0.5479,
766
+ "step": 255
767
+ },
768
+ {
769
+ "epoch": 0.22437307523097227,
770
+ "eval_loss": 0.5846895575523376,
771
+ "eval_runtime": 28.6081,
772
+ "eval_samples_per_second": 0.594,
773
+ "eval_steps_per_second": 0.315,
774
+ "step": 255
775
+ },
776
+ {
777
+ "epoch": 0.22877254729432467,
778
+ "grad_norm": 0.2758755385875702,
779
+ "learning_rate": 0.0001900968867902419,
780
+ "loss": 0.5767,
781
+ "step": 260
782
+ },
783
+ {
784
+ "epoch": 0.22877254729432467,
785
+ "eval_loss": 0.5815722942352295,
786
+ "eval_runtime": 28.5574,
787
+ "eval_samples_per_second": 0.595,
788
+ "eval_steps_per_second": 0.315,
789
+ "step": 260
790
+ },
791
+ {
792
+ "epoch": 0.2331720193576771,
793
+ "grad_norm": 0.25241315364837646,
794
+ "learning_rate": 0.000189419399586331,
795
+ "loss": 0.5568,
796
+ "step": 265
797
+ },
798
+ {
799
+ "epoch": 0.2331720193576771,
800
+ "eval_loss": 0.5822274684906006,
801
+ "eval_runtime": 28.573,
802
+ "eval_samples_per_second": 0.595,
803
+ "eval_steps_per_second": 0.315,
804
+ "step": 265
805
+ },
806
+ {
807
+ "epoch": 0.23757149142102948,
808
+ "grad_norm": 0.316436767578125,
809
+ "learning_rate": 0.00018872078911183146,
810
+ "loss": 0.5385,
811
+ "step": 270
812
+ },
813
+ {
814
+ "epoch": 0.23757149142102948,
815
+ "eval_loss": 0.5809066891670227,
816
+ "eval_runtime": 28.5598,
817
+ "eval_samples_per_second": 0.595,
818
+ "eval_steps_per_second": 0.315,
819
+ "step": 270
820
+ },
821
+ {
822
+ "epoch": 0.24197096348438188,
823
+ "grad_norm": 0.27813801169395447,
824
+ "learning_rate": 0.00018800122039735358,
825
+ "loss": 0.5348,
826
+ "step": 275
827
+ },
828
+ {
829
+ "epoch": 0.24197096348438188,
830
+ "eval_loss": 0.5786107778549194,
831
+ "eval_runtime": 28.546,
832
+ "eval_samples_per_second": 0.596,
833
+ "eval_steps_per_second": 0.315,
834
+ "step": 275
835
+ },
836
+ {
837
+ "epoch": 0.24637043554773427,
838
+ "grad_norm": 0.2552705407142639,
839
+ "learning_rate": 0.00018726086342440846,
840
+ "loss": 0.5207,
841
+ "step": 280
842
+ },
843
+ {
844
+ "epoch": 0.24637043554773427,
845
+ "eval_loss": 0.5768923759460449,
846
+ "eval_runtime": 28.5995,
847
+ "eval_samples_per_second": 0.594,
848
+ "eval_steps_per_second": 0.315,
849
+ "step": 280
850
+ },
851
+ {
852
+ "epoch": 0.2507699076110867,
853
+ "grad_norm": 0.21993091702461243,
854
+ "learning_rate": 0.00018649989308525372,
855
+ "loss": 0.5292,
856
+ "step": 285
857
+ },
858
+ {
859
+ "epoch": 0.2507699076110867,
860
+ "eval_loss": 0.5762263536453247,
861
+ "eval_runtime": 28.4816,
862
+ "eval_samples_per_second": 0.597,
863
+ "eval_steps_per_second": 0.316,
864
+ "step": 285
865
+ },
866
+ {
867
+ "epoch": 0.2551693796744391,
868
+ "grad_norm": 0.27086153626441956,
869
+ "learning_rate": 0.0001857184891415794,
870
+ "loss": 0.5312,
871
+ "step": 290
872
+ },
873
+ {
874
+ "epoch": 0.2551693796744391,
875
+ "eval_loss": 0.5758266448974609,
876
+ "eval_runtime": 28.5295,
877
+ "eval_samples_per_second": 0.596,
878
+ "eval_steps_per_second": 0.315,
879
+ "step": 290
880
+ },
881
+ {
882
+ "epoch": 0.2595688517377915,
883
+ "grad_norm": 0.21816319227218628,
884
+ "learning_rate": 0.0001849168361820431,
885
+ "loss": 0.5223,
886
+ "step": 295
887
+ },
888
+ {
889
+ "epoch": 0.2595688517377915,
890
+ "eval_loss": 0.574447751045227,
891
+ "eval_runtime": 28.5859,
892
+ "eval_samples_per_second": 0.595,
893
+ "eval_steps_per_second": 0.315,
894
+ "step": 295
895
+ },
896
+ {
897
+ "epoch": 0.2639683238011439,
898
+ "grad_norm": 0.24796700477600098,
899
+ "learning_rate": 0.00018409512357866548,
900
+ "loss": 0.5485,
901
+ "step": 300
902
+ },
903
+ {
904
+ "epoch": 0.2639683238011439,
905
+ "eval_loss": 0.573371410369873,
906
+ "eval_runtime": 28.6178,
907
+ "eval_samples_per_second": 0.594,
908
+ "eval_steps_per_second": 0.314,
909
+ "step": 300
910
+ },
911
+ {
912
+ "epoch": 0.2683677958644963,
913
+ "grad_norm": 0.2425287663936615,
914
+ "learning_rate": 0.00018325354544209535,
915
+ "loss": 0.5217,
916
+ "step": 305
917
+ },
918
+ {
919
+ "epoch": 0.2683677958644963,
920
+ "eval_loss": 0.5723298788070679,
921
+ "eval_runtime": 28.5916,
922
+ "eval_samples_per_second": 0.595,
923
+ "eval_steps_per_second": 0.315,
924
+ "step": 305
925
+ },
926
+ {
927
+ "epoch": 0.27276726792784867,
928
+ "grad_norm": 0.21630050241947174,
929
+ "learning_rate": 0.00018239230057575542,
930
+ "loss": 0.5074,
931
+ "step": 310
932
+ },
933
+ {
934
+ "epoch": 0.27276726792784867,
935
+ "eval_loss": 0.5725327134132385,
936
+ "eval_runtime": 28.536,
937
+ "eval_samples_per_second": 0.596,
938
+ "eval_steps_per_second": 0.315,
939
+ "step": 310
940
+ },
941
+ {
942
+ "epoch": 0.27716673999120106,
943
+ "grad_norm": 0.21529468894004822,
944
+ "learning_rate": 0.0001815115924288798,
945
+ "loss": 0.5487,
946
+ "step": 315
947
+ },
948
+ {
949
+ "epoch": 0.27716673999120106,
950
+ "eval_loss": 0.5721793174743652,
951
+ "eval_runtime": 28.6852,
952
+ "eval_samples_per_second": 0.593,
953
+ "eval_steps_per_second": 0.314,
954
+ "step": 315
955
+ },
956
+ {
957
+ "epoch": 0.28156621205455346,
958
+ "grad_norm": 0.21623414754867554,
959
+ "learning_rate": 0.00018061162904845358,
960
+ "loss": 0.5106,
961
+ "step": 320
962
+ },
963
+ {
964
+ "epoch": 0.28156621205455346,
965
+ "eval_loss": 0.5709577202796936,
966
+ "eval_runtime": 28.4592,
967
+ "eval_samples_per_second": 0.597,
968
+ "eval_steps_per_second": 0.316,
969
+ "step": 320
970
+ },
971
+ {
972
+ "epoch": 0.28596568411790585,
973
+ "grad_norm": 0.2219308316707611,
974
+ "learning_rate": 0.0001796926230300667,
975
+ "loss": 0.5218,
976
+ "step": 325
977
+ },
978
+ {
979
+ "epoch": 0.28596568411790585,
980
+ "eval_loss": 0.5698617100715637,
981
+ "eval_runtime": 28.5588,
982
+ "eval_samples_per_second": 0.595,
983
+ "eval_steps_per_second": 0.315,
984
+ "step": 325
985
+ },
986
+ {
987
+ "epoch": 0.29036515618125824,
988
+ "grad_norm": 0.2264701873064041,
989
+ "learning_rate": 0.00017875479146769305,
990
+ "loss": 0.5162,
991
+ "step": 330
992
+ },
993
+ {
994
+ "epoch": 0.29036515618125824,
995
+ "eval_loss": 0.5689781308174133,
996
+ "eval_runtime": 28.6221,
997
+ "eval_samples_per_second": 0.594,
998
+ "eval_steps_per_second": 0.314,
999
+ "step": 330
1000
+ },
1001
+ {
1002
+ "epoch": 0.29476462824461064,
1003
+ "grad_norm": 0.24004362523555756,
1004
+ "learning_rate": 0.000177798355902407,
1005
+ "loss": 0.539,
1006
+ "step": 335
1007
+ },
1008
+ {
1009
+ "epoch": 0.29476462824461064,
1010
+ "eval_loss": 0.5678241848945618,
1011
+ "eval_runtime": 28.5677,
1012
+ "eval_samples_per_second": 0.595,
1013
+ "eval_steps_per_second": 0.315,
1014
+ "step": 335
1015
+ },
1016
+ {
1017
+ "epoch": 0.29916410030796303,
1018
+ "grad_norm": 0.22996000945568085,
1019
+ "learning_rate": 0.00017682354227004963,
1020
+ "loss": 0.5002,
1021
+ "step": 340
1022
+ },
1023
+ {
1024
+ "epoch": 0.29916410030796303,
1025
+ "eval_loss": 0.5670127272605896,
1026
+ "eval_runtime": 28.6425,
1027
+ "eval_samples_per_second": 0.594,
1028
+ "eval_steps_per_second": 0.314,
1029
+ "step": 340
1030
+ },
1031
+ {
1032
+ "epoch": 0.3035635723713154,
1033
+ "grad_norm": 0.23163671791553497,
1034
+ "learning_rate": 0.00017583058084785625,
1035
+ "loss": 0.5175,
1036
+ "step": 345
1037
+ },
1038
+ {
1039
+ "epoch": 0.3035635723713154,
1040
+ "eval_loss": 0.5650352239608765,
1041
+ "eval_runtime": 28.5994,
1042
+ "eval_samples_per_second": 0.594,
1043
+ "eval_steps_per_second": 0.315,
1044
+ "step": 345
1045
+ },
1046
+ {
1047
+ "epoch": 0.3079630444346678,
1048
+ "grad_norm": 0.20120489597320557,
1049
+ "learning_rate": 0.00017481970620005912,
1050
+ "loss": 0.5269,
1051
+ "step": 350
1052
+ },
1053
+ {
1054
+ "epoch": 0.3079630444346678,
1055
+ "eval_loss": 0.5640237927436829,
1056
+ "eval_runtime": 28.5009,
1057
+ "eval_samples_per_second": 0.596,
1058
+ "eval_steps_per_second": 0.316,
1059
+ "step": 350
1060
+ },
1061
+ {
1062
+ "epoch": 0.3123625164980202,
1063
+ "grad_norm": 0.22231583297252655,
1064
+ "learning_rate": 0.00017379115712247675,
1065
+ "loss": 0.5444,
1066
+ "step": 355
1067
+ },
1068
+ {
1069
+ "epoch": 0.3123625164980202,
1070
+ "eval_loss": 0.5634257197380066,
1071
+ "eval_runtime": 28.5722,
1072
+ "eval_samples_per_second": 0.595,
1073
+ "eval_steps_per_second": 0.315,
1074
+ "step": 355
1075
+ },
1076
+ {
1077
+ "epoch": 0.3167619885613726,
1078
+ "grad_norm": 0.216331347823143,
1079
+ "learning_rate": 0.00017274517658610398,
1080
+ "loss": 0.5074,
1081
+ "step": 360
1082
+ },
1083
+ {
1084
+ "epoch": 0.3167619885613726,
1085
+ "eval_loss": 0.5618783831596375,
1086
+ "eval_runtime": 28.6759,
1087
+ "eval_samples_per_second": 0.593,
1088
+ "eval_steps_per_second": 0.314,
1089
+ "step": 360
1090
+ },
1091
+ {
1092
+ "epoch": 0.32116146062472506,
1093
+ "grad_norm": 0.21976010501384735,
1094
+ "learning_rate": 0.0001716820116797158,
1095
+ "loss": 0.5259,
1096
+ "step": 365
1097
+ },
1098
+ {
1099
+ "epoch": 0.32116146062472506,
1100
+ "eval_loss": 0.5602042078971863,
1101
+ "eval_runtime": 28.6019,
1102
+ "eval_samples_per_second": 0.594,
1103
+ "eval_steps_per_second": 0.315,
1104
+ "step": 365
1105
+ },
1106
+ {
1107
+ "epoch": 0.32556093268807745,
1108
+ "grad_norm": 0.22740119695663452,
1109
+ "learning_rate": 0.0001706019135514982,
1110
+ "loss": 0.5158,
1111
+ "step": 370
1112
+ },
1113
+ {
1114
+ "epoch": 0.32556093268807745,
1115
+ "eval_loss": 0.5599080920219421,
1116
+ "eval_runtime": 28.5177,
1117
+ "eval_samples_per_second": 0.596,
1118
+ "eval_steps_per_second": 0.316,
1119
+ "step": 370
1120
+ },
1121
+ {
1122
+ "epoch": 0.32996040475142985,
1123
+ "grad_norm": 0.21888501942157745,
1124
+ "learning_rate": 0.0001695051373497202,
1125
+ "loss": 0.527,
1126
+ "step": 375
1127
+ },
1128
+ {
1129
+ "epoch": 0.32996040475142985,
1130
+ "eval_loss": 0.558814525604248,
1131
+ "eval_runtime": 28.661,
1132
+ "eval_samples_per_second": 0.593,
1133
+ "eval_steps_per_second": 0.314,
1134
+ "step": 375
1135
+ },
1136
+ {
1137
+ "epoch": 0.33435987681478224,
1138
+ "grad_norm": 0.20402850210666656,
1139
+ "learning_rate": 0.00016839194216246108,
1140
+ "loss": 0.5027,
1141
+ "step": 380
1142
+ },
1143
+ {
1144
+ "epoch": 0.33435987681478224,
1145
+ "eval_loss": 0.5578404664993286,
1146
+ "eval_runtime": 28.5421,
1147
+ "eval_samples_per_second": 0.596,
1148
+ "eval_steps_per_second": 0.315,
1149
+ "step": 380
1150
+ },
1151
+ {
1152
+ "epoch": 0.33875934887813464,
1153
+ "grad_norm": 0.20368748903274536,
1154
+ "learning_rate": 0.00016726259095640664,
1155
+ "loss": 0.505,
1156
+ "step": 385
1157
+ },
1158
+ {
1159
+ "epoch": 0.33875934887813464,
1160
+ "eval_loss": 0.5567160844802856,
1161
+ "eval_runtime": 28.6126,
1162
+ "eval_samples_per_second": 0.594,
1163
+ "eval_steps_per_second": 0.315,
1164
+ "step": 385
1165
+ },
1166
+ {
1167
+ "epoch": 0.34315882094148703,
1168
+ "grad_norm": 0.2069130390882492,
1169
+ "learning_rate": 0.0001661173505147295,
1170
+ "loss": 0.5086,
1171
+ "step": 390
1172
+ },
1173
+ {
1174
+ "epoch": 0.34315882094148703,
1175
+ "eval_loss": 0.55617755651474,
1176
+ "eval_runtime": 28.4879,
1177
+ "eval_samples_per_second": 0.597,
1178
+ "eval_steps_per_second": 0.316,
1179
+ "step": 390
1180
+ },
1181
+ {
1182
+ "epoch": 0.3475582930048394,
1183
+ "grad_norm": 0.23644201457500458,
1184
+ "learning_rate": 0.00016495649137406772,
1185
+ "loss": 0.5412,
1186
+ "step": 395
1187
+ },
1188
+ {
1189
+ "epoch": 0.3475582930048394,
1190
+ "eval_loss": 0.5556927919387817,
1191
+ "eval_runtime": 28.6713,
1192
+ "eval_samples_per_second": 0.593,
1193
+ "eval_steps_per_second": 0.314,
1194
+ "step": 395
1195
+ },
1196
+ {
1197
+ "epoch": 0.3519577650681918,
1198
+ "grad_norm": 0.21997737884521484,
1199
+ "learning_rate": 0.00016378028776061667,
1200
+ "loss": 0.4908,
1201
+ "step": 400
1202
+ },
1203
+ {
1204
+ "epoch": 0.3519577650681918,
1205
+ "eval_loss": 0.5555915832519531,
1206
+ "eval_runtime": 28.596,
1207
+ "eval_samples_per_second": 0.594,
1208
+ "eval_steps_per_second": 0.315,
1209
+ "step": 400
1210
+ },
1211
+ {
1212
+ "epoch": 0.3563572371315442,
1213
+ "grad_norm": 0.22075805068016052,
1214
+ "learning_rate": 0.00016258901752534948,
1215
+ "loss": 0.5155,
1216
+ "step": 405
1217
+ },
1218
+ {
1219
+ "epoch": 0.3563572371315442,
1220
+ "eval_loss": 0.5552019476890564,
1221
+ "eval_runtime": 28.595,
1222
+ "eval_samples_per_second": 0.595,
1223
+ "eval_steps_per_second": 0.315,
1224
+ "step": 405
1225
+ },
1226
+ {
1227
+ "epoch": 0.3607567091948966,
1228
+ "grad_norm": 0.5917304158210754,
1229
+ "learning_rate": 0.00016138296207838127,
1230
+ "loss": 0.4991,
1231
+ "step": 410
1232
+ },
1233
+ {
1234
+ "epoch": 0.3607567091948966,
1235
+ "eval_loss": 0.5550567507743835,
1236
+ "eval_runtime": 28.6222,
1237
+ "eval_samples_per_second": 0.594,
1238
+ "eval_steps_per_second": 0.314,
1239
+ "step": 410
1240
+ },
1241
+ {
1242
+ "epoch": 0.365156181258249,
1243
+ "grad_norm": 0.21421152353286743,
1244
+ "learning_rate": 0.00016016240632249224,
1245
+ "loss": 0.4769,
1246
+ "step": 415
1247
+ },
1248
+ {
1249
+ "epoch": 0.365156181258249,
1250
+ "eval_loss": 0.5548796653747559,
1251
+ "eval_runtime": 28.5933,
1252
+ "eval_samples_per_second": 0.595,
1253
+ "eval_steps_per_second": 0.315,
1254
+ "step": 415
1255
+ },
1256
+ {
1257
+ "epoch": 0.3695556533216014,
1258
+ "grad_norm": 0.201774463057518,
1259
+ "learning_rate": 0.0001589276385858262,
1260
+ "loss": 0.4914,
1261
+ "step": 420
1262
+ },
1263
+ {
1264
+ "epoch": 0.3695556533216014,
1265
+ "eval_loss": 0.5546624064445496,
1266
+ "eval_runtime": 28.5213,
1267
+ "eval_samples_per_second": 0.596,
1268
+ "eval_steps_per_second": 0.316,
1269
+ "step": 420
1270
+ },
1271
+ {
1272
+ "epoch": 0.3739551253849538,
1273
+ "grad_norm": 0.22172759473323822,
1274
+ "learning_rate": 0.0001576789505537795,
1275
+ "loss": 0.4726,
1276
+ "step": 425
1277
+ },
1278
+ {
1279
+ "epoch": 0.3739551253849538,
1280
+ "eval_loss": 0.5535080432891846,
1281
+ "eval_runtime": 28.6645,
1282
+ "eval_samples_per_second": 0.593,
1283
+ "eval_steps_per_second": 0.314,
1284
+ "step": 425
1285
+ },
1286
+ {
1287
+ "epoch": 0.3783545974483062,
1288
+ "grad_norm": 0.23269815742969513,
1289
+ "learning_rate": 0.00015641663720009733,
1290
+ "loss": 0.5076,
1291
+ "step": 430
1292
+ },
1293
+ {
1294
+ "epoch": 0.3783545974483062,
1295
+ "eval_loss": 0.5522862076759338,
1296
+ "eval_runtime": 28.5697,
1297
+ "eval_samples_per_second": 0.595,
1298
+ "eval_steps_per_second": 0.315,
1299
+ "step": 430
1300
+ },
1301
+ {
1302
+ "epoch": 0.3827540695116586,
1303
+ "grad_norm": 0.23303498327732086,
1304
+ "learning_rate": 0.00015514099671719268,
1305
+ "loss": 0.5064,
1306
+ "step": 435
1307
+ },
1308
+ {
1309
+ "epoch": 0.3827540695116586,
1310
+ "eval_loss": 0.5502522587776184,
1311
+ "eval_runtime": 28.5369,
1312
+ "eval_samples_per_second": 0.596,
1313
+ "eval_steps_per_second": 0.315,
1314
+ "step": 435
1315
+ },
1316
+ {
1317
+ "epoch": 0.387153541575011,
1318
+ "grad_norm": 0.24087387323379517,
1319
+ "learning_rate": 0.00015385233044570555,
1320
+ "loss": 0.5361,
1321
+ "step": 440
1322
+ },
1323
+ {
1324
+ "epoch": 0.387153541575011,
1325
+ "eval_loss": 0.5471201539039612,
1326
+ "eval_runtime": 28.5791,
1327
+ "eval_samples_per_second": 0.595,
1328
+ "eval_steps_per_second": 0.315,
1329
+ "step": 440
1330
+ },
1331
+ {
1332
+ "epoch": 0.3915530136383634,
1333
+ "grad_norm": 0.20800553262233734,
1334
+ "learning_rate": 0.00015255094280331797,
1335
+ "loss": 0.5169,
1336
+ "step": 445
1337
+ },
1338
+ {
1339
+ "epoch": 0.3915530136383634,
1340
+ "eval_loss": 0.5466722846031189,
1341
+ "eval_runtime": 28.6339,
1342
+ "eval_samples_per_second": 0.594,
1343
+ "eval_steps_per_second": 0.314,
1344
+ "step": 445
1345
+ },
1346
+ {
1347
+ "epoch": 0.3959524857017158,
1348
+ "grad_norm": 0.37092360854148865,
1349
+ "learning_rate": 0.0001512371412128424,
1350
+ "loss": 0.5362,
1351
+ "step": 450
1352
+ },
1353
+ {
1354
+ "epoch": 0.3959524857017158,
1355
+ "eval_loss": 0.5455148220062256,
1356
+ "eval_runtime": 28.637,
1357
+ "eval_samples_per_second": 0.594,
1358
+ "eval_steps_per_second": 0.314,
1359
+ "step": 450
1360
+ },
1361
+ {
1362
+ "epoch": 0.4003519577650682,
1363
+ "grad_norm": 0.20706337690353394,
1364
+ "learning_rate": 0.00014991123602960018,
1365
+ "loss": 0.4994,
1366
+ "step": 455
1367
+ },
1368
+ {
1369
+ "epoch": 0.4003519577650682,
1370
+ "eval_loss": 0.5440109968185425,
1371
+ "eval_runtime": 28.5672,
1372
+ "eval_samples_per_second": 0.595,
1373
+ "eval_steps_per_second": 0.315,
1374
+ "step": 455
1375
+ },
1376
+ {
1377
+ "epoch": 0.4047514298284206,
1378
+ "grad_norm": 0.2135256677865982,
1379
+ "learning_rate": 0.00014857354046810732,
1380
+ "loss": 0.5005,
1381
+ "step": 460
1382
+ },
1383
+ {
1384
+ "epoch": 0.4047514298284206,
1385
+ "eval_loss": 0.5431147813796997,
1386
+ "eval_runtime": 28.4835,
1387
+ "eval_samples_per_second": 0.597,
1388
+ "eval_steps_per_second": 0.316,
1389
+ "step": 460
1390
+ },
1391
+ {
1392
+ "epoch": 0.409150901891773,
1393
+ "grad_norm": 0.5737074613571167,
1394
+ "learning_rate": 0.00014722437052808472,
1395
+ "loss": 0.5208,
1396
+ "step": 465
1397
+ },
1398
+ {
1399
+ "epoch": 0.409150901891773,
1400
+ "eval_loss": 0.541969358921051,
1401
+ "eval_runtime": 28.6004,
1402
+ "eval_samples_per_second": 0.594,
1403
+ "eval_steps_per_second": 0.315,
1404
+ "step": 465
1405
+ },
1406
+ {
1407
+ "epoch": 0.4135503739551254,
1408
+ "grad_norm": 0.24099959433078766,
1409
+ "learning_rate": 0.00014586404491981052,
1410
+ "loss": 0.5074,
1411
+ "step": 470
1412
+ },
1413
+ {
1414
+ "epoch": 0.4135503739551254,
1415
+ "eval_loss": 0.5449388027191162,
1416
+ "eval_runtime": 28.658,
1417
+ "eval_samples_per_second": 0.593,
1418
+ "eval_steps_per_second": 0.314,
1419
+ "step": 470
1420
+ },
1421
+ {
1422
+ "epoch": 0.4179498460184778,
1423
+ "grad_norm": 0.2046642154455185,
1424
+ "learning_rate": 0.0001444928849888321,
1425
+ "loss": 0.5052,
1426
+ "step": 475
1427
+ },
1428
+ {
1429
+ "epoch": 0.4179498460184778,
1430
+ "eval_loss": 0.5407991409301758,
1431
+ "eval_runtime": 28.5688,
1432
+ "eval_samples_per_second": 0.595,
1433
+ "eval_steps_per_second": 0.315,
1434
+ "step": 475
1435
+ },
1436
+ {
1437
+ "epoch": 0.4223493180818302,
1438
+ "grad_norm": 0.2824171185493469,
1439
+ "learning_rate": 0.00014311121464005583,
1440
+ "loss": 0.5179,
1441
+ "step": 480
1442
+ },
1443
+ {
1444
+ "epoch": 0.4223493180818302,
1445
+ "eval_loss": 0.54000324010849,
1446
+ "eval_runtime": 28.7144,
1447
+ "eval_samples_per_second": 0.592,
1448
+ "eval_steps_per_second": 0.313,
1449
+ "step": 480
1450
+ },
1451
+ {
1452
+ "epoch": 0.4267487901451826,
1453
+ "grad_norm": 0.2045980840921402,
1454
+ "learning_rate": 0.00014171936026123168,
1455
+ "loss": 0.4634,
1456
+ "step": 485
1457
+ },
1458
+ {
1459
+ "epoch": 0.4267487901451826,
1460
+ "eval_loss": 0.5398800373077393,
1461
+ "eval_runtime": 28.5209,
1462
+ "eval_samples_per_second": 0.596,
1463
+ "eval_steps_per_second": 0.316,
1464
+ "step": 485
1465
+ },
1466
+ {
1467
+ "epoch": 0.43114826220853497,
1468
+ "grad_norm": 0.2092169225215912,
1469
+ "learning_rate": 0.00014031765064585197,
1470
+ "loss": 0.4802,
1471
+ "step": 490
1472
+ },
1473
+ {
1474
+ "epoch": 0.43114826220853497,
1475
+ "eval_loss": 0.5395181179046631,
1476
+ "eval_runtime": 28.5086,
1477
+ "eval_samples_per_second": 0.596,
1478
+ "eval_steps_per_second": 0.316,
1479
+ "step": 490
1480
+ },
1481
+ {
1482
+ "epoch": 0.43554773427188737,
1483
+ "grad_norm": 0.20700140297412872,
1484
+ "learning_rate": 0.00013890641691548114,
1485
+ "loss": 0.4962,
1486
+ "step": 495
1487
+ },
1488
+ {
1489
+ "epoch": 0.43554773427188737,
1490
+ "eval_loss": 0.5390854477882385,
1491
+ "eval_runtime": 28.5682,
1492
+ "eval_samples_per_second": 0.595,
1493
+ "eval_steps_per_second": 0.315,
1494
+ "step": 495
1495
+ },
1496
+ {
1497
+ "epoch": 0.43994720633523976,
1498
+ "grad_norm": 0.19903522729873657,
1499
+ "learning_rate": 0.00013748599244153633,
1500
+ "loss": 0.4841,
1501
+ "step": 500
1502
+ },
1503
+ {
1504
+ "epoch": 0.43994720633523976,
1505
+ "eval_loss": 0.5381758213043213,
1506
+ "eval_runtime": 29.4274,
1507
+ "eval_samples_per_second": 0.578,
1508
+ "eval_steps_per_second": 0.306,
1509
+ "step": 500
1510
+ },
1511
+ {
1512
+ "epoch": 0.44434667839859215,
1513
+ "grad_norm": 0.4766729474067688,
1514
+ "learning_rate": 0.00013605671276653567,
1515
+ "loss": 0.5252,
1516
+ "step": 505
1517
+ },
1518
+ {
1519
+ "epoch": 0.44434667839859215,
1520
+ "eval_loss": 0.5368968844413757,
1521
+ "eval_runtime": 28.6474,
1522
+ "eval_samples_per_second": 0.593,
1523
+ "eval_steps_per_second": 0.314,
1524
+ "step": 505
1525
+ },
1526
+ {
1527
+ "epoch": 0.44874615046194455,
1528
+ "grad_norm": 0.21688155829906464,
1529
+ "learning_rate": 0.00013461891552483444,
1530
+ "loss": 0.515,
1531
+ "step": 510
1532
+ },
1533
+ {
1534
+ "epoch": 0.44874615046194455,
1535
+ "eval_loss": 0.5366407036781311,
1536
+ "eval_runtime": 28.5352,
1537
+ "eval_samples_per_second": 0.596,
1538
+ "eval_steps_per_second": 0.315,
1539
+ "step": 510
1540
+ },
1541
+ {
1542
+ "epoch": 0.45314562252529694,
1543
+ "grad_norm": 0.20375116169452667,
1544
+ "learning_rate": 0.00013317294036286644,
1545
+ "loss": 0.4887,
1546
+ "step": 515
1547
+ },
1548
+ {
1549
+ "epoch": 0.45314562252529694,
1550
+ "eval_loss": 0.5360764861106873,
1551
+ "eval_runtime": 28.6533,
1552
+ "eval_samples_per_second": 0.593,
1553
+ "eval_steps_per_second": 0.314,
1554
+ "step": 515
1555
+ },
1556
+ {
1557
+ "epoch": 0.45754509458864934,
1558
+ "grad_norm": 0.1958196461200714,
1559
+ "learning_rate": 0.00013171912885891063,
1560
+ "loss": 0.4868,
1561
+ "step": 520
1562
+ },
1563
+ {
1564
+ "epoch": 0.45754509458864934,
1565
+ "eval_loss": 0.5356424450874329,
1566
+ "eval_runtime": 28.5027,
1567
+ "eval_samples_per_second": 0.596,
1568
+ "eval_steps_per_second": 0.316,
1569
+ "step": 520
1570
+ },
1571
+ {
1572
+ "epoch": 0.4619445666520018,
1573
+ "grad_norm": 0.22040507197380066,
1574
+ "learning_rate": 0.00013025782444240087,
1575
+ "loss": 0.5086,
1576
+ "step": 525
1577
+ },
1578
+ {
1579
+ "epoch": 0.4619445666520018,
1580
+ "eval_loss": 0.5351347327232361,
1581
+ "eval_runtime": 28.6428,
1582
+ "eval_samples_per_second": 0.594,
1583
+ "eval_steps_per_second": 0.314,
1584
+ "step": 525
1585
+ },
1586
+ {
1587
+ "epoch": 0.4663440387153542,
1588
+ "grad_norm": 0.19495758414268494,
1589
+ "learning_rate": 0.00012878937231279892,
1590
+ "loss": 0.5113,
1591
+ "step": 530
1592
+ },
1593
+ {
1594
+ "epoch": 0.4663440387153542,
1595
+ "eval_loss": 0.5347647070884705,
1596
+ "eval_runtime": 28.6252,
1597
+ "eval_samples_per_second": 0.594,
1598
+ "eval_steps_per_second": 0.314,
1599
+ "step": 530
1600
+ },
1601
+ {
1602
+ "epoch": 0.4707435107787066,
1603
+ "grad_norm": 0.21149738132953644,
1604
+ "learning_rate": 0.0001273141193580488,
1605
+ "loss": 0.483,
1606
+ "step": 535
1607
+ },
1608
+ {
1609
+ "epoch": 0.4707435107787066,
1610
+ "eval_loss": 0.5339221954345703,
1611
+ "eval_runtime": 28.6055,
1612
+ "eval_samples_per_second": 0.594,
1613
+ "eval_steps_per_second": 0.315,
1614
+ "step": 535
1615
+ },
1616
+ {
1617
+ "epoch": 0.47514298284205897,
1618
+ "grad_norm": 0.20391018688678741,
1619
+ "learning_rate": 0.0001258324140726326,
1620
+ "loss": 0.4728,
1621
+ "step": 540
1622
+ },
1623
+ {
1624
+ "epoch": 0.47514298284205897,
1625
+ "eval_loss": 0.5337977409362793,
1626
+ "eval_runtime": 28.5842,
1627
+ "eval_samples_per_second": 0.595,
1628
+ "eval_steps_per_second": 0.315,
1629
+ "step": 540
1630
+ },
1631
+ {
1632
+ "epoch": 0.47954245490541136,
1633
+ "grad_norm": 0.20913545787334442,
1634
+ "learning_rate": 0.00012434460647524676,
1635
+ "loss": 0.5016,
1636
+ "step": 545
1637
+ },
1638
+ {
1639
+ "epoch": 0.47954245490541136,
1640
+ "eval_loss": 0.532899022102356,
1641
+ "eval_runtime": 28.4759,
1642
+ "eval_samples_per_second": 0.597,
1643
+ "eval_steps_per_second": 0.316,
1644
+ "step": 545
1645
+ },
1646
+ {
1647
+ "epoch": 0.48394192696876376,
1648
+ "grad_norm": 0.19410260021686554,
1649
+ "learning_rate": 0.00012285104802611812,
1650
+ "loss": 0.5103,
1651
+ "step": 550
1652
+ },
1653
+ {
1654
+ "epoch": 0.48394192696876376,
1655
+ "eval_loss": 0.5321294665336609,
1656
+ "eval_runtime": 28.5662,
1657
+ "eval_samples_per_second": 0.595,
1658
+ "eval_steps_per_second": 0.315,
1659
+ "step": 550
1660
+ },
1661
+ {
1662
+ "epoch": 0.48834139903211615,
1663
+ "grad_norm": 0.2097245752811432,
1664
+ "learning_rate": 0.00012135209154397962,
1665
+ "loss": 0.4954,
1666
+ "step": 555
1667
+ },
1668
+ {
1669
+ "epoch": 0.48834139903211615,
1670
+ "eval_loss": 0.532034695148468,
1671
+ "eval_runtime": 28.652,
1672
+ "eval_samples_per_second": 0.593,
1673
+ "eval_steps_per_second": 0.314,
1674
+ "step": 555
1675
+ },
1676
+ {
1677
+ "epoch": 0.49274087109546855,
1678
+ "grad_norm": 0.21518121659755707,
1679
+ "learning_rate": 0.00011984809112272495,
1680
+ "loss": 0.4999,
1681
+ "step": 560
1682
+ },
1683
+ {
1684
+ "epoch": 0.49274087109546855,
1685
+ "eval_loss": 0.5313233733177185,
1686
+ "eval_runtime": 28.5662,
1687
+ "eval_samples_per_second": 0.595,
1688
+ "eval_steps_per_second": 0.315,
1689
+ "step": 560
1690
+ },
1691
+ {
1692
+ "epoch": 0.49714034315882094,
1693
+ "grad_norm": 0.19571034610271454,
1694
+ "learning_rate": 0.00011833940204776209,
1695
+ "loss": 0.4931,
1696
+ "step": 565
1697
+ },
1698
+ {
1699
+ "epoch": 0.49714034315882094,
1700
+ "eval_loss": 0.5311394333839417,
1701
+ "eval_runtime": 28.5352,
1702
+ "eval_samples_per_second": 0.596,
1703
+ "eval_steps_per_second": 0.315,
1704
+ "step": 565
1705
+ },
1706
+ {
1707
+ "epoch": 0.5015398152221734,
1708
+ "grad_norm": 0.20554794371128082,
1709
+ "learning_rate": 0.00011682638071208533,
1710
+ "loss": 0.4833,
1711
+ "step": 570
1712
+ },
1713
+ {
1714
+ "epoch": 0.5015398152221734,
1715
+ "eval_loss": 0.5300410389900208,
1716
+ "eval_runtime": 28.5679,
1717
+ "eval_samples_per_second": 0.595,
1718
+ "eval_steps_per_second": 0.315,
1719
+ "step": 570
1720
+ },
1721
+ {
1722
+ "epoch": 0.5059392872855257,
1723
+ "grad_norm": 0.20373423397541046,
1724
+ "learning_rate": 0.00011530938453208559,
1725
+ "loss": 0.5057,
1726
+ "step": 575
1727
+ },
1728
+ {
1729
+ "epoch": 0.5059392872855257,
1730
+ "eval_loss": 0.5300309658050537,
1731
+ "eval_runtime": 28.5821,
1732
+ "eval_samples_per_second": 0.595,
1733
+ "eval_steps_per_second": 0.315,
1734
+ "step": 575
1735
+ },
1736
+ {
1737
+ "epoch": 0.5103387593488782,
1738
+ "grad_norm": 0.1982477903366089,
1739
+ "learning_rate": 0.00011378877186311912,
1740
+ "loss": 0.4754,
1741
+ "step": 580
1742
+ },
1743
+ {
1744
+ "epoch": 0.5103387593488782,
1745
+ "eval_loss": 0.5292160511016846,
1746
+ "eval_runtime": 28.5256,
1747
+ "eval_samples_per_second": 0.596,
1748
+ "eval_steps_per_second": 0.316,
1749
+ "step": 580
1750
+ },
1751
+ {
1752
+ "epoch": 0.5147382314122305,
1753
+ "grad_norm": 0.20576219260692596,
1754
+ "learning_rate": 0.00011226490191485421,
1755
+ "loss": 0.4991,
1756
+ "step": 585
1757
+ },
1758
+ {
1759
+ "epoch": 0.5147382314122305,
1760
+ "eval_loss": 0.5280917882919312,
1761
+ "eval_runtime": 28.6835,
1762
+ "eval_samples_per_second": 0.593,
1763
+ "eval_steps_per_second": 0.314,
1764
+ "step": 585
1765
+ },
1766
+ {
1767
+ "epoch": 0.519137703475583,
1768
+ "grad_norm": 0.2154638022184372,
1769
+ "learning_rate": 0.00011073813466641632,
1770
+ "loss": 0.4811,
1771
+ "step": 590
1772
+ },
1773
+ {
1774
+ "epoch": 0.519137703475583,
1775
+ "eval_loss": 0.5274674296379089,
1776
+ "eval_runtime": 28.4766,
1777
+ "eval_samples_per_second": 0.597,
1778
+ "eval_steps_per_second": 0.316,
1779
+ "step": 590
1780
+ },
1781
+ {
1782
+ "epoch": 0.5235371755389353,
1783
+ "grad_norm": 0.2037007063627243,
1784
+ "learning_rate": 0.00010920883078135117,
1785
+ "loss": 0.4717,
1786
+ "step": 595
1787
+ },
1788
+ {
1789
+ "epoch": 0.5235371755389353,
1790
+ "eval_loss": 0.5270927548408508,
1791
+ "eval_runtime": 28.5377,
1792
+ "eval_samples_per_second": 0.596,
1793
+ "eval_steps_per_second": 0.315,
1794
+ "step": 595
1795
+ },
1796
+ {
1797
+ "epoch": 0.5279366476022878,
1798
+ "grad_norm": 0.21386198699474335,
1799
+ "learning_rate": 0.00010767735152242649,
1800
+ "loss": 0.4776,
1801
+ "step": 600
1802
+ },
1803
+ {
1804
+ "epoch": 0.5279366476022878,
1805
+ "eval_loss": 0.526791512966156,
1806
+ "eval_runtime": 28.596,
1807
+ "eval_samples_per_second": 0.594,
1808
+ "eval_steps_per_second": 0.315,
1809
+ "step": 600
1810
+ },
1811
+ {
1812
+ "epoch": 0.5323361196656401,
1813
+ "grad_norm": 0.1984720528125763,
1814
+ "learning_rate": 0.0001061440586662917,
1815
+ "loss": 0.4708,
1816
+ "step": 605
1817
+ },
1818
+ {
1819
+ "epoch": 0.5323361196656401,
1820
+ "eval_loss": 0.5266034007072449,
1821
+ "eval_runtime": 28.6491,
1822
+ "eval_samples_per_second": 0.593,
1823
+ "eval_steps_per_second": 0.314,
1824
+ "step": 605
1825
+ },
1826
+ {
1827
+ "epoch": 0.5367355917289925,
1828
+ "grad_norm": 0.19453096389770508,
1829
+ "learning_rate": 0.000104609314418017,
1830
+ "loss": 0.4659,
1831
+ "step": 610
1832
+ },
1833
+ {
1834
+ "epoch": 0.5367355917289925,
1835
+ "eval_loss": 0.5267328023910522,
1836
+ "eval_runtime": 28.6358,
1837
+ "eval_samples_per_second": 0.594,
1838
+ "eval_steps_per_second": 0.314,
1839
+ "step": 610
1840
+ },
1841
+ {
1842
+ "epoch": 0.5411350637923449,
1843
+ "grad_norm": 0.2048104703426361,
1844
+ "learning_rate": 0.00010307348132553025,
1845
+ "loss": 0.5138,
1846
+ "step": 615
1847
+ },
1848
+ {
1849
+ "epoch": 0.5411350637923449,
1850
+ "eval_loss": 0.5270944833755493,
1851
+ "eval_runtime": 28.5902,
1852
+ "eval_samples_per_second": 0.595,
1853
+ "eval_steps_per_second": 0.315,
1854
+ "step": 615
1855
+ },
1856
+ {
1857
+ "epoch": 0.5455345358556973,
1858
+ "grad_norm": 0.1899915337562561,
1859
+ "learning_rate": 0.00010153692219397387,
1860
+ "loss": 0.4797,
1861
+ "step": 620
1862
+ },
1863
+ {
1864
+ "epoch": 0.5455345358556973,
1865
+ "eval_loss": 0.5260502099990845,
1866
+ "eval_runtime": 28.5533,
1867
+ "eval_samples_per_second": 0.595,
1868
+ "eval_steps_per_second": 0.315,
1869
+ "step": 620
1870
+ },
1871
+ {
1872
+ "epoch": 0.5499340079190497,
1873
+ "grad_norm": 0.18520919978618622,
1874
+ "learning_rate": 0.0001,
1875
+ "loss": 0.5068,
1876
+ "step": 625
1877
+ },
1878
+ {
1879
+ "epoch": 0.5499340079190497,
1880
+ "eval_loss": 0.5251287817955017,
1881
+ "eval_runtime": 28.4846,
1882
+ "eval_samples_per_second": 0.597,
1883
+ "eval_steps_per_second": 0.316,
1884
+ "step": 625
1885
+ },
1886
+ {
1887
+ "epoch": 0.5543334799824021,
1888
+ "grad_norm": 0.21325986087322235,
1889
+ "learning_rate": 9.84630778060262e-05,
1890
+ "loss": 0.4799,
1891
+ "step": 630
1892
+ },
1893
+ {
1894
+ "epoch": 0.5543334799824021,
1895
+ "eval_loss": 0.524385929107666,
1896
+ "eval_runtime": 28.5917,
1897
+ "eval_samples_per_second": 0.595,
1898
+ "eval_steps_per_second": 0.315,
1899
+ "step": 630
1900
+ },
1901
+ {
1902
+ "epoch": 0.5587329520457545,
1903
+ "grad_norm": 0.20572926104068756,
1904
+ "learning_rate": 9.692651867446973e-05,
1905
+ "loss": 0.49,
1906
+ "step": 635
1907
+ },
1908
+ {
1909
+ "epoch": 0.5587329520457545,
1910
+ "eval_loss": 0.523975133895874,
1911
+ "eval_runtime": 28.6052,
1912
+ "eval_samples_per_second": 0.594,
1913
+ "eval_steps_per_second": 0.315,
1914
+ "step": 635
1915
+ },
1916
+ {
1917
+ "epoch": 0.5631324241091069,
1918
+ "grad_norm": 0.20347937941551208,
1919
+ "learning_rate": 9.539068558198304e-05,
1920
+ "loss": 0.4702,
1921
+ "step": 640
1922
+ },
1923
+ {
1924
+ "epoch": 0.5631324241091069,
1925
+ "eval_loss": 0.5229539275169373,
1926
+ "eval_runtime": 28.6223,
1927
+ "eval_samples_per_second": 0.594,
1928
+ "eval_steps_per_second": 0.314,
1929
+ "step": 640
1930
+ },
1931
+ {
1932
+ "epoch": 0.5675318961724594,
1933
+ "grad_norm": 0.21256154775619507,
1934
+ "learning_rate": 9.38559413337083e-05,
1935
+ "loss": 0.4736,
1936
+ "step": 645
1937
+ },
1938
+ {
1939
+ "epoch": 0.5675318961724594,
1940
+ "eval_loss": 0.5221072435379028,
1941
+ "eval_runtime": 28.6189,
1942
+ "eval_samples_per_second": 0.594,
1943
+ "eval_steps_per_second": 0.314,
1944
+ "step": 645
1945
+ },
1946
+ {
1947
+ "epoch": 0.5719313682358117,
1948
+ "grad_norm": 0.2260565459728241,
1949
+ "learning_rate": 9.232264847757357e-05,
1950
+ "loss": 0.5065,
1951
+ "step": 650
1952
+ },
1953
+ {
1954
+ "epoch": 0.5719313682358117,
1955
+ "eval_loss": 0.5213314890861511,
1956
+ "eval_runtime": 28.6771,
1957
+ "eval_samples_per_second": 0.593,
1958
+ "eval_steps_per_second": 0.314,
1959
+ "step": 650
1960
+ },
1961
+ {
1962
+ "epoch": 0.5763308402991641,
1963
+ "grad_norm": 0.21002529561519623,
1964
+ "learning_rate": 9.079116921864884e-05,
1965
+ "loss": 0.4796,
1966
+ "step": 655
1967
+ },
1968
+ {
1969
+ "epoch": 0.5763308402991641,
1970
+ "eval_loss": 0.5214037299156189,
1971
+ "eval_runtime": 28.6202,
1972
+ "eval_samples_per_second": 0.594,
1973
+ "eval_steps_per_second": 0.314,
1974
+ "step": 655
1975
+ },
1976
+ {
1977
+ "epoch": 0.5807303123625165,
1978
+ "grad_norm": 0.19340470433235168,
1979
+ "learning_rate": 8.92618653335837e-05,
1980
+ "loss": 0.4788,
1981
+ "step": 660
1982
+ },
1983
+ {
1984
+ "epoch": 0.5807303123625165,
1985
+ "eval_loss": 0.5211138725280762,
1986
+ "eval_runtime": 28.6313,
1987
+ "eval_samples_per_second": 0.594,
1988
+ "eval_steps_per_second": 0.314,
1989
+ "step": 660
1990
+ },
1991
+ {
1992
+ "epoch": 0.5851297844258689,
1993
+ "grad_norm": 0.19035720825195312,
1994
+ "learning_rate": 8.773509808514581e-05,
1995
+ "loss": 0.468,
1996
+ "step": 665
1997
+ },
1998
+ {
1999
+ "epoch": 0.5851297844258689,
2000
+ "eval_loss": 0.5191999077796936,
2001
+ "eval_runtime": 28.0607,
2002
+ "eval_samples_per_second": 0.606,
2003
+ "eval_steps_per_second": 0.321,
2004
+ "step": 665
2005
+ },
2006
+ {
2007
+ "epoch": 0.5895292564892213,
2008
+ "grad_norm": 0.19168096780776978,
2009
+ "learning_rate": 8.62112281368809e-05,
2010
+ "loss": 0.5066,
2011
+ "step": 670
2012
+ },
2013
+ {
2014
+ "epoch": 0.5895292564892213,
2015
+ "eval_loss": 0.5176913142204285,
2016
+ "eval_runtime": 28.5375,
2017
+ "eval_samples_per_second": 0.596,
2018
+ "eval_steps_per_second": 0.315,
2019
+ "step": 670
2020
+ },
2021
+ {
2022
+ "epoch": 0.5939287285525737,
2023
+ "grad_norm": 0.19758321344852448,
2024
+ "learning_rate": 8.469061546791442e-05,
2025
+ "loss": 0.51,
2026
+ "step": 675
2027
+ },
2028
+ {
2029
+ "epoch": 0.5939287285525737,
2030
+ "eval_loss": 0.517296314239502,
2031
+ "eval_runtime": 28.5712,
2032
+ "eval_samples_per_second": 0.595,
2033
+ "eval_steps_per_second": 0.315,
2034
+ "step": 675
2035
+ },
2036
+ {
2037
+ "epoch": 0.5983282006159261,
2038
+ "grad_norm": 0.19562241435050964,
2039
+ "learning_rate": 8.317361928791469e-05,
2040
+ "loss": 0.4932,
2041
+ "step": 680
2042
+ },
2043
+ {
2044
+ "epoch": 0.5983282006159261,
2045
+ "eval_loss": 0.5170657634735107,
2046
+ "eval_runtime": 28.4877,
2047
+ "eval_samples_per_second": 0.597,
2048
+ "eval_steps_per_second": 0.316,
2049
+ "step": 680
2050
+ },
2051
+ {
2052
+ "epoch": 0.6027276726792785,
2053
+ "grad_norm": 0.18590031564235687,
2054
+ "learning_rate": 8.166059795223794e-05,
2055
+ "loss": 0.5055,
2056
+ "step": 685
2057
+ },
2058
+ {
2059
+ "epoch": 0.6027276726792785,
2060
+ "eval_loss": 0.5166193842887878,
2061
+ "eval_runtime": 28.625,
2062
+ "eval_samples_per_second": 0.594,
2063
+ "eval_steps_per_second": 0.314,
2064
+ "step": 685
2065
+ },
2066
+ {
2067
+ "epoch": 0.6071271447426309,
2068
+ "grad_norm": 0.2049984484910965,
2069
+ "learning_rate": 8.015190887727509e-05,
2070
+ "loss": 0.4846,
2071
+ "step": 690
2072
+ },
2073
+ {
2074
+ "epoch": 0.6071271447426309,
2075
+ "eval_loss": 0.5160765647888184,
2076
+ "eval_runtime": 28.5582,
2077
+ "eval_samples_per_second": 0.595,
2078
+ "eval_steps_per_second": 0.315,
2079
+ "step": 690
2080
+ },
2081
+ {
2082
+ "epoch": 0.6115266168059833,
2083
+ "grad_norm": 0.19373777508735657,
2084
+ "learning_rate": 7.864790845602039e-05,
2085
+ "loss": 0.4862,
2086
+ "step": 695
2087
+ },
2088
+ {
2089
+ "epoch": 0.6115266168059833,
2090
+ "eval_loss": 0.5157306790351868,
2091
+ "eval_runtime": 28.6078,
2092
+ "eval_samples_per_second": 0.594,
2093
+ "eval_steps_per_second": 0.315,
2094
+ "step": 695
2095
+ },
2096
+ {
2097
+ "epoch": 0.6159260888693356,
2098
+ "grad_norm": 0.20326727628707886,
2099
+ "learning_rate": 7.714895197388189e-05,
2100
+ "loss": 0.5064,
2101
+ "step": 700
2102
+ },
2103
+ {
2104
+ "epoch": 0.6159260888693356,
2105
+ "eval_loss": 0.5153770446777344,
2106
+ "eval_runtime": 28.6597,
2107
+ "eval_samples_per_second": 0.593,
2108
+ "eval_steps_per_second": 0.314,
2109
+ "step": 700
2110
+ },
2111
+ {
2112
+ "epoch": 0.6203255609326881,
2113
+ "grad_norm": 0.19425565004348755,
2114
+ "learning_rate": 7.565539352475326e-05,
2115
+ "loss": 0.5018,
2116
+ "step": 705
2117
+ },
2118
+ {
2119
+ "epoch": 0.6203255609326881,
2120
+ "eval_loss": 0.5147074460983276,
2121
+ "eval_runtime": 28.5261,
2122
+ "eval_samples_per_second": 0.596,
2123
+ "eval_steps_per_second": 0.316,
2124
+ "step": 705
2125
+ },
2126
+ {
2127
+ "epoch": 0.6247250329960404,
2128
+ "grad_norm": 0.19491039216518402,
2129
+ "learning_rate": 7.416758592736744e-05,
2130
+ "loss": 0.482,
2131
+ "step": 710
2132
+ },
2133
+ {
2134
+ "epoch": 0.6247250329960404,
2135
+ "eval_loss": 0.5144516229629517,
2136
+ "eval_runtime": 28.533,
2137
+ "eval_samples_per_second": 0.596,
2138
+ "eval_steps_per_second": 0.315,
2139
+ "step": 710
2140
+ },
2141
+ {
2142
+ "epoch": 0.6291245050593929,
2143
+ "grad_norm": 0.1957363337278366,
2144
+ "learning_rate": 7.268588064195122e-05,
2145
+ "loss": 0.4883,
2146
+ "step": 715
2147
+ },
2148
+ {
2149
+ "epoch": 0.6291245050593929,
2150
+ "eval_loss": 0.5139791965484619,
2151
+ "eval_runtime": 28.5313,
2152
+ "eval_samples_per_second": 0.596,
2153
+ "eval_steps_per_second": 0.315,
2154
+ "step": 715
2155
+ },
2156
+ {
2157
+ "epoch": 0.6335239771227452,
2158
+ "grad_norm": 0.21253836154937744,
2159
+ "learning_rate": 7.12106276872011e-05,
2160
+ "loss": 0.4768,
2161
+ "step": 720
2162
+ },
2163
+ {
2164
+ "epoch": 0.6335239771227452,
2165
+ "eval_loss": 0.5137556195259094,
2166
+ "eval_runtime": 28.6307,
2167
+ "eval_samples_per_second": 0.594,
2168
+ "eval_steps_per_second": 0.314,
2169
+ "step": 720
2170
+ },
2171
+ {
2172
+ "epoch": 0.6379234491860977,
2173
+ "grad_norm": 0.1721029132604599,
2174
+ "learning_rate": 6.974217555759915e-05,
2175
+ "loss": 0.4816,
2176
+ "step": 725
2177
+ },
2178
+ {
2179
+ "epoch": 0.6379234491860977,
2180
+ "eval_loss": 0.5133811831474304,
2181
+ "eval_runtime": 28.5925,
2182
+ "eval_samples_per_second": 0.595,
2183
+ "eval_steps_per_second": 0.315,
2184
+ "step": 725
2185
+ },
2186
+ {
2187
+ "epoch": 0.6423229212494501,
2188
+ "grad_norm": 0.19211679697036743,
2189
+ "learning_rate": 6.82808711410894e-05,
2190
+ "loss": 0.5035,
2191
+ "step": 730
2192
+ },
2193
+ {
2194
+ "epoch": 0.6423229212494501,
2195
+ "eval_loss": 0.5132091641426086,
2196
+ "eval_runtime": 28.5078,
2197
+ "eval_samples_per_second": 0.596,
2198
+ "eval_steps_per_second": 0.316,
2199
+ "step": 730
2200
+ },
2201
+ {
2202
+ "epoch": 0.6467223933128025,
2203
+ "grad_norm": 0.19252945482730865,
2204
+ "learning_rate": 6.682705963713356e-05,
2205
+ "loss": 0.4822,
2206
+ "step": 735
2207
+ },
2208
+ {
2209
+ "epoch": 0.6467223933128025,
2210
+ "eval_loss": 0.5131357908248901,
2211
+ "eval_runtime": 28.6326,
2212
+ "eval_samples_per_second": 0.594,
2213
+ "eval_steps_per_second": 0.314,
2214
+ "step": 735
2215
+ },
2216
+ {
2217
+ "epoch": 0.6511218653761549,
2218
+ "grad_norm": 0.1986207813024521,
2219
+ "learning_rate": 6.538108447516558e-05,
2220
+ "loss": 0.4612,
2221
+ "step": 740
2222
+ },
2223
+ {
2224
+ "epoch": 0.6511218653761549,
2225
+ "eval_loss": 0.5128303170204163,
2226
+ "eval_runtime": 28.6066,
2227
+ "eval_samples_per_second": 0.594,
2228
+ "eval_steps_per_second": 0.315,
2229
+ "step": 740
2230
+ },
2231
+ {
2232
+ "epoch": 0.6555213374395072,
2233
+ "grad_norm": 0.19202682375907898,
2234
+ "learning_rate": 6.394328723346434e-05,
2235
+ "loss": 0.4578,
2236
+ "step": 745
2237
+ },
2238
+ {
2239
+ "epoch": 0.6555213374395072,
2240
+ "eval_loss": 0.5124692916870117,
2241
+ "eval_runtime": 28.6064,
2242
+ "eval_samples_per_second": 0.594,
2243
+ "eval_steps_per_second": 0.315,
2244
+ "step": 745
2245
+ },
2246
+ {
2247
+ "epoch": 0.6599208095028597,
2248
+ "grad_norm": 0.198526531457901,
2249
+ "learning_rate": 6.251400755846372e-05,
2250
+ "loss": 0.5176,
2251
+ "step": 750
2252
+ },
2253
+ {
2254
+ "epoch": 0.6599208095028597,
2255
+ "eval_loss": 0.5121349096298218,
2256
+ "eval_runtime": 28.5313,
2257
+ "eval_samples_per_second": 0.596,
2258
+ "eval_steps_per_second": 0.315,
2259
+ "step": 750
2260
+ },
2261
+ {
2262
+ "epoch": 0.664320281566212,
2263
+ "grad_norm": 0.19058994948863983,
2264
+ "learning_rate": 6.109358308451885e-05,
2265
+ "loss": 0.4877,
2266
+ "step": 755
2267
+ },
2268
+ {
2269
+ "epoch": 0.664320281566212,
2270
+ "eval_loss": 0.5118634700775146,
2271
+ "eval_runtime": 28.5287,
2272
+ "eval_samples_per_second": 0.596,
2273
+ "eval_steps_per_second": 0.315,
2274
+ "step": 755
2275
+ },
2276
+ {
2277
+ "epoch": 0.6687197536295645,
2278
+ "grad_norm": 0.1798192411661148,
2279
+ "learning_rate": 5.968234935414807e-05,
2280
+ "loss": 0.4805,
2281
+ "step": 760
2282
+ },
2283
+ {
2284
+ "epoch": 0.6687197536295645,
2285
+ "eval_loss": 0.5116167664527893,
2286
+ "eval_runtime": 28.5918,
2287
+ "eval_samples_per_second": 0.595,
2288
+ "eval_steps_per_second": 0.315,
2289
+ "step": 760
2290
+ },
2291
+ {
2292
+ "epoch": 0.6731192256929168,
2293
+ "grad_norm": 0.18448549509048462,
2294
+ "learning_rate": 5.828063973876834e-05,
2295
+ "loss": 0.4993,
2296
+ "step": 765
2297
+ },
2298
+ {
2299
+ "epoch": 0.6731192256929168,
2300
+ "eval_loss": 0.5111361742019653,
2301
+ "eval_runtime": 28.5586,
2302
+ "eval_samples_per_second": 0.595,
2303
+ "eval_steps_per_second": 0.315,
2304
+ "step": 765
2305
+ },
2306
+ {
2307
+ "epoch": 0.6775186977562693,
2308
+ "grad_norm": 0.18624383211135864,
2309
+ "learning_rate": 5.688878535994421e-05,
2310
+ "loss": 0.4844,
2311
+ "step": 770
2312
+ },
2313
+ {
2314
+ "epoch": 0.6775186977562693,
2315
+ "eval_loss": 0.5107051134109497,
2316
+ "eval_runtime": 28.5748,
2317
+ "eval_samples_per_second": 0.595,
2318
+ "eval_steps_per_second": 0.315,
2319
+ "step": 770
2320
+ },
2321
+ {
2322
+ "epoch": 0.6819181698196216,
2323
+ "grad_norm": 0.18364666402339935,
2324
+ "learning_rate": 5.550711501116789e-05,
2325
+ "loss": 0.4674,
2326
+ "step": 775
2327
+ },
2328
+ {
2329
+ "epoch": 0.6819181698196216,
2330
+ "eval_loss": 0.5101103186607361,
2331
+ "eval_runtime": 28.5159,
2332
+ "eval_samples_per_second": 0.596,
2333
+ "eval_steps_per_second": 0.316,
2334
+ "step": 775
2335
+ },
2336
+ {
2337
+ "epoch": 0.6863176418829741,
2338
+ "grad_norm": 0.23952247202396393,
2339
+ "learning_rate": 5.413595508018952e-05,
2340
+ "loss": 0.4943,
2341
+ "step": 780
2342
+ },
2343
+ {
2344
+ "epoch": 0.6863176418829741,
2345
+ "eval_loss": 0.5096238255500793,
2346
+ "eval_runtime": 28.516,
2347
+ "eval_samples_per_second": 0.596,
2348
+ "eval_steps_per_second": 0.316,
2349
+ "step": 780
2350
+ },
2351
+ {
2352
+ "epoch": 0.6907171139463264,
2353
+ "grad_norm": 0.20105206966400146,
2354
+ "learning_rate": 5.27756294719153e-05,
2355
+ "loss": 0.4924,
2356
+ "step": 785
2357
+ },
2358
+ {
2359
+ "epoch": 0.6907171139463264,
2360
+ "eval_loss": 0.5093135237693787,
2361
+ "eval_runtime": 28.5941,
2362
+ "eval_samples_per_second": 0.595,
2363
+ "eval_steps_per_second": 0.315,
2364
+ "step": 785
2365
+ },
2366
+ {
2367
+ "epoch": 0.6951165860096788,
2368
+ "grad_norm": 0.19826586544513702,
2369
+ "learning_rate": 5.1426459531892714e-05,
2370
+ "loss": 0.4986,
2371
+ "step": 790
2372
+ },
2373
+ {
2374
+ "epoch": 0.6951165860096788,
2375
+ "eval_loss": 0.5086015462875366,
2376
+ "eval_runtime": 28.6207,
2377
+ "eval_samples_per_second": 0.594,
2378
+ "eval_steps_per_second": 0.314,
2379
+ "step": 790
2380
+ },
2381
+ {
2382
+ "epoch": 0.6995160580730312,
2383
+ "grad_norm": 0.17991924285888672,
2384
+ "learning_rate": 5.008876397039983e-05,
2385
+ "loss": 0.4698,
2386
+ "step": 795
2387
+ },
2388
+ {
2389
+ "epoch": 0.6995160580730312,
2390
+ "eval_loss": 0.5082879662513733,
2391
+ "eval_runtime": 28.6587,
2392
+ "eval_samples_per_second": 0.593,
2393
+ "eval_steps_per_second": 0.314,
2394
+ "step": 795
2395
+ },
2396
+ {
2397
+ "epoch": 0.7039155301363836,
2398
+ "grad_norm": 0.19232523441314697,
2399
+ "learning_rate": 4.876285878715764e-05,
2400
+ "loss": 0.4981,
2401
+ "step": 800
2402
+ },
2403
+ {
2404
+ "epoch": 0.7039155301363836,
2405
+ "eval_loss": 0.5078893899917603,
2406
+ "eval_runtime": 28.5038,
2407
+ "eval_samples_per_second": 0.596,
2408
+ "eval_steps_per_second": 0.316,
2409
+ "step": 800
2410
+ },
2411
+ {
2412
+ "epoch": 0.708315002199736,
2413
+ "grad_norm": 0.19006720185279846,
2414
+ "learning_rate": 4.744905719668207e-05,
2415
+ "loss": 0.4758,
2416
+ "step": 805
2417
+ },
2418
+ {
2419
+ "epoch": 0.708315002199736,
2420
+ "eval_loss": 0.5076141357421875,
2421
+ "eval_runtime": 28.6324,
2422
+ "eval_samples_per_second": 0.594,
2423
+ "eval_steps_per_second": 0.314,
2424
+ "step": 805
2425
+ },
2426
+ {
2427
+ "epoch": 0.7127144742630884,
2428
+ "grad_norm": 0.19002890586853027,
2429
+ "learning_rate": 4.614766955429447e-05,
2430
+ "loss": 0.4642,
2431
+ "step": 810
2432
+ },
2433
+ {
2434
+ "epoch": 0.7127144742630884,
2435
+ "eval_loss": 0.507789671421051,
2436
+ "eval_runtime": 28.6356,
2437
+ "eval_samples_per_second": 0.594,
2438
+ "eval_steps_per_second": 0.314,
2439
+ "step": 810
2440
+ },
2441
+ {
2442
+ "epoch": 0.7171139463264409,
2443
+ "grad_norm": 0.2051495909690857,
2444
+ "learning_rate": 4.485900328280731e-05,
2445
+ "loss": 0.4669,
2446
+ "step": 815
2447
+ },
2448
+ {
2449
+ "epoch": 0.7171139463264409,
2450
+ "eval_loss": 0.5073484182357788,
2451
+ "eval_runtime": 28.5748,
2452
+ "eval_samples_per_second": 0.595,
2453
+ "eval_steps_per_second": 0.315,
2454
+ "step": 815
2455
+ },
2456
+ {
2457
+ "epoch": 0.7215134183897932,
2458
+ "grad_norm": 0.6378114223480225,
2459
+ "learning_rate": 4.358336279990268e-05,
2460
+ "loss": 0.4711,
2461
+ "step": 820
2462
+ },
2463
+ {
2464
+ "epoch": 0.7215134183897932,
2465
+ "eval_loss": 0.5070581436157227,
2466
+ "eval_runtime": 28.6233,
2467
+ "eval_samples_per_second": 0.594,
2468
+ "eval_steps_per_second": 0.314,
2469
+ "step": 820
2470
+ },
2471
+ {
2472
+ "epoch": 0.7259128904531457,
2473
+ "grad_norm": 0.181978240609169,
2474
+ "learning_rate": 4.2321049446220505e-05,
2475
+ "loss": 0.4704,
2476
+ "step": 825
2477
+ },
2478
+ {
2479
+ "epoch": 0.7259128904531457,
2480
+ "eval_loss": 0.5068845748901367,
2481
+ "eval_runtime": 28.5225,
2482
+ "eval_samples_per_second": 0.596,
2483
+ "eval_steps_per_second": 0.316,
2484
+ "step": 825
2485
+ },
2486
+ {
2487
+ "epoch": 0.730312362516498,
2488
+ "grad_norm": 0.1777966171503067,
2489
+ "learning_rate": 4.107236141417382e-05,
2490
+ "loss": 0.4752,
2491
+ "step": 830
2492
+ },
2493
+ {
2494
+ "epoch": 0.730312362516498,
2495
+ "eval_loss": 0.5066249966621399,
2496
+ "eval_runtime": 28.5423,
2497
+ "eval_samples_per_second": 0.596,
2498
+ "eval_steps_per_second": 0.315,
2499
+ "step": 830
2500
+ },
2501
+ {
2502
+ "epoch": 0.7347118345798505,
2503
+ "grad_norm": 0.18686190247535706,
2504
+ "learning_rate": 3.9837593677507726e-05,
2505
+ "loss": 0.4621,
2506
+ "step": 835
2507
+ },
2508
+ {
2509
+ "epoch": 0.7347118345798505,
2510
+ "eval_loss": 0.5066962242126465,
2511
+ "eval_runtime": 28.428,
2512
+ "eval_samples_per_second": 0.598,
2513
+ "eval_steps_per_second": 0.317,
2514
+ "step": 835
2515
+ },
2516
+ {
2517
+ "epoch": 0.7391113066432028,
2518
+ "grad_norm": 0.18854567408561707,
2519
+ "learning_rate": 3.8617037921618705e-05,
2520
+ "loss": 0.4748,
2521
+ "step": 840
2522
+ },
2523
+ {
2524
+ "epoch": 0.7391113066432028,
2525
+ "eval_loss": 0.50632643699646,
2526
+ "eval_runtime": 28.5075,
2527
+ "eval_samples_per_second": 0.596,
2528
+ "eval_steps_per_second": 0.316,
2529
+ "step": 840
2530
+ },
2531
+ {
2532
+ "epoch": 0.7435107787065552,
2533
+ "grad_norm": 0.19204109907150269,
2534
+ "learning_rate": 3.741098247465049e-05,
2535
+ "loss": 0.4948,
2536
+ "step": 845
2537
+ },
2538
+ {
2539
+ "epoch": 0.7435107787065552,
2540
+ "eval_loss": 0.5060507655143738,
2541
+ "eval_runtime": 28.5753,
2542
+ "eval_samples_per_second": 0.595,
2543
+ "eval_steps_per_second": 0.315,
2544
+ "step": 845
2545
+ },
2546
+ {
2547
+ "epoch": 0.7479102507699076,
2548
+ "grad_norm": 0.19182614982128143,
2549
+ "learning_rate": 3.621971223938334e-05,
2550
+ "loss": 0.4832,
2551
+ "step": 850
2552
+ },
2553
+ {
2554
+ "epoch": 0.7479102507699076,
2555
+ "eval_loss": 0.5058286190032959,
2556
+ "eval_runtime": 28.5184,
2557
+ "eval_samples_per_second": 0.596,
2558
+ "eval_steps_per_second": 0.316,
2559
+ "step": 850
2560
+ },
2561
+ {
2562
+ "epoch": 0.75230972283326,
2563
+ "grad_norm": 0.18205444514751434,
2564
+ "learning_rate": 3.504350862593231e-05,
2565
+ "loss": 0.4642,
2566
+ "step": 855
2567
+ },
2568
+ {
2569
+ "epoch": 0.75230972283326,
2570
+ "eval_loss": 0.505698025226593,
2571
+ "eval_runtime": 28.6382,
2572
+ "eval_samples_per_second": 0.594,
2573
+ "eval_steps_per_second": 0.314,
2574
+ "step": 855
2575
+ },
2576
+ {
2577
+ "epoch": 0.7567091948966124,
2578
+ "grad_norm": 0.20196740329265594,
2579
+ "learning_rate": 3.388264948527052e-05,
2580
+ "loss": 0.4877,
2581
+ "step": 860
2582
+ },
2583
+ {
2584
+ "epoch": 0.7567091948966124,
2585
+ "eval_loss": 0.5052359700202942,
2586
+ "eval_runtime": 28.5347,
2587
+ "eval_samples_per_second": 0.596,
2588
+ "eval_steps_per_second": 0.315,
2589
+ "step": 860
2590
+ },
2591
+ {
2592
+ "epoch": 0.7611086669599648,
2593
+ "grad_norm": 0.18125030398368835,
2594
+ "learning_rate": 3.2737409043593405e-05,
2595
+ "loss": 0.4727,
2596
+ "step": 865
2597
+ },
2598
+ {
2599
+ "epoch": 0.7611086669599648,
2600
+ "eval_loss": 0.504954993724823,
2601
+ "eval_runtime": 28.5976,
2602
+ "eval_samples_per_second": 0.594,
2603
+ "eval_steps_per_second": 0.315,
2604
+ "step": 865
2605
+ },
2606
+ {
2607
+ "epoch": 0.7655081390233172,
2608
+ "grad_norm": 0.18927669525146484,
2609
+ "learning_rate": 3.160805783753897e-05,
2610
+ "loss": 0.4691,
2611
+ "step": 870
2612
+ },
2613
+ {
2614
+ "epoch": 0.7655081390233172,
2615
+ "eval_loss": 0.5047942399978638,
2616
+ "eval_runtime": 28.5051,
2617
+ "eval_samples_per_second": 0.596,
2618
+ "eval_steps_per_second": 0.316,
2619
+ "step": 870
2620
+ },
2621
+ {
2622
+ "epoch": 0.7699076110866696,
2623
+ "grad_norm": 0.18508534133434296,
2624
+ "learning_rate": 3.0494862650279822e-05,
2625
+ "loss": 0.5292,
2626
+ "step": 875
2627
+ },
2628
+ {
2629
+ "epoch": 0.7699076110866696,
2630
+ "eval_loss": 0.5046341419219971,
2631
+ "eval_runtime": 28.5445,
2632
+ "eval_samples_per_second": 0.596,
2633
+ "eval_steps_per_second": 0.315,
2634
+ "step": 875
2635
+ },
2636
+ {
2637
+ "epoch": 0.774307083150022,
2638
+ "grad_norm": 0.18230414390563965,
2639
+ "learning_rate": 2.939808644850184e-05,
2640
+ "loss": 0.4708,
2641
+ "step": 880
2642
+ },
2643
+ {
2644
+ "epoch": 0.774307083150022,
2645
+ "eval_loss": 0.5046290755271912,
2646
+ "eval_runtime": 28.6138,
2647
+ "eval_samples_per_second": 0.594,
2648
+ "eval_steps_per_second": 0.315,
2649
+ "step": 880
2650
+ },
2651
+ {
2652
+ "epoch": 0.7787065552133744,
2653
+ "grad_norm": 0.17352643609046936,
2654
+ "learning_rate": 2.8317988320284228e-05,
2655
+ "loss": 0.4863,
2656
+ "step": 885
2657
+ },
2658
+ {
2659
+ "epoch": 0.7787065552133744,
2660
+ "eval_loss": 0.5044691562652588,
2661
+ "eval_runtime": 28.6321,
2662
+ "eval_samples_per_second": 0.594,
2663
+ "eval_steps_per_second": 0.314,
2664
+ "step": 885
2665
+ },
2666
+ {
2667
+ "epoch": 0.7831060272767268,
2668
+ "grad_norm": 0.1845002919435501,
2669
+ "learning_rate": 2.7254823413896058e-05,
2670
+ "loss": 0.5006,
2671
+ "step": 890
2672
+ },
2673
+ {
2674
+ "epoch": 0.7831060272767268,
2675
+ "eval_loss": 0.5042091012001038,
2676
+ "eval_runtime": 28.6132,
2677
+ "eval_samples_per_second": 0.594,
2678
+ "eval_steps_per_second": 0.315,
2679
+ "step": 890
2680
+ },
2681
+ {
2682
+ "epoch": 0.7875054993400792,
2683
+ "grad_norm": 0.17883773148059845,
2684
+ "learning_rate": 2.6208842877523278e-05,
2685
+ "loss": 0.4887,
2686
+ "step": 895
2687
+ },
2688
+ {
2689
+ "epoch": 0.7875054993400792,
2690
+ "eval_loss": 0.5039156675338745,
2691
+ "eval_runtime": 28.5693,
2692
+ "eval_samples_per_second": 0.595,
2693
+ "eval_steps_per_second": 0.315,
2694
+ "step": 895
2695
+ },
2696
+ {
2697
+ "epoch": 0.7919049714034316,
2698
+ "grad_norm": 0.19202597439289093,
2699
+ "learning_rate": 2.518029379994089e-05,
2700
+ "loss": 0.4867,
2701
+ "step": 900
2702
+ },
2703
+ {
2704
+ "epoch": 0.7919049714034316,
2705
+ "eval_loss": 0.5037320852279663,
2706
+ "eval_runtime": 28.549,
2707
+ "eval_samples_per_second": 0.595,
2708
+ "eval_steps_per_second": 0.315,
2709
+ "step": 900
2710
+ },
2711
+ {
2712
+ "epoch": 0.796304443466784,
2713
+ "grad_norm": 0.18246056139469147,
2714
+ "learning_rate": 2.4169419152143768e-05,
2715
+ "loss": 0.4662,
2716
+ "step": 905
2717
+ },
2718
+ {
2719
+ "epoch": 0.796304443466784,
2720
+ "eval_loss": 0.5035374164581299,
2721
+ "eval_runtime": 28.6042,
2722
+ "eval_samples_per_second": 0.594,
2723
+ "eval_steps_per_second": 0.315,
2724
+ "step": 905
2725
+ },
2726
+ {
2727
+ "epoch": 0.8007039155301364,
2728
+ "grad_norm": 0.18989378213882446,
2729
+ "learning_rate": 2.317645772995042e-05,
2730
+ "loss": 0.4744,
2731
+ "step": 910
2732
+ },
2733
+ {
2734
+ "epoch": 0.8007039155301364,
2735
+ "eval_loss": 0.5033923387527466,
2736
+ "eval_runtime": 28.4795,
2737
+ "eval_samples_per_second": 0.597,
2738
+ "eval_steps_per_second": 0.316,
2739
+ "step": 910
2740
+ },
2741
+ {
2742
+ "epoch": 0.8051033875934888,
2743
+ "grad_norm": 0.19525018334388733,
2744
+ "learning_rate": 2.220164409759299e-05,
2745
+ "loss": 0.5159,
2746
+ "step": 915
2747
+ },
2748
+ {
2749
+ "epoch": 0.8051033875934888,
2750
+ "eval_loss": 0.503151535987854,
2751
+ "eval_runtime": 28.6198,
2752
+ "eval_samples_per_second": 0.594,
2753
+ "eval_steps_per_second": 0.314,
2754
+ "step": 915
2755
+ },
2756
+ {
2757
+ "epoch": 0.8095028596568412,
2758
+ "grad_norm": 0.18840977549552917,
2759
+ "learning_rate": 2.124520853230697e-05,
2760
+ "loss": 0.4848,
2761
+ "step": 920
2762
+ },
2763
+ {
2764
+ "epoch": 0.8095028596568412,
2765
+ "eval_loss": 0.5029481649398804,
2766
+ "eval_runtime": 28.614,
2767
+ "eval_samples_per_second": 0.594,
2768
+ "eval_steps_per_second": 0.315,
2769
+ "step": 920
2770
+ },
2771
+ {
2772
+ "epoch": 0.8139023317201936,
2773
+ "grad_norm": 0.18055056035518646,
2774
+ "learning_rate": 2.03073769699333e-05,
2775
+ "loss": 0.4648,
2776
+ "step": 925
2777
+ },
2778
+ {
2779
+ "epoch": 0.8139023317201936,
2780
+ "eval_loss": 0.5028063654899597,
2781
+ "eval_runtime": 28.5662,
2782
+ "eval_samples_per_second": 0.595,
2783
+ "eval_steps_per_second": 0.315,
2784
+ "step": 925
2785
+ },
2786
+ {
2787
+ "epoch": 0.818301803783546,
2788
+ "grad_norm": 0.18352611362934113,
2789
+ "learning_rate": 1.9388370951546432e-05,
2790
+ "loss": 0.4733,
2791
+ "step": 930
2792
+ },
2793
+ {
2794
+ "epoch": 0.818301803783546,
2795
+ "eval_loss": 0.5027296543121338,
2796
+ "eval_runtime": 28.5532,
2797
+ "eval_samples_per_second": 0.595,
2798
+ "eval_steps_per_second": 0.315,
2799
+ "step": 930
2800
+ },
2801
+ {
2802
+ "epoch": 0.8227012758468983,
2803
+ "grad_norm": 0.18161964416503906,
2804
+ "learning_rate": 1.848840757112019e-05,
2805
+ "loss": 0.4556,
2806
+ "step": 935
2807
+ },
2808
+ {
2809
+ "epoch": 0.8227012758468983,
2810
+ "eval_loss": 0.5025849342346191,
2811
+ "eval_runtime": 28.6672,
2812
+ "eval_samples_per_second": 0.593,
2813
+ "eval_steps_per_second": 0.314,
2814
+ "step": 935
2815
+ },
2816
+ {
2817
+ "epoch": 0.8271007479102508,
2818
+ "grad_norm": 0.19485127925872803,
2819
+ "learning_rate": 1.7607699424244585e-05,
2820
+ "loss": 0.4973,
2821
+ "step": 940
2822
+ },
2823
+ {
2824
+ "epoch": 0.8271007479102508,
2825
+ "eval_loss": 0.5023777484893799,
2826
+ "eval_runtime": 28.5856,
2827
+ "eval_samples_per_second": 0.595,
2828
+ "eval_steps_per_second": 0.315,
2829
+ "step": 940
2830
+ },
2831
+ {
2832
+ "epoch": 0.8315002199736031,
2833
+ "grad_norm": 0.19218072295188904,
2834
+ "learning_rate": 1.674645455790468e-05,
2835
+ "loss": 0.4708,
2836
+ "step": 945
2837
+ },
2838
+ {
2839
+ "epoch": 0.8315002199736031,
2840
+ "eval_loss": 0.5024308562278748,
2841
+ "eval_runtime": 28.6001,
2842
+ "eval_samples_per_second": 0.594,
2843
+ "eval_steps_per_second": 0.315,
2844
+ "step": 945
2845
+ },
2846
+ {
2847
+ "epoch": 0.8358996920369556,
2848
+ "grad_norm": 0.18270643055438995,
2849
+ "learning_rate": 1.5904876421334536e-05,
2850
+ "loss": 0.4547,
2851
+ "step": 950
2852
+ },
2853
+ {
2854
+ "epoch": 0.8358996920369556,
2855
+ "eval_loss": 0.5024178624153137,
2856
+ "eval_runtime": 28.5464,
2857
+ "eval_samples_per_second": 0.596,
2858
+ "eval_steps_per_second": 0.315,
2859
+ "step": 950
2860
+ },
2861
+ {
2862
+ "epoch": 0.8402991641003079,
2863
+ "grad_norm": 0.18350371718406677,
2864
+ "learning_rate": 1.5083163817956914e-05,
2865
+ "loss": 0.4663,
2866
+ "step": 955
2867
+ },
2868
+ {
2869
+ "epoch": 0.8402991641003079,
2870
+ "eval_loss": 0.5021481513977051,
2871
+ "eval_runtime": 28.5783,
2872
+ "eval_samples_per_second": 0.595,
2873
+ "eval_steps_per_second": 0.315,
2874
+ "step": 955
2875
+ },
2876
+ {
2877
+ "epoch": 0.8446986361636604,
2878
+ "grad_norm": 0.18115630745887756,
2879
+ "learning_rate": 1.4281510858420632e-05,
2880
+ "loss": 0.4857,
2881
+ "step": 960
2882
+ },
2883
+ {
2884
+ "epoch": 0.8446986361636604,
2885
+ "eval_loss": 0.5019457340240479,
2886
+ "eval_runtime": 28.5976,
2887
+ "eval_samples_per_second": 0.594,
2888
+ "eval_steps_per_second": 0.315,
2889
+ "step": 960
2890
+ },
2891
+ {
2892
+ "epoch": 0.8490981082270127,
2893
+ "grad_norm": 0.1744571477174759,
2894
+ "learning_rate": 1.350010691474629e-05,
2895
+ "loss": 0.4633,
2896
+ "step": 965
2897
+ },
2898
+ {
2899
+ "epoch": 0.8490981082270127,
2900
+ "eval_loss": 0.5019629597663879,
2901
+ "eval_runtime": 28.5207,
2902
+ "eval_samples_per_second": 0.596,
2903
+ "eval_steps_per_second": 0.316,
2904
+ "step": 965
2905
+ },
2906
+ {
2907
+ "epoch": 0.8534975802903652,
2908
+ "grad_norm": 0.18827442824840546,
2909
+ "learning_rate": 1.2739136575591581e-05,
2910
+ "loss": 0.4723,
2911
+ "step": 970
2912
+ },
2913
+ {
2914
+ "epoch": 0.8534975802903652,
2915
+ "eval_loss": 0.5018792748451233,
2916
+ "eval_runtime": 28.4515,
2917
+ "eval_samples_per_second": 0.598,
2918
+ "eval_steps_per_second": 0.316,
2919
+ "step": 970
2920
+ },
2921
+ {
2922
+ "epoch": 0.8578970523537176,
2923
+ "grad_norm": 0.18166576325893402,
2924
+ "learning_rate": 1.1998779602646437e-05,
2925
+ "loss": 0.4691,
2926
+ "step": 975
2927
+ },
2928
+ {
2929
+ "epoch": 0.8578970523537176,
2930
+ "eval_loss": 0.5017500519752502,
2931
+ "eval_runtime": 28.5978,
2932
+ "eval_samples_per_second": 0.594,
2933
+ "eval_steps_per_second": 0.315,
2934
+ "step": 975
2935
+ },
2936
+ {
2937
+ "epoch": 0.8622965244170699,
2938
+ "grad_norm": 0.18091408908367157,
2939
+ "learning_rate": 1.1279210888168546e-05,
2940
+ "loss": 0.4874,
2941
+ "step": 980
2942
+ },
2943
+ {
2944
+ "epoch": 0.8622965244170699,
2945
+ "eval_loss": 0.5017052888870239,
2946
+ "eval_runtime": 28.7541,
2947
+ "eval_samples_per_second": 0.591,
2948
+ "eval_steps_per_second": 0.313,
2949
+ "step": 980
2950
+ },
2951
+ {
2952
+ "epoch": 0.8666959964804224,
2953
+ "grad_norm": 0.182442307472229,
2954
+ "learning_rate": 1.0580600413668984e-05,
2955
+ "loss": 0.4773,
2956
+ "step": 985
2957
+ },
2958
+ {
2959
+ "epoch": 0.8666959964804224,
2960
+ "eval_loss": 0.5016083121299744,
2961
+ "eval_runtime": 28.5972,
2962
+ "eval_samples_per_second": 0.594,
2963
+ "eval_steps_per_second": 0.315,
2964
+ "step": 985
2965
+ },
2966
+ {
2967
+ "epoch": 0.8710954685437747,
2968
+ "grad_norm": 0.18171900510787964,
2969
+ "learning_rate": 9.903113209758096e-06,
2970
+ "loss": 0.4806,
2971
+ "step": 990
2972
+ },
2973
+ {
2974
+ "epoch": 0.8710954685437747,
2975
+ "eval_loss": 0.5015130043029785,
2976
+ "eval_runtime": 28.5707,
2977
+ "eval_samples_per_second": 0.595,
2978
+ "eval_steps_per_second": 0.315,
2979
+ "step": 990
2980
+ },
2981
+ {
2982
+ "epoch": 0.8754949406071272,
2983
+ "grad_norm": 0.1896371841430664,
2984
+ "learning_rate": 9.246909317160746e-06,
2985
+ "loss": 0.4512,
2986
+ "step": 995
2987
+ },
2988
+ {
2989
+ "epoch": 0.8754949406071272,
2990
+ "eval_loss": 0.5013110637664795,
2991
+ "eval_runtime": 28.6509,
2992
+ "eval_samples_per_second": 0.593,
2993
+ "eval_steps_per_second": 0.314,
2994
+ "step": 995
2995
+ },
2996
+ {
2997
+ "epoch": 0.8798944126704795,
2998
+ "grad_norm": 0.1779976189136505,
2999
+ "learning_rate": 8.612143748910451e-06,
3000
+ "loss": 0.4561,
3001
+ "step": 1000
3002
+ },
3003
+ {
3004
+ "epoch": 0.8798944126704795,
3005
+ "eval_loss": 0.5013135075569153,
3006
+ "eval_runtime": 28.8047,
3007
+ "eval_samples_per_second": 0.59,
3008
+ "eval_steps_per_second": 0.312,
3009
+ "step": 1000
3010
+ },
3011
+ {
3012
+ "epoch": 0.884293884733832,
3013
+ "grad_norm": 0.17416957020759583,
3014
+ "learning_rate": 7.998966453731094e-06,
3015
+ "loss": 0.4637,
3016
+ "step": 1005
3017
+ },
3018
+ {
3019
+ "epoch": 0.884293884733832,
3020
+ "eval_loss": 0.5013565421104431,
3021
+ "eval_runtime": 28.5911,
3022
+ "eval_samples_per_second": 0.595,
3023
+ "eval_steps_per_second": 0.315,
3024
+ "step": 1005
3025
+ },
3026
+ {
3027
+ "epoch": 0.8886933567971843,
3028
+ "grad_norm": 0.1769402176141739,
3029
+ "learning_rate": 7.40752228061502e-06,
3030
+ "loss": 0.4527,
3031
+ "step": 1010
3032
+ },
3033
+ {
3034
+ "epoch": 0.8886933567971843,
3035
+ "eval_loss": 0.5010828375816345,
3036
+ "eval_runtime": 28.5203,
3037
+ "eval_samples_per_second": 0.596,
3038
+ "eval_steps_per_second": 0.316,
3039
+ "step": 1010
3040
+ },
3041
+ {
3042
+ "epoch": 0.8930928288605368,
3043
+ "grad_norm": 0.17784808576107025,
3044
+ "learning_rate": 6.8379509446057644e-06,
3045
+ "loss": 0.4903,
3046
+ "step": 1015
3047
+ },
3048
+ {
3049
+ "epoch": 0.8930928288605368,
3050
+ "eval_loss": 0.5012202262878418,
3051
+ "eval_runtime": 27.8441,
3052
+ "eval_samples_per_second": 0.611,
3053
+ "eval_steps_per_second": 0.323,
3054
+ "step": 1015
3055
+ },
3056
+ {
3057
+ "epoch": 0.8974923009238891,
3058
+ "grad_norm": 0.18067394196987152,
3059
+ "learning_rate": 6.290386993793618e-06,
3060
+ "loss": 0.4689,
3061
+ "step": 1020
3062
+ },
3063
+ {
3064
+ "epoch": 0.8974923009238891,
3065
+ "eval_loss": 0.5012267231941223,
3066
+ "eval_runtime": 28.517,
3067
+ "eval_samples_per_second": 0.596,
3068
+ "eval_steps_per_second": 0.316,
3069
+ "step": 1020
3070
+ },
3071
+ {
3072
+ "epoch": 0.9018917729872415,
3073
+ "grad_norm": 0.17478391528129578,
3074
+ "learning_rate": 5.764959777531776e-06,
3075
+ "loss": 0.4589,
3076
+ "step": 1025
3077
+ },
3078
+ {
3079
+ "epoch": 0.9018917729872415,
3080
+ "eval_loss": 0.5011836290359497,
3081
+ "eval_runtime": 28.6023,
3082
+ "eval_samples_per_second": 0.594,
3083
+ "eval_steps_per_second": 0.315,
3084
+ "step": 1025
3085
+ },
3086
+ {
3087
+ "epoch": 0.9062912450505939,
3088
+ "grad_norm": 0.185857892036438,
3089
+ "learning_rate": 5.261793415880456e-06,
3090
+ "loss": 0.4528,
3091
+ "step": 1030
3092
+ },
3093
+ {
3094
+ "epoch": 0.9062912450505939,
3095
+ "eval_loss": 0.501183807849884,
3096
+ "eval_runtime": 28.5159,
3097
+ "eval_samples_per_second": 0.596,
3098
+ "eval_steps_per_second": 0.316,
3099
+ "step": 1030
3100
+ },
3101
+ {
3102
+ "epoch": 0.9106907171139463,
3103
+ "grad_norm": 0.17951223254203796,
3104
+ "learning_rate": 4.781006770286478e-06,
3105
+ "loss": 0.4845,
3106
+ "step": 1035
3107
+ },
3108
+ {
3109
+ "epoch": 0.9106907171139463,
3110
+ "eval_loss": 0.5011433959007263,
3111
+ "eval_runtime": 28.6072,
3112
+ "eval_samples_per_second": 0.594,
3113
+ "eval_steps_per_second": 0.315,
3114
+ "step": 1035
3115
+ },
3116
+ {
3117
+ "epoch": 0.9150901891772987,
3118
+ "grad_norm": 0.18096089363098145,
3119
+ "learning_rate": 4.322713415504975e-06,
3120
+ "loss": 0.4578,
3121
+ "step": 1040
3122
+ },
3123
+ {
3124
+ "epoch": 0.9150901891772987,
3125
+ "eval_loss": 0.5011703968048096,
3126
+ "eval_runtime": 28.6287,
3127
+ "eval_samples_per_second": 0.594,
3128
+ "eval_steps_per_second": 0.314,
3129
+ "step": 1040
3130
+ },
3131
+ {
3132
+ "epoch": 0.9194896612406511,
3133
+ "grad_norm": 0.2069099247455597,
3134
+ "learning_rate": 3.887021612769936e-06,
3135
+ "loss": 0.5027,
3136
+ "step": 1045
3137
+ },
3138
+ {
3139
+ "epoch": 0.9194896612406511,
3140
+ "eval_loss": 0.5011240839958191,
3141
+ "eval_runtime": 29.0514,
3142
+ "eval_samples_per_second": 0.585,
3143
+ "eval_steps_per_second": 0.31,
3144
+ "step": 1045
3145
+ },
3146
+ {
3147
+ "epoch": 0.9238891333040036,
3148
+ "grad_norm": 0.18762987852096558,
3149
+ "learning_rate": 3.4740342842199956e-06,
3150
+ "loss": 0.4695,
3151
+ "step": 1050
3152
+ },
3153
+ {
3154
+ "epoch": 0.9238891333040036,
3155
+ "eval_loss": 0.5010772347450256,
3156
+ "eval_runtime": 28.5655,
3157
+ "eval_samples_per_second": 0.595,
3158
+ "eval_steps_per_second": 0.315,
3159
+ "step": 1050
3160
+ },
3161
+ {
3162
+ "epoch": 0.9282886053673559,
3163
+ "grad_norm": 0.178373321890831,
3164
+ "learning_rate": 3.0838489885854805e-06,
3165
+ "loss": 0.484,
3166
+ "step": 1055
3167
+ },
3168
+ {
3169
+ "epoch": 0.9282886053673559,
3170
+ "eval_loss": 0.5010451674461365,
3171
+ "eval_runtime": 28.6083,
3172
+ "eval_samples_per_second": 0.594,
3173
+ "eval_steps_per_second": 0.315,
3174
+ "step": 1055
3175
+ },
3176
+ {
3177
+ "epoch": 0.9326880774307084,
3178
+ "grad_norm": 0.1794215440750122,
3179
+ "learning_rate": 2.7165578981424357e-06,
3180
+ "loss": 0.4784,
3181
+ "step": 1060
3182
+ },
3183
+ {
3184
+ "epoch": 0.9326880774307084,
3185
+ "eval_loss": 0.5010905265808105,
3186
+ "eval_runtime": 28.5675,
3187
+ "eval_samples_per_second": 0.595,
3188
+ "eval_steps_per_second": 0.315,
3189
+ "step": 1060
3190
+ },
3191
+ {
3192
+ "epoch": 0.9370875494940607,
3193
+ "grad_norm": 0.17699354887008667,
3194
+ "learning_rate": 2.3722477769389517e-06,
3195
+ "loss": 0.4698,
3196
+ "step": 1065
3197
+ },
3198
+ {
3199
+ "epoch": 0.9370875494940607,
3200
+ "eval_loss": 0.5010352730751038,
3201
+ "eval_runtime": 28.6041,
3202
+ "eval_samples_per_second": 0.594,
3203
+ "eval_steps_per_second": 0.315,
3204
+ "step": 1065
3205
+ },
3206
+ {
3207
+ "epoch": 0.9414870215574132,
3208
+ "grad_norm": 0.17208220064640045,
3209
+ "learning_rate": 2.0509999602992493e-06,
3210
+ "loss": 0.4517,
3211
+ "step": 1070
3212
+ },
3213
+ {
3214
+ "epoch": 0.9414870215574132,
3215
+ "eval_loss": 0.5010344982147217,
3216
+ "eval_runtime": 28.5865,
3217
+ "eval_samples_per_second": 0.595,
3218
+ "eval_steps_per_second": 0.315,
3219
+ "step": 1070
3220
+ },
3221
+ {
3222
+ "epoch": 0.9458864936207655,
3223
+ "grad_norm": 0.1774464249610901,
3224
+ "learning_rate": 1.7528903356100469e-06,
3225
+ "loss": 0.4846,
3226
+ "step": 1075
3227
+ },
3228
+ {
3229
+ "epoch": 0.9458864936207655,
3230
+ "eval_loss": 0.5010223388671875,
3231
+ "eval_runtime": 28.5634,
3232
+ "eval_samples_per_second": 0.595,
3233
+ "eval_steps_per_second": 0.315,
3234
+ "step": 1075
3235
+ },
3236
+ {
3237
+ "epoch": 0.9502859656841179,
3238
+ "grad_norm": 0.1773741990327835,
3239
+ "learning_rate": 1.4779893243939359e-06,
3240
+ "loss": 0.4402,
3241
+ "step": 1080
3242
+ },
3243
+ {
3244
+ "epoch": 0.9502859656841179,
3245
+ "eval_loss": 0.5009992718696594,
3246
+ "eval_runtime": 28.5952,
3247
+ "eval_samples_per_second": 0.595,
3248
+ "eval_steps_per_second": 0.315,
3249
+ "step": 1080
3250
+ },
3251
+ {
3252
+ "epoch": 0.9546854377474703,
3253
+ "grad_norm": 0.18979211151599884,
3254
+ "learning_rate": 1.2263618656739084e-06,
3255
+ "loss": 0.5013,
3256
+ "step": 1085
3257
+ },
3258
+ {
3259
+ "epoch": 0.9546854377474703,
3260
+ "eval_loss": 0.501004159450531,
3261
+ "eval_runtime": 28.614,
3262
+ "eval_samples_per_second": 0.594,
3263
+ "eval_steps_per_second": 0.315,
3264
+ "step": 1085
3265
+ },
3266
+ {
3267
+ "epoch": 0.9590849098108227,
3268
+ "grad_norm": 0.1895236372947693,
3269
+ "learning_rate": 9.98067400632985e-07,
3270
+ "loss": 0.4588,
3271
+ "step": 1090
3272
+ },
3273
+ {
3274
+ "epoch": 0.9590849098108227,
3275
+ "eval_loss": 0.5009981393814087,
3276
+ "eval_runtime": 28.5601,
3277
+ "eval_samples_per_second": 0.595,
3278
+ "eval_steps_per_second": 0.315,
3279
+ "step": 1090
3280
+ },
3281
+ {
3282
+ "epoch": 0.9634843818741751,
3283
+ "grad_norm": 0.17328618466854095,
3284
+ "learning_rate": 7.931598585726563e-07,
3285
+ "loss": 0.4712,
3286
+ "step": 1095
3287
+ },
3288
+ {
3289
+ "epoch": 0.9634843818741751,
3290
+ "eval_loss": 0.500961184501648,
3291
+ "eval_runtime": 28.574,
3292
+ "eval_samples_per_second": 0.595,
3293
+ "eval_steps_per_second": 0.315,
3294
+ "step": 1095
3295
+ },
3296
+ {
3297
+ "epoch": 0.9678838539375275,
3298
+ "grad_norm": 0.18122579157352448,
3299
+ "learning_rate": 6.116876441733088e-07,
3300
+ "loss": 0.4534,
3301
+ "step": 1100
3302
+ },
3303
+ {
3304
+ "epoch": 0.9678838539375275,
3305
+ "eval_loss": 0.5009814500808716,
3306
+ "eval_runtime": 28.5934,
3307
+ "eval_samples_per_second": 0.595,
3308
+ "eval_steps_per_second": 0.315,
3309
+ "step": 1100
3310
+ },
3311
+ {
3312
+ "epoch": 0.9722833260008799,
3313
+ "grad_norm": 0.18148748576641083,
3314
+ "learning_rate": 4.536936260597258e-07,
3315
+ "loss": 0.4587,
3316
+ "step": 1105
3317
+ },
3318
+ {
3319
+ "epoch": 0.9722833260008799,
3320
+ "eval_loss": 0.5009997487068176,
3321
+ "eval_runtime": 28.5275,
3322
+ "eval_samples_per_second": 0.596,
3323
+ "eval_steps_per_second": 0.315,
3324
+ "step": 1105
3325
+ },
3326
+ {
3327
+ "epoch": 0.9766827980642323,
3328
+ "grad_norm": 0.18024764955043793,
3329
+ "learning_rate": 3.192151266743548e-07,
3330
+ "loss": 0.4783,
3331
+ "step": 1110
3332
+ },
3333
+ {
3334
+ "epoch": 0.9766827980642323,
3335
+ "eval_loss": 0.5009670853614807,
3336
+ "eval_runtime": 28.5688,
3337
+ "eval_samples_per_second": 0.595,
3338
+ "eval_steps_per_second": 0.315,
3339
+ "step": 1110
3340
+ },
3341
+ {
3342
+ "epoch": 0.9810822701275846,
3343
+ "grad_norm": 0.18152055144309998,
3344
+ "learning_rate": 2.082839134607828e-07,
3345
+ "loss": 0.4623,
3346
+ "step": 1115
3347
+ },
3348
+ {
3349
+ "epoch": 0.9810822701275846,
3350
+ "eval_loss": 0.5009202361106873,
3351
+ "eval_runtime": 28.6066,
3352
+ "eval_samples_per_second": 0.594,
3353
+ "eval_steps_per_second": 0.315,
3354
+ "step": 1115
3355
+ },
3356
+ {
3357
+ "epoch": 0.9854817421909371,
3358
+ "grad_norm": 0.17324087023735046,
3359
+ "learning_rate": 1.2092619135937177e-07,
3360
+ "loss": 0.439,
3361
+ "step": 1120
3362
+ },
3363
+ {
3364
+ "epoch": 0.9854817421909371,
3365
+ "eval_loss": 0.5010377168655396,
3366
+ "eval_runtime": 28.5308,
3367
+ "eval_samples_per_second": 0.596,
3368
+ "eval_steps_per_second": 0.315,
3369
+ "step": 1120
3370
+ },
3371
+ {
3372
+ "epoch": 0.9898812142542894,
3373
+ "grad_norm": 0.17685554921627045,
3374
+ "learning_rate": 5.716259661695533e-08,
3375
+ "loss": 0.4629,
3376
+ "step": 1125
3377
+ },
3378
+ {
3379
+ "epoch": 0.9898812142542894,
3380
+ "eval_loss": 0.5009082555770874,
3381
+ "eval_runtime": 28.6259,
3382
+ "eval_samples_per_second": 0.594,
3383
+ "eval_steps_per_second": 0.314,
3384
+ "step": 1125
3385
+ },
3386
+ {
3387
+ "epoch": 0.9942806863176419,
3388
+ "grad_norm": 0.17675389349460602,
3389
+ "learning_rate": 1.7008191912004646e-08,
3390
+ "loss": 0.4716,
3391
+ "step": 1130
3392
+ },
3393
+ {
3394
+ "epoch": 0.9942806863176419,
3395
+ "eval_loss": 0.5009535551071167,
3396
+ "eval_runtime": 28.626,
3397
+ "eval_samples_per_second": 0.594,
3398
+ "eval_steps_per_second": 0.314,
3399
+ "step": 1130
3400
+ },
3401
+ {
3402
+ "epoch": 0.9986801583809943,
3403
+ "grad_norm": 0.18398317694664001,
3404
+ "learning_rate": 4.724627964303175e-10,
3405
+ "loss": 0.4832,
3406
+ "step": 1135
3407
+ },
3408
+ {
3409
+ "epoch": 0.9986801583809943,
3410
+ "eval_loss": 0.5010104179382324,
3411
+ "eval_runtime": 28.6106,
3412
+ "eval_samples_per_second": 0.594,
3413
+ "eval_steps_per_second": 0.315,
3414
+ "step": 1135
3415
+ },
3416
+ {
3417
+ "epoch": 0.9995600527936648,
3418
+ "step": 1136,
3419
+ "total_flos": 7.211600370336793e+18,
3420
+ "train_loss": 0.039691918463984004,
3421
+ "train_runtime": 9596.3839,
3422
+ "train_samples_per_second": 1.895,
3423
+ "train_steps_per_second": 0.118
3424
+ }
3425
+ ],
3426
+ "logging_steps": 5,
3427
+ "max_steps": 1136,
3428
+ "num_input_tokens_seen": 0,
3429
+ "num_train_epochs": 1,
3430
+ "save_steps": 5,
3431
+ "stateful_callbacks": {
3432
+ "TrainerControl": {
3433
+ "args": {
3434
+ "should_epoch_stop": false,
3435
+ "should_evaluate": false,
3436
+ "should_log": false,
3437
+ "should_save": true,
3438
+ "should_training_stop": true
3439
+ },
3440
+ "attributes": {}
3441
+ }
3442
+ },
3443
+ "total_flos": 7.211600370336793e+18,
3444
+ "train_batch_size": 8,
3445
+ "trial_name": null,
3446
+ "trial_params": null
3447
+ }