zhangchen1991 commited on
Commit
86d3038
·
verified ·
1 Parent(s): b8c5d5c

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,75 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: other
4
+ base_model: Qwen/Qwen2-7B-Instruct
5
+ tags:
6
+ - llama-factory
7
+ - full
8
+ - generated_from_trainer
9
+ model-index:
10
+ - name: ideaassistant_qwen2_7b
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # ideaassistant_qwen2_7b
18
+
19
+ This model is a fine-tuned version of [Qwen/Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct) on the idea_sharegpt_format dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 0.4394
22
+
23
+ ## Model description
24
+
25
+ More information needed
26
+
27
+ ## Intended uses & limitations
28
+
29
+ More information needed
30
+
31
+ ## Training and evaluation data
32
+
33
+ More information needed
34
+
35
+ ## Training procedure
36
+
37
+ ### Training hyperparameters
38
+
39
+ The following hyperparameters were used during training:
40
+ - learning_rate: 1e-05
41
+ - train_batch_size: 2
42
+ - eval_batch_size: 1
43
+ - seed: 42
44
+ - distributed_type: multi-GPU
45
+ - num_devices: 8
46
+ - gradient_accumulation_steps: 4
47
+ - total_train_batch_size: 64
48
+ - total_eval_batch_size: 8
49
+ - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
50
+ - lr_scheduler_type: cosine
51
+ - lr_scheduler_warmup_ratio: 0.1
52
+ - num_epochs: 1.0
53
+
54
+ ### Training results
55
+
56
+ | Training Loss | Epoch | Step | Validation Loss |
57
+ |:-------------:|:------:|:----:|:---------------:|
58
+ | 1.0484 | 0.0955 | 200 | 1.0037 |
59
+ | 0.9828 | 0.1910 | 400 | 0.9110 |
60
+ | 0.8785 | 0.2865 | 600 | 0.8167 |
61
+ | 0.7863 | 0.3820 | 800 | 0.7238 |
62
+ | 0.7073 | 0.4776 | 1000 | 0.6407 |
63
+ | 0.6143 | 0.5731 | 1200 | 0.5672 |
64
+ | 0.5726 | 0.6686 | 1400 | 0.5096 |
65
+ | 0.5623 | 0.7641 | 1600 | 0.4683 |
66
+ | 0.5206 | 0.8596 | 1800 | 0.4465 |
67
+ | 0.5054 | 0.9551 | 2000 | 0.4396 |
68
+
69
+
70
+ ### Framework versions
71
+
72
+ - Transformers 4.50.0
73
+ - Pytorch 2.6.0+cu124
74
+ - Datasets 3.4.1
75
+ - Tokenizers 0.21.0
added_tokens.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "<|endoftext|>": 151643,
3
+ "<|im_end|>": 151645,
4
+ "<|im_start|>": 151644
5
+ }
all_results.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "eval_loss": 0.4393579959869385,
4
+ "eval_runtime": 534.2651,
5
+ "eval_samples_per_second": 27.87,
6
+ "eval_steps_per_second": 3.485,
7
+ "total_flos": 985274787299328.0,
8
+ "train_loss": 0.7358792713970487,
9
+ "train_runtime": 28849.9475,
10
+ "train_samples_per_second": 4.645,
11
+ "train_steps_per_second": 0.073
12
+ }
eval_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "eval_loss": 0.4393579959869385,
4
+ "eval_runtime": 534.2651,
5
+ "eval_samples_per_second": 27.87,
6
+ "eval_steps_per_second": 3.485
7
+ }
generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "repetition_penalty": 1.05,
10
+ "temperature": 0.7,
11
+ "top_k": 20,
12
+ "top_p": 0.8,
13
+ "transformers_version": "4.50.0"
14
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c6aee5d20fa9a0dc1bfef370d2fd400837714c7cf586fb748b2a2d656ec4bad
3
+ size 4877660776
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab06a6a05df0ce149a2e2ceef9cce715d4b2caadbca077a6b700a96c340a87fc
3
+ size 4932751008
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf9b9da015353abee9a4903dd91c3d0fb086cd3f46f5aa733bddc2d119d146b3
3
+ size 4330865200
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bcbb423005c5cb3dd483c10af534638f640637ddbb75f0ff02d8c5026a283787
3
+ size 1089994880
model.safetensors.index.json ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 15231233024
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00004-of-00004.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
13
+ "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
14
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
15
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
16
+ "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
17
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
18
+ "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
19
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
20
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
21
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
22
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
23
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
24
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
25
+ "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
26
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
27
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
28
+ "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
29
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
30
+ "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
31
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
32
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
33
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
34
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
35
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
36
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
37
+ "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
38
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
39
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
40
+ "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
41
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
42
+ "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
43
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
44
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
45
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
46
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
47
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
48
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
49
+ "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
50
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
51
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
52
+ "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
53
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
54
+ "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
55
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
56
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
57
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
58
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
59
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
60
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
61
+ "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
62
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
63
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
64
+ "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
65
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
66
+ "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
67
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
68
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
69
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
70
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
71
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
72
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
73
+ "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
74
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
75
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
76
+ "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
77
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
78
+ "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
79
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
80
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
81
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
82
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
83
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
84
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
85
+ "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
86
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
87
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
88
+ "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
89
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
90
+ "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
91
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
92
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
93
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
94
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
95
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
96
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
97
+ "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
98
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
99
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
100
+ "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
101
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
102
+ "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
103
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
104
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
105
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
106
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
107
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
108
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
109
+ "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
110
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
111
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
112
+ "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
113
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
114
+ "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
115
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
116
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
117
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
118
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
119
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
120
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
121
+ "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
122
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
123
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
124
+ "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
125
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
126
+ "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
127
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
128
+ "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
129
+ "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
130
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
131
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
132
+ "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
133
+ "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
134
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
135
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
136
+ "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
137
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
138
+ "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
139
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
140
+ "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
141
+ "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
142
+ "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
143
+ "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
144
+ "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
145
+ "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
146
+ "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
147
+ "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
148
+ "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
149
+ "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
150
+ "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
151
+ "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
152
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
153
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
154
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
155
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
156
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
157
+ "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
158
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
159
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
160
+ "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
161
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
162
+ "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
163
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
164
+ "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
165
+ "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
166
+ "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
167
+ "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
168
+ "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
169
+ "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
170
+ "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
171
+ "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
172
+ "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
173
+ "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
174
+ "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
175
+ "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
176
+ "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
177
+ "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
178
+ "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
179
+ "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
180
+ "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
181
+ "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
182
+ "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
183
+ "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
184
+ "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
185
+ "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
186
+ "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
187
+ "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
188
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
189
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
190
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
191
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
192
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
193
+ "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
194
+ "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
195
+ "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
196
+ "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
197
+ "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
198
+ "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
199
+ "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
200
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
201
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
202
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
203
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
204
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
205
+ "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
206
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
207
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
208
+ "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
209
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
210
+ "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
211
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
212
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
213
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
214
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
215
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
216
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
217
+ "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
218
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
219
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
220
+ "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
221
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
222
+ "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
223
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
224
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
225
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
226
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
227
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
228
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
229
+ "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
230
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
231
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
232
+ "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
233
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
234
+ "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
235
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
236
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
237
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
238
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
239
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
240
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
241
+ "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
242
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
243
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
244
+ "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
245
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
246
+ "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
247
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
248
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
249
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
250
+ "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
251
+ "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
252
+ "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
253
+ "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
254
+ "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
255
+ "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
256
+ "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
257
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
258
+ "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
259
+ "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
260
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
261
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
262
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
263
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
264
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
265
+ "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
266
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
267
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
268
+ "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
269
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
270
+ "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
271
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
272
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
273
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
274
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
275
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
276
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
277
+ "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
278
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
279
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
280
+ "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
281
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
282
+ "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
283
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
284
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
285
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
286
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
287
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
288
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
289
+ "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
290
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
291
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
292
+ "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
293
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
294
+ "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
295
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
296
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
297
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
298
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
299
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
300
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
301
+ "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
302
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
303
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
304
+ "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
305
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
306
+ "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
307
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
308
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
309
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
310
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
311
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
312
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
313
+ "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
314
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
315
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
316
+ "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
317
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
318
+ "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
319
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
320
+ "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
321
+ "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
322
+ "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
323
+ "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
324
+ "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
325
+ "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
326
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
327
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
328
+ "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
329
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
330
+ "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
331
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
332
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
333
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
334
+ "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
335
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
336
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
337
+ "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
338
+ "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
339
+ "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
340
+ "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
341
+ "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
342
+ "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
343
+ "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
344
+ "model.norm.weight": "model-00003-of-00004.safetensors"
345
+ }
346
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "eos_token": {
7
+ "content": "<|im_end|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "pad_token": {
14
+ "content": "<|endoftext|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ }
20
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bcfe42da0a4497e8b2b172c1f9f4ec423a46dc12907f4349c55025f670422ba9
3
+ size 11418266
tokenizer_config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "additional_special_tokens": [
30
+ "<|im_start|>",
31
+ "<|im_end|>"
32
+ ],
33
+ "bos_token": null,
34
+ "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
35
+ "clean_up_tokenization_spaces": false,
36
+ "eos_token": "<|im_end|>",
37
+ "errors": "replace",
38
+ "extra_special_tokens": {},
39
+ "model_max_length": 131072,
40
+ "pad_token": "<|endoftext|>",
41
+ "padding_side": "right",
42
+ "split_special_tokens": false,
43
+ "tokenizer_class": "Qwen2Tokenizer",
44
+ "unk_token": null
45
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 985274787299328.0,
4
+ "train_loss": 0.7358792713970487,
5
+ "train_runtime": 28849.9475,
6
+ "train_samples_per_second": 4.645,
7
+ "train_steps_per_second": 0.073
8
+ }
trainer_log.jsonl ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"current_steps": 10, "total_steps": 2094, "loss": 1.1507, "lr": 4.7619047619047623e-07, "epoch": 0.004775549188156638, "percentage": 0.48, "elapsed_time": "0:01:52", "remaining_time": "6:29:13"}
2
+ {"current_steps": 20, "total_steps": 2094, "loss": 1.1084, "lr": 9.523809523809525e-07, "epoch": 0.009551098376313277, "percentage": 0.96, "elapsed_time": "0:03:43", "remaining_time": "6:26:27"}
3
+ {"current_steps": 30, "total_steps": 2094, "loss": 1.0874, "lr": 1.4285714285714286e-06, "epoch": 0.014326647564469915, "percentage": 1.43, "elapsed_time": "0:05:35", "remaining_time": "6:24:42"}
4
+ {"current_steps": 40, "total_steps": 2094, "loss": 1.1068, "lr": 1.904761904761905e-06, "epoch": 0.019102196752626553, "percentage": 1.91, "elapsed_time": "0:07:29", "remaining_time": "6:24:35"}
5
+ {"current_steps": 50, "total_steps": 2094, "loss": 1.1152, "lr": 2.380952380952381e-06, "epoch": 0.02387774594078319, "percentage": 2.39, "elapsed_time": "0:09:14", "remaining_time": "6:17:32"}
6
+ {"current_steps": 60, "total_steps": 2094, "loss": 1.0964, "lr": 2.8571428571428573e-06, "epoch": 0.02865329512893983, "percentage": 2.87, "elapsed_time": "0:11:00", "remaining_time": "6:13:18"}
7
+ {"current_steps": 70, "total_steps": 2094, "loss": 1.1005, "lr": 3.3333333333333333e-06, "epoch": 0.033428844317096466, "percentage": 3.34, "elapsed_time": "0:12:52", "remaining_time": "6:12:11"}
8
+ {"current_steps": 80, "total_steps": 2094, "loss": 1.0933, "lr": 3.80952380952381e-06, "epoch": 0.038204393505253106, "percentage": 3.82, "elapsed_time": "0:14:41", "remaining_time": "6:09:49"}
9
+ {"current_steps": 90, "total_steps": 2094, "loss": 1.0397, "lr": 4.2857142857142855e-06, "epoch": 0.04297994269340974, "percentage": 4.3, "elapsed_time": "0:16:39", "remaining_time": "6:11:03"}
10
+ {"current_steps": 100, "total_steps": 2094, "loss": 1.0723, "lr": 4.761904761904762e-06, "epoch": 0.04775549188156638, "percentage": 4.78, "elapsed_time": "0:18:33", "remaining_time": "6:10:08"}
11
+ {"current_steps": 110, "total_steps": 2094, "loss": 1.0753, "lr": 5.2380952380952384e-06, "epoch": 0.05253104106972302, "percentage": 5.25, "elapsed_time": "0:20:22", "remaining_time": "6:07:36"}
12
+ {"current_steps": 120, "total_steps": 2094, "loss": 1.0604, "lr": 5.7142857142857145e-06, "epoch": 0.05730659025787966, "percentage": 5.73, "elapsed_time": "0:22:03", "remaining_time": "6:02:43"}
13
+ {"current_steps": 130, "total_steps": 2094, "loss": 1.0844, "lr": 6.1904761904761914e-06, "epoch": 0.06208213944603629, "percentage": 6.21, "elapsed_time": "0:23:55", "remaining_time": "6:01:23"}
14
+ {"current_steps": 140, "total_steps": 2094, "loss": 1.0738, "lr": 6.666666666666667e-06, "epoch": 0.06685768863419293, "percentage": 6.69, "elapsed_time": "0:25:53", "remaining_time": "6:01:26"}
15
+ {"current_steps": 150, "total_steps": 2094, "loss": 1.0638, "lr": 7.1428571428571436e-06, "epoch": 0.07163323782234957, "percentage": 7.16, "elapsed_time": "0:27:48", "remaining_time": "6:00:23"}
16
+ {"current_steps": 160, "total_steps": 2094, "loss": 1.089, "lr": 7.61904761904762e-06, "epoch": 0.07640878701050621, "percentage": 7.64, "elapsed_time": "0:29:41", "remaining_time": "5:58:52"}
17
+ {"current_steps": 170, "total_steps": 2094, "loss": 1.0215, "lr": 8.095238095238097e-06, "epoch": 0.08118433619866285, "percentage": 8.12, "elapsed_time": "0:31:32", "remaining_time": "5:57:01"}
18
+ {"current_steps": 180, "total_steps": 2094, "loss": 1.0489, "lr": 8.571428571428571e-06, "epoch": 0.08595988538681948, "percentage": 8.6, "elapsed_time": "0:33:24", "remaining_time": "5:55:10"}
19
+ {"current_steps": 190, "total_steps": 2094, "loss": 1.047, "lr": 9.047619047619049e-06, "epoch": 0.09073543457497613, "percentage": 9.07, "elapsed_time": "0:35:19", "remaining_time": "5:53:56"}
20
+ {"current_steps": 200, "total_steps": 2094, "loss": 1.0484, "lr": 9.523809523809525e-06, "epoch": 0.09551098376313276, "percentage": 9.55, "elapsed_time": "0:37:13", "remaining_time": "5:52:32"}
21
+ {"current_steps": 200, "total_steps": 2094, "eval_loss": 1.0037423372268677, "epoch": 0.09551098376313276, "percentage": 9.55, "elapsed_time": "0:46:07", "remaining_time": "7:16:52"}
22
+ {"current_steps": 210, "total_steps": 2094, "loss": 1.0786, "lr": 1e-05, "epoch": 0.10028653295128939, "percentage": 10.03, "elapsed_time": "0:48:26", "remaining_time": "7:14:37"}
23
+ {"current_steps": 220, "total_steps": 2094, "loss": 1.0568, "lr": 9.99930486701988e-06, "epoch": 0.10506208213944604, "percentage": 10.51, "elapsed_time": "0:50:23", "remaining_time": "7:09:14"}
24
+ {"current_steps": 230, "total_steps": 2094, "loss": 1.0264, "lr": 9.99721966136347e-06, "epoch": 0.10983763132760267, "percentage": 10.98, "elapsed_time": "0:52:22", "remaining_time": "7:04:26"}
25
+ {"current_steps": 240, "total_steps": 2094, "loss": 1.0093, "lr": 9.99374496282885e-06, "epoch": 0.11461318051575932, "percentage": 11.46, "elapsed_time": "0:54:10", "remaining_time": "6:58:32"}
26
+ {"current_steps": 250, "total_steps": 2094, "loss": 1.0172, "lr": 9.988881737567046e-06, "epoch": 0.11938872970391595, "percentage": 11.94, "elapsed_time": "0:56:04", "remaining_time": "6:53:35"}
27
+ {"current_steps": 260, "total_steps": 2094, "loss": 1.0369, "lr": 9.982631337813363e-06, "epoch": 0.12416427889207259, "percentage": 12.42, "elapsed_time": "0:57:54", "remaining_time": "6:48:28"}
28
+ {"current_steps": 270, "total_steps": 2094, "loss": 0.998, "lr": 9.974995501511404e-06, "epoch": 0.12893982808022922, "percentage": 12.89, "elapsed_time": "0:59:43", "remaining_time": "6:43:26"}
29
+ {"current_steps": 280, "total_steps": 2094, "loss": 1.0232, "lr": 9.965976351829827e-06, "epoch": 0.13371537726838587, "percentage": 13.37, "elapsed_time": "1:01:30", "remaining_time": "6:38:26"}
30
+ {"current_steps": 290, "total_steps": 2094, "loss": 1.0093, "lr": 9.95557639657199e-06, "epoch": 0.1384909264565425, "percentage": 13.85, "elapsed_time": "1:03:20", "remaining_time": "6:34:04"}
31
+ {"current_steps": 300, "total_steps": 2094, "loss": 1.0048, "lr": 9.943798527478652e-06, "epoch": 0.14326647564469913, "percentage": 14.33, "elapsed_time": "1:05:19", "remaining_time": "6:30:38"}
32
+ {"current_steps": 310, "total_steps": 2094, "loss": 0.9838, "lr": 9.930646019423909e-06, "epoch": 0.14804202483285578, "percentage": 14.8, "elapsed_time": "1:07:07", "remaining_time": "6:26:19"}
33
+ {"current_steps": 320, "total_steps": 2094, "loss": 0.9935, "lr": 9.916122529504605e-06, "epoch": 0.15281757402101243, "percentage": 15.28, "elapsed_time": "1:08:56", "remaining_time": "6:22:08"}
34
+ {"current_steps": 330, "total_steps": 2094, "loss": 0.9834, "lr": 9.900232096023478e-06, "epoch": 0.15759312320916904, "percentage": 15.76, "elapsed_time": "1:10:46", "remaining_time": "6:18:17"}
35
+ {"current_steps": 340, "total_steps": 2094, "loss": 0.9638, "lr": 9.882979137366275e-06, "epoch": 0.1623686723973257, "percentage": 16.24, "elapsed_time": "1:12:37", "remaining_time": "6:14:38"}
36
+ {"current_steps": 350, "total_steps": 2094, "loss": 0.983, "lr": 9.864368450773227e-06, "epoch": 0.16714422158548234, "percentage": 16.71, "elapsed_time": "1:14:31", "remaining_time": "6:11:22"}
37
+ {"current_steps": 360, "total_steps": 2094, "loss": 0.9759, "lr": 9.844405211005145e-06, "epoch": 0.17191977077363896, "percentage": 17.19, "elapsed_time": "1:16:21", "remaining_time": "6:07:47"}
38
+ {"current_steps": 370, "total_steps": 2094, "loss": 0.9927, "lr": 9.823094968904572e-06, "epoch": 0.1766953199617956, "percentage": 17.67, "elapsed_time": "1:18:09", "remaining_time": "6:04:09"}
39
+ {"current_steps": 380, "total_steps": 2094, "loss": 0.985, "lr": 9.800443649852347e-06, "epoch": 0.18147086914995225, "percentage": 18.15, "elapsed_time": "1:20:01", "remaining_time": "6:00:57"}
40
+ {"current_steps": 390, "total_steps": 2094, "loss": 0.9578, "lr": 9.776457552120034e-06, "epoch": 0.18624641833810887, "percentage": 18.62, "elapsed_time": "1:21:50", "remaining_time": "5:57:35"}
41
+ {"current_steps": 400, "total_steps": 2094, "loss": 0.9828, "lr": 9.751143345118675e-06, "epoch": 0.19102196752626552, "percentage": 19.1, "elapsed_time": "1:23:38", "remaining_time": "5:54:11"}
42
+ {"current_steps": 400, "total_steps": 2094, "eval_loss": 0.911033034324646, "epoch": 0.19102196752626552, "percentage": 19.1, "elapsed_time": "1:32:32", "remaining_time": "6:31:53"}
43
+ {"current_steps": 410, "total_steps": 2094, "loss": 0.9593, "lr": 9.724508067544328e-06, "epoch": 0.19579751671442217, "percentage": 19.58, "elapsed_time": "1:34:50", "remaining_time": "6:29:32"}
44
+ {"current_steps": 420, "total_steps": 2094, "loss": 0.9342, "lr": 9.696559125420949e-06, "epoch": 0.20057306590257878, "percentage": 20.06, "elapsed_time": "1:36:42", "remaining_time": "6:25:26"}
45
+ {"current_steps": 430, "total_steps": 2094, "loss": 0.9182, "lr": 9.667304290041102e-06, "epoch": 0.20534861509073543, "percentage": 20.53, "elapsed_time": "1:38:39", "remaining_time": "6:21:47"}
46
+ {"current_steps": 440, "total_steps": 2094, "loss": 0.9399, "lr": 9.636751695805154e-06, "epoch": 0.21012416427889208, "percentage": 21.01, "elapsed_time": "1:40:29", "remaining_time": "6:17:43"}
47
+ {"current_steps": 450, "total_steps": 2094, "loss": 0.9546, "lr": 9.604909837959456e-06, "epoch": 0.2148997134670487, "percentage": 21.49, "elapsed_time": "1:42:14", "remaining_time": "6:13:33"}
48
+ {"current_steps": 460, "total_steps": 2094, "loss": 0.9493, "lr": 9.57178757023422e-06, "epoch": 0.21967526265520534, "percentage": 21.97, "elapsed_time": "1:43:58", "remaining_time": "6:09:21"}
49
+ {"current_steps": 470, "total_steps": 2094, "loss": 0.951, "lr": 9.537394102381719e-06, "epoch": 0.224450811843362, "percentage": 22.45, "elapsed_time": "1:45:52", "remaining_time": "6:05:48"}
50
+ {"current_steps": 480, "total_steps": 2094, "loss": 0.902, "lr": 9.501738997615471e-06, "epoch": 0.22922636103151864, "percentage": 22.92, "elapsed_time": "1:47:42", "remaining_time": "6:02:10"}
51
+ {"current_steps": 490, "total_steps": 2094, "loss": 0.9121, "lr": 9.464832169951171e-06, "epoch": 0.23400191021967526, "percentage": 23.4, "elapsed_time": "1:49:36", "remaining_time": "5:58:47"}
52
+ {"current_steps": 500, "total_steps": 2094, "loss": 0.9149, "lr": 9.426683881450058e-06, "epoch": 0.2387774594078319, "percentage": 23.88, "elapsed_time": "1:51:23", "remaining_time": "5:55:05"}
53
+ {"current_steps": 510, "total_steps": 2094, "loss": 0.9141, "lr": 9.387304739365524e-06, "epoch": 0.24355300859598855, "percentage": 24.36, "elapsed_time": "1:53:23", "remaining_time": "5:52:10"}
54
+ {"current_steps": 520, "total_steps": 2094, "loss": 0.9046, "lr": 9.346705693193722e-06, "epoch": 0.24832855778414517, "percentage": 24.83, "elapsed_time": "1:55:12", "remaining_time": "5:48:44"}
55
+ {"current_steps": 530, "total_steps": 2094, "loss": 0.907, "lr": 9.304898031629038e-06, "epoch": 0.2531041069723018, "percentage": 25.31, "elapsed_time": "1:57:02", "remaining_time": "5:45:23"}
56
+ {"current_steps": 540, "total_steps": 2094, "loss": 0.9095, "lr": 9.261893379425218e-06, "epoch": 0.25787965616045844, "percentage": 25.79, "elapsed_time": "1:58:48", "remaining_time": "5:41:54"}
57
+ {"current_steps": 550, "total_steps": 2094, "loss": 0.8811, "lr": 9.217703694163083e-06, "epoch": 0.2626552053486151, "percentage": 26.27, "elapsed_time": "2:00:41", "remaining_time": "5:38:48"}
58
+ {"current_steps": 560, "total_steps": 2094, "loss": 0.8743, "lr": 9.172341262925675e-06, "epoch": 0.26743075453677173, "percentage": 26.74, "elapsed_time": "2:02:34", "remaining_time": "5:35:47"}
59
+ {"current_steps": 570, "total_steps": 2094, "loss": 0.8659, "lr": 9.125818698881798e-06, "epoch": 0.2722063037249284, "percentage": 27.22, "elapsed_time": "2:04:24", "remaining_time": "5:32:36"}
60
+ {"current_steps": 580, "total_steps": 2094, "loss": 0.906, "lr": 9.078148937778889e-06, "epoch": 0.276981852913085, "percentage": 27.7, "elapsed_time": "2:06:14", "remaining_time": "5:29:31"}
61
+ {"current_steps": 590, "total_steps": 2094, "loss": 0.8859, "lr": 9.029345234346183e-06, "epoch": 0.2817574021012416, "percentage": 28.18, "elapsed_time": "2:08:07", "remaining_time": "5:26:36"}
62
+ {"current_steps": 600, "total_steps": 2094, "loss": 0.8785, "lr": 8.979421158609206e-06, "epoch": 0.28653295128939826, "percentage": 28.65, "elapsed_time": "2:09:55", "remaining_time": "5:23:31"}
63
+ {"current_steps": 600, "total_steps": 2094, "eval_loss": 0.8167237639427185, "epoch": 0.28653295128939826, "percentage": 28.65, "elapsed_time": "2:18:50", "remaining_time": "5:45:41"}
64
+ {"current_steps": 610, "total_steps": 2094, "loss": 0.8539, "lr": 8.928390592116576e-06, "epoch": 0.2913085004775549, "percentage": 29.13, "elapsed_time": "2:21:01", "remaining_time": "5:43:05"}
65
+ {"current_steps": 620, "total_steps": 2094, "loss": 0.8527, "lr": 8.876267724080197e-06, "epoch": 0.29608404966571156, "percentage": 29.61, "elapsed_time": "2:22:47", "remaining_time": "5:39:29"}
66
+ {"current_steps": 630, "total_steps": 2094, "loss": 0.8683, "lr": 8.823067047429908e-06, "epoch": 0.3008595988538682, "percentage": 30.09, "elapsed_time": "2:24:35", "remaining_time": "5:35:59"}
67
+ {"current_steps": 640, "total_steps": 2094, "loss": 0.8649, "lr": 8.768803354783668e-06, "epoch": 0.30563514804202485, "percentage": 30.56, "elapsed_time": "2:26:19", "remaining_time": "5:32:26"}
68
+ {"current_steps": 650, "total_steps": 2094, "loss": 0.8622, "lr": 8.71349173433443e-06, "epoch": 0.3104106972301815, "percentage": 31.04, "elapsed_time": "2:28:06", "remaining_time": "5:29:01"}
69
+ {"current_steps": 660, "total_steps": 2094, "loss": 0.8359, "lr": 8.65714756565482e-06, "epoch": 0.3151862464183381, "percentage": 31.52, "elapsed_time": "2:29:51", "remaining_time": "5:25:35"}
70
+ {"current_steps": 670, "total_steps": 2094, "loss": 0.8569, "lr": 8.599786515420789e-06, "epoch": 0.31996179560649474, "percentage": 32.0, "elapsed_time": "2:31:42", "remaining_time": "5:22:26"}
71
+ {"current_steps": 680, "total_steps": 2094, "loss": 0.8458, "lr": 8.541424533055455e-06, "epoch": 0.3247373447946514, "percentage": 32.47, "elapsed_time": "2:33:32", "remaining_time": "5:19:16"}
72
+ {"current_steps": 690, "total_steps": 2094, "loss": 0.8447, "lr": 8.48207784629431e-06, "epoch": 0.32951289398280803, "percentage": 32.95, "elapsed_time": "2:35:23", "remaining_time": "5:16:11"}
73
+ {"current_steps": 700, "total_steps": 2094, "loss": 0.8365, "lr": 8.421762956673043e-06, "epoch": 0.3342884431709647, "percentage": 33.43, "elapsed_time": "2:37:16", "remaining_time": "5:13:13"}
74
+ {"current_steps": 710, "total_steps": 2094, "loss": 0.8335, "lr": 8.360496634939243e-06, "epoch": 0.3390639923591213, "percentage": 33.91, "elapsed_time": "2:39:07", "remaining_time": "5:10:11"}
75
+ {"current_steps": 720, "total_steps": 2094, "loss": 0.8458, "lr": 8.298295916389234e-06, "epoch": 0.3438395415472779, "percentage": 34.38, "elapsed_time": "2:41:01", "remaining_time": "5:07:17"}
76
+ {"current_steps": 730, "total_steps": 2094, "loss": 0.8185, "lr": 8.235178096131355e-06, "epoch": 0.34861509073543456, "percentage": 34.86, "elapsed_time": "2:42:50", "remaining_time": "5:04:16"}
77
+ {"current_steps": 740, "total_steps": 2094, "loss": 0.8009, "lr": 8.171160724277005e-06, "epoch": 0.3533906399235912, "percentage": 35.34, "elapsed_time": "2:44:38", "remaining_time": "5:01:15"}
78
+ {"current_steps": 750, "total_steps": 2094, "loss": 0.8277, "lr": 8.106261601060773e-06, "epoch": 0.35816618911174786, "percentage": 35.82, "elapsed_time": "2:46:27", "remaining_time": "4:58:16"}
79
+ {"current_steps": 760, "total_steps": 2094, "loss": 0.8432, "lr": 8.040498771891031e-06, "epoch": 0.3629417382999045, "percentage": 36.29, "elapsed_time": "2:48:16", "remaining_time": "4:55:21"}
80
+ {"current_steps": 770, "total_steps": 2094, "loss": 0.7933, "lr": 7.973890522332348e-06, "epoch": 0.36771728748806115, "percentage": 36.77, "elapsed_time": "2:50:04", "remaining_time": "4:52:26"}
81
+ {"current_steps": 780, "total_steps": 2094, "loss": 0.8122, "lr": 7.90645537302113e-06, "epoch": 0.37249283667621774, "percentage": 37.25, "elapsed_time": "2:52:03", "remaining_time": "4:49:51"}
82
+ {"current_steps": 790, "total_steps": 2094, "loss": 0.7713, "lr": 7.838212074515899e-06, "epoch": 0.3772683858643744, "percentage": 37.73, "elapsed_time": "2:53:57", "remaining_time": "4:47:08"}
83
+ {"current_steps": 800, "total_steps": 2094, "loss": 0.7863, "lr": 7.769179602083642e-06, "epoch": 0.38204393505253104, "percentage": 38.2, "elapsed_time": "2:55:49", "remaining_time": "4:44:24"}
84
+ {"current_steps": 800, "total_steps": 2094, "eval_loss": 0.7238086462020874, "epoch": 0.38204393505253104, "percentage": 38.2, "elapsed_time": "3:04:44", "remaining_time": "4:58:48"}
85
+ {"current_steps": 810, "total_steps": 2094, "loss": 0.7703, "lr": 7.699377150423673e-06, "epoch": 0.3868194842406877, "percentage": 38.68, "elapsed_time": "3:07:00", "remaining_time": "4:56:26"}
86
+ {"current_steps": 820, "total_steps": 2094, "loss": 0.7651, "lr": 7.628824128330485e-06, "epoch": 0.39159503342884433, "percentage": 39.16, "elapsed_time": "3:08:49", "remaining_time": "4:53:22"}
87
+ {"current_steps": 830, "total_steps": 2094, "loss": 0.777, "lr": 7.557540153297086e-06, "epoch": 0.396370582617001, "percentage": 39.64, "elapsed_time": "3:10:32", "remaining_time": "4:50:10"}
88
+ {"current_steps": 840, "total_steps": 2094, "loss": 0.7659, "lr": 7.485545046060272e-06, "epoch": 0.40114613180515757, "percentage": 40.11, "elapsed_time": "3:12:26", "remaining_time": "4:47:17"}
89
+ {"current_steps": 850, "total_steps": 2094, "loss": 0.7422, "lr": 7.412858825089423e-06, "epoch": 0.4059216809933142, "percentage": 40.59, "elapsed_time": "3:14:14", "remaining_time": "4:44:16"}
90
+ {"current_steps": 860, "total_steps": 2094, "loss": 0.7812, "lr": 7.3395017010202965e-06, "epoch": 0.41069723018147086, "percentage": 41.07, "elapsed_time": "3:16:05", "remaining_time": "4:41:21"}
91
+ {"current_steps": 870, "total_steps": 2094, "loss": 0.7461, "lr": 7.265494071035401e-06, "epoch": 0.4154727793696275, "percentage": 41.55, "elapsed_time": "3:17:56", "remaining_time": "4:38:28"}
92
+ {"current_steps": 880, "total_steps": 2094, "loss": 0.7475, "lr": 7.19085651319249e-06, "epoch": 0.42024832855778416, "percentage": 42.02, "elapsed_time": "3:19:50", "remaining_time": "4:35:40"}
93
+ {"current_steps": 890, "total_steps": 2094, "loss": 0.7485, "lr": 7.115609780702767e-06, "epoch": 0.4250238777459408, "percentage": 42.5, "elapsed_time": "3:21:46", "remaining_time": "4:32:57"}
94
+ {"current_steps": 900, "total_steps": 2094, "loss": 0.7502, "lr": 7.039774796160391e-06, "epoch": 0.4297994269340974, "percentage": 42.98, "elapsed_time": "3:23:38", "remaining_time": "4:30:10"}
95
+ {"current_steps": 910, "total_steps": 2094, "loss": 0.7307, "lr": 6.9633726457248864e-06, "epoch": 0.43457497612225404, "percentage": 43.46, "elapsed_time": "3:25:34", "remaining_time": "4:27:29"}
96
+ {"current_steps": 920, "total_steps": 2094, "loss": 0.7407, "lr": 6.886424573258057e-06, "epoch": 0.4393505253104107, "percentage": 43.94, "elapsed_time": "3:27:26", "remaining_time": "4:24:43"}
97
+ {"current_steps": 930, "total_steps": 2094, "loss": 0.7232, "lr": 6.808951974417077e-06, "epoch": 0.44412607449856734, "percentage": 44.41, "elapsed_time": "3:29:15", "remaining_time": "4:21:55"}
98
+ {"current_steps": 940, "total_steps": 2094, "loss": 0.7217, "lr": 6.73097639070535e-06, "epoch": 0.448901623686724, "percentage": 44.89, "elapsed_time": "3:31:04", "remaining_time": "4:19:07"}
99
+ {"current_steps": 950, "total_steps": 2094, "loss": 0.7275, "lr": 6.652519503482829e-06, "epoch": 0.45367717287488063, "percentage": 45.37, "elapsed_time": "3:32:55", "remaining_time": "4:16:24"}
100
+ {"current_steps": 960, "total_steps": 2094, "loss": 0.7244, "lr": 6.573603127937443e-06, "epoch": 0.4584527220630373, "percentage": 45.85, "elapsed_time": "3:34:42", "remaining_time": "4:13:37"}
101
+ {"current_steps": 970, "total_steps": 2094, "loss": 0.7184, "lr": 6.494249207019317e-06, "epoch": 0.46322827125119387, "percentage": 46.32, "elapsed_time": "3:36:41", "remaining_time": "4:11:05"}
102
+ {"current_steps": 980, "total_steps": 2094, "loss": 0.6887, "lr": 6.414479805339465e-06, "epoch": 0.4680038204393505, "percentage": 46.8, "elapsed_time": "3:38:35", "remaining_time": "4:08:29"}
103
+ {"current_steps": 990, "total_steps": 2094, "loss": 0.6858, "lr": 6.3343171030346525e-06, "epoch": 0.47277936962750716, "percentage": 47.28, "elapsed_time": "3:40:16", "remaining_time": "4:05:38"}
104
+ {"current_steps": 1000, "total_steps": 2094, "loss": 0.7073, "lr": 6.253783389600136e-06, "epoch": 0.4775549188156638, "percentage": 47.76, "elapsed_time": "3:42:08", "remaining_time": "4:03:01"}
105
+ {"current_steps": 1000, "total_steps": 2094, "eval_loss": 0.6407110095024109, "epoch": 0.4775549188156638, "percentage": 47.76, "elapsed_time": "3:51:02", "remaining_time": "4:12:46"}
106
+ {"current_steps": 1010, "total_steps": 2094, "loss": 0.7207, "lr": 6.172901057692007e-06, "epoch": 0.48233046800382046, "percentage": 48.23, "elapsed_time": "3:53:18", "remaining_time": "4:10:24"}
107
+ {"current_steps": 1020, "total_steps": 2094, "loss": 0.7363, "lr": 6.0916925969008275e-06, "epoch": 0.4871060171919771, "percentage": 48.71, "elapsed_time": "3:55:05", "remaining_time": "4:07:32"}
108
+ {"current_steps": 1030, "total_steps": 2094, "loss": 0.6729, "lr": 6.010180587498347e-06, "epoch": 0.4918815663801337, "percentage": 49.19, "elapsed_time": "3:56:53", "remaining_time": "4:04:42"}
109
+ {"current_steps": 1040, "total_steps": 2094, "loss": 0.6956, "lr": 5.928387694158968e-06, "epoch": 0.49665711556829034, "percentage": 49.67, "elapsed_time": "3:58:47", "remaining_time": "4:02:00"}
110
+ {"current_steps": 1050, "total_steps": 2094, "loss": 0.6896, "lr": 5.8463366596577706e-06, "epoch": 0.501432664756447, "percentage": 50.14, "elapsed_time": "4:00:34", "remaining_time": "3:59:12"}
111
+ {"current_steps": 1060, "total_steps": 2094, "loss": 0.6861, "lr": 5.764050298546808e-06, "epoch": 0.5062082139446036, "percentage": 50.62, "elapsed_time": "4:02:19", "remaining_time": "3:56:22"}
112
+ {"current_steps": 1070, "total_steps": 2094, "loss": 0.6762, "lr": 5.68155149081145e-06, "epoch": 0.5109837631327603, "percentage": 51.1, "elapsed_time": "4:04:10", "remaining_time": "3:53:40"}
113
+ {"current_steps": 1080, "total_steps": 2094, "loss": 0.6717, "lr": 5.598863175508526e-06, "epoch": 0.5157593123209169, "percentage": 51.58, "elapsed_time": "4:06:02", "remaining_time": "3:50:59"}
114
+ {"current_steps": 1090, "total_steps": 2094, "loss": 0.6825, "lr": 5.516008344388053e-06, "epoch": 0.5205348615090736, "percentage": 52.05, "elapsed_time": "4:07:55", "remaining_time": "3:48:21"}
115
+ {"current_steps": 1100, "total_steps": 2094, "loss": 0.6771, "lr": 5.433010035500299e-06, "epoch": 0.5253104106972302, "percentage": 52.53, "elapsed_time": "4:09:48", "remaining_time": "3:45:44"}
116
+ {"current_steps": 1110, "total_steps": 2094, "loss": 0.674, "lr": 5.3498913267899864e-06, "epoch": 0.5300859598853869, "percentage": 53.01, "elapsed_time": "4:11:44", "remaining_time": "3:43:09"}
117
+ {"current_steps": 1120, "total_steps": 2094, "loss": 0.6662, "lr": 5.2666753296793895e-06, "epoch": 0.5348615090735435, "percentage": 53.49, "elapsed_time": "4:13:26", "remaining_time": "3:40:23"}
118
+ {"current_steps": 1130, "total_steps": 2094, "loss": 0.6765, "lr": 5.183385182642136e-06, "epoch": 0.5396370582617, "percentage": 53.96, "elapsed_time": "4:15:16", "remaining_time": "3:37:46"}
119
+ {"current_steps": 1140, "total_steps": 2094, "loss": 0.6682, "lr": 5.100044044769472e-06, "epoch": 0.5444126074498568, "percentage": 54.44, "elapsed_time": "4:17:07", "remaining_time": "3:35:10"}
120
+ {"current_steps": 1150, "total_steps": 2094, "loss": 0.6583, "lr": 5.016675089330817e-06, "epoch": 0.5491881566380133, "percentage": 54.92, "elapsed_time": "4:18:56", "remaining_time": "3:32:33"}
121
+ {"current_steps": 1160, "total_steps": 2094, "loss": 0.6456, "lr": 4.933301497330344e-06, "epoch": 0.55396370582617, "percentage": 55.4, "elapsed_time": "4:20:46", "remaining_time": "3:29:58"}
122
+ {"current_steps": 1170, "total_steps": 2094, "loss": 0.673, "lr": 4.849946451061444e-06, "epoch": 0.5587392550143266, "percentage": 55.87, "elapsed_time": "4:22:40", "remaining_time": "3:27:26"}
123
+ {"current_steps": 1180, "total_steps": 2094, "loss": 0.6372, "lr": 4.766633127660805e-06, "epoch": 0.5635148042024832, "percentage": 56.35, "elapsed_time": "4:24:30", "remaining_time": "3:24:52"}
124
+ {"current_steps": 1190, "total_steps": 2094, "loss": 0.6352, "lr": 4.683384692663937e-06, "epoch": 0.5682903533906399, "percentage": 56.83, "elapsed_time": "4:26:21", "remaining_time": "3:22:20"}
125
+ {"current_steps": 1200, "total_steps": 2094, "loss": 0.6143, "lr": 4.600224293563926e-06, "epoch": 0.5730659025787965, "percentage": 57.31, "elapsed_time": "4:28:13", "remaining_time": "3:19:49"}
126
+ {"current_steps": 1200, "total_steps": 2094, "eval_loss": 0.5672308206558228, "epoch": 0.5730659025787965, "percentage": 57.31, "elapsed_time": "4:37:08", "remaining_time": "3:26:27"}
127
+ {"current_steps": 1210, "total_steps": 2094, "loss": 0.6482, "lr": 4.517175053375191e-06, "epoch": 0.5778414517669532, "percentage": 57.78, "elapsed_time": "4:39:20", "remaining_time": "3:24:04"}
128
+ {"current_steps": 1220, "total_steps": 2094, "loss": 0.6244, "lr": 4.434260064204067e-06, "epoch": 0.5826170009551098, "percentage": 58.26, "elapsed_time": "4:41:08", "remaining_time": "3:21:24"}
129
+ {"current_steps": 1230, "total_steps": 2094, "loss": 0.6231, "lr": 4.351502380827959e-06, "epoch": 0.5873925501432665, "percentage": 58.74, "elapsed_time": "4:42:56", "remaining_time": "3:18:44"}
130
+ {"current_steps": 1240, "total_steps": 2094, "loss": 0.6515, "lr": 4.268925014284898e-06, "epoch": 0.5921680993314231, "percentage": 59.22, "elapsed_time": "4:44:47", "remaining_time": "3:16:08"}
131
+ {"current_steps": 1250, "total_steps": 2094, "loss": 0.6027, "lr": 4.18655092547524e-06, "epoch": 0.5969436485195797, "percentage": 59.69, "elapsed_time": "4:46:36", "remaining_time": "3:13:31"}
132
+ {"current_steps": 1260, "total_steps": 2094, "loss": 0.636, "lr": 4.104403018777323e-06, "epoch": 0.6017191977077364, "percentage": 60.17, "elapsed_time": "4:48:28", "remaining_time": "3:10:56"}
133
+ {"current_steps": 1270, "total_steps": 2094, "loss": 0.6356, "lr": 4.022504135678822e-06, "epoch": 0.606494746895893, "percentage": 60.65, "elapsed_time": "4:50:21", "remaining_time": "3:08:23"}
134
+ {"current_steps": 1280, "total_steps": 2094, "loss": 0.6303, "lr": 3.94087704842561e-06, "epoch": 0.6112702960840497, "percentage": 61.13, "elapsed_time": "4:52:07", "remaining_time": "3:05:46"}
135
+ {"current_steps": 1290, "total_steps": 2094, "loss": 0.6181, "lr": 3.859544453689853e-06, "epoch": 0.6160458452722063, "percentage": 61.6, "elapsed_time": "4:54:06", "remaining_time": "3:03:18"}
136
+ {"current_steps": 1300, "total_steps": 2094, "loss": 0.6075, "lr": 3.778528966259137e-06, "epoch": 0.620821394460363, "percentage": 62.08, "elapsed_time": "4:55:54", "remaining_time": "3:00:44"}
137
+ {"current_steps": 1310, "total_steps": 2094, "loss": 0.6106, "lr": 3.697853112748345e-06, "epoch": 0.6255969436485196, "percentage": 62.56, "elapsed_time": "4:57:50", "remaining_time": "2:58:15"}
138
+ {"current_steps": 1320, "total_steps": 2094, "loss": 0.599, "lr": 3.6175393253360704e-06, "epoch": 0.6303724928366762, "percentage": 63.04, "elapsed_time": "4:59:44", "remaining_time": "2:55:45"}
139
+ {"current_steps": 1330, "total_steps": 2094, "loss": 0.5996, "lr": 3.537609935527264e-06, "epoch": 0.6351480420248329, "percentage": 63.51, "elapsed_time": "5:01:34", "remaining_time": "2:53:13"}
140
+ {"current_steps": 1340, "total_steps": 2094, "loss": 0.5867, "lr": 3.458087167943905e-06, "epoch": 0.6399235912129895, "percentage": 63.99, "elapsed_time": "5:03:30", "remaining_time": "2:50:46"}
141
+ {"current_steps": 1350, "total_steps": 2094, "loss": 0.614, "lr": 3.3789931341453564e-06, "epoch": 0.6446991404011462, "percentage": 64.47, "elapsed_time": "5:05:24", "remaining_time": "2:48:18"}
142
+ {"current_steps": 1360, "total_steps": 2094, "loss": 0.5858, "lr": 3.3003498264801915e-06, "epoch": 0.6494746895893028, "percentage": 64.95, "elapsed_time": "5:07:09", "remaining_time": "2:45:46"}
143
+ {"current_steps": 1370, "total_steps": 2094, "loss": 0.6073, "lr": 3.2221791119711372e-06, "epoch": 0.6542502387774594, "percentage": 65.43, "elapsed_time": "5:08:56", "remaining_time": "2:43:16"}
144
+ {"current_steps": 1380, "total_steps": 2094, "loss": 0.598, "lr": 3.144502726234889e-06, "epoch": 0.6590257879656161, "percentage": 65.9, "elapsed_time": "5:10:49", "remaining_time": "2:40:49"}
145
+ {"current_steps": 1390, "total_steps": 2094, "loss": 0.5864, "lr": 3.067342267438446e-06, "epoch": 0.6638013371537727, "percentage": 66.38, "elapsed_time": "5:12:44", "remaining_time": "2:38:23"}
146
+ {"current_steps": 1400, "total_steps": 2094, "loss": 0.5726, "lr": 2.9907191902936773e-06, "epoch": 0.6685768863419294, "percentage": 66.86, "elapsed_time": "5:14:38", "remaining_time": "2:35:58"}
147
+ {"current_steps": 1400, "total_steps": 2094, "eval_loss": 0.5096372961997986, "epoch": 0.6685768863419294, "percentage": 66.86, "elapsed_time": "5:23:32", "remaining_time": "2:40:23"}
148
+ {"current_steps": 1410, "total_steps": 2094, "loss": 0.5678, "lr": 2.914654800091768e-06, "epoch": 0.673352435530086, "percentage": 67.34, "elapsed_time": "5:25:50", "remaining_time": "2:38:04"}
149
+ {"current_steps": 1420, "total_steps": 2094, "loss": 0.5875, "lr": 2.8391702467792137e-06, "epoch": 0.6781279847182426, "percentage": 67.81, "elapsed_time": "5:27:46", "remaining_time": "2:35:34"}
150
+ {"current_steps": 1430, "total_steps": 2094, "loss": 0.5745, "lr": 2.764286519077014e-06, "epoch": 0.6829035339063992, "percentage": 68.29, "elapsed_time": "5:29:35", "remaining_time": "2:33:02"}
151
+ {"current_steps": 1440, "total_steps": 2094, "loss": 0.5748, "lr": 2.6900244386446903e-06, "epoch": 0.6876790830945558, "percentage": 68.77, "elapsed_time": "5:31:24", "remaining_time": "2:30:30"}
152
+ {"current_steps": 1450, "total_steps": 2094, "loss": 0.582, "lr": 2.616404654290752e-06, "epoch": 0.6924546322827125, "percentage": 69.25, "elapsed_time": "5:33:17", "remaining_time": "2:28:01"}
153
+ {"current_steps": 1460, "total_steps": 2094, "loss": 0.5859, "lr": 2.5434476362312375e-06, "epoch": 0.6972301814708691, "percentage": 69.72, "elapsed_time": "5:35:11", "remaining_time": "2:25:33"}
154
+ {"current_steps": 1470, "total_steps": 2094, "loss": 0.5778, "lr": 2.4711736703979015e-06, "epoch": 0.7020057306590258, "percentage": 70.2, "elapsed_time": "5:37:12", "remaining_time": "2:23:08"}
155
+ {"current_steps": 1480, "total_steps": 2094, "loss": 0.5833, "lr": 2.399602852797647e-06, "epoch": 0.7067812798471824, "percentage": 70.68, "elapsed_time": "5:39:03", "remaining_time": "2:20:39"}
156
+ {"current_steps": 1490, "total_steps": 2094, "loss": 0.5677, "lr": 2.3287550839247625e-06, "epoch": 0.711556829035339, "percentage": 71.16, "elapsed_time": "5:40:56", "remaining_time": "2:18:12"}
157
+ {"current_steps": 1500, "total_steps": 2094, "loss": 0.5501, "lr": 2.2586500632275333e-06, "epoch": 0.7163323782234957, "percentage": 71.63, "elapsed_time": "5:42:50", "remaining_time": "2:15:46"}
158
+ {"current_steps": 1510, "total_steps": 2094, "loss": 0.5432, "lr": 2.1893072836307433e-06, "epoch": 0.7211079274116523, "percentage": 72.11, "elapsed_time": "5:44:44", "remaining_time": "2:13:19"}
159
+ {"current_steps": 1520, "total_steps": 2094, "loss": 0.6017, "lr": 2.1207460261156066e-06, "epoch": 0.725883476599809, "percentage": 72.59, "elapsed_time": "5:46:35", "remaining_time": "2:10:53"}
160
+ {"current_steps": 1530, "total_steps": 2094, "loss": 0.5361, "lr": 2.052985354358622e-06, "epoch": 0.7306590257879656, "percentage": 73.07, "elapsed_time": "5:48:26", "remaining_time": "2:08:26"}
161
+ {"current_steps": 1540, "total_steps": 2094, "loss": 0.544, "lr": 1.986044109430869e-06, "epoch": 0.7354345749761223, "percentage": 73.54, "elapsed_time": "5:50:15", "remaining_time": "2:05:59"}
162
+ {"current_steps": 1550, "total_steps": 2094, "loss": 0.5544, "lr": 1.91994090455918e-06, "epoch": 0.7402101241642789, "percentage": 74.02, "elapsed_time": "5:52:04", "remaining_time": "2:03:34"}
163
+ {"current_steps": 1560, "total_steps": 2094, "loss": 0.5743, "lr": 1.8546941199506752e-06, "epoch": 0.7449856733524355, "percentage": 74.5, "elapsed_time": "5:53:57", "remaining_time": "2:01:09"}
164
+ {"current_steps": 1570, "total_steps": 2094, "loss": 0.5516, "lr": 1.790321897682083e-06, "epoch": 0.7497612225405922, "percentage": 74.98, "elapsed_time": "5:55:45", "remaining_time": "1:58:44"}
165
+ {"current_steps": 1580, "total_steps": 2094, "loss": 0.5598, "lr": 1.7268421366552851e-06, "epoch": 0.7545367717287488, "percentage": 75.45, "elapsed_time": "5:57:38", "remaining_time": "1:56:20"}
166
+ {"current_steps": 1590, "total_steps": 2094, "loss": 0.5457, "lr": 1.6642724876204658e-06, "epoch": 0.7593123209169055, "percentage": 75.93, "elapsed_time": "5:59:27", "remaining_time": "1:53:56"}
167
+ {"current_steps": 1600, "total_steps": 2094, "loss": 0.5623, "lr": 1.602630348268267e-06, "epoch": 0.7640878701050621, "percentage": 76.41, "elapsed_time": "6:01:14", "remaining_time": "1:51:32"}
168
+ {"current_steps": 1600, "total_steps": 2094, "eval_loss": 0.46827441453933716, "epoch": 0.7640878701050621, "percentage": 76.41, "elapsed_time": "6:10:08", "remaining_time": "1:54:16"}
169
+ {"current_steps": 1610, "total_steps": 2094, "loss": 0.5522, "lr": 1.541932858392296e-06, "epoch": 0.7688634192932188, "percentage": 76.89, "elapsed_time": "6:12:23", "remaining_time": "1:51:57"}
170
+ {"current_steps": 1620, "total_steps": 2094, "loss": 0.5321, "lr": 1.482196895123364e-06, "epoch": 0.7736389684813754, "percentage": 77.36, "elapsed_time": "6:14:19", "remaining_time": "1:49:31"}
171
+ {"current_steps": 1630, "total_steps": 2094, "loss": 0.5789, "lr": 1.423439068236736e-06, "epoch": 0.778414517669532, "percentage": 77.84, "elapsed_time": "6:16:08", "remaining_time": "1:47:04"}
172
+ {"current_steps": 1640, "total_steps": 2094, "loss": 0.5628, "lr": 1.3656757155337413e-06, "epoch": 0.7831900668576887, "percentage": 78.32, "elapsed_time": "6:17:59", "remaining_time": "1:44:38"}
173
+ {"current_steps": 1650, "total_steps": 2094, "loss": 0.5139, "lr": 1.3089228982989771e-06, "epoch": 0.7879656160458453, "percentage": 78.8, "elapsed_time": "6:19:46", "remaining_time": "1:42:11"}
174
+ {"current_steps": 1660, "total_steps": 2094, "loss": 0.5229, "lr": 1.2531963968344346e-06, "epoch": 0.792741165234002, "percentage": 79.27, "elapsed_time": "6:21:36", "remaining_time": "1:39:46"}
175
+ {"current_steps": 1670, "total_steps": 2094, "loss": 0.5184, "lr": 1.1985117060717278e-06, "epoch": 0.7975167144221585, "percentage": 79.75, "elapsed_time": "6:23:33", "remaining_time": "1:37:22"}
176
+ {"current_steps": 1680, "total_steps": 2094, "loss": 0.5248, "lr": 1.1448840312636812e-06, "epoch": 0.8022922636103151, "percentage": 80.23, "elapsed_time": "6:25:21", "remaining_time": "1:34:57"}
177
+ {"current_steps": 1690, "total_steps": 2094, "loss": 0.5451, "lr": 1.0923282837564537e-06, "epoch": 0.8070678127984718, "percentage": 80.71, "elapsed_time": "6:27:18", "remaining_time": "1:32:35"}
178
+ {"current_steps": 1700, "total_steps": 2094, "loss": 0.522, "lr": 1.0408590768434018e-06, "epoch": 0.8118433619866284, "percentage": 81.18, "elapsed_time": "6:29:11", "remaining_time": "1:30:12"}
179
+ {"current_steps": 1710, "total_steps": 2094, "loss": 0.5143, "lr": 9.904907217018e-07, "epoch": 0.8166189111747851, "percentage": 81.66, "elapsed_time": "6:31:01", "remaining_time": "1:27:48"}
180
+ {"current_steps": 1720, "total_steps": 2094, "loss": 0.5339, "lr": 9.412372234135753e-07, "epoch": 0.8213944603629417, "percentage": 82.14, "elapsed_time": "6:32:50", "remaining_time": "1:25:25"}
181
+ {"current_steps": 1730, "total_steps": 2094, "loss": 0.5326, "lr": 8.931122770711425e-07, "epoch": 0.8261700095510984, "percentage": 82.62, "elapsed_time": "6:34:43", "remaining_time": "1:23:03"}
182
+ {"current_steps": 1740, "total_steps": 2094, "loss": 0.5308, "lr": 8.461292639694519e-07, "epoch": 0.830945558739255, "percentage": 83.09, "elapsed_time": "6:36:40", "remaining_time": "1:20:42"}
183
+ {"current_steps": 1750, "total_steps": 2094, "loss": 0.4943, "lr": 8.003012478852679e-07, "epoch": 0.8357211079274116, "percentage": 83.57, "elapsed_time": "6:38:26", "remaining_time": "1:18:19"}
184
+ {"current_steps": 1760, "total_steps": 2094, "loss": 0.5474, "lr": 7.556409714447488e-07, "epoch": 0.8404966571155683, "percentage": 84.05, "elapsed_time": "6:40:19", "remaining_time": "1:15:58"}
185
+ {"current_steps": 1770, "total_steps": 2094, "loss": 0.5301, "lr": 7.121608525803142e-07, "epoch": 0.8452722063037249, "percentage": 84.53, "elapsed_time": "6:42:09", "remaining_time": "1:13:36"}
186
+ {"current_steps": 1780, "total_steps": 2094, "loss": 0.5302, "lr": 6.698729810778065e-07, "epoch": 0.8500477554918816, "percentage": 85.0, "elapsed_time": "6:44:05", "remaining_time": "1:11:17"}
187
+ {"current_steps": 1790, "total_steps": 2094, "loss": 0.5075, "lr": 6.287891152148823e-07, "epoch": 0.8548233046800382, "percentage": 85.48, "elapsed_time": "6:45:50", "remaining_time": "1:08:55"}
188
+ {"current_steps": 1800, "total_steps": 2094, "loss": 0.5206, "lr": 5.889206784915863e-07, "epoch": 0.8595988538681948, "percentage": 85.96, "elapsed_time": "6:47:43", "remaining_time": "1:06:35"}
189
+ {"current_steps": 1800, "total_steps": 2094, "eval_loss": 0.44646286964416504, "epoch": 0.8595988538681948, "percentage": 85.96, "elapsed_time": "6:56:37", "remaining_time": "1:08:02"}
190
+ {"current_steps": 1810, "total_steps": 2094, "loss": 0.5305, "lr": 5.502787564540102e-07, "epoch": 0.8643744030563515, "percentage": 86.44, "elapsed_time": "6:58:49", "remaining_time": "1:05:42"}
191
+ {"current_steps": 1820, "total_steps": 2094, "loss": 0.5115, "lr": 5.128740936119242e-07, "epoch": 0.8691499522445081, "percentage": 86.91, "elapsed_time": "7:00:41", "remaining_time": "1:03:20"}
192
+ {"current_steps": 1830, "total_steps": 2094, "loss": 0.501, "lr": 4.7671709045122914e-07, "epoch": 0.8739255014326648, "percentage": 87.39, "elapsed_time": "7:02:29", "remaining_time": "1:00:56"}
193
+ {"current_steps": 1840, "total_steps": 2094, "loss": 0.5316, "lr": 4.4181780054206925e-07, "epoch": 0.8787010506208214, "percentage": 87.87, "elapsed_time": "7:04:25", "remaining_time": "0:58:35"}
194
+ {"current_steps": 1850, "total_steps": 2094, "loss": 0.5084, "lr": 4.081859277434025e-07, "epoch": 0.8834765998089781, "percentage": 88.35, "elapsed_time": "7:06:14", "remaining_time": "0:56:13"}
195
+ {"current_steps": 1860, "total_steps": 2094, "loss": 0.4988, "lr": 3.758308235048158e-07, "epoch": 0.8882521489971347, "percentage": 88.83, "elapsed_time": "7:07:58", "remaining_time": "0:53:50"}
196
+ {"current_steps": 1870, "total_steps": 2094, "loss": 0.5248, "lr": 3.4476148426632215e-07, "epoch": 0.8930276981852913, "percentage": 89.3, "elapsed_time": "7:09:54", "remaining_time": "0:51:29"}
197
+ {"current_steps": 1880, "total_steps": 2094, "loss": 0.5263, "lr": 3.1498654895687095e-07, "epoch": 0.897803247373448, "percentage": 89.78, "elapsed_time": "7:11:47", "remaining_time": "0:49:09"}
198
+ {"current_steps": 1890, "total_steps": 2094, "loss": 0.5129, "lr": 2.8651429659226906e-07, "epoch": 0.9025787965616046, "percentage": 90.26, "elapsed_time": "7:13:39", "remaining_time": "0:46:48"}
199
+ {"current_steps": 1900, "total_steps": 2094, "loss": 0.5033, "lr": 2.593526439731697e-07, "epoch": 0.9073543457497613, "percentage": 90.74, "elapsed_time": "7:15:34", "remaining_time": "0:44:28"}
200
+ {"current_steps": 1910, "total_steps": 2094, "loss": 0.5157, "lr": 2.3350914348378606e-07, "epoch": 0.9121298949379179, "percentage": 91.21, "elapsed_time": "7:17:21", "remaining_time": "0:42:07"}
201
+ {"current_steps": 1920, "total_steps": 2094, "loss": 0.5158, "lr": 2.0899098099192273e-07, "epoch": 0.9169054441260746, "percentage": 91.69, "elapsed_time": "7:19:05", "remaining_time": "0:39:47"}
202
+ {"current_steps": 1930, "total_steps": 2094, "loss": 0.5145, "lr": 1.8580497385092376e-07, "epoch": 0.9216809933142311, "percentage": 92.17, "elapsed_time": "7:20:54", "remaining_time": "0:37:27"}
203
+ {"current_steps": 1940, "total_steps": 2094, "loss": 0.5321, "lr": 1.6395756900408454e-07, "epoch": 0.9264565425023877, "percentage": 92.65, "elapsed_time": "7:22:44", "remaining_time": "0:35:08"}
204
+ {"current_steps": 1950, "total_steps": 2094, "loss": 0.5065, "lr": 1.4345484119206222e-07, "epoch": 0.9312320916905444, "percentage": 93.12, "elapsed_time": "7:24:35", "remaining_time": "0:32:49"}
205
+ {"current_steps": 1960, "total_steps": 2094, "loss": 0.54, "lr": 1.2430249126376913e-07, "epoch": 0.936007640878701, "percentage": 93.6, "elapsed_time": "7:26:22", "remaining_time": "0:30:31"}
206
+ {"current_steps": 1970, "total_steps": 2094, "loss": 0.5084, "lr": 1.065058445912398e-07, "epoch": 0.9407831900668577, "percentage": 94.08, "elapsed_time": "7:28:15", "remaining_time": "0:28:12"}
207
+ {"current_steps": 1980, "total_steps": 2094, "loss": 0.527, "lr": 9.006984958888742e-08, "epoch": 0.9455587392550143, "percentage": 94.56, "elapsed_time": "7:30:05", "remaining_time": "0:25:54"}
208
+ {"current_steps": 1990, "total_steps": 2094, "loss": 0.4929, "lr": 7.499907633758797e-08, "epoch": 0.9503342884431709, "percentage": 95.03, "elapsed_time": "7:31:56", "remaining_time": "0:23:37"}
209
+ {"current_steps": 2000, "total_steps": 2094, "loss": 0.5054, "lr": 6.129771531395045e-08, "epoch": 0.9551098376313276, "percentage": 95.51, "elapsed_time": "7:33:45", "remaining_time": "0:21:19"}
210
+ {"current_steps": 2000, "total_steps": 2094, "eval_loss": 0.43961772322654724, "epoch": 0.9551098376313276, "percentage": 95.51, "elapsed_time": "7:42:39", "remaining_time": "0:21:44"}
211
+ {"current_steps": 2010, "total_steps": 2094, "loss": 0.4983, "lr": 4.896957622514298e-08, "epoch": 0.9598853868194842, "percentage": 95.99, "elapsed_time": "7:44:52", "remaining_time": "0:19:25"}
212
+ {"current_steps": 2020, "total_steps": 2094, "loss": 0.5219, "lr": 3.801808694959053e-08, "epoch": 0.9646609360076409, "percentage": 96.47, "elapsed_time": "7:46:45", "remaining_time": "0:17:05"}
213
+ {"current_steps": 2030, "total_steps": 2094, "loss": 0.5397, "lr": 2.8446292583844126e-08, "epoch": 0.9694364851957975, "percentage": 96.94, "elapsed_time": "7:48:34", "remaining_time": "0:14:46"}
214
+ {"current_steps": 2040, "total_steps": 2094, "loss": 0.525, "lr": 2.025685459588145e-08, "epoch": 0.9742120343839542, "percentage": 97.42, "elapsed_time": "7:50:29", "remaining_time": "0:12:27"}
215
+ {"current_steps": 2050, "total_steps": 2094, "loss": 0.5086, "lr": 1.3452050085075441e-08, "epoch": 0.9789875835721108, "percentage": 97.9, "elapsed_time": "7:52:19", "remaining_time": "0:10:08"}
216
+ {"current_steps": 2060, "total_steps": 2094, "loss": 0.5122, "lr": 8.033771149041913e-09, "epoch": 0.9837631327602674, "percentage": 98.38, "elapsed_time": "7:54:07", "remaining_time": "0:07:49"}
217
+ {"current_steps": 2070, "total_steps": 2094, "loss": 0.5168, "lr": 4.003524357534261e-09, "epoch": 0.9885386819484241, "percentage": 98.85, "elapsed_time": "7:55:59", "remaining_time": "0:05:31"}
218
+ {"current_steps": 2080, "total_steps": 2094, "loss": 0.5155, "lr": 1.3624303335380006e-09, "epoch": 0.9933142311365807, "percentage": 99.33, "elapsed_time": "7:57:50", "remaining_time": "0:03:12"}
219
+ {"current_steps": 2090, "total_steps": 2094, "loss": 0.5051, "lr": 1.1122344167613374e-10, "epoch": 0.9980897803247374, "percentage": 99.81, "elapsed_time": "7:59:41", "remaining_time": "0:00:55"}
220
+ {"current_steps": 2094, "total_steps": 2094, "epoch": 1.0, "percentage": 100.0, "elapsed_time": "8:00:49", "remaining_time": "0:00:00"}
trainer_state.json ADDED
@@ -0,0 +1,1586 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 200,
7
+ "global_step": 2094,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.004775549188156638,
14
+ "grad_norm": 1.5947464157895972,
15
+ "learning_rate": 4.7619047619047623e-07,
16
+ "loss": 1.1507,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.009551098376313277,
21
+ "grad_norm": 1.0799275908070096,
22
+ "learning_rate": 9.523809523809525e-07,
23
+ "loss": 1.1084,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.014326647564469915,
28
+ "grad_norm": 0.8982107389870104,
29
+ "learning_rate": 1.4285714285714286e-06,
30
+ "loss": 1.0874,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.019102196752626553,
35
+ "grad_norm": 0.7755204372056955,
36
+ "learning_rate": 1.904761904761905e-06,
37
+ "loss": 1.1068,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.02387774594078319,
42
+ "grad_norm": 0.7764375006527714,
43
+ "learning_rate": 2.380952380952381e-06,
44
+ "loss": 1.1152,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.02865329512893983,
49
+ "grad_norm": 0.8635581450994269,
50
+ "learning_rate": 2.8571428571428573e-06,
51
+ "loss": 1.0964,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.033428844317096466,
56
+ "grad_norm": 0.7474505965610809,
57
+ "learning_rate": 3.3333333333333333e-06,
58
+ "loss": 1.1005,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.038204393505253106,
63
+ "grad_norm": 0.7625379001449059,
64
+ "learning_rate": 3.80952380952381e-06,
65
+ "loss": 1.0933,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.04297994269340974,
70
+ "grad_norm": 0.7457628875786683,
71
+ "learning_rate": 4.2857142857142855e-06,
72
+ "loss": 1.0397,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.04775549188156638,
77
+ "grad_norm": 0.7668973425259701,
78
+ "learning_rate": 4.761904761904762e-06,
79
+ "loss": 1.0723,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.05253104106972302,
84
+ "grad_norm": 0.7418999154350163,
85
+ "learning_rate": 5.2380952380952384e-06,
86
+ "loss": 1.0753,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 0.05730659025787966,
91
+ "grad_norm": 0.7536046873082832,
92
+ "learning_rate": 5.7142857142857145e-06,
93
+ "loss": 1.0604,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 0.06208213944603629,
98
+ "grad_norm": 0.7126742899879043,
99
+ "learning_rate": 6.1904761904761914e-06,
100
+ "loss": 1.0844,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 0.06685768863419293,
105
+ "grad_norm": 0.7388477971520834,
106
+ "learning_rate": 6.666666666666667e-06,
107
+ "loss": 1.0738,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 0.07163323782234957,
112
+ "grad_norm": 0.7063263241327802,
113
+ "learning_rate": 7.1428571428571436e-06,
114
+ "loss": 1.0638,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 0.07640878701050621,
119
+ "grad_norm": 0.7435364320190926,
120
+ "learning_rate": 7.61904761904762e-06,
121
+ "loss": 1.089,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 0.08118433619866285,
126
+ "grad_norm": 0.7196795649504337,
127
+ "learning_rate": 8.095238095238097e-06,
128
+ "loss": 1.0215,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 0.08595988538681948,
133
+ "grad_norm": 0.792285178872838,
134
+ "learning_rate": 8.571428571428571e-06,
135
+ "loss": 1.0489,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 0.09073543457497613,
140
+ "grad_norm": 0.8765129377706619,
141
+ "learning_rate": 9.047619047619049e-06,
142
+ "loss": 1.047,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 0.09551098376313276,
147
+ "grad_norm": 0.6926043469847205,
148
+ "learning_rate": 9.523809523809525e-06,
149
+ "loss": 1.0484,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 0.09551098376313276,
154
+ "eval_loss": 1.0037423372268677,
155
+ "eval_runtime": 534.236,
156
+ "eval_samples_per_second": 27.872,
157
+ "eval_steps_per_second": 3.485,
158
+ "step": 200
159
+ },
160
+ {
161
+ "epoch": 0.10028653295128939,
162
+ "grad_norm": 0.7189100769671237,
163
+ "learning_rate": 1e-05,
164
+ "loss": 1.0786,
165
+ "step": 210
166
+ },
167
+ {
168
+ "epoch": 0.10506208213944604,
169
+ "grad_norm": 0.6530269733207105,
170
+ "learning_rate": 9.99930486701988e-06,
171
+ "loss": 1.0568,
172
+ "step": 220
173
+ },
174
+ {
175
+ "epoch": 0.10983763132760267,
176
+ "grad_norm": 0.7951921984643434,
177
+ "learning_rate": 9.99721966136347e-06,
178
+ "loss": 1.0264,
179
+ "step": 230
180
+ },
181
+ {
182
+ "epoch": 0.11461318051575932,
183
+ "grad_norm": 0.7425556549789157,
184
+ "learning_rate": 9.99374496282885e-06,
185
+ "loss": 1.0093,
186
+ "step": 240
187
+ },
188
+ {
189
+ "epoch": 0.11938872970391595,
190
+ "grad_norm": 0.7965678052101759,
191
+ "learning_rate": 9.988881737567046e-06,
192
+ "loss": 1.0172,
193
+ "step": 250
194
+ },
195
+ {
196
+ "epoch": 0.12416427889207259,
197
+ "grad_norm": 0.6836841577113884,
198
+ "learning_rate": 9.982631337813363e-06,
199
+ "loss": 1.0369,
200
+ "step": 260
201
+ },
202
+ {
203
+ "epoch": 0.12893982808022922,
204
+ "grad_norm": 0.7403849280275662,
205
+ "learning_rate": 9.974995501511404e-06,
206
+ "loss": 0.998,
207
+ "step": 270
208
+ },
209
+ {
210
+ "epoch": 0.13371537726838587,
211
+ "grad_norm": 0.8090595134087124,
212
+ "learning_rate": 9.965976351829827e-06,
213
+ "loss": 1.0232,
214
+ "step": 280
215
+ },
216
+ {
217
+ "epoch": 0.1384909264565425,
218
+ "grad_norm": 0.7595821597209347,
219
+ "learning_rate": 9.95557639657199e-06,
220
+ "loss": 1.0093,
221
+ "step": 290
222
+ },
223
+ {
224
+ "epoch": 0.14326647564469913,
225
+ "grad_norm": 0.6967946094456987,
226
+ "learning_rate": 9.943798527478652e-06,
227
+ "loss": 1.0048,
228
+ "step": 300
229
+ },
230
+ {
231
+ "epoch": 0.14804202483285578,
232
+ "grad_norm": 0.8063573400514253,
233
+ "learning_rate": 9.930646019423909e-06,
234
+ "loss": 0.9838,
235
+ "step": 310
236
+ },
237
+ {
238
+ "epoch": 0.15281757402101243,
239
+ "grad_norm": 0.8442835891928957,
240
+ "learning_rate": 9.916122529504605e-06,
241
+ "loss": 0.9935,
242
+ "step": 320
243
+ },
244
+ {
245
+ "epoch": 0.15759312320916904,
246
+ "grad_norm": 0.781371439165008,
247
+ "learning_rate": 9.900232096023478e-06,
248
+ "loss": 0.9834,
249
+ "step": 330
250
+ },
251
+ {
252
+ "epoch": 0.1623686723973257,
253
+ "grad_norm": 0.8183448490469395,
254
+ "learning_rate": 9.882979137366275e-06,
255
+ "loss": 0.9638,
256
+ "step": 340
257
+ },
258
+ {
259
+ "epoch": 0.16714422158548234,
260
+ "grad_norm": 0.7604900553397272,
261
+ "learning_rate": 9.864368450773227e-06,
262
+ "loss": 0.983,
263
+ "step": 350
264
+ },
265
+ {
266
+ "epoch": 0.17191977077363896,
267
+ "grad_norm": 0.8396013068852565,
268
+ "learning_rate": 9.844405211005145e-06,
269
+ "loss": 0.9759,
270
+ "step": 360
271
+ },
272
+ {
273
+ "epoch": 0.1766953199617956,
274
+ "grad_norm": 0.8365439774930438,
275
+ "learning_rate": 9.823094968904572e-06,
276
+ "loss": 0.9927,
277
+ "step": 370
278
+ },
279
+ {
280
+ "epoch": 0.18147086914995225,
281
+ "grad_norm": 0.7930078729758488,
282
+ "learning_rate": 9.800443649852347e-06,
283
+ "loss": 0.985,
284
+ "step": 380
285
+ },
286
+ {
287
+ "epoch": 0.18624641833810887,
288
+ "grad_norm": 0.6951067649253269,
289
+ "learning_rate": 9.776457552120034e-06,
290
+ "loss": 0.9578,
291
+ "step": 390
292
+ },
293
+ {
294
+ "epoch": 0.19102196752626552,
295
+ "grad_norm": 0.7758966278638689,
296
+ "learning_rate": 9.751143345118675e-06,
297
+ "loss": 0.9828,
298
+ "step": 400
299
+ },
300
+ {
301
+ "epoch": 0.19102196752626552,
302
+ "eval_loss": 0.911033034324646,
303
+ "eval_runtime": 534.1402,
304
+ "eval_samples_per_second": 27.877,
305
+ "eval_steps_per_second": 3.486,
306
+ "step": 400
307
+ },
308
+ {
309
+ "epoch": 0.19579751671442217,
310
+ "grad_norm": 0.8178388178466306,
311
+ "learning_rate": 9.724508067544328e-06,
312
+ "loss": 0.9593,
313
+ "step": 410
314
+ },
315
+ {
316
+ "epoch": 0.20057306590257878,
317
+ "grad_norm": 0.7948626800066163,
318
+ "learning_rate": 9.696559125420949e-06,
319
+ "loss": 0.9342,
320
+ "step": 420
321
+ },
322
+ {
323
+ "epoch": 0.20534861509073543,
324
+ "grad_norm": 0.7690821022431013,
325
+ "learning_rate": 9.667304290041102e-06,
326
+ "loss": 0.9182,
327
+ "step": 430
328
+ },
329
+ {
330
+ "epoch": 0.21012416427889208,
331
+ "grad_norm": 0.8011672592566321,
332
+ "learning_rate": 9.636751695805154e-06,
333
+ "loss": 0.9399,
334
+ "step": 440
335
+ },
336
+ {
337
+ "epoch": 0.2148997134670487,
338
+ "grad_norm": 0.7896064611214817,
339
+ "learning_rate": 9.604909837959456e-06,
340
+ "loss": 0.9546,
341
+ "step": 450
342
+ },
343
+ {
344
+ "epoch": 0.21967526265520534,
345
+ "grad_norm": 0.8683971543565645,
346
+ "learning_rate": 9.57178757023422e-06,
347
+ "loss": 0.9493,
348
+ "step": 460
349
+ },
350
+ {
351
+ "epoch": 0.224450811843362,
352
+ "grad_norm": 0.7840423345877752,
353
+ "learning_rate": 9.537394102381719e-06,
354
+ "loss": 0.951,
355
+ "step": 470
356
+ },
357
+ {
358
+ "epoch": 0.22922636103151864,
359
+ "grad_norm": 0.8816713401350009,
360
+ "learning_rate": 9.501738997615471e-06,
361
+ "loss": 0.902,
362
+ "step": 480
363
+ },
364
+ {
365
+ "epoch": 0.23400191021967526,
366
+ "grad_norm": 0.7272663417426294,
367
+ "learning_rate": 9.464832169951171e-06,
368
+ "loss": 0.9121,
369
+ "step": 490
370
+ },
371
+ {
372
+ "epoch": 0.2387774594078319,
373
+ "grad_norm": 0.8166914666826738,
374
+ "learning_rate": 9.426683881450058e-06,
375
+ "loss": 0.9149,
376
+ "step": 500
377
+ },
378
+ {
379
+ "epoch": 0.24355300859598855,
380
+ "grad_norm": 0.806333386634663,
381
+ "learning_rate": 9.387304739365524e-06,
382
+ "loss": 0.9141,
383
+ "step": 510
384
+ },
385
+ {
386
+ "epoch": 0.24832855778414517,
387
+ "grad_norm": 0.8500670140915615,
388
+ "learning_rate": 9.346705693193722e-06,
389
+ "loss": 0.9046,
390
+ "step": 520
391
+ },
392
+ {
393
+ "epoch": 0.2531041069723018,
394
+ "grad_norm": 0.8536991575306397,
395
+ "learning_rate": 9.304898031629038e-06,
396
+ "loss": 0.907,
397
+ "step": 530
398
+ },
399
+ {
400
+ "epoch": 0.25787965616045844,
401
+ "grad_norm": 0.7805889795178423,
402
+ "learning_rate": 9.261893379425218e-06,
403
+ "loss": 0.9095,
404
+ "step": 540
405
+ },
406
+ {
407
+ "epoch": 0.2626552053486151,
408
+ "grad_norm": 0.983470886698574,
409
+ "learning_rate": 9.217703694163083e-06,
410
+ "loss": 0.8811,
411
+ "step": 550
412
+ },
413
+ {
414
+ "epoch": 0.26743075453677173,
415
+ "grad_norm": 0.8247344852185301,
416
+ "learning_rate": 9.172341262925675e-06,
417
+ "loss": 0.8743,
418
+ "step": 560
419
+ },
420
+ {
421
+ "epoch": 0.2722063037249284,
422
+ "grad_norm": 0.7947362543336306,
423
+ "learning_rate": 9.125818698881798e-06,
424
+ "loss": 0.8659,
425
+ "step": 570
426
+ },
427
+ {
428
+ "epoch": 0.276981852913085,
429
+ "grad_norm": 0.7251555329092241,
430
+ "learning_rate": 9.078148937778889e-06,
431
+ "loss": 0.906,
432
+ "step": 580
433
+ },
434
+ {
435
+ "epoch": 0.2817574021012416,
436
+ "grad_norm": 0.789421032185164,
437
+ "learning_rate": 9.029345234346183e-06,
438
+ "loss": 0.8859,
439
+ "step": 590
440
+ },
441
+ {
442
+ "epoch": 0.28653295128939826,
443
+ "grad_norm": 0.8701223156155673,
444
+ "learning_rate": 8.979421158609206e-06,
445
+ "loss": 0.8785,
446
+ "step": 600
447
+ },
448
+ {
449
+ "epoch": 0.28653295128939826,
450
+ "eval_loss": 0.8167237639427185,
451
+ "eval_runtime": 534.2082,
452
+ "eval_samples_per_second": 27.873,
453
+ "eval_steps_per_second": 3.486,
454
+ "step": 600
455
+ },
456
+ {
457
+ "epoch": 0.2913085004775549,
458
+ "grad_norm": 0.8629003851548017,
459
+ "learning_rate": 8.928390592116576e-06,
460
+ "loss": 0.8539,
461
+ "step": 610
462
+ },
463
+ {
464
+ "epoch": 0.29608404966571156,
465
+ "grad_norm": 0.8486389448069024,
466
+ "learning_rate": 8.876267724080197e-06,
467
+ "loss": 0.8527,
468
+ "step": 620
469
+ },
470
+ {
471
+ "epoch": 0.3008595988538682,
472
+ "grad_norm": 0.762838712746454,
473
+ "learning_rate": 8.823067047429908e-06,
474
+ "loss": 0.8683,
475
+ "step": 630
476
+ },
477
+ {
478
+ "epoch": 0.30563514804202485,
479
+ "grad_norm": 0.8625392396534273,
480
+ "learning_rate": 8.768803354783668e-06,
481
+ "loss": 0.8649,
482
+ "step": 640
483
+ },
484
+ {
485
+ "epoch": 0.3104106972301815,
486
+ "grad_norm": 0.8333540473972623,
487
+ "learning_rate": 8.71349173433443e-06,
488
+ "loss": 0.8622,
489
+ "step": 650
490
+ },
491
+ {
492
+ "epoch": 0.3151862464183381,
493
+ "grad_norm": 0.7664644528325345,
494
+ "learning_rate": 8.65714756565482e-06,
495
+ "loss": 0.8359,
496
+ "step": 660
497
+ },
498
+ {
499
+ "epoch": 0.31996179560649474,
500
+ "grad_norm": 0.8563484088758611,
501
+ "learning_rate": 8.599786515420789e-06,
502
+ "loss": 0.8569,
503
+ "step": 670
504
+ },
505
+ {
506
+ "epoch": 0.3247373447946514,
507
+ "grad_norm": 0.8053932610453219,
508
+ "learning_rate": 8.541424533055455e-06,
509
+ "loss": 0.8458,
510
+ "step": 680
511
+ },
512
+ {
513
+ "epoch": 0.32951289398280803,
514
+ "grad_norm": 0.8666024535787029,
515
+ "learning_rate": 8.48207784629431e-06,
516
+ "loss": 0.8447,
517
+ "step": 690
518
+ },
519
+ {
520
+ "epoch": 0.3342884431709647,
521
+ "grad_norm": 0.8868169973573014,
522
+ "learning_rate": 8.421762956673043e-06,
523
+ "loss": 0.8365,
524
+ "step": 700
525
+ },
526
+ {
527
+ "epoch": 0.3390639923591213,
528
+ "grad_norm": 0.8954608340327667,
529
+ "learning_rate": 8.360496634939243e-06,
530
+ "loss": 0.8335,
531
+ "step": 710
532
+ },
533
+ {
534
+ "epoch": 0.3438395415472779,
535
+ "grad_norm": 0.7784372366116787,
536
+ "learning_rate": 8.298295916389234e-06,
537
+ "loss": 0.8458,
538
+ "step": 720
539
+ },
540
+ {
541
+ "epoch": 0.34861509073543456,
542
+ "grad_norm": 0.8749905429657314,
543
+ "learning_rate": 8.235178096131355e-06,
544
+ "loss": 0.8185,
545
+ "step": 730
546
+ },
547
+ {
548
+ "epoch": 0.3533906399235912,
549
+ "grad_norm": 0.8021513738690056,
550
+ "learning_rate": 8.171160724277005e-06,
551
+ "loss": 0.8009,
552
+ "step": 740
553
+ },
554
+ {
555
+ "epoch": 0.35816618911174786,
556
+ "grad_norm": 0.8463663258199511,
557
+ "learning_rate": 8.106261601060773e-06,
558
+ "loss": 0.8277,
559
+ "step": 750
560
+ },
561
+ {
562
+ "epoch": 0.3629417382999045,
563
+ "grad_norm": 0.8443067155200391,
564
+ "learning_rate": 8.040498771891031e-06,
565
+ "loss": 0.8432,
566
+ "step": 760
567
+ },
568
+ {
569
+ "epoch": 0.36771728748806115,
570
+ "grad_norm": 0.8612360765810276,
571
+ "learning_rate": 7.973890522332348e-06,
572
+ "loss": 0.7933,
573
+ "step": 770
574
+ },
575
+ {
576
+ "epoch": 0.37249283667621774,
577
+ "grad_norm": 0.7566639910663036,
578
+ "learning_rate": 7.90645537302113e-06,
579
+ "loss": 0.8122,
580
+ "step": 780
581
+ },
582
+ {
583
+ "epoch": 0.3772683858643744,
584
+ "grad_norm": 0.8598522471784897,
585
+ "learning_rate": 7.838212074515899e-06,
586
+ "loss": 0.7713,
587
+ "step": 790
588
+ },
589
+ {
590
+ "epoch": 0.38204393505253104,
591
+ "grad_norm": 0.8264097384771215,
592
+ "learning_rate": 7.769179602083642e-06,
593
+ "loss": 0.7863,
594
+ "step": 800
595
+ },
596
+ {
597
+ "epoch": 0.38204393505253104,
598
+ "eval_loss": 0.7238086462020874,
599
+ "eval_runtime": 534.3413,
600
+ "eval_samples_per_second": 27.866,
601
+ "eval_steps_per_second": 3.485,
602
+ "step": 800
603
+ },
604
+ {
605
+ "epoch": 0.3868194842406877,
606
+ "grad_norm": 0.8302298544450823,
607
+ "learning_rate": 7.699377150423673e-06,
608
+ "loss": 0.7703,
609
+ "step": 810
610
+ },
611
+ {
612
+ "epoch": 0.39159503342884433,
613
+ "grad_norm": 0.9074490770748711,
614
+ "learning_rate": 7.628824128330485e-06,
615
+ "loss": 0.7651,
616
+ "step": 820
617
+ },
618
+ {
619
+ "epoch": 0.396370582617001,
620
+ "grad_norm": 0.9084786994105255,
621
+ "learning_rate": 7.557540153297086e-06,
622
+ "loss": 0.777,
623
+ "step": 830
624
+ },
625
+ {
626
+ "epoch": 0.40114613180515757,
627
+ "grad_norm": 0.7941270341446054,
628
+ "learning_rate": 7.485545046060272e-06,
629
+ "loss": 0.7659,
630
+ "step": 840
631
+ },
632
+ {
633
+ "epoch": 0.4059216809933142,
634
+ "grad_norm": 0.8418275807952966,
635
+ "learning_rate": 7.412858825089423e-06,
636
+ "loss": 0.7422,
637
+ "step": 850
638
+ },
639
+ {
640
+ "epoch": 0.41069723018147086,
641
+ "grad_norm": 0.8272295917894095,
642
+ "learning_rate": 7.3395017010202965e-06,
643
+ "loss": 0.7812,
644
+ "step": 860
645
+ },
646
+ {
647
+ "epoch": 0.4154727793696275,
648
+ "grad_norm": 0.8505418063106487,
649
+ "learning_rate": 7.265494071035401e-06,
650
+ "loss": 0.7461,
651
+ "step": 870
652
+ },
653
+ {
654
+ "epoch": 0.42024832855778416,
655
+ "grad_norm": 0.8254281894571944,
656
+ "learning_rate": 7.19085651319249e-06,
657
+ "loss": 0.7475,
658
+ "step": 880
659
+ },
660
+ {
661
+ "epoch": 0.4250238777459408,
662
+ "grad_norm": 0.7405023384966558,
663
+ "learning_rate": 7.115609780702767e-06,
664
+ "loss": 0.7485,
665
+ "step": 890
666
+ },
667
+ {
668
+ "epoch": 0.4297994269340974,
669
+ "grad_norm": 0.8069701422330396,
670
+ "learning_rate": 7.039774796160391e-06,
671
+ "loss": 0.7502,
672
+ "step": 900
673
+ },
674
+ {
675
+ "epoch": 0.43457497612225404,
676
+ "grad_norm": 0.7893053819873398,
677
+ "learning_rate": 6.9633726457248864e-06,
678
+ "loss": 0.7307,
679
+ "step": 910
680
+ },
681
+ {
682
+ "epoch": 0.4393505253104107,
683
+ "grad_norm": 0.8684426658728022,
684
+ "learning_rate": 6.886424573258057e-06,
685
+ "loss": 0.7407,
686
+ "step": 920
687
+ },
688
+ {
689
+ "epoch": 0.44412607449856734,
690
+ "grad_norm": 0.8806215484013901,
691
+ "learning_rate": 6.808951974417077e-06,
692
+ "loss": 0.7232,
693
+ "step": 930
694
+ },
695
+ {
696
+ "epoch": 0.448901623686724,
697
+ "grad_norm": 1.0078825768218607,
698
+ "learning_rate": 6.73097639070535e-06,
699
+ "loss": 0.7217,
700
+ "step": 940
701
+ },
702
+ {
703
+ "epoch": 0.45367717287488063,
704
+ "grad_norm": 0.820169166397048,
705
+ "learning_rate": 6.652519503482829e-06,
706
+ "loss": 0.7275,
707
+ "step": 950
708
+ },
709
+ {
710
+ "epoch": 0.4584527220630373,
711
+ "grad_norm": 0.9615211451349941,
712
+ "learning_rate": 6.573603127937443e-06,
713
+ "loss": 0.7244,
714
+ "step": 960
715
+ },
716
+ {
717
+ "epoch": 0.46322827125119387,
718
+ "grad_norm": 0.9105145919711279,
719
+ "learning_rate": 6.494249207019317e-06,
720
+ "loss": 0.7184,
721
+ "step": 970
722
+ },
723
+ {
724
+ "epoch": 0.4680038204393505,
725
+ "grad_norm": 0.8720579325571185,
726
+ "learning_rate": 6.414479805339465e-06,
727
+ "loss": 0.6887,
728
+ "step": 980
729
+ },
730
+ {
731
+ "epoch": 0.47277936962750716,
732
+ "grad_norm": 0.8869722170599964,
733
+ "learning_rate": 6.3343171030346525e-06,
734
+ "loss": 0.6858,
735
+ "step": 990
736
+ },
737
+ {
738
+ "epoch": 0.4775549188156638,
739
+ "grad_norm": 0.8820354385866871,
740
+ "learning_rate": 6.253783389600136e-06,
741
+ "loss": 0.7073,
742
+ "step": 1000
743
+ },
744
+ {
745
+ "epoch": 0.4775549188156638,
746
+ "eval_loss": 0.6407110095024109,
747
+ "eval_runtime": 534.1565,
748
+ "eval_samples_per_second": 27.876,
749
+ "eval_steps_per_second": 3.486,
750
+ "step": 1000
751
+ },
752
+ {
753
+ "epoch": 0.48233046800382046,
754
+ "grad_norm": 0.9876137273298787,
755
+ "learning_rate": 6.172901057692007e-06,
756
+ "loss": 0.7207,
757
+ "step": 1010
758
+ },
759
+ {
760
+ "epoch": 0.4871060171919771,
761
+ "grad_norm": 0.8646444637817178,
762
+ "learning_rate": 6.0916925969008275e-06,
763
+ "loss": 0.7363,
764
+ "step": 1020
765
+ },
766
+ {
767
+ "epoch": 0.4918815663801337,
768
+ "grad_norm": 0.8847812706441835,
769
+ "learning_rate": 6.010180587498347e-06,
770
+ "loss": 0.6729,
771
+ "step": 1030
772
+ },
773
+ {
774
+ "epoch": 0.49665711556829034,
775
+ "grad_norm": 0.7915082131571504,
776
+ "learning_rate": 5.928387694158968e-06,
777
+ "loss": 0.6956,
778
+ "step": 1040
779
+ },
780
+ {
781
+ "epoch": 0.501432664756447,
782
+ "grad_norm": 1.046517690383018,
783
+ "learning_rate": 5.8463366596577706e-06,
784
+ "loss": 0.6896,
785
+ "step": 1050
786
+ },
787
+ {
788
+ "epoch": 0.5062082139446036,
789
+ "grad_norm": 0.8847177405317377,
790
+ "learning_rate": 5.764050298546808e-06,
791
+ "loss": 0.6861,
792
+ "step": 1060
793
+ },
794
+ {
795
+ "epoch": 0.5109837631327603,
796
+ "grad_norm": 0.8672793826369924,
797
+ "learning_rate": 5.68155149081145e-06,
798
+ "loss": 0.6762,
799
+ "step": 1070
800
+ },
801
+ {
802
+ "epoch": 0.5157593123209169,
803
+ "grad_norm": 0.9340453072759527,
804
+ "learning_rate": 5.598863175508526e-06,
805
+ "loss": 0.6717,
806
+ "step": 1080
807
+ },
808
+ {
809
+ "epoch": 0.5205348615090736,
810
+ "grad_norm": 0.8259949274241196,
811
+ "learning_rate": 5.516008344388053e-06,
812
+ "loss": 0.6825,
813
+ "step": 1090
814
+ },
815
+ {
816
+ "epoch": 0.5253104106972302,
817
+ "grad_norm": 0.9029867272763249,
818
+ "learning_rate": 5.433010035500299e-06,
819
+ "loss": 0.6771,
820
+ "step": 1100
821
+ },
822
+ {
823
+ "epoch": 0.5300859598853869,
824
+ "grad_norm": 0.800953127657272,
825
+ "learning_rate": 5.3498913267899864e-06,
826
+ "loss": 0.674,
827
+ "step": 1110
828
+ },
829
+ {
830
+ "epoch": 0.5348615090735435,
831
+ "grad_norm": 0.9245770325043161,
832
+ "learning_rate": 5.2666753296793895e-06,
833
+ "loss": 0.6662,
834
+ "step": 1120
835
+ },
836
+ {
837
+ "epoch": 0.5396370582617,
838
+ "grad_norm": 0.8502150053204679,
839
+ "learning_rate": 5.183385182642136e-06,
840
+ "loss": 0.6765,
841
+ "step": 1130
842
+ },
843
+ {
844
+ "epoch": 0.5444126074498568,
845
+ "grad_norm": 1.5468980444160696,
846
+ "learning_rate": 5.100044044769472e-06,
847
+ "loss": 0.6682,
848
+ "step": 1140
849
+ },
850
+ {
851
+ "epoch": 0.5491881566380133,
852
+ "grad_norm": 1.0568604331225828,
853
+ "learning_rate": 5.016675089330817e-06,
854
+ "loss": 0.6583,
855
+ "step": 1150
856
+ },
857
+ {
858
+ "epoch": 0.55396370582617,
859
+ "grad_norm": 0.9277335589404493,
860
+ "learning_rate": 4.933301497330344e-06,
861
+ "loss": 0.6456,
862
+ "step": 1160
863
+ },
864
+ {
865
+ "epoch": 0.5587392550143266,
866
+ "grad_norm": 0.9015066761493195,
867
+ "learning_rate": 4.849946451061444e-06,
868
+ "loss": 0.673,
869
+ "step": 1170
870
+ },
871
+ {
872
+ "epoch": 0.5635148042024832,
873
+ "grad_norm": 0.916248819386327,
874
+ "learning_rate": 4.766633127660805e-06,
875
+ "loss": 0.6372,
876
+ "step": 1180
877
+ },
878
+ {
879
+ "epoch": 0.5682903533906399,
880
+ "grad_norm": 0.9944122764504796,
881
+ "learning_rate": 4.683384692663937e-06,
882
+ "loss": 0.6352,
883
+ "step": 1190
884
+ },
885
+ {
886
+ "epoch": 0.5730659025787965,
887
+ "grad_norm": 0.8995871939871956,
888
+ "learning_rate": 4.600224293563926e-06,
889
+ "loss": 0.6143,
890
+ "step": 1200
891
+ },
892
+ {
893
+ "epoch": 0.5730659025787965,
894
+ "eval_loss": 0.5672308206558228,
895
+ "eval_runtime": 534.5729,
896
+ "eval_samples_per_second": 27.854,
897
+ "eval_steps_per_second": 3.483,
898
+ "step": 1200
899
+ },
900
+ {
901
+ "epoch": 0.5778414517669532,
902
+ "grad_norm": 0.9029828803944917,
903
+ "learning_rate": 4.517175053375191e-06,
904
+ "loss": 0.6482,
905
+ "step": 1210
906
+ },
907
+ {
908
+ "epoch": 0.5826170009551098,
909
+ "grad_norm": 0.9997552015425194,
910
+ "learning_rate": 4.434260064204067e-06,
911
+ "loss": 0.6244,
912
+ "step": 1220
913
+ },
914
+ {
915
+ "epoch": 0.5873925501432665,
916
+ "grad_norm": 0.8505066146958208,
917
+ "learning_rate": 4.351502380827959e-06,
918
+ "loss": 0.6231,
919
+ "step": 1230
920
+ },
921
+ {
922
+ "epoch": 0.5921680993314231,
923
+ "grad_norm": 0.9601951382006098,
924
+ "learning_rate": 4.268925014284898e-06,
925
+ "loss": 0.6515,
926
+ "step": 1240
927
+ },
928
+ {
929
+ "epoch": 0.5969436485195797,
930
+ "grad_norm": 0.858109652878112,
931
+ "learning_rate": 4.18655092547524e-06,
932
+ "loss": 0.6027,
933
+ "step": 1250
934
+ },
935
+ {
936
+ "epoch": 0.6017191977077364,
937
+ "grad_norm": 1.0279381548988882,
938
+ "learning_rate": 4.104403018777323e-06,
939
+ "loss": 0.636,
940
+ "step": 1260
941
+ },
942
+ {
943
+ "epoch": 0.606494746895893,
944
+ "grad_norm": 0.8684044204496176,
945
+ "learning_rate": 4.022504135678822e-06,
946
+ "loss": 0.6356,
947
+ "step": 1270
948
+ },
949
+ {
950
+ "epoch": 0.6112702960840497,
951
+ "grad_norm": 1.2002839065266542,
952
+ "learning_rate": 3.94087704842561e-06,
953
+ "loss": 0.6303,
954
+ "step": 1280
955
+ },
956
+ {
957
+ "epoch": 0.6160458452722063,
958
+ "grad_norm": 1.0212819754078601,
959
+ "learning_rate": 3.859544453689853e-06,
960
+ "loss": 0.6181,
961
+ "step": 1290
962
+ },
963
+ {
964
+ "epoch": 0.620821394460363,
965
+ "grad_norm": 1.1643909557826269,
966
+ "learning_rate": 3.778528966259137e-06,
967
+ "loss": 0.6075,
968
+ "step": 1300
969
+ },
970
+ {
971
+ "epoch": 0.6255969436485196,
972
+ "grad_norm": 0.8318901215082086,
973
+ "learning_rate": 3.697853112748345e-06,
974
+ "loss": 0.6106,
975
+ "step": 1310
976
+ },
977
+ {
978
+ "epoch": 0.6303724928366762,
979
+ "grad_norm": 0.9063102495466279,
980
+ "learning_rate": 3.6175393253360704e-06,
981
+ "loss": 0.599,
982
+ "step": 1320
983
+ },
984
+ {
985
+ "epoch": 0.6351480420248329,
986
+ "grad_norm": 0.9567097209608001,
987
+ "learning_rate": 3.537609935527264e-06,
988
+ "loss": 0.5996,
989
+ "step": 1330
990
+ },
991
+ {
992
+ "epoch": 0.6399235912129895,
993
+ "grad_norm": 0.939453389364599,
994
+ "learning_rate": 3.458087167943905e-06,
995
+ "loss": 0.5867,
996
+ "step": 1340
997
+ },
998
+ {
999
+ "epoch": 0.6446991404011462,
1000
+ "grad_norm": 0.9944415765925527,
1001
+ "learning_rate": 3.3789931341453564e-06,
1002
+ "loss": 0.614,
1003
+ "step": 1350
1004
+ },
1005
+ {
1006
+ "epoch": 0.6494746895893028,
1007
+ "grad_norm": 0.8911567397377756,
1008
+ "learning_rate": 3.3003498264801915e-06,
1009
+ "loss": 0.5858,
1010
+ "step": 1360
1011
+ },
1012
+ {
1013
+ "epoch": 0.6542502387774594,
1014
+ "grad_norm": 0.9190740572643366,
1015
+ "learning_rate": 3.2221791119711372e-06,
1016
+ "loss": 0.6073,
1017
+ "step": 1370
1018
+ },
1019
+ {
1020
+ "epoch": 0.6590257879656161,
1021
+ "grad_norm": 0.8722067899669511,
1022
+ "learning_rate": 3.144502726234889e-06,
1023
+ "loss": 0.598,
1024
+ "step": 1380
1025
+ },
1026
+ {
1027
+ "epoch": 0.6638013371537727,
1028
+ "grad_norm": 0.8704883954915125,
1029
+ "learning_rate": 3.067342267438446e-06,
1030
+ "loss": 0.5864,
1031
+ "step": 1390
1032
+ },
1033
+ {
1034
+ "epoch": 0.6685768863419294,
1035
+ "grad_norm": 0.9586746506286237,
1036
+ "learning_rate": 2.9907191902936773e-06,
1037
+ "loss": 0.5726,
1038
+ "step": 1400
1039
+ },
1040
+ {
1041
+ "epoch": 0.6685768863419294,
1042
+ "eval_loss": 0.5096372961997986,
1043
+ "eval_runtime": 534.0593,
1044
+ "eval_samples_per_second": 27.881,
1045
+ "eval_steps_per_second": 3.487,
1046
+ "step": 1400
1047
+ },
1048
+ {
1049
+ "epoch": 0.673352435530086,
1050
+ "grad_norm": 0.9771151805675299,
1051
+ "learning_rate": 2.914654800091768e-06,
1052
+ "loss": 0.5678,
1053
+ "step": 1410
1054
+ },
1055
+ {
1056
+ "epoch": 0.6781279847182426,
1057
+ "grad_norm": 0.9844163415808749,
1058
+ "learning_rate": 2.8391702467792137e-06,
1059
+ "loss": 0.5875,
1060
+ "step": 1420
1061
+ },
1062
+ {
1063
+ "epoch": 0.6829035339063992,
1064
+ "grad_norm": 0.936929121667794,
1065
+ "learning_rate": 2.764286519077014e-06,
1066
+ "loss": 0.5745,
1067
+ "step": 1430
1068
+ },
1069
+ {
1070
+ "epoch": 0.6876790830945558,
1071
+ "grad_norm": 0.9581940513551886,
1072
+ "learning_rate": 2.6900244386446903e-06,
1073
+ "loss": 0.5748,
1074
+ "step": 1440
1075
+ },
1076
+ {
1077
+ "epoch": 0.6924546322827125,
1078
+ "grad_norm": 0.9382097505155865,
1079
+ "learning_rate": 2.616404654290752e-06,
1080
+ "loss": 0.582,
1081
+ "step": 1450
1082
+ },
1083
+ {
1084
+ "epoch": 0.6972301814708691,
1085
+ "grad_norm": 0.9458807920061071,
1086
+ "learning_rate": 2.5434476362312375e-06,
1087
+ "loss": 0.5859,
1088
+ "step": 1460
1089
+ },
1090
+ {
1091
+ "epoch": 0.7020057306590258,
1092
+ "grad_norm": 0.8536247554325601,
1093
+ "learning_rate": 2.4711736703979015e-06,
1094
+ "loss": 0.5778,
1095
+ "step": 1470
1096
+ },
1097
+ {
1098
+ "epoch": 0.7067812798471824,
1099
+ "grad_norm": 0.8896142850001317,
1100
+ "learning_rate": 2.399602852797647e-06,
1101
+ "loss": 0.5833,
1102
+ "step": 1480
1103
+ },
1104
+ {
1105
+ "epoch": 0.711556829035339,
1106
+ "grad_norm": 0.9369088545555486,
1107
+ "learning_rate": 2.3287550839247625e-06,
1108
+ "loss": 0.5677,
1109
+ "step": 1490
1110
+ },
1111
+ {
1112
+ "epoch": 0.7163323782234957,
1113
+ "grad_norm": 0.9352682466004876,
1114
+ "learning_rate": 2.2586500632275333e-06,
1115
+ "loss": 0.5501,
1116
+ "step": 1500
1117
+ },
1118
+ {
1119
+ "epoch": 0.7211079274116523,
1120
+ "grad_norm": 0.9291292330708577,
1121
+ "learning_rate": 2.1893072836307433e-06,
1122
+ "loss": 0.5432,
1123
+ "step": 1510
1124
+ },
1125
+ {
1126
+ "epoch": 0.725883476599809,
1127
+ "grad_norm": 1.1278542414631672,
1128
+ "learning_rate": 2.1207460261156066e-06,
1129
+ "loss": 0.6017,
1130
+ "step": 1520
1131
+ },
1132
+ {
1133
+ "epoch": 0.7306590257879656,
1134
+ "grad_norm": 0.8496342199267922,
1135
+ "learning_rate": 2.052985354358622e-06,
1136
+ "loss": 0.5361,
1137
+ "step": 1530
1138
+ },
1139
+ {
1140
+ "epoch": 0.7354345749761223,
1141
+ "grad_norm": 0.8448590719351696,
1142
+ "learning_rate": 1.986044109430869e-06,
1143
+ "loss": 0.544,
1144
+ "step": 1540
1145
+ },
1146
+ {
1147
+ "epoch": 0.7402101241642789,
1148
+ "grad_norm": 1.0014560087074114,
1149
+ "learning_rate": 1.91994090455918e-06,
1150
+ "loss": 0.5544,
1151
+ "step": 1550
1152
+ },
1153
+ {
1154
+ "epoch": 0.7449856733524355,
1155
+ "grad_norm": 0.9943362148840331,
1156
+ "learning_rate": 1.8546941199506752e-06,
1157
+ "loss": 0.5743,
1158
+ "step": 1560
1159
+ },
1160
+ {
1161
+ "epoch": 0.7497612225405922,
1162
+ "grad_norm": 0.9488632116893986,
1163
+ "learning_rate": 1.790321897682083e-06,
1164
+ "loss": 0.5516,
1165
+ "step": 1570
1166
+ },
1167
+ {
1168
+ "epoch": 0.7545367717287488,
1169
+ "grad_norm": 0.9282545781122443,
1170
+ "learning_rate": 1.7268421366552851e-06,
1171
+ "loss": 0.5598,
1172
+ "step": 1580
1173
+ },
1174
+ {
1175
+ "epoch": 0.7593123209169055,
1176
+ "grad_norm": 0.893009147729329,
1177
+ "learning_rate": 1.6642724876204658e-06,
1178
+ "loss": 0.5457,
1179
+ "step": 1590
1180
+ },
1181
+ {
1182
+ "epoch": 0.7640878701050621,
1183
+ "grad_norm": 0.8952401614954113,
1184
+ "learning_rate": 1.602630348268267e-06,
1185
+ "loss": 0.5623,
1186
+ "step": 1600
1187
+ },
1188
+ {
1189
+ "epoch": 0.7640878701050621,
1190
+ "eval_loss": 0.46827441453933716,
1191
+ "eval_runtime": 534.0896,
1192
+ "eval_samples_per_second": 27.879,
1193
+ "eval_steps_per_second": 3.486,
1194
+ "step": 1600
1195
+ },
1196
+ {
1197
+ "epoch": 0.7688634192932188,
1198
+ "grad_norm": 0.9378147502660392,
1199
+ "learning_rate": 1.541932858392296e-06,
1200
+ "loss": 0.5522,
1201
+ "step": 1610
1202
+ },
1203
+ {
1204
+ "epoch": 0.7736389684813754,
1205
+ "grad_norm": 0.8793552837932004,
1206
+ "learning_rate": 1.482196895123364e-06,
1207
+ "loss": 0.5321,
1208
+ "step": 1620
1209
+ },
1210
+ {
1211
+ "epoch": 0.778414517669532,
1212
+ "grad_norm": 0.8650313644836122,
1213
+ "learning_rate": 1.423439068236736e-06,
1214
+ "loss": 0.5789,
1215
+ "step": 1630
1216
+ },
1217
+ {
1218
+ "epoch": 0.7831900668576887,
1219
+ "grad_norm": 0.9837452596609937,
1220
+ "learning_rate": 1.3656757155337413e-06,
1221
+ "loss": 0.5628,
1222
+ "step": 1640
1223
+ },
1224
+ {
1225
+ "epoch": 0.7879656160458453,
1226
+ "grad_norm": 1.0526446379669654,
1227
+ "learning_rate": 1.3089228982989771e-06,
1228
+ "loss": 0.5139,
1229
+ "step": 1650
1230
+ },
1231
+ {
1232
+ "epoch": 0.792741165234002,
1233
+ "grad_norm": 0.9071660119485095,
1234
+ "learning_rate": 1.2531963968344346e-06,
1235
+ "loss": 0.5229,
1236
+ "step": 1660
1237
+ },
1238
+ {
1239
+ "epoch": 0.7975167144221585,
1240
+ "grad_norm": 0.8509970739173898,
1241
+ "learning_rate": 1.1985117060717278e-06,
1242
+ "loss": 0.5184,
1243
+ "step": 1670
1244
+ },
1245
+ {
1246
+ "epoch": 0.8022922636103151,
1247
+ "grad_norm": 0.8689645530199235,
1248
+ "learning_rate": 1.1448840312636812e-06,
1249
+ "loss": 0.5248,
1250
+ "step": 1680
1251
+ },
1252
+ {
1253
+ "epoch": 0.8070678127984718,
1254
+ "grad_norm": 0.8888334350846111,
1255
+ "learning_rate": 1.0923282837564537e-06,
1256
+ "loss": 0.5451,
1257
+ "step": 1690
1258
+ },
1259
+ {
1260
+ "epoch": 0.8118433619866284,
1261
+ "grad_norm": 1.0944895277253541,
1262
+ "learning_rate": 1.0408590768434018e-06,
1263
+ "loss": 0.522,
1264
+ "step": 1700
1265
+ },
1266
+ {
1267
+ "epoch": 0.8166189111747851,
1268
+ "grad_norm": 1.0504131852344616,
1269
+ "learning_rate": 9.904907217018e-07,
1270
+ "loss": 0.5143,
1271
+ "step": 1710
1272
+ },
1273
+ {
1274
+ "epoch": 0.8213944603629417,
1275
+ "grad_norm": 0.9813714332571194,
1276
+ "learning_rate": 9.412372234135753e-07,
1277
+ "loss": 0.5339,
1278
+ "step": 1720
1279
+ },
1280
+ {
1281
+ "epoch": 0.8261700095510984,
1282
+ "grad_norm": 0.8392110216415885,
1283
+ "learning_rate": 8.931122770711425e-07,
1284
+ "loss": 0.5326,
1285
+ "step": 1730
1286
+ },
1287
+ {
1288
+ "epoch": 0.830945558739255,
1289
+ "grad_norm": 0.883891691537776,
1290
+ "learning_rate": 8.461292639694519e-07,
1291
+ "loss": 0.5308,
1292
+ "step": 1740
1293
+ },
1294
+ {
1295
+ "epoch": 0.8357211079274116,
1296
+ "grad_norm": 0.9330631339104432,
1297
+ "learning_rate": 8.003012478852679e-07,
1298
+ "loss": 0.4943,
1299
+ "step": 1750
1300
+ },
1301
+ {
1302
+ "epoch": 0.8404966571155683,
1303
+ "grad_norm": 0.9077272187489582,
1304
+ "learning_rate": 7.556409714447488e-07,
1305
+ "loss": 0.5474,
1306
+ "step": 1760
1307
+ },
1308
+ {
1309
+ "epoch": 0.8452722063037249,
1310
+ "grad_norm": 0.8412019707689536,
1311
+ "learning_rate": 7.121608525803142e-07,
1312
+ "loss": 0.5301,
1313
+ "step": 1770
1314
+ },
1315
+ {
1316
+ "epoch": 0.8500477554918816,
1317
+ "grad_norm": 1.0343479774517594,
1318
+ "learning_rate": 6.698729810778065e-07,
1319
+ "loss": 0.5302,
1320
+ "step": 1780
1321
+ },
1322
+ {
1323
+ "epoch": 0.8548233046800382,
1324
+ "grad_norm": 0.9838478459223126,
1325
+ "learning_rate": 6.287891152148823e-07,
1326
+ "loss": 0.5075,
1327
+ "step": 1790
1328
+ },
1329
+ {
1330
+ "epoch": 0.8595988538681948,
1331
+ "grad_norm": 1.009194175526722,
1332
+ "learning_rate": 5.889206784915863e-07,
1333
+ "loss": 0.5206,
1334
+ "step": 1800
1335
+ },
1336
+ {
1337
+ "epoch": 0.8595988538681948,
1338
+ "eval_loss": 0.44646286964416504,
1339
+ "eval_runtime": 534.2652,
1340
+ "eval_samples_per_second": 27.87,
1341
+ "eval_steps_per_second": 3.485,
1342
+ "step": 1800
1343
+ },
1344
+ {
1345
+ "epoch": 0.8643744030563515,
1346
+ "grad_norm": 0.8387985123574973,
1347
+ "learning_rate": 5.502787564540102e-07,
1348
+ "loss": 0.5305,
1349
+ "step": 1810
1350
+ },
1351
+ {
1352
+ "epoch": 0.8691499522445081,
1353
+ "grad_norm": 0.9238833740564283,
1354
+ "learning_rate": 5.128740936119242e-07,
1355
+ "loss": 0.5115,
1356
+ "step": 1820
1357
+ },
1358
+ {
1359
+ "epoch": 0.8739255014326648,
1360
+ "grad_norm": 0.8643626924619122,
1361
+ "learning_rate": 4.7671709045122914e-07,
1362
+ "loss": 0.501,
1363
+ "step": 1830
1364
+ },
1365
+ {
1366
+ "epoch": 0.8787010506208214,
1367
+ "grad_norm": 0.8843512593283425,
1368
+ "learning_rate": 4.4181780054206925e-07,
1369
+ "loss": 0.5316,
1370
+ "step": 1840
1371
+ },
1372
+ {
1373
+ "epoch": 0.8834765998089781,
1374
+ "grad_norm": 1.0280116743208123,
1375
+ "learning_rate": 4.081859277434025e-07,
1376
+ "loss": 0.5084,
1377
+ "step": 1850
1378
+ },
1379
+ {
1380
+ "epoch": 0.8882521489971347,
1381
+ "grad_norm": 0.9217334180362886,
1382
+ "learning_rate": 3.758308235048158e-07,
1383
+ "loss": 0.4988,
1384
+ "step": 1860
1385
+ },
1386
+ {
1387
+ "epoch": 0.8930276981852913,
1388
+ "grad_norm": 0.9278902432782374,
1389
+ "learning_rate": 3.4476148426632215e-07,
1390
+ "loss": 0.5248,
1391
+ "step": 1870
1392
+ },
1393
+ {
1394
+ "epoch": 0.897803247373448,
1395
+ "grad_norm": 0.8498974666627348,
1396
+ "learning_rate": 3.1498654895687095e-07,
1397
+ "loss": 0.5263,
1398
+ "step": 1880
1399
+ },
1400
+ {
1401
+ "epoch": 0.9025787965616046,
1402
+ "grad_norm": 0.914856710615246,
1403
+ "learning_rate": 2.8651429659226906e-07,
1404
+ "loss": 0.5129,
1405
+ "step": 1890
1406
+ },
1407
+ {
1408
+ "epoch": 0.9073543457497613,
1409
+ "grad_norm": 0.9485264410476115,
1410
+ "learning_rate": 2.593526439731697e-07,
1411
+ "loss": 0.5033,
1412
+ "step": 1900
1413
+ },
1414
+ {
1415
+ "epoch": 0.9121298949379179,
1416
+ "grad_norm": 0.922810539225268,
1417
+ "learning_rate": 2.3350914348378606e-07,
1418
+ "loss": 0.5157,
1419
+ "step": 1910
1420
+ },
1421
+ {
1422
+ "epoch": 0.9169054441260746,
1423
+ "grad_norm": 0.956199326320254,
1424
+ "learning_rate": 2.0899098099192273e-07,
1425
+ "loss": 0.5158,
1426
+ "step": 1920
1427
+ },
1428
+ {
1429
+ "epoch": 0.9216809933142311,
1430
+ "grad_norm": 0.8741702574957524,
1431
+ "learning_rate": 1.8580497385092376e-07,
1432
+ "loss": 0.5145,
1433
+ "step": 1930
1434
+ },
1435
+ {
1436
+ "epoch": 0.9264565425023877,
1437
+ "grad_norm": 0.9941012086649309,
1438
+ "learning_rate": 1.6395756900408454e-07,
1439
+ "loss": 0.5321,
1440
+ "step": 1940
1441
+ },
1442
+ {
1443
+ "epoch": 0.9312320916905444,
1444
+ "grad_norm": 0.8582163500365767,
1445
+ "learning_rate": 1.4345484119206222e-07,
1446
+ "loss": 0.5065,
1447
+ "step": 1950
1448
+ },
1449
+ {
1450
+ "epoch": 0.936007640878701,
1451
+ "grad_norm": 0.9879634600102223,
1452
+ "learning_rate": 1.2430249126376913e-07,
1453
+ "loss": 0.54,
1454
+ "step": 1960
1455
+ },
1456
+ {
1457
+ "epoch": 0.9407831900668577,
1458
+ "grad_norm": 0.9616760638465843,
1459
+ "learning_rate": 1.065058445912398e-07,
1460
+ "loss": 0.5084,
1461
+ "step": 1970
1462
+ },
1463
+ {
1464
+ "epoch": 0.9455587392550143,
1465
+ "grad_norm": 1.0543325130204897,
1466
+ "learning_rate": 9.006984958888742e-08,
1467
+ "loss": 0.527,
1468
+ "step": 1980
1469
+ },
1470
+ {
1471
+ "epoch": 0.9503342884431709,
1472
+ "grad_norm": 0.8916780720938148,
1473
+ "learning_rate": 7.499907633758797e-08,
1474
+ "loss": 0.4929,
1475
+ "step": 1990
1476
+ },
1477
+ {
1478
+ "epoch": 0.9551098376313276,
1479
+ "grad_norm": 0.8743144782771384,
1480
+ "learning_rate": 6.129771531395045e-08,
1481
+ "loss": 0.5054,
1482
+ "step": 2000
1483
+ },
1484
+ {
1485
+ "epoch": 0.9551098376313276,
1486
+ "eval_loss": 0.43961772322654724,
1487
+ "eval_runtime": 534.3343,
1488
+ "eval_samples_per_second": 27.866,
1489
+ "eval_steps_per_second": 3.485,
1490
+ "step": 2000
1491
+ },
1492
+ {
1493
+ "epoch": 0.9598853868194842,
1494
+ "grad_norm": 0.9030091469256598,
1495
+ "learning_rate": 4.896957622514298e-08,
1496
+ "loss": 0.4983,
1497
+ "step": 2010
1498
+ },
1499
+ {
1500
+ "epoch": 0.9646609360076409,
1501
+ "grad_norm": 0.9300966338414236,
1502
+ "learning_rate": 3.801808694959053e-08,
1503
+ "loss": 0.5219,
1504
+ "step": 2020
1505
+ },
1506
+ {
1507
+ "epoch": 0.9694364851957975,
1508
+ "grad_norm": 0.9956560344691349,
1509
+ "learning_rate": 2.8446292583844126e-08,
1510
+ "loss": 0.5397,
1511
+ "step": 2030
1512
+ },
1513
+ {
1514
+ "epoch": 0.9742120343839542,
1515
+ "grad_norm": 0.944281751372382,
1516
+ "learning_rate": 2.025685459588145e-08,
1517
+ "loss": 0.525,
1518
+ "step": 2040
1519
+ },
1520
+ {
1521
+ "epoch": 0.9789875835721108,
1522
+ "grad_norm": 0.8939170547532904,
1523
+ "learning_rate": 1.3452050085075441e-08,
1524
+ "loss": 0.5086,
1525
+ "step": 2050
1526
+ },
1527
+ {
1528
+ "epoch": 0.9837631327602674,
1529
+ "grad_norm": 0.8895914526936296,
1530
+ "learning_rate": 8.033771149041913e-09,
1531
+ "loss": 0.5122,
1532
+ "step": 2060
1533
+ },
1534
+ {
1535
+ "epoch": 0.9885386819484241,
1536
+ "grad_norm": 0.8484510879585117,
1537
+ "learning_rate": 4.003524357534261e-09,
1538
+ "loss": 0.5168,
1539
+ "step": 2070
1540
+ },
1541
+ {
1542
+ "epoch": 0.9933142311365807,
1543
+ "grad_norm": 0.9676882963530836,
1544
+ "learning_rate": 1.3624303335380006e-09,
1545
+ "loss": 0.5155,
1546
+ "step": 2080
1547
+ },
1548
+ {
1549
+ "epoch": 0.9980897803247374,
1550
+ "grad_norm": 0.8272201128188289,
1551
+ "learning_rate": 1.1122344167613374e-10,
1552
+ "loss": 0.5051,
1553
+ "step": 2090
1554
+ },
1555
+ {
1556
+ "epoch": 1.0,
1557
+ "step": 2094,
1558
+ "total_flos": 985274787299328.0,
1559
+ "train_loss": 0.7358792713970487,
1560
+ "train_runtime": 28849.9475,
1561
+ "train_samples_per_second": 4.645,
1562
+ "train_steps_per_second": 0.073
1563
+ }
1564
+ ],
1565
+ "logging_steps": 10,
1566
+ "max_steps": 2094,
1567
+ "num_input_tokens_seen": 0,
1568
+ "num_train_epochs": 1,
1569
+ "save_steps": 200,
1570
+ "stateful_callbacks": {
1571
+ "TrainerControl": {
1572
+ "args": {
1573
+ "should_epoch_stop": false,
1574
+ "should_evaluate": false,
1575
+ "should_log": false,
1576
+ "should_save": true,
1577
+ "should_training_stop": true
1578
+ },
1579
+ "attributes": {}
1580
+ }
1581
+ },
1582
+ "total_flos": 985274787299328.0,
1583
+ "train_batch_size": 2,
1584
+ "trial_name": null,
1585
+ "trial_params": null
1586
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad93b5c94e2609eb9bf91665fab08b3b0f0685612e426e17e0895898cf6681e0
3
+ size 7480
training_eval_loss.png ADDED
training_loss.png ADDED
vocab.json ADDED
The diff for this file is too large to render. See raw diff