Allen Poston commited on
Commit
eb2a98c
·
verified ·
1 Parent(s): c1d84f8

Adaptive Senko AI - 30,000 examples, gpt2 base

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +202 -0
  2. adapter_config.json +32 -0
  3. adapter_model.safetensors +3 -0
  4. checkpoint-11600/README.md +202 -0
  5. checkpoint-11600/adapter_config.json +32 -0
  6. checkpoint-11600/adapter_model.safetensors +3 -0
  7. checkpoint-11600/merges.txt +0 -0
  8. checkpoint-11600/optimizer.pt +3 -0
  9. checkpoint-11600/rng_state.pth +3 -0
  10. checkpoint-11600/scaler.pt +3 -0
  11. checkpoint-11600/scheduler.pt +3 -0
  12. checkpoint-11600/special_tokens_map.json +6 -0
  13. checkpoint-11600/tokenizer.json +0 -0
  14. checkpoint-11600/tokenizer_config.json +21 -0
  15. checkpoint-11600/trainer_state.json +2122 -0
  16. checkpoint-11600/training_args.bin +3 -0
  17. checkpoint-11600/vocab.json +0 -0
  18. checkpoint-11800/README.md +202 -0
  19. checkpoint-11800/adapter_config.json +32 -0
  20. checkpoint-11800/adapter_model.safetensors +3 -0
  21. checkpoint-11800/merges.txt +0 -0
  22. checkpoint-11800/optimizer.pt +3 -0
  23. checkpoint-11800/rng_state.pth +3 -0
  24. checkpoint-11800/scaler.pt +3 -0
  25. checkpoint-11800/scheduler.pt +3 -0
  26. checkpoint-11800/special_tokens_map.json +6 -0
  27. checkpoint-11800/tokenizer.json +0 -0
  28. checkpoint-11800/tokenizer_config.json +21 -0
  29. checkpoint-11800/trainer_state.json +2158 -0
  30. checkpoint-11800/training_args.bin +3 -0
  31. checkpoint-11800/vocab.json +0 -0
  32. checkpoint-11847/README.md +202 -0
  33. checkpoint-11847/adapter_config.json +32 -0
  34. checkpoint-11847/adapter_model.safetensors +3 -0
  35. checkpoint-11847/merges.txt +0 -0
  36. checkpoint-11847/optimizer.pt +3 -0
  37. checkpoint-11847/rng_state.pth +3 -0
  38. checkpoint-11847/scaler.pt +3 -0
  39. checkpoint-11847/scheduler.pt +3 -0
  40. checkpoint-11847/special_tokens_map.json +6 -0
  41. checkpoint-11847/tokenizer.json +0 -0
  42. checkpoint-11847/tokenizer_config.json +21 -0
  43. checkpoint-11847/trainer_state.json +2158 -0
  44. checkpoint-11847/training_args.bin +3 -0
  45. checkpoint-11847/vocab.json +0 -0
  46. merges.txt +0 -0
  47. special_tokens_map.json +6 -0
  48. tokenizer.json +0 -0
  49. tokenizer_config.json +21 -0
  50. training_args.bin +3 -0
README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: gpt2
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.14.0
adapter_config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "gpt2",
5
+ "bias": "none",
6
+ "eva_config": null,
7
+ "exclude_modules": null,
8
+ "fan_in_fan_out": true,
9
+ "inference_mode": true,
10
+ "init_lora_weights": true,
11
+ "layer_replication": null,
12
+ "layers_pattern": null,
13
+ "layers_to_transform": null,
14
+ "loftq_config": {},
15
+ "lora_alpha": 16,
16
+ "lora_bias": false,
17
+ "lora_dropout": 0.1,
18
+ "megatron_config": null,
19
+ "megatron_core": "megatron.core",
20
+ "modules_to_save": null,
21
+ "peft_type": "LORA",
22
+ "r": 8,
23
+ "rank_pattern": {},
24
+ "revision": null,
25
+ "target_modules": [
26
+ "c_proj",
27
+ "c_attn"
28
+ ],
29
+ "task_type": "CAUSAL_LM",
30
+ "use_dora": false,
31
+ "use_rslora": false
32
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1cbeee3dfc268cbdbc22f727c37945ebd4275a3ea49512bd0ee049c583c3112b
3
+ size 3253104
checkpoint-11600/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: gpt2
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.14.0
checkpoint-11600/adapter_config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "gpt2",
5
+ "bias": "none",
6
+ "eva_config": null,
7
+ "exclude_modules": null,
8
+ "fan_in_fan_out": true,
9
+ "inference_mode": true,
10
+ "init_lora_weights": true,
11
+ "layer_replication": null,
12
+ "layers_pattern": null,
13
+ "layers_to_transform": null,
14
+ "loftq_config": {},
15
+ "lora_alpha": 16,
16
+ "lora_bias": false,
17
+ "lora_dropout": 0.1,
18
+ "megatron_config": null,
19
+ "megatron_core": "megatron.core",
20
+ "modules_to_save": null,
21
+ "peft_type": "LORA",
22
+ "r": 8,
23
+ "rank_pattern": {},
24
+ "revision": null,
25
+ "target_modules": [
26
+ "c_proj",
27
+ "c_attn"
28
+ ],
29
+ "task_type": "CAUSAL_LM",
30
+ "use_dora": false,
31
+ "use_rslora": false
32
+ }
checkpoint-11600/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8137c1eda8ebedf087f23f1f39ad57fa24e9b35b3fc584acc19896ff81984421
3
+ size 3253104
checkpoint-11600/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-11600/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6698b23ae4afc12ca05cd227dc927d2c778fd5abc3ce6f3c2c02a8a7d3b3794
3
+ size 6548858
checkpoint-11600/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:752ae17d06428ff7db5e14fdee45efee8a27d6a0dc1859545aba6773ec56480e
3
+ size 14244
checkpoint-11600/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:684a2d80447af9cc76005199cfec8e7dbd71d967ea01cae183e7fcace028e157
3
+ size 988
checkpoint-11600/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6a219de3badb6cb2e9c9c7b282785dd34d720128250f541f986799be86e14e5
3
+ size 1064
checkpoint-11600/special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "pad_token": "<|endoftext|>",
5
+ "unk_token": "<|endoftext|>"
6
+ }
checkpoint-11600/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-11600/tokenizer_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "50256": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ }
12
+ },
13
+ "bos_token": "<|endoftext|>",
14
+ "clean_up_tokenization_spaces": false,
15
+ "eos_token": "<|endoftext|>",
16
+ "extra_special_tokens": {},
17
+ "model_max_length": 1024,
18
+ "pad_token": "<|endoftext|>",
19
+ "tokenizer_class": "GPT2Tokenizer",
20
+ "unk_token": "<|endoftext|>"
21
+ }
checkpoint-11600/trainer_state.json ADDED
@@ -0,0 +1,2122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 11600,
3
+ "best_metric": 2.4329476356506348,
4
+ "best_model_checkpoint": "/kaggle/working/senko_adaptive/checkpoint-11600",
5
+ "epoch": 2.936886750648857,
6
+ "eval_steps": 200,
7
+ "global_step": 11600,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0126606317655251,
14
+ "grad_norm": 0.4274106025695801,
15
+ "learning_rate": 2.067510548523207e-06,
16
+ "loss": 3.4405,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 0.0253212635310502,
21
+ "grad_norm": 0.5292551517486572,
22
+ "learning_rate": 4.177215189873418e-06,
23
+ "loss": 3.4567,
24
+ "step": 100
25
+ },
26
+ {
27
+ "epoch": 0.0379818952965753,
28
+ "grad_norm": 0.7541739344596863,
29
+ "learning_rate": 6.28691983122363e-06,
30
+ "loss": 3.4683,
31
+ "step": 150
32
+ },
33
+ {
34
+ "epoch": 0.0506425270621004,
35
+ "grad_norm": 0.8833445906639099,
36
+ "learning_rate": 8.39662447257384e-06,
37
+ "loss": 3.5084,
38
+ "step": 200
39
+ },
40
+ {
41
+ "epoch": 0.0506425270621004,
42
+ "eval_loss": 3.5248923301696777,
43
+ "eval_runtime": 39.9384,
44
+ "eval_samples_per_second": 43.968,
45
+ "eval_steps_per_second": 43.968,
46
+ "step": 200
47
+ },
48
+ {
49
+ "epoch": 0.0633031588276255,
50
+ "grad_norm": 0.9998921155929565,
51
+ "learning_rate": 1.0506329113924052e-05,
52
+ "loss": 3.359,
53
+ "step": 250
54
+ },
55
+ {
56
+ "epoch": 0.0759637905931506,
57
+ "grad_norm": 0.8041885495185852,
58
+ "learning_rate": 1.2616033755274262e-05,
59
+ "loss": 3.351,
60
+ "step": 300
61
+ },
62
+ {
63
+ "epoch": 0.0886244223586757,
64
+ "grad_norm": 0.9213416576385498,
65
+ "learning_rate": 1.4725738396624473e-05,
66
+ "loss": 3.2244,
67
+ "step": 350
68
+ },
69
+ {
70
+ "epoch": 0.1012850541242008,
71
+ "grad_norm": 1.0922213792800903,
72
+ "learning_rate": 1.6835443037974685e-05,
73
+ "loss": 3.1565,
74
+ "step": 400
75
+ },
76
+ {
77
+ "epoch": 0.1012850541242008,
78
+ "eval_loss": 3.151216983795166,
79
+ "eval_runtime": 39.7515,
80
+ "eval_samples_per_second": 44.174,
81
+ "eval_steps_per_second": 44.174,
82
+ "step": 400
83
+ },
84
+ {
85
+ "epoch": 0.1139456858897259,
86
+ "grad_norm": 1.4199283123016357,
87
+ "learning_rate": 1.8945147679324897e-05,
88
+ "loss": 3.0154,
89
+ "step": 450
90
+ },
91
+ {
92
+ "epoch": 0.126606317655251,
93
+ "grad_norm": 1.077143907546997,
94
+ "learning_rate": 2.1054852320675106e-05,
95
+ "loss": 3.0456,
96
+ "step": 500
97
+ },
98
+ {
99
+ "epoch": 0.1392669494207761,
100
+ "grad_norm": 1.5466052293777466,
101
+ "learning_rate": 2.3164556962025318e-05,
102
+ "loss": 2.9099,
103
+ "step": 550
104
+ },
105
+ {
106
+ "epoch": 0.1519275811863012,
107
+ "grad_norm": 1.2139467000961304,
108
+ "learning_rate": 2.5274261603375527e-05,
109
+ "loss": 2.8839,
110
+ "step": 600
111
+ },
112
+ {
113
+ "epoch": 0.1519275811863012,
114
+ "eval_loss": 2.793567657470703,
115
+ "eval_runtime": 40.2573,
116
+ "eval_samples_per_second": 43.619,
117
+ "eval_steps_per_second": 43.619,
118
+ "step": 600
119
+ },
120
+ {
121
+ "epoch": 0.1645882129518263,
122
+ "grad_norm": 1.0270315408706665,
123
+ "learning_rate": 2.738396624472574e-05,
124
+ "loss": 2.8389,
125
+ "step": 650
126
+ },
127
+ {
128
+ "epoch": 0.1772488447173514,
129
+ "grad_norm": 1.5865377187728882,
130
+ "learning_rate": 2.949367088607595e-05,
131
+ "loss": 2.8228,
132
+ "step": 700
133
+ },
134
+ {
135
+ "epoch": 0.18990947648287648,
136
+ "grad_norm": 1.076073408126831,
137
+ "learning_rate": 3.160337552742616e-05,
138
+ "loss": 2.9255,
139
+ "step": 750
140
+ },
141
+ {
142
+ "epoch": 0.2025701082484016,
143
+ "grad_norm": 1.4510694742202759,
144
+ "learning_rate": 3.3713080168776376e-05,
145
+ "loss": 2.8165,
146
+ "step": 800
147
+ },
148
+ {
149
+ "epoch": 0.2025701082484016,
150
+ "eval_loss": 2.667301893234253,
151
+ "eval_runtime": 40.0906,
152
+ "eval_samples_per_second": 43.801,
153
+ "eval_steps_per_second": 43.801,
154
+ "step": 800
155
+ },
156
+ {
157
+ "epoch": 0.2152307400139267,
158
+ "grad_norm": 1.5206592082977295,
159
+ "learning_rate": 3.5822784810126585e-05,
160
+ "loss": 2.8022,
161
+ "step": 850
162
+ },
163
+ {
164
+ "epoch": 0.2278913717794518,
165
+ "grad_norm": 1.173909068107605,
166
+ "learning_rate": 3.7932489451476794e-05,
167
+ "loss": 2.8034,
168
+ "step": 900
169
+ },
170
+ {
171
+ "epoch": 0.24055200354497688,
172
+ "grad_norm": 1.4551103115081787,
173
+ "learning_rate": 4.004219409282701e-05,
174
+ "loss": 2.774,
175
+ "step": 950
176
+ },
177
+ {
178
+ "epoch": 0.253212635310502,
179
+ "grad_norm": 1.509749412536621,
180
+ "learning_rate": 4.215189873417722e-05,
181
+ "loss": 2.7978,
182
+ "step": 1000
183
+ },
184
+ {
185
+ "epoch": 0.253212635310502,
186
+ "eval_loss": 2.603116273880005,
187
+ "eval_runtime": 39.9617,
188
+ "eval_samples_per_second": 43.942,
189
+ "eval_steps_per_second": 43.942,
190
+ "step": 1000
191
+ },
192
+ {
193
+ "epoch": 0.2658732670760271,
194
+ "grad_norm": 1.745764136314392,
195
+ "learning_rate": 4.426160337552743e-05,
196
+ "loss": 2.7249,
197
+ "step": 1050
198
+ },
199
+ {
200
+ "epoch": 0.2785338988415522,
201
+ "grad_norm": 1.6712589263916016,
202
+ "learning_rate": 4.637130801687764e-05,
203
+ "loss": 2.7618,
204
+ "step": 1100
205
+ },
206
+ {
207
+ "epoch": 0.2911945306070773,
208
+ "grad_norm": 2.256267786026001,
209
+ "learning_rate": 4.8481012658227845e-05,
210
+ "loss": 2.7431,
211
+ "step": 1150
212
+ },
213
+ {
214
+ "epoch": 0.3038551623726024,
215
+ "grad_norm": 1.5181586742401123,
216
+ "learning_rate": 4.993434627649597e-05,
217
+ "loss": 2.778,
218
+ "step": 1200
219
+ },
220
+ {
221
+ "epoch": 0.3038551623726024,
222
+ "eval_loss": 2.5704379081726074,
223
+ "eval_runtime": 40.1791,
224
+ "eval_samples_per_second": 43.704,
225
+ "eval_steps_per_second": 43.704,
226
+ "step": 1200
227
+ },
228
+ {
229
+ "epoch": 0.31651579413812747,
230
+ "grad_norm": 1.1885608434677124,
231
+ "learning_rate": 4.969986869255299e-05,
232
+ "loss": 2.7224,
233
+ "step": 1250
234
+ },
235
+ {
236
+ "epoch": 0.3291764259036526,
237
+ "grad_norm": 1.2136404514312744,
238
+ "learning_rate": 4.946539110861002e-05,
239
+ "loss": 2.6823,
240
+ "step": 1300
241
+ },
242
+ {
243
+ "epoch": 0.3418370576691777,
244
+ "grad_norm": 0.8780732750892639,
245
+ "learning_rate": 4.9230913524667046e-05,
246
+ "loss": 2.6961,
247
+ "step": 1350
248
+ },
249
+ {
250
+ "epoch": 0.3544976894347028,
251
+ "grad_norm": 1.0844959020614624,
252
+ "learning_rate": 4.899643594072407e-05,
253
+ "loss": 2.7014,
254
+ "step": 1400
255
+ },
256
+ {
257
+ "epoch": 0.3544976894347028,
258
+ "eval_loss": 2.54587721824646,
259
+ "eval_runtime": 39.8803,
260
+ "eval_samples_per_second": 44.032,
261
+ "eval_steps_per_second": 44.032,
262
+ "step": 1400
263
+ },
264
+ {
265
+ "epoch": 0.3671583212002279,
266
+ "grad_norm": 1.3518335819244385,
267
+ "learning_rate": 4.8761958356781096e-05,
268
+ "loss": 2.6393,
269
+ "step": 1450
270
+ },
271
+ {
272
+ "epoch": 0.37981895296575297,
273
+ "grad_norm": 1.1389687061309814,
274
+ "learning_rate": 4.852748077283812e-05,
275
+ "loss": 2.7002,
276
+ "step": 1500
277
+ },
278
+ {
279
+ "epoch": 0.3924795847312781,
280
+ "grad_norm": 1.6295430660247803,
281
+ "learning_rate": 4.829300318889514e-05,
282
+ "loss": 2.6754,
283
+ "step": 1550
284
+ },
285
+ {
286
+ "epoch": 0.4051402164968032,
287
+ "grad_norm": 1.387499451637268,
288
+ "learning_rate": 4.8058525604952173e-05,
289
+ "loss": 2.6853,
290
+ "step": 1600
291
+ },
292
+ {
293
+ "epoch": 0.4051402164968032,
294
+ "eval_loss": 2.5297553539276123,
295
+ "eval_runtime": 39.722,
296
+ "eval_samples_per_second": 44.207,
297
+ "eval_steps_per_second": 44.207,
298
+ "step": 1600
299
+ },
300
+ {
301
+ "epoch": 0.4178008482623283,
302
+ "grad_norm": 1.014020323753357,
303
+ "learning_rate": 4.7824048021009195e-05,
304
+ "loss": 2.7275,
305
+ "step": 1650
306
+ },
307
+ {
308
+ "epoch": 0.4304614800278534,
309
+ "grad_norm": 1.1505990028381348,
310
+ "learning_rate": 4.7589570437066216e-05,
311
+ "loss": 2.6651,
312
+ "step": 1700
313
+ },
314
+ {
315
+ "epoch": 0.4431221117933785,
316
+ "grad_norm": 1.1389458179473877,
317
+ "learning_rate": 4.7355092853123244e-05,
318
+ "loss": 2.6993,
319
+ "step": 1750
320
+ },
321
+ {
322
+ "epoch": 0.4557827435589036,
323
+ "grad_norm": 1.2159587144851685,
324
+ "learning_rate": 4.7120615269180266e-05,
325
+ "loss": 2.7239,
326
+ "step": 1800
327
+ },
328
+ {
329
+ "epoch": 0.4557827435589036,
330
+ "eval_loss": 2.5201079845428467,
331
+ "eval_runtime": 39.8313,
332
+ "eval_samples_per_second": 44.086,
333
+ "eval_steps_per_second": 44.086,
334
+ "step": 1800
335
+ },
336
+ {
337
+ "epoch": 0.4684433753244287,
338
+ "grad_norm": 1.1873971223831177,
339
+ "learning_rate": 4.6886137685237294e-05,
340
+ "loss": 2.6368,
341
+ "step": 1850
342
+ },
343
+ {
344
+ "epoch": 0.48110400708995377,
345
+ "grad_norm": 1.5109103918075562,
346
+ "learning_rate": 4.665166010129432e-05,
347
+ "loss": 2.6827,
348
+ "step": 1900
349
+ },
350
+ {
351
+ "epoch": 0.4937646388554789,
352
+ "grad_norm": 1.9981125593185425,
353
+ "learning_rate": 4.641718251735134e-05,
354
+ "loss": 2.6513,
355
+ "step": 1950
356
+ },
357
+ {
358
+ "epoch": 0.506425270621004,
359
+ "grad_norm": 1.4879294633865356,
360
+ "learning_rate": 4.6182704933408365e-05,
361
+ "loss": 2.6433,
362
+ "step": 2000
363
+ },
364
+ {
365
+ "epoch": 0.506425270621004,
366
+ "eval_loss": 2.5107176303863525,
367
+ "eval_runtime": 40.0643,
368
+ "eval_samples_per_second": 43.83,
369
+ "eval_steps_per_second": 43.83,
370
+ "step": 2000
371
+ },
372
+ {
373
+ "epoch": 0.5190859023865291,
374
+ "grad_norm": 1.2832767963409424,
375
+ "learning_rate": 4.5952916901144253e-05,
376
+ "loss": 2.6225,
377
+ "step": 2050
378
+ },
379
+ {
380
+ "epoch": 0.5317465341520542,
381
+ "grad_norm": 1.2915899753570557,
382
+ "learning_rate": 4.5718439317201275e-05,
383
+ "loss": 2.6592,
384
+ "step": 2100
385
+ },
386
+ {
387
+ "epoch": 0.5444071659175793,
388
+ "grad_norm": 1.229929804801941,
389
+ "learning_rate": 4.54839617332583e-05,
390
+ "loss": 2.6411,
391
+ "step": 2150
392
+ },
393
+ {
394
+ "epoch": 0.5570677976831044,
395
+ "grad_norm": 1.2569608688354492,
396
+ "learning_rate": 4.524948414931533e-05,
397
+ "loss": 2.6436,
398
+ "step": 2200
399
+ },
400
+ {
401
+ "epoch": 0.5570677976831044,
402
+ "eval_loss": 2.504101514816284,
403
+ "eval_runtime": 39.8694,
404
+ "eval_samples_per_second": 44.044,
405
+ "eval_steps_per_second": 44.044,
406
+ "step": 2200
407
+ },
408
+ {
409
+ "epoch": 0.5697284294486294,
410
+ "grad_norm": 1.3688510656356812,
411
+ "learning_rate": 4.501500656537235e-05,
412
+ "loss": 2.6819,
413
+ "step": 2250
414
+ },
415
+ {
416
+ "epoch": 0.5823890612141546,
417
+ "grad_norm": 1.1405905485153198,
418
+ "learning_rate": 4.4780528981429374e-05,
419
+ "loss": 2.6116,
420
+ "step": 2300
421
+ },
422
+ {
423
+ "epoch": 0.5950496929796797,
424
+ "grad_norm": 1.453338861465454,
425
+ "learning_rate": 4.45460513974864e-05,
426
+ "loss": 2.6154,
427
+ "step": 2350
428
+ },
429
+ {
430
+ "epoch": 0.6077103247452048,
431
+ "grad_norm": 1.0401395559310913,
432
+ "learning_rate": 4.431157381354343e-05,
433
+ "loss": 2.6018,
434
+ "step": 2400
435
+ },
436
+ {
437
+ "epoch": 0.6077103247452048,
438
+ "eval_loss": 2.498344898223877,
439
+ "eval_runtime": 39.9496,
440
+ "eval_samples_per_second": 43.955,
441
+ "eval_steps_per_second": 43.955,
442
+ "step": 2400
443
+ },
444
+ {
445
+ "epoch": 0.6203709565107299,
446
+ "grad_norm": 1.4646718502044678,
447
+ "learning_rate": 4.407709622960045e-05,
448
+ "loss": 2.5734,
449
+ "step": 2450
450
+ },
451
+ {
452
+ "epoch": 0.6330315882762549,
453
+ "grad_norm": 1.3828164339065552,
454
+ "learning_rate": 4.384261864565748e-05,
455
+ "loss": 2.6445,
456
+ "step": 2500
457
+ },
458
+ {
459
+ "epoch": 0.6456922200417801,
460
+ "grad_norm": 2.1768596172332764,
461
+ "learning_rate": 4.36081410617145e-05,
462
+ "loss": 2.6618,
463
+ "step": 2550
464
+ },
465
+ {
466
+ "epoch": 0.6583528518073052,
467
+ "grad_norm": 1.6110296249389648,
468
+ "learning_rate": 4.337366347777152e-05,
469
+ "loss": 2.6509,
470
+ "step": 2600
471
+ },
472
+ {
473
+ "epoch": 0.6583528518073052,
474
+ "eval_loss": 2.4937028884887695,
475
+ "eval_runtime": 39.8698,
476
+ "eval_samples_per_second": 44.043,
477
+ "eval_steps_per_second": 44.043,
478
+ "step": 2600
479
+ },
480
+ {
481
+ "epoch": 0.6710134835728303,
482
+ "grad_norm": 1.2363536357879639,
483
+ "learning_rate": 4.313918589382856e-05,
484
+ "loss": 2.6274,
485
+ "step": 2650
486
+ },
487
+ {
488
+ "epoch": 0.6836741153383554,
489
+ "grad_norm": 2.192110538482666,
490
+ "learning_rate": 4.290470830988558e-05,
491
+ "loss": 2.6932,
492
+ "step": 2700
493
+ },
494
+ {
495
+ "epoch": 0.6963347471038804,
496
+ "grad_norm": 1.2024074792861938,
497
+ "learning_rate": 4.26702307259426e-05,
498
+ "loss": 2.6221,
499
+ "step": 2750
500
+ },
501
+ {
502
+ "epoch": 0.7089953788694056,
503
+ "grad_norm": 1.8665797710418701,
504
+ "learning_rate": 4.243575314199963e-05,
505
+ "loss": 2.6313,
506
+ "step": 2800
507
+ },
508
+ {
509
+ "epoch": 0.7089953788694056,
510
+ "eval_loss": 2.4876773357391357,
511
+ "eval_runtime": 40.026,
512
+ "eval_samples_per_second": 43.871,
513
+ "eval_steps_per_second": 43.871,
514
+ "step": 2800
515
+ },
516
+ {
517
+ "epoch": 0.7216560106349307,
518
+ "grad_norm": 1.4088993072509766,
519
+ "learning_rate": 4.220127555805665e-05,
520
+ "loss": 2.5675,
521
+ "step": 2850
522
+ },
523
+ {
524
+ "epoch": 0.7343166424004558,
525
+ "grad_norm": 1.3225140571594238,
526
+ "learning_rate": 4.196679797411368e-05,
527
+ "loss": 2.56,
528
+ "step": 2900
529
+ },
530
+ {
531
+ "epoch": 0.7469772741659809,
532
+ "grad_norm": 1.3416539430618286,
533
+ "learning_rate": 4.1732320390170706e-05,
534
+ "loss": 2.6517,
535
+ "step": 2950
536
+ },
537
+ {
538
+ "epoch": 0.7596379059315059,
539
+ "grad_norm": 1.079567790031433,
540
+ "learning_rate": 4.149784280622773e-05,
541
+ "loss": 2.698,
542
+ "step": 3000
543
+ },
544
+ {
545
+ "epoch": 0.7596379059315059,
546
+ "eval_loss": 2.4842560291290283,
547
+ "eval_runtime": 39.7988,
548
+ "eval_samples_per_second": 44.122,
549
+ "eval_steps_per_second": 44.122,
550
+ "step": 3000
551
+ },
552
+ {
553
+ "epoch": 0.772298537697031,
554
+ "grad_norm": 1.4532116651535034,
555
+ "learning_rate": 4.126336522228475e-05,
556
+ "loss": 2.6232,
557
+ "step": 3050
558
+ },
559
+ {
560
+ "epoch": 0.7849591694625562,
561
+ "grad_norm": 1.5380038022994995,
562
+ "learning_rate": 4.102888763834178e-05,
563
+ "loss": 2.6212,
564
+ "step": 3100
565
+ },
566
+ {
567
+ "epoch": 0.7976198012280813,
568
+ "grad_norm": 1.3965916633605957,
569
+ "learning_rate": 4.0794410054398805e-05,
570
+ "loss": 2.5804,
571
+ "step": 3150
572
+ },
573
+ {
574
+ "epoch": 0.8102804329936064,
575
+ "grad_norm": 1.4798463582992554,
576
+ "learning_rate": 4.0559932470455826e-05,
577
+ "loss": 2.6724,
578
+ "step": 3200
579
+ },
580
+ {
581
+ "epoch": 0.8102804329936064,
582
+ "eval_loss": 2.480894088745117,
583
+ "eval_runtime": 39.9604,
584
+ "eval_samples_per_second": 43.943,
585
+ "eval_steps_per_second": 43.943,
586
+ "step": 3200
587
+ },
588
+ {
589
+ "epoch": 0.8229410647591315,
590
+ "grad_norm": 1.2598360776901245,
591
+ "learning_rate": 4.0325454886512854e-05,
592
+ "loss": 2.6993,
593
+ "step": 3250
594
+ },
595
+ {
596
+ "epoch": 0.8356016965246565,
597
+ "grad_norm": 1.366295576095581,
598
+ "learning_rate": 4.0090977302569876e-05,
599
+ "loss": 2.551,
600
+ "step": 3300
601
+ },
602
+ {
603
+ "epoch": 0.8482623282901817,
604
+ "grad_norm": 1.1827855110168457,
605
+ "learning_rate": 3.98564997186269e-05,
606
+ "loss": 2.6131,
607
+ "step": 3350
608
+ },
609
+ {
610
+ "epoch": 0.8609229600557068,
611
+ "grad_norm": 1.2728627920150757,
612
+ "learning_rate": 3.9622022134683925e-05,
613
+ "loss": 2.6178,
614
+ "step": 3400
615
+ },
616
+ {
617
+ "epoch": 0.8609229600557068,
618
+ "eval_loss": 2.477010726928711,
619
+ "eval_runtime": 40.2504,
620
+ "eval_samples_per_second": 43.627,
621
+ "eval_steps_per_second": 43.627,
622
+ "step": 3400
623
+ },
624
+ {
625
+ "epoch": 0.8735835918212319,
626
+ "grad_norm": 1.341917634010315,
627
+ "learning_rate": 3.938754455074095e-05,
628
+ "loss": 2.5748,
629
+ "step": 3450
630
+ },
631
+ {
632
+ "epoch": 0.886244223586757,
633
+ "grad_norm": 1.4114609956741333,
634
+ "learning_rate": 3.9153066966797975e-05,
635
+ "loss": 2.667,
636
+ "step": 3500
637
+ },
638
+ {
639
+ "epoch": 0.898904855352282,
640
+ "grad_norm": 1.1211490631103516,
641
+ "learning_rate": 3.8918589382855e-05,
642
+ "loss": 2.5671,
643
+ "step": 3550
644
+ },
645
+ {
646
+ "epoch": 0.9115654871178072,
647
+ "grad_norm": 1.4166322946548462,
648
+ "learning_rate": 3.8684111798912024e-05,
649
+ "loss": 2.5945,
650
+ "step": 3600
651
+ },
652
+ {
653
+ "epoch": 0.9115654871178072,
654
+ "eval_loss": 2.47322940826416,
655
+ "eval_runtime": 40.2079,
656
+ "eval_samples_per_second": 43.673,
657
+ "eval_steps_per_second": 43.673,
658
+ "step": 3600
659
+ },
660
+ {
661
+ "epoch": 0.9242261188833323,
662
+ "grad_norm": 0.9144394993782043,
663
+ "learning_rate": 3.844963421496905e-05,
664
+ "loss": 2.6148,
665
+ "step": 3650
666
+ },
667
+ {
668
+ "epoch": 0.9368867506488574,
669
+ "grad_norm": 1.4106061458587646,
670
+ "learning_rate": 3.821515663102608e-05,
671
+ "loss": 2.6586,
672
+ "step": 3700
673
+ },
674
+ {
675
+ "epoch": 0.9495473824143825,
676
+ "grad_norm": 1.414415717124939,
677
+ "learning_rate": 3.79806790470831e-05,
678
+ "loss": 2.5874,
679
+ "step": 3750
680
+ },
681
+ {
682
+ "epoch": 0.9622080141799075,
683
+ "grad_norm": 1.5448992252349854,
684
+ "learning_rate": 3.774620146314012e-05,
685
+ "loss": 2.6422,
686
+ "step": 3800
687
+ },
688
+ {
689
+ "epoch": 0.9622080141799075,
690
+ "eval_loss": 2.4701173305511475,
691
+ "eval_runtime": 40.1267,
692
+ "eval_samples_per_second": 43.761,
693
+ "eval_steps_per_second": 43.761,
694
+ "step": 3800
695
+ },
696
+ {
697
+ "epoch": 0.9748686459454327,
698
+ "grad_norm": 1.1959314346313477,
699
+ "learning_rate": 3.751172387919715e-05,
700
+ "loss": 2.6975,
701
+ "step": 3850
702
+ },
703
+ {
704
+ "epoch": 0.9875292777109578,
705
+ "grad_norm": 0.9525274038314819,
706
+ "learning_rate": 3.727724629525417e-05,
707
+ "loss": 2.6675,
708
+ "step": 3900
709
+ },
710
+ {
711
+ "epoch": 1.0,
712
+ "grad_norm": 4.733253479003906,
713
+ "learning_rate": 3.70427687113112e-05,
714
+ "loss": 2.566,
715
+ "step": 3950
716
+ },
717
+ {
718
+ "epoch": 1.0126606317655251,
719
+ "grad_norm": 1.2803192138671875,
720
+ "learning_rate": 3.680829112736823e-05,
721
+ "loss": 2.5659,
722
+ "step": 4000
723
+ },
724
+ {
725
+ "epoch": 1.0126606317655251,
726
+ "eval_loss": 2.4702188968658447,
727
+ "eval_runtime": 40.1387,
728
+ "eval_samples_per_second": 43.748,
729
+ "eval_steps_per_second": 43.748,
730
+ "step": 4000
731
+ },
732
+ {
733
+ "epoch": 1.0253212635310502,
734
+ "grad_norm": 1.446990966796875,
735
+ "learning_rate": 3.657381354342525e-05,
736
+ "loss": 2.627,
737
+ "step": 4050
738
+ },
739
+ {
740
+ "epoch": 1.0379818952965754,
741
+ "grad_norm": 1.3563008308410645,
742
+ "learning_rate": 3.633933595948227e-05,
743
+ "loss": 2.6252,
744
+ "step": 4100
745
+ },
746
+ {
747
+ "epoch": 1.0506425270621005,
748
+ "grad_norm": 1.5763463973999023,
749
+ "learning_rate": 3.61048583755393e-05,
750
+ "loss": 2.6593,
751
+ "step": 4150
752
+ },
753
+ {
754
+ "epoch": 1.0633031588276256,
755
+ "grad_norm": 1.0055335760116577,
756
+ "learning_rate": 3.587038079159633e-05,
757
+ "loss": 2.5955,
758
+ "step": 4200
759
+ },
760
+ {
761
+ "epoch": 1.0633031588276256,
762
+ "eval_loss": 2.4676930904388428,
763
+ "eval_runtime": 40.0342,
764
+ "eval_samples_per_second": 43.863,
765
+ "eval_steps_per_second": 43.863,
766
+ "step": 4200
767
+ },
768
+ {
769
+ "epoch": 1.0759637905931505,
770
+ "grad_norm": 1.7013343572616577,
771
+ "learning_rate": 3.563590320765335e-05,
772
+ "loss": 2.59,
773
+ "step": 4250
774
+ },
775
+ {
776
+ "epoch": 1.0886244223586756,
777
+ "grad_norm": 1.541069507598877,
778
+ "learning_rate": 3.540142562371038e-05,
779
+ "loss": 2.6192,
780
+ "step": 4300
781
+ },
782
+ {
783
+ "epoch": 1.1012850541242007,
784
+ "grad_norm": 1.2536805868148804,
785
+ "learning_rate": 3.51669480397674e-05,
786
+ "loss": 2.6225,
787
+ "step": 4350
788
+ },
789
+ {
790
+ "epoch": 1.1139456858897259,
791
+ "grad_norm": 1.8328826427459717,
792
+ "learning_rate": 3.493247045582442e-05,
793
+ "loss": 2.6022,
794
+ "step": 4400
795
+ },
796
+ {
797
+ "epoch": 1.1139456858897259,
798
+ "eval_loss": 2.465629816055298,
799
+ "eval_runtime": 39.8532,
800
+ "eval_samples_per_second": 44.062,
801
+ "eval_steps_per_second": 44.062,
802
+ "step": 4400
803
+ },
804
+ {
805
+ "epoch": 1.126606317655251,
806
+ "grad_norm": 1.8557270765304565,
807
+ "learning_rate": 3.469799287188145e-05,
808
+ "loss": 2.6496,
809
+ "step": 4450
810
+ },
811
+ {
812
+ "epoch": 1.139266949420776,
813
+ "grad_norm": 1.3255618810653687,
814
+ "learning_rate": 3.446820483961734e-05,
815
+ "loss": 2.5315,
816
+ "step": 4500
817
+ },
818
+ {
819
+ "epoch": 1.1519275811863012,
820
+ "grad_norm": 1.2192399501800537,
821
+ "learning_rate": 3.423372725567436e-05,
822
+ "loss": 2.5409,
823
+ "step": 4550
824
+ },
825
+ {
826
+ "epoch": 1.1645882129518264,
827
+ "grad_norm": 1.2533234357833862,
828
+ "learning_rate": 3.399924967173139e-05,
829
+ "loss": 2.6457,
830
+ "step": 4600
831
+ },
832
+ {
833
+ "epoch": 1.1645882129518264,
834
+ "eval_loss": 2.462162733078003,
835
+ "eval_runtime": 40.0542,
836
+ "eval_samples_per_second": 43.841,
837
+ "eval_steps_per_second": 43.841,
838
+ "step": 4600
839
+ },
840
+ {
841
+ "epoch": 1.1772488447173515,
842
+ "grad_norm": 1.8414678573608398,
843
+ "learning_rate": 3.376477208778841e-05,
844
+ "loss": 2.5658,
845
+ "step": 4650
846
+ },
847
+ {
848
+ "epoch": 1.1899094764828764,
849
+ "grad_norm": 1.568259596824646,
850
+ "learning_rate": 3.3530294503845436e-05,
851
+ "loss": 2.5771,
852
+ "step": 4700
853
+ },
854
+ {
855
+ "epoch": 1.2025701082484015,
856
+ "grad_norm": 1.3547483682632446,
857
+ "learning_rate": 3.3295816919902464e-05,
858
+ "loss": 2.6525,
859
+ "step": 4750
860
+ },
861
+ {
862
+ "epoch": 1.2152307400139266,
863
+ "grad_norm": 1.1655386686325073,
864
+ "learning_rate": 3.3061339335959486e-05,
865
+ "loss": 2.6421,
866
+ "step": 4800
867
+ },
868
+ {
869
+ "epoch": 1.2152307400139266,
870
+ "eval_loss": 2.461489200592041,
871
+ "eval_runtime": 39.9962,
872
+ "eval_samples_per_second": 43.904,
873
+ "eval_steps_per_second": 43.904,
874
+ "step": 4800
875
+ },
876
+ {
877
+ "epoch": 1.2278913717794517,
878
+ "grad_norm": 1.798033595085144,
879
+ "learning_rate": 3.282686175201651e-05,
880
+ "loss": 2.6091,
881
+ "step": 4850
882
+ },
883
+ {
884
+ "epoch": 1.2405520035449769,
885
+ "grad_norm": 3.2964117527008057,
886
+ "learning_rate": 3.2592384168073535e-05,
887
+ "loss": 2.5997,
888
+ "step": 4900
889
+ },
890
+ {
891
+ "epoch": 1.253212635310502,
892
+ "grad_norm": 1.0457675457000732,
893
+ "learning_rate": 3.2357906584130557e-05,
894
+ "loss": 2.6144,
895
+ "step": 4950
896
+ },
897
+ {
898
+ "epoch": 1.265873267076027,
899
+ "grad_norm": 0.9728056192398071,
900
+ "learning_rate": 3.2123429000187585e-05,
901
+ "loss": 2.5712,
902
+ "step": 5000
903
+ },
904
+ {
905
+ "epoch": 1.265873267076027,
906
+ "eval_loss": 2.460186719894409,
907
+ "eval_runtime": 39.8386,
908
+ "eval_samples_per_second": 44.078,
909
+ "eval_steps_per_second": 44.078,
910
+ "step": 5000
911
+ },
912
+ {
913
+ "epoch": 1.2785338988415522,
914
+ "grad_norm": 1.2350194454193115,
915
+ "learning_rate": 3.188895141624461e-05,
916
+ "loss": 2.5448,
917
+ "step": 5050
918
+ },
919
+ {
920
+ "epoch": 1.2911945306070773,
921
+ "grad_norm": 1.4210622310638428,
922
+ "learning_rate": 3.1654473832301634e-05,
923
+ "loss": 2.6031,
924
+ "step": 5100
925
+ },
926
+ {
927
+ "epoch": 1.3038551623726025,
928
+ "grad_norm": 2.226473093032837,
929
+ "learning_rate": 3.1419996248358656e-05,
930
+ "loss": 2.6597,
931
+ "step": 5150
932
+ },
933
+ {
934
+ "epoch": 1.3165157941381276,
935
+ "grad_norm": 2.4525105953216553,
936
+ "learning_rate": 3.1185518664415684e-05,
937
+ "loss": 2.596,
938
+ "step": 5200
939
+ },
940
+ {
941
+ "epoch": 1.3165157941381276,
942
+ "eval_loss": 2.454537868499756,
943
+ "eval_runtime": 39.805,
944
+ "eval_samples_per_second": 44.115,
945
+ "eval_steps_per_second": 44.115,
946
+ "step": 5200
947
+ },
948
+ {
949
+ "epoch": 1.3291764259036527,
950
+ "grad_norm": 1.265309453010559,
951
+ "learning_rate": 3.095104108047271e-05,
952
+ "loss": 2.5559,
953
+ "step": 5250
954
+ },
955
+ {
956
+ "epoch": 1.3418370576691778,
957
+ "grad_norm": 2.1364307403564453,
958
+ "learning_rate": 3.071656349652973e-05,
959
+ "loss": 2.5859,
960
+ "step": 5300
961
+ },
962
+ {
963
+ "epoch": 1.3544976894347027,
964
+ "grad_norm": 1.5945920944213867,
965
+ "learning_rate": 3.048208591258676e-05,
966
+ "loss": 2.5778,
967
+ "step": 5350
968
+ },
969
+ {
970
+ "epoch": 1.3671583212002278,
971
+ "grad_norm": 1.2479759454727173,
972
+ "learning_rate": 3.0247608328643783e-05,
973
+ "loss": 2.6846,
974
+ "step": 5400
975
+ },
976
+ {
977
+ "epoch": 1.3671583212002278,
978
+ "eval_loss": 2.4547293186187744,
979
+ "eval_runtime": 39.7806,
980
+ "eval_samples_per_second": 44.142,
981
+ "eval_steps_per_second": 44.142,
982
+ "step": 5400
983
+ },
984
+ {
985
+ "epoch": 1.379818952965753,
986
+ "grad_norm": 1.4845050573349,
987
+ "learning_rate": 3.0013130744700808e-05,
988
+ "loss": 2.5661,
989
+ "step": 5450
990
+ },
991
+ {
992
+ "epoch": 1.392479584731278,
993
+ "grad_norm": 1.5581985712051392,
994
+ "learning_rate": 2.9778653160757836e-05,
995
+ "loss": 2.5441,
996
+ "step": 5500
997
+ },
998
+ {
999
+ "epoch": 1.4051402164968032,
1000
+ "grad_norm": 3.1663737297058105,
1001
+ "learning_rate": 2.9544175576814857e-05,
1002
+ "loss": 2.5044,
1003
+ "step": 5550
1004
+ },
1005
+ {
1006
+ "epoch": 1.4178008482623283,
1007
+ "grad_norm": 1.2454484701156616,
1008
+ "learning_rate": 2.9309697992871882e-05,
1009
+ "loss": 2.5747,
1010
+ "step": 5600
1011
+ },
1012
+ {
1013
+ "epoch": 1.4178008482623283,
1014
+ "eval_loss": 2.4544529914855957,
1015
+ "eval_runtime": 39.9287,
1016
+ "eval_samples_per_second": 43.978,
1017
+ "eval_steps_per_second": 43.978,
1018
+ "step": 5600
1019
+ },
1020
+ {
1021
+ "epoch": 1.4304614800278534,
1022
+ "grad_norm": 1.662784457206726,
1023
+ "learning_rate": 2.907522040892891e-05,
1024
+ "loss": 2.6064,
1025
+ "step": 5650
1026
+ },
1027
+ {
1028
+ "epoch": 1.4431221117933786,
1029
+ "grad_norm": 1.618458867073059,
1030
+ "learning_rate": 2.8840742824985935e-05,
1031
+ "loss": 2.5191,
1032
+ "step": 5700
1033
+ },
1034
+ {
1035
+ "epoch": 1.4557827435589035,
1036
+ "grad_norm": 1.3003348112106323,
1037
+ "learning_rate": 2.8606265241042956e-05,
1038
+ "loss": 2.5339,
1039
+ "step": 5750
1040
+ },
1041
+ {
1042
+ "epoch": 1.4684433753244286,
1043
+ "grad_norm": 1.1443992853164673,
1044
+ "learning_rate": 2.8371787657099984e-05,
1045
+ "loss": 2.5914,
1046
+ "step": 5800
1047
+ },
1048
+ {
1049
+ "epoch": 1.4684433753244286,
1050
+ "eval_loss": 2.453752279281616,
1051
+ "eval_runtime": 39.8234,
1052
+ "eval_samples_per_second": 44.095,
1053
+ "eval_steps_per_second": 44.095,
1054
+ "step": 5800
1055
+ },
1056
+ {
1057
+ "epoch": 1.4811040070899537,
1058
+ "grad_norm": 1.2574009895324707,
1059
+ "learning_rate": 2.813731007315701e-05,
1060
+ "loss": 2.6109,
1061
+ "step": 5850
1062
+ },
1063
+ {
1064
+ "epoch": 1.4937646388554788,
1065
+ "grad_norm": 1.002815842628479,
1066
+ "learning_rate": 2.790283248921403e-05,
1067
+ "loss": 2.6075,
1068
+ "step": 5900
1069
+ },
1070
+ {
1071
+ "epoch": 1.506425270621004,
1072
+ "grad_norm": 1.306024432182312,
1073
+ "learning_rate": 2.766835490527106e-05,
1074
+ "loss": 2.5733,
1075
+ "step": 5950
1076
+ },
1077
+ {
1078
+ "epoch": 1.519085902386529,
1079
+ "grad_norm": 2.5023701190948486,
1080
+ "learning_rate": 2.7433877321328083e-05,
1081
+ "loss": 2.6274,
1082
+ "step": 6000
1083
+ },
1084
+ {
1085
+ "epoch": 1.519085902386529,
1086
+ "eval_loss": 2.449617862701416,
1087
+ "eval_runtime": 39.9401,
1088
+ "eval_samples_per_second": 43.966,
1089
+ "eval_steps_per_second": 43.966,
1090
+ "step": 6000
1091
+ },
1092
+ {
1093
+ "epoch": 1.5317465341520542,
1094
+ "grad_norm": 1.9410326480865479,
1095
+ "learning_rate": 2.7199399737385105e-05,
1096
+ "loss": 2.5312,
1097
+ "step": 6050
1098
+ },
1099
+ {
1100
+ "epoch": 1.5444071659175793,
1101
+ "grad_norm": 1.9793561697006226,
1102
+ "learning_rate": 2.6964922153442136e-05,
1103
+ "loss": 2.5759,
1104
+ "step": 6100
1105
+ },
1106
+ {
1107
+ "epoch": 1.5570677976831044,
1108
+ "grad_norm": 1.290531873703003,
1109
+ "learning_rate": 2.6730444569499157e-05,
1110
+ "loss": 2.5817,
1111
+ "step": 6150
1112
+ },
1113
+ {
1114
+ "epoch": 1.5697284294486296,
1115
+ "grad_norm": 2.11389422416687,
1116
+ "learning_rate": 2.6495966985556182e-05,
1117
+ "loss": 2.6287,
1118
+ "step": 6200
1119
+ },
1120
+ {
1121
+ "epoch": 1.5697284294486296,
1122
+ "eval_loss": 2.4490554332733154,
1123
+ "eval_runtime": 39.7474,
1124
+ "eval_samples_per_second": 44.179,
1125
+ "eval_steps_per_second": 44.179,
1126
+ "step": 6200
1127
+ },
1128
+ {
1129
+ "epoch": 1.5823890612141547,
1130
+ "grad_norm": 1.6492938995361328,
1131
+ "learning_rate": 2.626148940161321e-05,
1132
+ "loss": 2.631,
1133
+ "step": 6250
1134
+ },
1135
+ {
1136
+ "epoch": 1.5950496929796798,
1137
+ "grad_norm": 1.3233673572540283,
1138
+ "learning_rate": 2.6027011817670232e-05,
1139
+ "loss": 2.5654,
1140
+ "step": 6300
1141
+ },
1142
+ {
1143
+ "epoch": 1.607710324745205,
1144
+ "grad_norm": 1.688264012336731,
1145
+ "learning_rate": 2.5792534233727257e-05,
1146
+ "loss": 2.6096,
1147
+ "step": 6350
1148
+ },
1149
+ {
1150
+ "epoch": 1.62037095651073,
1151
+ "grad_norm": 2.064823865890503,
1152
+ "learning_rate": 2.5558056649784285e-05,
1153
+ "loss": 2.6275,
1154
+ "step": 6400
1155
+ },
1156
+ {
1157
+ "epoch": 1.62037095651073,
1158
+ "eval_loss": 2.4488983154296875,
1159
+ "eval_runtime": 39.6847,
1160
+ "eval_samples_per_second": 44.249,
1161
+ "eval_steps_per_second": 44.249,
1162
+ "step": 6400
1163
+ },
1164
+ {
1165
+ "epoch": 1.633031588276255,
1166
+ "grad_norm": 1.5599696636199951,
1167
+ "learning_rate": 2.5323579065841306e-05,
1168
+ "loss": 2.6334,
1169
+ "step": 6450
1170
+ },
1171
+ {
1172
+ "epoch": 1.64569222004178,
1173
+ "grad_norm": 1.3142633438110352,
1174
+ "learning_rate": 2.508910148189833e-05,
1175
+ "loss": 2.5496,
1176
+ "step": 6500
1177
+ },
1178
+ {
1179
+ "epoch": 1.6583528518073052,
1180
+ "grad_norm": 1.474135160446167,
1181
+ "learning_rate": 2.4854623897955356e-05,
1182
+ "loss": 2.5628,
1183
+ "step": 6550
1184
+ },
1185
+ {
1186
+ "epoch": 1.6710134835728303,
1187
+ "grad_norm": 1.3737610578536987,
1188
+ "learning_rate": 2.4620146314012384e-05,
1189
+ "loss": 2.5345,
1190
+ "step": 6600
1191
+ },
1192
+ {
1193
+ "epoch": 1.6710134835728303,
1194
+ "eval_loss": 2.447559356689453,
1195
+ "eval_runtime": 39.7021,
1196
+ "eval_samples_per_second": 44.229,
1197
+ "eval_steps_per_second": 44.229,
1198
+ "step": 6600
1199
+ },
1200
+ {
1201
+ "epoch": 1.6836741153383554,
1202
+ "grad_norm": 1.2432060241699219,
1203
+ "learning_rate": 2.4385668730069405e-05,
1204
+ "loss": 2.5977,
1205
+ "step": 6650
1206
+ },
1207
+ {
1208
+ "epoch": 1.6963347471038803,
1209
+ "grad_norm": 1.465063452720642,
1210
+ "learning_rate": 2.415119114612643e-05,
1211
+ "loss": 2.6118,
1212
+ "step": 6700
1213
+ },
1214
+ {
1215
+ "epoch": 1.7089953788694054,
1216
+ "grad_norm": 1.5186200141906738,
1217
+ "learning_rate": 2.3916713562183458e-05,
1218
+ "loss": 2.6126,
1219
+ "step": 6750
1220
+ },
1221
+ {
1222
+ "epoch": 1.7216560106349306,
1223
+ "grad_norm": 1.6869078874588013,
1224
+ "learning_rate": 2.368223597824048e-05,
1225
+ "loss": 2.576,
1226
+ "step": 6800
1227
+ },
1228
+ {
1229
+ "epoch": 1.7216560106349306,
1230
+ "eval_loss": 2.4459502696990967,
1231
+ "eval_runtime": 39.7653,
1232
+ "eval_samples_per_second": 44.159,
1233
+ "eval_steps_per_second": 44.159,
1234
+ "step": 6800
1235
+ },
1236
+ {
1237
+ "epoch": 1.7343166424004557,
1238
+ "grad_norm": 1.2578104734420776,
1239
+ "learning_rate": 2.3447758394297507e-05,
1240
+ "loss": 2.6178,
1241
+ "step": 6850
1242
+ },
1243
+ {
1244
+ "epoch": 1.7469772741659808,
1245
+ "grad_norm": 1.7597213983535767,
1246
+ "learning_rate": 2.3213280810354532e-05,
1247
+ "loss": 2.6358,
1248
+ "step": 6900
1249
+ },
1250
+ {
1251
+ "epoch": 1.759637905931506,
1252
+ "grad_norm": 2.144465923309326,
1253
+ "learning_rate": 2.2978803226411554e-05,
1254
+ "loss": 2.5597,
1255
+ "step": 6950
1256
+ },
1257
+ {
1258
+ "epoch": 1.772298537697031,
1259
+ "grad_norm": 1.1808464527130127,
1260
+ "learning_rate": 2.2744325642468582e-05,
1261
+ "loss": 2.6269,
1262
+ "step": 7000
1263
+ },
1264
+ {
1265
+ "epoch": 1.772298537697031,
1266
+ "eval_loss": 2.4444611072540283,
1267
+ "eval_runtime": 40.0709,
1268
+ "eval_samples_per_second": 43.822,
1269
+ "eval_steps_per_second": 43.822,
1270
+ "step": 7000
1271
+ },
1272
+ {
1273
+ "epoch": 1.7849591694625562,
1274
+ "grad_norm": 1.4550806283950806,
1275
+ "learning_rate": 2.2509848058525606e-05,
1276
+ "loss": 2.6206,
1277
+ "step": 7050
1278
+ },
1279
+ {
1280
+ "epoch": 1.7976198012280813,
1281
+ "grad_norm": 1.2635902166366577,
1282
+ "learning_rate": 2.227537047458263e-05,
1283
+ "loss": 2.5722,
1284
+ "step": 7100
1285
+ },
1286
+ {
1287
+ "epoch": 1.8102804329936064,
1288
+ "grad_norm": 1.3835856914520264,
1289
+ "learning_rate": 2.2040892890639656e-05,
1290
+ "loss": 2.535,
1291
+ "step": 7150
1292
+ },
1293
+ {
1294
+ "epoch": 1.8229410647591315,
1295
+ "grad_norm": 1.735004186630249,
1296
+ "learning_rate": 2.180641530669668e-05,
1297
+ "loss": 2.6086,
1298
+ "step": 7200
1299
+ },
1300
+ {
1301
+ "epoch": 1.8229410647591315,
1302
+ "eval_loss": 2.443899154663086,
1303
+ "eval_runtime": 40.91,
1304
+ "eval_samples_per_second": 42.924,
1305
+ "eval_steps_per_second": 42.924,
1306
+ "step": 7200
1307
+ },
1308
+ {
1309
+ "epoch": 1.8356016965246567,
1310
+ "grad_norm": 1.263051986694336,
1311
+ "learning_rate": 2.1571937722753706e-05,
1312
+ "loss": 2.5544,
1313
+ "step": 7250
1314
+ },
1315
+ {
1316
+ "epoch": 1.8482623282901818,
1317
+ "grad_norm": 1.0899442434310913,
1318
+ "learning_rate": 2.133746013881073e-05,
1319
+ "loss": 2.5603,
1320
+ "step": 7300
1321
+ },
1322
+ {
1323
+ "epoch": 1.860922960055707,
1324
+ "grad_norm": 3.038811206817627,
1325
+ "learning_rate": 2.1102982554867755e-05,
1326
+ "loss": 2.5688,
1327
+ "step": 7350
1328
+ },
1329
+ {
1330
+ "epoch": 1.873583591821232,
1331
+ "grad_norm": 1.6385984420776367,
1332
+ "learning_rate": 2.086850497092478e-05,
1333
+ "loss": 2.6006,
1334
+ "step": 7400
1335
+ },
1336
+ {
1337
+ "epoch": 1.873583591821232,
1338
+ "eval_loss": 2.443300724029541,
1339
+ "eval_runtime": 40.6649,
1340
+ "eval_samples_per_second": 43.182,
1341
+ "eval_steps_per_second": 43.182,
1342
+ "step": 7400
1343
+ },
1344
+ {
1345
+ "epoch": 1.8862442235867571,
1346
+ "grad_norm": 1.2857129573822021,
1347
+ "learning_rate": 2.0634027386981805e-05,
1348
+ "loss": 2.5563,
1349
+ "step": 7450
1350
+ },
1351
+ {
1352
+ "epoch": 1.898904855352282,
1353
+ "grad_norm": 1.0289497375488281,
1354
+ "learning_rate": 2.0399549803038833e-05,
1355
+ "loss": 2.5671,
1356
+ "step": 7500
1357
+ },
1358
+ {
1359
+ "epoch": 1.9115654871178072,
1360
+ "grad_norm": 1.5041025876998901,
1361
+ "learning_rate": 2.0165072219095854e-05,
1362
+ "loss": 2.5689,
1363
+ "step": 7550
1364
+ },
1365
+ {
1366
+ "epoch": 1.9242261188833323,
1367
+ "grad_norm": 1.6611964702606201,
1368
+ "learning_rate": 1.993528418683174e-05,
1369
+ "loss": 2.5801,
1370
+ "step": 7600
1371
+ },
1372
+ {
1373
+ "epoch": 1.9242261188833323,
1374
+ "eval_loss": 2.443532943725586,
1375
+ "eval_runtime": 39.931,
1376
+ "eval_samples_per_second": 43.976,
1377
+ "eval_steps_per_second": 43.976,
1378
+ "step": 7600
1379
+ },
1380
+ {
1381
+ "epoch": 1.9368867506488574,
1382
+ "grad_norm": 1.521170735359192,
1383
+ "learning_rate": 1.9700806602888767e-05,
1384
+ "loss": 2.5969,
1385
+ "step": 7650
1386
+ },
1387
+ {
1388
+ "epoch": 1.9495473824143825,
1389
+ "grad_norm": 1.3700034618377686,
1390
+ "learning_rate": 1.946632901894579e-05,
1391
+ "loss": 2.6306,
1392
+ "step": 7700
1393
+ },
1394
+ {
1395
+ "epoch": 1.9622080141799074,
1396
+ "grad_norm": 2.311443328857422,
1397
+ "learning_rate": 1.9231851435002814e-05,
1398
+ "loss": 2.5608,
1399
+ "step": 7750
1400
+ },
1401
+ {
1402
+ "epoch": 1.9748686459454325,
1403
+ "grad_norm": 1.6699820756912231,
1404
+ "learning_rate": 1.8997373851059842e-05,
1405
+ "loss": 2.5113,
1406
+ "step": 7800
1407
+ },
1408
+ {
1409
+ "epoch": 1.9748686459454325,
1410
+ "eval_loss": 2.4421675205230713,
1411
+ "eval_runtime": 40.1783,
1412
+ "eval_samples_per_second": 43.705,
1413
+ "eval_steps_per_second": 43.705,
1414
+ "step": 7800
1415
+ },
1416
+ {
1417
+ "epoch": 1.9875292777109577,
1418
+ "grad_norm": 1.2560683488845825,
1419
+ "learning_rate": 1.8762896267116863e-05,
1420
+ "loss": 2.545,
1421
+ "step": 7850
1422
+ },
1423
+ {
1424
+ "epoch": 2.0,
1425
+ "grad_norm": 2.176563262939453,
1426
+ "learning_rate": 1.852841868317389e-05,
1427
+ "loss": 2.5752,
1428
+ "step": 7900
1429
+ },
1430
+ {
1431
+ "epoch": 2.012660631765525,
1432
+ "grad_norm": 1.2551178932189941,
1433
+ "learning_rate": 1.8293941099230916e-05,
1434
+ "loss": 2.5215,
1435
+ "step": 7950
1436
+ },
1437
+ {
1438
+ "epoch": 2.0253212635310502,
1439
+ "grad_norm": 1.5646872520446777,
1440
+ "learning_rate": 1.8059463515287937e-05,
1441
+ "loss": 2.5838,
1442
+ "step": 8000
1443
+ },
1444
+ {
1445
+ "epoch": 2.0253212635310502,
1446
+ "eval_loss": 2.441195249557495,
1447
+ "eval_runtime": 39.701,
1448
+ "eval_samples_per_second": 44.231,
1449
+ "eval_steps_per_second": 44.231,
1450
+ "step": 8000
1451
+ },
1452
+ {
1453
+ "epoch": 2.0379818952965754,
1454
+ "grad_norm": 1.4227900505065918,
1455
+ "learning_rate": 1.7824985931344966e-05,
1456
+ "loss": 2.5597,
1457
+ "step": 8050
1458
+ },
1459
+ {
1460
+ "epoch": 2.0506425270621005,
1461
+ "grad_norm": 1.3013832569122314,
1462
+ "learning_rate": 1.759050834740199e-05,
1463
+ "loss": 2.7641,
1464
+ "step": 8100
1465
+ },
1466
+ {
1467
+ "epoch": 2.0633031588276256,
1468
+ "grad_norm": 1.1282143592834473,
1469
+ "learning_rate": 1.7356030763459015e-05,
1470
+ "loss": 2.5875,
1471
+ "step": 8150
1472
+ },
1473
+ {
1474
+ "epoch": 2.0759637905931507,
1475
+ "grad_norm": 2.079760789871216,
1476
+ "learning_rate": 1.712155317951604e-05,
1477
+ "loss": 2.4861,
1478
+ "step": 8200
1479
+ },
1480
+ {
1481
+ "epoch": 2.0759637905931507,
1482
+ "eval_loss": 2.440812826156616,
1483
+ "eval_runtime": 40.0764,
1484
+ "eval_samples_per_second": 43.816,
1485
+ "eval_steps_per_second": 43.816,
1486
+ "step": 8200
1487
+ },
1488
+ {
1489
+ "epoch": 2.088624422358676,
1490
+ "grad_norm": 1.0884991884231567,
1491
+ "learning_rate": 1.6887075595573065e-05,
1492
+ "loss": 2.5941,
1493
+ "step": 8250
1494
+ },
1495
+ {
1496
+ "epoch": 2.101285054124201,
1497
+ "grad_norm": 1.9202015399932861,
1498
+ "learning_rate": 1.665259801163009e-05,
1499
+ "loss": 2.5929,
1500
+ "step": 8300
1501
+ },
1502
+ {
1503
+ "epoch": 2.113945685889726,
1504
+ "grad_norm": 1.5925830602645874,
1505
+ "learning_rate": 1.6418120427687114e-05,
1506
+ "loss": 2.5046,
1507
+ "step": 8350
1508
+ },
1509
+ {
1510
+ "epoch": 2.126606317655251,
1511
+ "grad_norm": 1.5219184160232544,
1512
+ "learning_rate": 1.618364284374414e-05,
1513
+ "loss": 2.5628,
1514
+ "step": 8400
1515
+ },
1516
+ {
1517
+ "epoch": 2.126606317655251,
1518
+ "eval_loss": 2.4396440982818604,
1519
+ "eval_runtime": 40.0053,
1520
+ "eval_samples_per_second": 43.894,
1521
+ "eval_steps_per_second": 43.894,
1522
+ "step": 8400
1523
+ },
1524
+ {
1525
+ "epoch": 2.139266949420776,
1526
+ "grad_norm": 1.4882445335388184,
1527
+ "learning_rate": 1.5949165259801164e-05,
1528
+ "loss": 2.6268,
1529
+ "step": 8450
1530
+ },
1531
+ {
1532
+ "epoch": 2.151927581186301,
1533
+ "grad_norm": 1.3513301610946655,
1534
+ "learning_rate": 1.571468767585819e-05,
1535
+ "loss": 2.5277,
1536
+ "step": 8500
1537
+ },
1538
+ {
1539
+ "epoch": 2.164588212951826,
1540
+ "grad_norm": 1.690974473953247,
1541
+ "learning_rate": 1.5480210091915216e-05,
1542
+ "loss": 2.5631,
1543
+ "step": 8550
1544
+ },
1545
+ {
1546
+ "epoch": 2.1772488447173513,
1547
+ "grad_norm": 1.5311528444290161,
1548
+ "learning_rate": 1.5245732507972238e-05,
1549
+ "loss": 2.5454,
1550
+ "step": 8600
1551
+ },
1552
+ {
1553
+ "epoch": 2.1772488447173513,
1554
+ "eval_loss": 2.4388718605041504,
1555
+ "eval_runtime": 40.0289,
1556
+ "eval_samples_per_second": 43.868,
1557
+ "eval_steps_per_second": 43.868,
1558
+ "step": 8600
1559
+ },
1560
+ {
1561
+ "epoch": 2.1899094764828764,
1562
+ "grad_norm": 2.1171281337738037,
1563
+ "learning_rate": 1.5011254924029264e-05,
1564
+ "loss": 2.6112,
1565
+ "step": 8650
1566
+ },
1567
+ {
1568
+ "epoch": 2.2025701082484015,
1569
+ "grad_norm": 1.9706814289093018,
1570
+ "learning_rate": 1.4776777340086289e-05,
1571
+ "loss": 2.588,
1572
+ "step": 8700
1573
+ },
1574
+ {
1575
+ "epoch": 2.2152307400139266,
1576
+ "grad_norm": 1.8991297483444214,
1577
+ "learning_rate": 1.4542299756143312e-05,
1578
+ "loss": 2.5655,
1579
+ "step": 8750
1580
+ },
1581
+ {
1582
+ "epoch": 2.2278913717794517,
1583
+ "grad_norm": 1.5568820238113403,
1584
+ "learning_rate": 1.4307822172200339e-05,
1585
+ "loss": 2.5312,
1586
+ "step": 8800
1587
+ },
1588
+ {
1589
+ "epoch": 2.2278913717794517,
1590
+ "eval_loss": 2.438715696334839,
1591
+ "eval_runtime": 40.1748,
1592
+ "eval_samples_per_second": 43.709,
1593
+ "eval_steps_per_second": 43.709,
1594
+ "step": 8800
1595
+ },
1596
+ {
1597
+ "epoch": 2.240552003544977,
1598
+ "grad_norm": 1.277051329612732,
1599
+ "learning_rate": 1.4073344588257365e-05,
1600
+ "loss": 2.5818,
1601
+ "step": 8850
1602
+ },
1603
+ {
1604
+ "epoch": 2.253212635310502,
1605
+ "grad_norm": 1.8890128135681152,
1606
+ "learning_rate": 1.3838867004314388e-05,
1607
+ "loss": 2.5211,
1608
+ "step": 8900
1609
+ },
1610
+ {
1611
+ "epoch": 2.265873267076027,
1612
+ "grad_norm": 1.8824830055236816,
1613
+ "learning_rate": 1.3604389420371413e-05,
1614
+ "loss": 2.53,
1615
+ "step": 8950
1616
+ },
1617
+ {
1618
+ "epoch": 2.278533898841552,
1619
+ "grad_norm": 1.239490032196045,
1620
+ "learning_rate": 1.336991183642844e-05,
1621
+ "loss": 2.5889,
1622
+ "step": 9000
1623
+ },
1624
+ {
1625
+ "epoch": 2.278533898841552,
1626
+ "eval_loss": 2.437577962875366,
1627
+ "eval_runtime": 40.1654,
1628
+ "eval_samples_per_second": 43.719,
1629
+ "eval_steps_per_second": 43.719,
1630
+ "step": 9000
1631
+ },
1632
+ {
1633
+ "epoch": 2.2911945306070773,
1634
+ "grad_norm": 1.7253328561782837,
1635
+ "learning_rate": 1.3135434252485462e-05,
1636
+ "loss": 2.5426,
1637
+ "step": 9050
1638
+ },
1639
+ {
1640
+ "epoch": 2.3038551623726025,
1641
+ "grad_norm": 1.6971838474273682,
1642
+ "learning_rate": 1.2900956668542489e-05,
1643
+ "loss": 2.4953,
1644
+ "step": 9100
1645
+ },
1646
+ {
1647
+ "epoch": 2.3165157941381276,
1648
+ "grad_norm": 1.4906270503997803,
1649
+ "learning_rate": 1.2666479084599514e-05,
1650
+ "loss": 2.606,
1651
+ "step": 9150
1652
+ },
1653
+ {
1654
+ "epoch": 2.3291764259036527,
1655
+ "grad_norm": 1.658526062965393,
1656
+ "learning_rate": 1.2432001500656538e-05,
1657
+ "loss": 2.5483,
1658
+ "step": 9200
1659
+ },
1660
+ {
1661
+ "epoch": 2.3291764259036527,
1662
+ "eval_loss": 2.437896490097046,
1663
+ "eval_runtime": 40.7008,
1664
+ "eval_samples_per_second": 43.144,
1665
+ "eval_steps_per_second": 43.144,
1666
+ "step": 9200
1667
+ },
1668
+ {
1669
+ "epoch": 2.341837057669178,
1670
+ "grad_norm": 1.0781177282333374,
1671
+ "learning_rate": 1.2197523916713563e-05,
1672
+ "loss": 2.5449,
1673
+ "step": 9250
1674
+ },
1675
+ {
1676
+ "epoch": 2.354497689434703,
1677
+ "grad_norm": 2.1414873600006104,
1678
+ "learning_rate": 1.1963046332770588e-05,
1679
+ "loss": 2.5303,
1680
+ "step": 9300
1681
+ },
1682
+ {
1683
+ "epoch": 2.367158321200228,
1684
+ "grad_norm": 2.063297986984253,
1685
+ "learning_rate": 1.1728568748827613e-05,
1686
+ "loss": 2.5837,
1687
+ "step": 9350
1688
+ },
1689
+ {
1690
+ "epoch": 2.3798189529657527,
1691
+ "grad_norm": 1.2153489589691162,
1692
+ "learning_rate": 1.1494091164884637e-05,
1693
+ "loss": 2.6384,
1694
+ "step": 9400
1695
+ },
1696
+ {
1697
+ "epoch": 2.3798189529657527,
1698
+ "eval_loss": 2.4365696907043457,
1699
+ "eval_runtime": 40.2398,
1700
+ "eval_samples_per_second": 43.638,
1701
+ "eval_steps_per_second": 43.638,
1702
+ "step": 9400
1703
+ },
1704
+ {
1705
+ "epoch": 2.3924795847312783,
1706
+ "grad_norm": 1.2976094484329224,
1707
+ "learning_rate": 1.1259613580941662e-05,
1708
+ "loss": 2.572,
1709
+ "step": 9450
1710
+ },
1711
+ {
1712
+ "epoch": 2.405140216496803,
1713
+ "grad_norm": 1.2775920629501343,
1714
+ "learning_rate": 1.1025135996998689e-05,
1715
+ "loss": 2.6083,
1716
+ "step": 9500
1717
+ },
1718
+ {
1719
+ "epoch": 2.417800848262328,
1720
+ "grad_norm": 1.358311653137207,
1721
+ "learning_rate": 1.0790658413055713e-05,
1722
+ "loss": 2.5206,
1723
+ "step": 9550
1724
+ },
1725
+ {
1726
+ "epoch": 2.4304614800278532,
1727
+ "grad_norm": 1.3438369035720825,
1728
+ "learning_rate": 1.0556180829112736e-05,
1729
+ "loss": 2.4967,
1730
+ "step": 9600
1731
+ },
1732
+ {
1733
+ "epoch": 2.4304614800278532,
1734
+ "eval_loss": 2.4359662532806396,
1735
+ "eval_runtime": 40.0802,
1736
+ "eval_samples_per_second": 43.812,
1737
+ "eval_steps_per_second": 43.812,
1738
+ "step": 9600
1739
+ },
1740
+ {
1741
+ "epoch": 2.4431221117933783,
1742
+ "grad_norm": 1.2618831396102905,
1743
+ "learning_rate": 1.0321703245169763e-05,
1744
+ "loss": 2.6169,
1745
+ "step": 9650
1746
+ },
1747
+ {
1748
+ "epoch": 2.4557827435589035,
1749
+ "grad_norm": 1.3764727115631104,
1750
+ "learning_rate": 1.0087225661226788e-05,
1751
+ "loss": 2.5444,
1752
+ "step": 9700
1753
+ },
1754
+ {
1755
+ "epoch": 2.4684433753244286,
1756
+ "grad_norm": 1.604864478111267,
1757
+ "learning_rate": 9.852748077283812e-06,
1758
+ "loss": 2.5343,
1759
+ "step": 9750
1760
+ },
1761
+ {
1762
+ "epoch": 2.4811040070899537,
1763
+ "grad_norm": 1.390496850013733,
1764
+ "learning_rate": 9.618270493340837e-06,
1765
+ "loss": 2.5051,
1766
+ "step": 9800
1767
+ },
1768
+ {
1769
+ "epoch": 2.4811040070899537,
1770
+ "eval_loss": 2.4353232383728027,
1771
+ "eval_runtime": 40.1607,
1772
+ "eval_samples_per_second": 43.724,
1773
+ "eval_steps_per_second": 43.724,
1774
+ "step": 9800
1775
+ },
1776
+ {
1777
+ "epoch": 2.493764638855479,
1778
+ "grad_norm": 2.1982169151306152,
1779
+ "learning_rate": 9.383792909397862e-06,
1780
+ "loss": 2.5036,
1781
+ "step": 9850
1782
+ },
1783
+ {
1784
+ "epoch": 2.506425270621004,
1785
+ "grad_norm": 1.3033822774887085,
1786
+ "learning_rate": 9.149315325454887e-06,
1787
+ "loss": 2.5205,
1788
+ "step": 9900
1789
+ },
1790
+ {
1791
+ "epoch": 2.519085902386529,
1792
+ "grad_norm": 1.682586431503296,
1793
+ "learning_rate": 8.919527293190772e-06,
1794
+ "loss": 2.6083,
1795
+ "step": 9950
1796
+ },
1797
+ {
1798
+ "epoch": 2.531746534152054,
1799
+ "grad_norm": 3.184382200241089,
1800
+ "learning_rate": 8.685049709247797e-06,
1801
+ "loss": 2.5314,
1802
+ "step": 10000
1803
+ },
1804
+ {
1805
+ "epoch": 2.531746534152054,
1806
+ "eval_loss": 2.434755802154541,
1807
+ "eval_runtime": 40.2877,
1808
+ "eval_samples_per_second": 43.587,
1809
+ "eval_steps_per_second": 43.587,
1810
+ "step": 10000
1811
+ },
1812
+ {
1813
+ "epoch": 2.5444071659175793,
1814
+ "grad_norm": 2.0026867389678955,
1815
+ "learning_rate": 8.450572125304821e-06,
1816
+ "loss": 2.5109,
1817
+ "step": 10050
1818
+ },
1819
+ {
1820
+ "epoch": 2.5570677976831044,
1821
+ "grad_norm": 1.3833885192871094,
1822
+ "learning_rate": 8.216094541361846e-06,
1823
+ "loss": 2.5362,
1824
+ "step": 10100
1825
+ },
1826
+ {
1827
+ "epoch": 2.5697284294486296,
1828
+ "grad_norm": 2.157984495162964,
1829
+ "learning_rate": 7.981616957418871e-06,
1830
+ "loss": 2.5423,
1831
+ "step": 10150
1832
+ },
1833
+ {
1834
+ "epoch": 2.5823890612141547,
1835
+ "grad_norm": 1.682053565979004,
1836
+ "learning_rate": 7.747139373475897e-06,
1837
+ "loss": 2.5133,
1838
+ "step": 10200
1839
+ },
1840
+ {
1841
+ "epoch": 2.5823890612141547,
1842
+ "eval_loss": 2.435208559036255,
1843
+ "eval_runtime": 40.4768,
1844
+ "eval_samples_per_second": 43.383,
1845
+ "eval_steps_per_second": 43.383,
1846
+ "step": 10200
1847
+ },
1848
+ {
1849
+ "epoch": 2.59504969297968,
1850
+ "grad_norm": 1.9720139503479004,
1851
+ "learning_rate": 7.512661789532921e-06,
1852
+ "loss": 2.6372,
1853
+ "step": 10250
1854
+ },
1855
+ {
1856
+ "epoch": 2.607710324745205,
1857
+ "grad_norm": 1.6906607151031494,
1858
+ "learning_rate": 7.278184205589945e-06,
1859
+ "loss": 2.5505,
1860
+ "step": 10300
1861
+ },
1862
+ {
1863
+ "epoch": 2.62037095651073,
1864
+ "grad_norm": 1.484045147895813,
1865
+ "learning_rate": 7.043706621646972e-06,
1866
+ "loss": 2.5095,
1867
+ "step": 10350
1868
+ },
1869
+ {
1870
+ "epoch": 2.633031588276255,
1871
+ "grad_norm": 1.6676850318908691,
1872
+ "learning_rate": 6.8092290377039955e-06,
1873
+ "loss": 2.6487,
1874
+ "step": 10400
1875
+ },
1876
+ {
1877
+ "epoch": 2.633031588276255,
1878
+ "eval_loss": 2.4344091415405273,
1879
+ "eval_runtime": 39.869,
1880
+ "eval_samples_per_second": 44.044,
1881
+ "eval_steps_per_second": 44.044,
1882
+ "step": 10400
1883
+ },
1884
+ {
1885
+ "epoch": 2.64569222004178,
1886
+ "grad_norm": 1.5012388229370117,
1887
+ "learning_rate": 6.57475145376102e-06,
1888
+ "loss": 2.5756,
1889
+ "step": 10450
1890
+ },
1891
+ {
1892
+ "epoch": 2.6583528518073054,
1893
+ "grad_norm": 1.043954849243164,
1894
+ "learning_rate": 6.340273869818046e-06,
1895
+ "loss": 2.5843,
1896
+ "step": 10500
1897
+ },
1898
+ {
1899
+ "epoch": 2.67101348357283,
1900
+ "grad_norm": 1.0455141067504883,
1901
+ "learning_rate": 6.105796285875071e-06,
1902
+ "loss": 2.5248,
1903
+ "step": 10550
1904
+ },
1905
+ {
1906
+ "epoch": 2.6836741153383556,
1907
+ "grad_norm": 1.39467453956604,
1908
+ "learning_rate": 5.871318701932095e-06,
1909
+ "loss": 2.5091,
1910
+ "step": 10600
1911
+ },
1912
+ {
1913
+ "epoch": 2.6836741153383556,
1914
+ "eval_loss": 2.4331610202789307,
1915
+ "eval_runtime": 39.923,
1916
+ "eval_samples_per_second": 43.985,
1917
+ "eval_steps_per_second": 43.985,
1918
+ "step": 10600
1919
+ },
1920
+ {
1921
+ "epoch": 2.6963347471038803,
1922
+ "grad_norm": 1.1417715549468994,
1923
+ "learning_rate": 5.63684111798912e-06,
1924
+ "loss": 2.5853,
1925
+ "step": 10650
1926
+ },
1927
+ {
1928
+ "epoch": 2.7089953788694054,
1929
+ "grad_norm": 1.133244514465332,
1930
+ "learning_rate": 5.402363534046146e-06,
1931
+ "loss": 2.5457,
1932
+ "step": 10700
1933
+ },
1934
+ {
1935
+ "epoch": 2.7216560106349306,
1936
+ "grad_norm": 1.2331452369689941,
1937
+ "learning_rate": 5.1678859501031705e-06,
1938
+ "loss": 2.5576,
1939
+ "step": 10750
1940
+ },
1941
+ {
1942
+ "epoch": 2.7343166424004557,
1943
+ "grad_norm": 1.7164263725280762,
1944
+ "learning_rate": 4.933408366160195e-06,
1945
+ "loss": 2.5471,
1946
+ "step": 10800
1947
+ },
1948
+ {
1949
+ "epoch": 2.7343166424004557,
1950
+ "eval_loss": 2.4340403079986572,
1951
+ "eval_runtime": 40.3849,
1952
+ "eval_samples_per_second": 43.482,
1953
+ "eval_steps_per_second": 43.482,
1954
+ "step": 10800
1955
+ },
1956
+ {
1957
+ "epoch": 2.746977274165981,
1958
+ "grad_norm": 1.3680106401443481,
1959
+ "learning_rate": 4.69893078221722e-06,
1960
+ "loss": 2.5562,
1961
+ "step": 10850
1962
+ },
1963
+ {
1964
+ "epoch": 2.759637905931506,
1965
+ "grad_norm": 1.0978279113769531,
1966
+ "learning_rate": 4.464453198274246e-06,
1967
+ "loss": 2.5185,
1968
+ "step": 10900
1969
+ },
1970
+ {
1971
+ "epoch": 2.772298537697031,
1972
+ "grad_norm": 1.2212647199630737,
1973
+ "learning_rate": 4.2299756143312695e-06,
1974
+ "loss": 2.6371,
1975
+ "step": 10950
1976
+ },
1977
+ {
1978
+ "epoch": 2.784959169462556,
1979
+ "grad_norm": 1.6452165842056274,
1980
+ "learning_rate": 3.995498030388295e-06,
1981
+ "loss": 2.681,
1982
+ "step": 11000
1983
+ },
1984
+ {
1985
+ "epoch": 2.784959169462556,
1986
+ "eval_loss": 2.4337143898010254,
1987
+ "eval_runtime": 40.4235,
1988
+ "eval_samples_per_second": 43.44,
1989
+ "eval_steps_per_second": 43.44,
1990
+ "step": 11000
1991
+ },
1992
+ {
1993
+ "epoch": 2.7976198012280813,
1994
+ "grad_norm": 1.7757978439331055,
1995
+ "learning_rate": 3.7610204464453203e-06,
1996
+ "loss": 2.5746,
1997
+ "step": 11050
1998
+ },
1999
+ {
2000
+ "epoch": 2.8102804329936064,
2001
+ "grad_norm": 1.2373579740524292,
2002
+ "learning_rate": 3.5265428625023455e-06,
2003
+ "loss": 2.5412,
2004
+ "step": 11100
2005
+ },
2006
+ {
2007
+ "epoch": 2.8229410647591315,
2008
+ "grad_norm": 1.1407558917999268,
2009
+ "learning_rate": 3.29206527855937e-06,
2010
+ "loss": 2.5973,
2011
+ "step": 11150
2012
+ },
2013
+ {
2014
+ "epoch": 2.8356016965246567,
2015
+ "grad_norm": 2.399686813354492,
2016
+ "learning_rate": 3.057587694616395e-06,
2017
+ "loss": 2.5566,
2018
+ "step": 11200
2019
+ },
2020
+ {
2021
+ "epoch": 2.8356016965246567,
2022
+ "eval_loss": 2.4338231086730957,
2023
+ "eval_runtime": 40.4877,
2024
+ "eval_samples_per_second": 43.371,
2025
+ "eval_steps_per_second": 43.371,
2026
+ "step": 11200
2027
+ },
2028
+ {
2029
+ "epoch": 2.8482623282901818,
2030
+ "grad_norm": 1.7053141593933105,
2031
+ "learning_rate": 2.8231101106734197e-06,
2032
+ "loss": 2.6224,
2033
+ "step": 11250
2034
+ },
2035
+ {
2036
+ "epoch": 2.860922960055707,
2037
+ "grad_norm": 1.8215903043746948,
2038
+ "learning_rate": 2.5886325267304445e-06,
2039
+ "loss": 2.5108,
2040
+ "step": 11300
2041
+ },
2042
+ {
2043
+ "epoch": 2.873583591821232,
2044
+ "grad_norm": 1.1648200750350952,
2045
+ "learning_rate": 2.3541549427874697e-06,
2046
+ "loss": 2.557,
2047
+ "step": 11350
2048
+ },
2049
+ {
2050
+ "epoch": 2.886244223586757,
2051
+ "grad_norm": 1.5225868225097656,
2052
+ "learning_rate": 2.1196773588444944e-06,
2053
+ "loss": 2.6285,
2054
+ "step": 11400
2055
+ },
2056
+ {
2057
+ "epoch": 2.886244223586757,
2058
+ "eval_loss": 2.4334514141082764,
2059
+ "eval_runtime": 40.3985,
2060
+ "eval_samples_per_second": 43.467,
2061
+ "eval_steps_per_second": 43.467,
2062
+ "step": 11400
2063
+ },
2064
+ {
2065
+ "epoch": 2.8989048553522823,
2066
+ "grad_norm": 1.4937622547149658,
2067
+ "learning_rate": 1.8851997749015194e-06,
2068
+ "loss": 2.481,
2069
+ "step": 11450
2070
+ },
2071
+ {
2072
+ "epoch": 2.911565487117807,
2073
+ "grad_norm": 1.9169902801513672,
2074
+ "learning_rate": 1.6507221909585446e-06,
2075
+ "loss": 2.5412,
2076
+ "step": 11500
2077
+ },
2078
+ {
2079
+ "epoch": 2.9242261188833325,
2080
+ "grad_norm": 1.6611114740371704,
2081
+ "learning_rate": 1.4162446070155693e-06,
2082
+ "loss": 2.5086,
2083
+ "step": 11550
2084
+ },
2085
+ {
2086
+ "epoch": 2.936886750648857,
2087
+ "grad_norm": 1.3464007377624512,
2088
+ "learning_rate": 1.1817670230725943e-06,
2089
+ "loss": 2.6063,
2090
+ "step": 11600
2091
+ },
2092
+ {
2093
+ "epoch": 2.936886750648857,
2094
+ "eval_loss": 2.4329476356506348,
2095
+ "eval_runtime": 40.4334,
2096
+ "eval_samples_per_second": 43.429,
2097
+ "eval_steps_per_second": 43.429,
2098
+ "step": 11600
2099
+ }
2100
+ ],
2101
+ "logging_steps": 50,
2102
+ "max_steps": 11847,
2103
+ "num_input_tokens_seen": 0,
2104
+ "num_train_epochs": 3,
2105
+ "save_steps": 200,
2106
+ "stateful_callbacks": {
2107
+ "TrainerControl": {
2108
+ "args": {
2109
+ "should_epoch_stop": false,
2110
+ "should_evaluate": false,
2111
+ "should_log": false,
2112
+ "should_save": true,
2113
+ "should_training_stop": false
2114
+ },
2115
+ "attributes": {}
2116
+ }
2117
+ },
2118
+ "total_flos": 6118984656617472.0,
2119
+ "train_batch_size": 1,
2120
+ "trial_name": null,
2121
+ "trial_params": null
2122
+ }
checkpoint-11600/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d46b7107c3dfd17ff0fb12b8cdcc256a49e8d4594358d4495904a5fcb069382
3
+ size 5304
checkpoint-11600/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-11800/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: gpt2
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.14.0
checkpoint-11800/adapter_config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "gpt2",
5
+ "bias": "none",
6
+ "eva_config": null,
7
+ "exclude_modules": null,
8
+ "fan_in_fan_out": true,
9
+ "inference_mode": true,
10
+ "init_lora_weights": true,
11
+ "layer_replication": null,
12
+ "layers_pattern": null,
13
+ "layers_to_transform": null,
14
+ "loftq_config": {},
15
+ "lora_alpha": 16,
16
+ "lora_bias": false,
17
+ "lora_dropout": 0.1,
18
+ "megatron_config": null,
19
+ "megatron_core": "megatron.core",
20
+ "modules_to_save": null,
21
+ "peft_type": "LORA",
22
+ "r": 8,
23
+ "rank_pattern": {},
24
+ "revision": null,
25
+ "target_modules": [
26
+ "c_proj",
27
+ "c_attn"
28
+ ],
29
+ "task_type": "CAUSAL_LM",
30
+ "use_dora": false,
31
+ "use_rslora": false
32
+ }
checkpoint-11800/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1cbeee3dfc268cbdbc22f727c37945ebd4275a3ea49512bd0ee049c583c3112b
3
+ size 3253104
checkpoint-11800/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-11800/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2508e6c71eefe4bf3e6d34287d87494c23ed52ffdc88e1f2b4e1870ad89b4ce2
3
+ size 6548858
checkpoint-11800/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7627c0686ccb211fe33205e3cd82a7c42fc538760ecf8cec401a983447f69f8d
3
+ size 14244
checkpoint-11800/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:573e7fd5dc49ed59f25d4becc07e4193e7f82ea7e1f2e73baca290d86a26a80a
3
+ size 988
checkpoint-11800/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:779c8896e02a376108997a6c5096158c8cea4a1d22071fa3e06e889ac960dfa8
3
+ size 1064
checkpoint-11800/special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "pad_token": "<|endoftext|>",
5
+ "unk_token": "<|endoftext|>"
6
+ }
checkpoint-11800/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-11800/tokenizer_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "50256": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ }
12
+ },
13
+ "bos_token": "<|endoftext|>",
14
+ "clean_up_tokenization_spaces": false,
15
+ "eos_token": "<|endoftext|>",
16
+ "extra_special_tokens": {},
17
+ "model_max_length": 1024,
18
+ "pad_token": "<|endoftext|>",
19
+ "tokenizer_class": "GPT2Tokenizer",
20
+ "unk_token": "<|endoftext|>"
21
+ }
checkpoint-11800/trainer_state.json ADDED
@@ -0,0 +1,2158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 11800,
3
+ "best_metric": 2.432849645614624,
4
+ "best_model_checkpoint": "/kaggle/working/senko_adaptive/checkpoint-11800",
5
+ "epoch": 2.9875292777109577,
6
+ "eval_steps": 200,
7
+ "global_step": 11800,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0126606317655251,
14
+ "grad_norm": 0.4274106025695801,
15
+ "learning_rate": 2.067510548523207e-06,
16
+ "loss": 3.4405,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 0.0253212635310502,
21
+ "grad_norm": 0.5292551517486572,
22
+ "learning_rate": 4.177215189873418e-06,
23
+ "loss": 3.4567,
24
+ "step": 100
25
+ },
26
+ {
27
+ "epoch": 0.0379818952965753,
28
+ "grad_norm": 0.7541739344596863,
29
+ "learning_rate": 6.28691983122363e-06,
30
+ "loss": 3.4683,
31
+ "step": 150
32
+ },
33
+ {
34
+ "epoch": 0.0506425270621004,
35
+ "grad_norm": 0.8833445906639099,
36
+ "learning_rate": 8.39662447257384e-06,
37
+ "loss": 3.5084,
38
+ "step": 200
39
+ },
40
+ {
41
+ "epoch": 0.0506425270621004,
42
+ "eval_loss": 3.5248923301696777,
43
+ "eval_runtime": 39.9384,
44
+ "eval_samples_per_second": 43.968,
45
+ "eval_steps_per_second": 43.968,
46
+ "step": 200
47
+ },
48
+ {
49
+ "epoch": 0.0633031588276255,
50
+ "grad_norm": 0.9998921155929565,
51
+ "learning_rate": 1.0506329113924052e-05,
52
+ "loss": 3.359,
53
+ "step": 250
54
+ },
55
+ {
56
+ "epoch": 0.0759637905931506,
57
+ "grad_norm": 0.8041885495185852,
58
+ "learning_rate": 1.2616033755274262e-05,
59
+ "loss": 3.351,
60
+ "step": 300
61
+ },
62
+ {
63
+ "epoch": 0.0886244223586757,
64
+ "grad_norm": 0.9213416576385498,
65
+ "learning_rate": 1.4725738396624473e-05,
66
+ "loss": 3.2244,
67
+ "step": 350
68
+ },
69
+ {
70
+ "epoch": 0.1012850541242008,
71
+ "grad_norm": 1.0922213792800903,
72
+ "learning_rate": 1.6835443037974685e-05,
73
+ "loss": 3.1565,
74
+ "step": 400
75
+ },
76
+ {
77
+ "epoch": 0.1012850541242008,
78
+ "eval_loss": 3.151216983795166,
79
+ "eval_runtime": 39.7515,
80
+ "eval_samples_per_second": 44.174,
81
+ "eval_steps_per_second": 44.174,
82
+ "step": 400
83
+ },
84
+ {
85
+ "epoch": 0.1139456858897259,
86
+ "grad_norm": 1.4199283123016357,
87
+ "learning_rate": 1.8945147679324897e-05,
88
+ "loss": 3.0154,
89
+ "step": 450
90
+ },
91
+ {
92
+ "epoch": 0.126606317655251,
93
+ "grad_norm": 1.077143907546997,
94
+ "learning_rate": 2.1054852320675106e-05,
95
+ "loss": 3.0456,
96
+ "step": 500
97
+ },
98
+ {
99
+ "epoch": 0.1392669494207761,
100
+ "grad_norm": 1.5466052293777466,
101
+ "learning_rate": 2.3164556962025318e-05,
102
+ "loss": 2.9099,
103
+ "step": 550
104
+ },
105
+ {
106
+ "epoch": 0.1519275811863012,
107
+ "grad_norm": 1.2139467000961304,
108
+ "learning_rate": 2.5274261603375527e-05,
109
+ "loss": 2.8839,
110
+ "step": 600
111
+ },
112
+ {
113
+ "epoch": 0.1519275811863012,
114
+ "eval_loss": 2.793567657470703,
115
+ "eval_runtime": 40.2573,
116
+ "eval_samples_per_second": 43.619,
117
+ "eval_steps_per_second": 43.619,
118
+ "step": 600
119
+ },
120
+ {
121
+ "epoch": 0.1645882129518263,
122
+ "grad_norm": 1.0270315408706665,
123
+ "learning_rate": 2.738396624472574e-05,
124
+ "loss": 2.8389,
125
+ "step": 650
126
+ },
127
+ {
128
+ "epoch": 0.1772488447173514,
129
+ "grad_norm": 1.5865377187728882,
130
+ "learning_rate": 2.949367088607595e-05,
131
+ "loss": 2.8228,
132
+ "step": 700
133
+ },
134
+ {
135
+ "epoch": 0.18990947648287648,
136
+ "grad_norm": 1.076073408126831,
137
+ "learning_rate": 3.160337552742616e-05,
138
+ "loss": 2.9255,
139
+ "step": 750
140
+ },
141
+ {
142
+ "epoch": 0.2025701082484016,
143
+ "grad_norm": 1.4510694742202759,
144
+ "learning_rate": 3.3713080168776376e-05,
145
+ "loss": 2.8165,
146
+ "step": 800
147
+ },
148
+ {
149
+ "epoch": 0.2025701082484016,
150
+ "eval_loss": 2.667301893234253,
151
+ "eval_runtime": 40.0906,
152
+ "eval_samples_per_second": 43.801,
153
+ "eval_steps_per_second": 43.801,
154
+ "step": 800
155
+ },
156
+ {
157
+ "epoch": 0.2152307400139267,
158
+ "grad_norm": 1.5206592082977295,
159
+ "learning_rate": 3.5822784810126585e-05,
160
+ "loss": 2.8022,
161
+ "step": 850
162
+ },
163
+ {
164
+ "epoch": 0.2278913717794518,
165
+ "grad_norm": 1.173909068107605,
166
+ "learning_rate": 3.7932489451476794e-05,
167
+ "loss": 2.8034,
168
+ "step": 900
169
+ },
170
+ {
171
+ "epoch": 0.24055200354497688,
172
+ "grad_norm": 1.4551103115081787,
173
+ "learning_rate": 4.004219409282701e-05,
174
+ "loss": 2.774,
175
+ "step": 950
176
+ },
177
+ {
178
+ "epoch": 0.253212635310502,
179
+ "grad_norm": 1.509749412536621,
180
+ "learning_rate": 4.215189873417722e-05,
181
+ "loss": 2.7978,
182
+ "step": 1000
183
+ },
184
+ {
185
+ "epoch": 0.253212635310502,
186
+ "eval_loss": 2.603116273880005,
187
+ "eval_runtime": 39.9617,
188
+ "eval_samples_per_second": 43.942,
189
+ "eval_steps_per_second": 43.942,
190
+ "step": 1000
191
+ },
192
+ {
193
+ "epoch": 0.2658732670760271,
194
+ "grad_norm": 1.745764136314392,
195
+ "learning_rate": 4.426160337552743e-05,
196
+ "loss": 2.7249,
197
+ "step": 1050
198
+ },
199
+ {
200
+ "epoch": 0.2785338988415522,
201
+ "grad_norm": 1.6712589263916016,
202
+ "learning_rate": 4.637130801687764e-05,
203
+ "loss": 2.7618,
204
+ "step": 1100
205
+ },
206
+ {
207
+ "epoch": 0.2911945306070773,
208
+ "grad_norm": 2.256267786026001,
209
+ "learning_rate": 4.8481012658227845e-05,
210
+ "loss": 2.7431,
211
+ "step": 1150
212
+ },
213
+ {
214
+ "epoch": 0.3038551623726024,
215
+ "grad_norm": 1.5181586742401123,
216
+ "learning_rate": 4.993434627649597e-05,
217
+ "loss": 2.778,
218
+ "step": 1200
219
+ },
220
+ {
221
+ "epoch": 0.3038551623726024,
222
+ "eval_loss": 2.5704379081726074,
223
+ "eval_runtime": 40.1791,
224
+ "eval_samples_per_second": 43.704,
225
+ "eval_steps_per_second": 43.704,
226
+ "step": 1200
227
+ },
228
+ {
229
+ "epoch": 0.31651579413812747,
230
+ "grad_norm": 1.1885608434677124,
231
+ "learning_rate": 4.969986869255299e-05,
232
+ "loss": 2.7224,
233
+ "step": 1250
234
+ },
235
+ {
236
+ "epoch": 0.3291764259036526,
237
+ "grad_norm": 1.2136404514312744,
238
+ "learning_rate": 4.946539110861002e-05,
239
+ "loss": 2.6823,
240
+ "step": 1300
241
+ },
242
+ {
243
+ "epoch": 0.3418370576691777,
244
+ "grad_norm": 0.8780732750892639,
245
+ "learning_rate": 4.9230913524667046e-05,
246
+ "loss": 2.6961,
247
+ "step": 1350
248
+ },
249
+ {
250
+ "epoch": 0.3544976894347028,
251
+ "grad_norm": 1.0844959020614624,
252
+ "learning_rate": 4.899643594072407e-05,
253
+ "loss": 2.7014,
254
+ "step": 1400
255
+ },
256
+ {
257
+ "epoch": 0.3544976894347028,
258
+ "eval_loss": 2.54587721824646,
259
+ "eval_runtime": 39.8803,
260
+ "eval_samples_per_second": 44.032,
261
+ "eval_steps_per_second": 44.032,
262
+ "step": 1400
263
+ },
264
+ {
265
+ "epoch": 0.3671583212002279,
266
+ "grad_norm": 1.3518335819244385,
267
+ "learning_rate": 4.8761958356781096e-05,
268
+ "loss": 2.6393,
269
+ "step": 1450
270
+ },
271
+ {
272
+ "epoch": 0.37981895296575297,
273
+ "grad_norm": 1.1389687061309814,
274
+ "learning_rate": 4.852748077283812e-05,
275
+ "loss": 2.7002,
276
+ "step": 1500
277
+ },
278
+ {
279
+ "epoch": 0.3924795847312781,
280
+ "grad_norm": 1.6295430660247803,
281
+ "learning_rate": 4.829300318889514e-05,
282
+ "loss": 2.6754,
283
+ "step": 1550
284
+ },
285
+ {
286
+ "epoch": 0.4051402164968032,
287
+ "grad_norm": 1.387499451637268,
288
+ "learning_rate": 4.8058525604952173e-05,
289
+ "loss": 2.6853,
290
+ "step": 1600
291
+ },
292
+ {
293
+ "epoch": 0.4051402164968032,
294
+ "eval_loss": 2.5297553539276123,
295
+ "eval_runtime": 39.722,
296
+ "eval_samples_per_second": 44.207,
297
+ "eval_steps_per_second": 44.207,
298
+ "step": 1600
299
+ },
300
+ {
301
+ "epoch": 0.4178008482623283,
302
+ "grad_norm": 1.014020323753357,
303
+ "learning_rate": 4.7824048021009195e-05,
304
+ "loss": 2.7275,
305
+ "step": 1650
306
+ },
307
+ {
308
+ "epoch": 0.4304614800278534,
309
+ "grad_norm": 1.1505990028381348,
310
+ "learning_rate": 4.7589570437066216e-05,
311
+ "loss": 2.6651,
312
+ "step": 1700
313
+ },
314
+ {
315
+ "epoch": 0.4431221117933785,
316
+ "grad_norm": 1.1389458179473877,
317
+ "learning_rate": 4.7355092853123244e-05,
318
+ "loss": 2.6993,
319
+ "step": 1750
320
+ },
321
+ {
322
+ "epoch": 0.4557827435589036,
323
+ "grad_norm": 1.2159587144851685,
324
+ "learning_rate": 4.7120615269180266e-05,
325
+ "loss": 2.7239,
326
+ "step": 1800
327
+ },
328
+ {
329
+ "epoch": 0.4557827435589036,
330
+ "eval_loss": 2.5201079845428467,
331
+ "eval_runtime": 39.8313,
332
+ "eval_samples_per_second": 44.086,
333
+ "eval_steps_per_second": 44.086,
334
+ "step": 1800
335
+ },
336
+ {
337
+ "epoch": 0.4684433753244287,
338
+ "grad_norm": 1.1873971223831177,
339
+ "learning_rate": 4.6886137685237294e-05,
340
+ "loss": 2.6368,
341
+ "step": 1850
342
+ },
343
+ {
344
+ "epoch": 0.48110400708995377,
345
+ "grad_norm": 1.5109103918075562,
346
+ "learning_rate": 4.665166010129432e-05,
347
+ "loss": 2.6827,
348
+ "step": 1900
349
+ },
350
+ {
351
+ "epoch": 0.4937646388554789,
352
+ "grad_norm": 1.9981125593185425,
353
+ "learning_rate": 4.641718251735134e-05,
354
+ "loss": 2.6513,
355
+ "step": 1950
356
+ },
357
+ {
358
+ "epoch": 0.506425270621004,
359
+ "grad_norm": 1.4879294633865356,
360
+ "learning_rate": 4.6182704933408365e-05,
361
+ "loss": 2.6433,
362
+ "step": 2000
363
+ },
364
+ {
365
+ "epoch": 0.506425270621004,
366
+ "eval_loss": 2.5107176303863525,
367
+ "eval_runtime": 40.0643,
368
+ "eval_samples_per_second": 43.83,
369
+ "eval_steps_per_second": 43.83,
370
+ "step": 2000
371
+ },
372
+ {
373
+ "epoch": 0.5190859023865291,
374
+ "grad_norm": 1.2832767963409424,
375
+ "learning_rate": 4.5952916901144253e-05,
376
+ "loss": 2.6225,
377
+ "step": 2050
378
+ },
379
+ {
380
+ "epoch": 0.5317465341520542,
381
+ "grad_norm": 1.2915899753570557,
382
+ "learning_rate": 4.5718439317201275e-05,
383
+ "loss": 2.6592,
384
+ "step": 2100
385
+ },
386
+ {
387
+ "epoch": 0.5444071659175793,
388
+ "grad_norm": 1.229929804801941,
389
+ "learning_rate": 4.54839617332583e-05,
390
+ "loss": 2.6411,
391
+ "step": 2150
392
+ },
393
+ {
394
+ "epoch": 0.5570677976831044,
395
+ "grad_norm": 1.2569608688354492,
396
+ "learning_rate": 4.524948414931533e-05,
397
+ "loss": 2.6436,
398
+ "step": 2200
399
+ },
400
+ {
401
+ "epoch": 0.5570677976831044,
402
+ "eval_loss": 2.504101514816284,
403
+ "eval_runtime": 39.8694,
404
+ "eval_samples_per_second": 44.044,
405
+ "eval_steps_per_second": 44.044,
406
+ "step": 2200
407
+ },
408
+ {
409
+ "epoch": 0.5697284294486294,
410
+ "grad_norm": 1.3688510656356812,
411
+ "learning_rate": 4.501500656537235e-05,
412
+ "loss": 2.6819,
413
+ "step": 2250
414
+ },
415
+ {
416
+ "epoch": 0.5823890612141546,
417
+ "grad_norm": 1.1405905485153198,
418
+ "learning_rate": 4.4780528981429374e-05,
419
+ "loss": 2.6116,
420
+ "step": 2300
421
+ },
422
+ {
423
+ "epoch": 0.5950496929796797,
424
+ "grad_norm": 1.453338861465454,
425
+ "learning_rate": 4.45460513974864e-05,
426
+ "loss": 2.6154,
427
+ "step": 2350
428
+ },
429
+ {
430
+ "epoch": 0.6077103247452048,
431
+ "grad_norm": 1.0401395559310913,
432
+ "learning_rate": 4.431157381354343e-05,
433
+ "loss": 2.6018,
434
+ "step": 2400
435
+ },
436
+ {
437
+ "epoch": 0.6077103247452048,
438
+ "eval_loss": 2.498344898223877,
439
+ "eval_runtime": 39.9496,
440
+ "eval_samples_per_second": 43.955,
441
+ "eval_steps_per_second": 43.955,
442
+ "step": 2400
443
+ },
444
+ {
445
+ "epoch": 0.6203709565107299,
446
+ "grad_norm": 1.4646718502044678,
447
+ "learning_rate": 4.407709622960045e-05,
448
+ "loss": 2.5734,
449
+ "step": 2450
450
+ },
451
+ {
452
+ "epoch": 0.6330315882762549,
453
+ "grad_norm": 1.3828164339065552,
454
+ "learning_rate": 4.384261864565748e-05,
455
+ "loss": 2.6445,
456
+ "step": 2500
457
+ },
458
+ {
459
+ "epoch": 0.6456922200417801,
460
+ "grad_norm": 2.1768596172332764,
461
+ "learning_rate": 4.36081410617145e-05,
462
+ "loss": 2.6618,
463
+ "step": 2550
464
+ },
465
+ {
466
+ "epoch": 0.6583528518073052,
467
+ "grad_norm": 1.6110296249389648,
468
+ "learning_rate": 4.337366347777152e-05,
469
+ "loss": 2.6509,
470
+ "step": 2600
471
+ },
472
+ {
473
+ "epoch": 0.6583528518073052,
474
+ "eval_loss": 2.4937028884887695,
475
+ "eval_runtime": 39.8698,
476
+ "eval_samples_per_second": 44.043,
477
+ "eval_steps_per_second": 44.043,
478
+ "step": 2600
479
+ },
480
+ {
481
+ "epoch": 0.6710134835728303,
482
+ "grad_norm": 1.2363536357879639,
483
+ "learning_rate": 4.313918589382856e-05,
484
+ "loss": 2.6274,
485
+ "step": 2650
486
+ },
487
+ {
488
+ "epoch": 0.6836741153383554,
489
+ "grad_norm": 2.192110538482666,
490
+ "learning_rate": 4.290470830988558e-05,
491
+ "loss": 2.6932,
492
+ "step": 2700
493
+ },
494
+ {
495
+ "epoch": 0.6963347471038804,
496
+ "grad_norm": 1.2024074792861938,
497
+ "learning_rate": 4.26702307259426e-05,
498
+ "loss": 2.6221,
499
+ "step": 2750
500
+ },
501
+ {
502
+ "epoch": 0.7089953788694056,
503
+ "grad_norm": 1.8665797710418701,
504
+ "learning_rate": 4.243575314199963e-05,
505
+ "loss": 2.6313,
506
+ "step": 2800
507
+ },
508
+ {
509
+ "epoch": 0.7089953788694056,
510
+ "eval_loss": 2.4876773357391357,
511
+ "eval_runtime": 40.026,
512
+ "eval_samples_per_second": 43.871,
513
+ "eval_steps_per_second": 43.871,
514
+ "step": 2800
515
+ },
516
+ {
517
+ "epoch": 0.7216560106349307,
518
+ "grad_norm": 1.4088993072509766,
519
+ "learning_rate": 4.220127555805665e-05,
520
+ "loss": 2.5675,
521
+ "step": 2850
522
+ },
523
+ {
524
+ "epoch": 0.7343166424004558,
525
+ "grad_norm": 1.3225140571594238,
526
+ "learning_rate": 4.196679797411368e-05,
527
+ "loss": 2.56,
528
+ "step": 2900
529
+ },
530
+ {
531
+ "epoch": 0.7469772741659809,
532
+ "grad_norm": 1.3416539430618286,
533
+ "learning_rate": 4.1732320390170706e-05,
534
+ "loss": 2.6517,
535
+ "step": 2950
536
+ },
537
+ {
538
+ "epoch": 0.7596379059315059,
539
+ "grad_norm": 1.079567790031433,
540
+ "learning_rate": 4.149784280622773e-05,
541
+ "loss": 2.698,
542
+ "step": 3000
543
+ },
544
+ {
545
+ "epoch": 0.7596379059315059,
546
+ "eval_loss": 2.4842560291290283,
547
+ "eval_runtime": 39.7988,
548
+ "eval_samples_per_second": 44.122,
549
+ "eval_steps_per_second": 44.122,
550
+ "step": 3000
551
+ },
552
+ {
553
+ "epoch": 0.772298537697031,
554
+ "grad_norm": 1.4532116651535034,
555
+ "learning_rate": 4.126336522228475e-05,
556
+ "loss": 2.6232,
557
+ "step": 3050
558
+ },
559
+ {
560
+ "epoch": 0.7849591694625562,
561
+ "grad_norm": 1.5380038022994995,
562
+ "learning_rate": 4.102888763834178e-05,
563
+ "loss": 2.6212,
564
+ "step": 3100
565
+ },
566
+ {
567
+ "epoch": 0.7976198012280813,
568
+ "grad_norm": 1.3965916633605957,
569
+ "learning_rate": 4.0794410054398805e-05,
570
+ "loss": 2.5804,
571
+ "step": 3150
572
+ },
573
+ {
574
+ "epoch": 0.8102804329936064,
575
+ "grad_norm": 1.4798463582992554,
576
+ "learning_rate": 4.0559932470455826e-05,
577
+ "loss": 2.6724,
578
+ "step": 3200
579
+ },
580
+ {
581
+ "epoch": 0.8102804329936064,
582
+ "eval_loss": 2.480894088745117,
583
+ "eval_runtime": 39.9604,
584
+ "eval_samples_per_second": 43.943,
585
+ "eval_steps_per_second": 43.943,
586
+ "step": 3200
587
+ },
588
+ {
589
+ "epoch": 0.8229410647591315,
590
+ "grad_norm": 1.2598360776901245,
591
+ "learning_rate": 4.0325454886512854e-05,
592
+ "loss": 2.6993,
593
+ "step": 3250
594
+ },
595
+ {
596
+ "epoch": 0.8356016965246565,
597
+ "grad_norm": 1.366295576095581,
598
+ "learning_rate": 4.0090977302569876e-05,
599
+ "loss": 2.551,
600
+ "step": 3300
601
+ },
602
+ {
603
+ "epoch": 0.8482623282901817,
604
+ "grad_norm": 1.1827855110168457,
605
+ "learning_rate": 3.98564997186269e-05,
606
+ "loss": 2.6131,
607
+ "step": 3350
608
+ },
609
+ {
610
+ "epoch": 0.8609229600557068,
611
+ "grad_norm": 1.2728627920150757,
612
+ "learning_rate": 3.9622022134683925e-05,
613
+ "loss": 2.6178,
614
+ "step": 3400
615
+ },
616
+ {
617
+ "epoch": 0.8609229600557068,
618
+ "eval_loss": 2.477010726928711,
619
+ "eval_runtime": 40.2504,
620
+ "eval_samples_per_second": 43.627,
621
+ "eval_steps_per_second": 43.627,
622
+ "step": 3400
623
+ },
624
+ {
625
+ "epoch": 0.8735835918212319,
626
+ "grad_norm": 1.341917634010315,
627
+ "learning_rate": 3.938754455074095e-05,
628
+ "loss": 2.5748,
629
+ "step": 3450
630
+ },
631
+ {
632
+ "epoch": 0.886244223586757,
633
+ "grad_norm": 1.4114609956741333,
634
+ "learning_rate": 3.9153066966797975e-05,
635
+ "loss": 2.667,
636
+ "step": 3500
637
+ },
638
+ {
639
+ "epoch": 0.898904855352282,
640
+ "grad_norm": 1.1211490631103516,
641
+ "learning_rate": 3.8918589382855e-05,
642
+ "loss": 2.5671,
643
+ "step": 3550
644
+ },
645
+ {
646
+ "epoch": 0.9115654871178072,
647
+ "grad_norm": 1.4166322946548462,
648
+ "learning_rate": 3.8684111798912024e-05,
649
+ "loss": 2.5945,
650
+ "step": 3600
651
+ },
652
+ {
653
+ "epoch": 0.9115654871178072,
654
+ "eval_loss": 2.47322940826416,
655
+ "eval_runtime": 40.2079,
656
+ "eval_samples_per_second": 43.673,
657
+ "eval_steps_per_second": 43.673,
658
+ "step": 3600
659
+ },
660
+ {
661
+ "epoch": 0.9242261188833323,
662
+ "grad_norm": 0.9144394993782043,
663
+ "learning_rate": 3.844963421496905e-05,
664
+ "loss": 2.6148,
665
+ "step": 3650
666
+ },
667
+ {
668
+ "epoch": 0.9368867506488574,
669
+ "grad_norm": 1.4106061458587646,
670
+ "learning_rate": 3.821515663102608e-05,
671
+ "loss": 2.6586,
672
+ "step": 3700
673
+ },
674
+ {
675
+ "epoch": 0.9495473824143825,
676
+ "grad_norm": 1.414415717124939,
677
+ "learning_rate": 3.79806790470831e-05,
678
+ "loss": 2.5874,
679
+ "step": 3750
680
+ },
681
+ {
682
+ "epoch": 0.9622080141799075,
683
+ "grad_norm": 1.5448992252349854,
684
+ "learning_rate": 3.774620146314012e-05,
685
+ "loss": 2.6422,
686
+ "step": 3800
687
+ },
688
+ {
689
+ "epoch": 0.9622080141799075,
690
+ "eval_loss": 2.4701173305511475,
691
+ "eval_runtime": 40.1267,
692
+ "eval_samples_per_second": 43.761,
693
+ "eval_steps_per_second": 43.761,
694
+ "step": 3800
695
+ },
696
+ {
697
+ "epoch": 0.9748686459454327,
698
+ "grad_norm": 1.1959314346313477,
699
+ "learning_rate": 3.751172387919715e-05,
700
+ "loss": 2.6975,
701
+ "step": 3850
702
+ },
703
+ {
704
+ "epoch": 0.9875292777109578,
705
+ "grad_norm": 0.9525274038314819,
706
+ "learning_rate": 3.727724629525417e-05,
707
+ "loss": 2.6675,
708
+ "step": 3900
709
+ },
710
+ {
711
+ "epoch": 1.0,
712
+ "grad_norm": 4.733253479003906,
713
+ "learning_rate": 3.70427687113112e-05,
714
+ "loss": 2.566,
715
+ "step": 3950
716
+ },
717
+ {
718
+ "epoch": 1.0126606317655251,
719
+ "grad_norm": 1.2803192138671875,
720
+ "learning_rate": 3.680829112736823e-05,
721
+ "loss": 2.5659,
722
+ "step": 4000
723
+ },
724
+ {
725
+ "epoch": 1.0126606317655251,
726
+ "eval_loss": 2.4702188968658447,
727
+ "eval_runtime": 40.1387,
728
+ "eval_samples_per_second": 43.748,
729
+ "eval_steps_per_second": 43.748,
730
+ "step": 4000
731
+ },
732
+ {
733
+ "epoch": 1.0253212635310502,
734
+ "grad_norm": 1.446990966796875,
735
+ "learning_rate": 3.657381354342525e-05,
736
+ "loss": 2.627,
737
+ "step": 4050
738
+ },
739
+ {
740
+ "epoch": 1.0379818952965754,
741
+ "grad_norm": 1.3563008308410645,
742
+ "learning_rate": 3.633933595948227e-05,
743
+ "loss": 2.6252,
744
+ "step": 4100
745
+ },
746
+ {
747
+ "epoch": 1.0506425270621005,
748
+ "grad_norm": 1.5763463973999023,
749
+ "learning_rate": 3.61048583755393e-05,
750
+ "loss": 2.6593,
751
+ "step": 4150
752
+ },
753
+ {
754
+ "epoch": 1.0633031588276256,
755
+ "grad_norm": 1.0055335760116577,
756
+ "learning_rate": 3.587038079159633e-05,
757
+ "loss": 2.5955,
758
+ "step": 4200
759
+ },
760
+ {
761
+ "epoch": 1.0633031588276256,
762
+ "eval_loss": 2.4676930904388428,
763
+ "eval_runtime": 40.0342,
764
+ "eval_samples_per_second": 43.863,
765
+ "eval_steps_per_second": 43.863,
766
+ "step": 4200
767
+ },
768
+ {
769
+ "epoch": 1.0759637905931505,
770
+ "grad_norm": 1.7013343572616577,
771
+ "learning_rate": 3.563590320765335e-05,
772
+ "loss": 2.59,
773
+ "step": 4250
774
+ },
775
+ {
776
+ "epoch": 1.0886244223586756,
777
+ "grad_norm": 1.541069507598877,
778
+ "learning_rate": 3.540142562371038e-05,
779
+ "loss": 2.6192,
780
+ "step": 4300
781
+ },
782
+ {
783
+ "epoch": 1.1012850541242007,
784
+ "grad_norm": 1.2536805868148804,
785
+ "learning_rate": 3.51669480397674e-05,
786
+ "loss": 2.6225,
787
+ "step": 4350
788
+ },
789
+ {
790
+ "epoch": 1.1139456858897259,
791
+ "grad_norm": 1.8328826427459717,
792
+ "learning_rate": 3.493247045582442e-05,
793
+ "loss": 2.6022,
794
+ "step": 4400
795
+ },
796
+ {
797
+ "epoch": 1.1139456858897259,
798
+ "eval_loss": 2.465629816055298,
799
+ "eval_runtime": 39.8532,
800
+ "eval_samples_per_second": 44.062,
801
+ "eval_steps_per_second": 44.062,
802
+ "step": 4400
803
+ },
804
+ {
805
+ "epoch": 1.126606317655251,
806
+ "grad_norm": 1.8557270765304565,
807
+ "learning_rate": 3.469799287188145e-05,
808
+ "loss": 2.6496,
809
+ "step": 4450
810
+ },
811
+ {
812
+ "epoch": 1.139266949420776,
813
+ "grad_norm": 1.3255618810653687,
814
+ "learning_rate": 3.446820483961734e-05,
815
+ "loss": 2.5315,
816
+ "step": 4500
817
+ },
818
+ {
819
+ "epoch": 1.1519275811863012,
820
+ "grad_norm": 1.2192399501800537,
821
+ "learning_rate": 3.423372725567436e-05,
822
+ "loss": 2.5409,
823
+ "step": 4550
824
+ },
825
+ {
826
+ "epoch": 1.1645882129518264,
827
+ "grad_norm": 1.2533234357833862,
828
+ "learning_rate": 3.399924967173139e-05,
829
+ "loss": 2.6457,
830
+ "step": 4600
831
+ },
832
+ {
833
+ "epoch": 1.1645882129518264,
834
+ "eval_loss": 2.462162733078003,
835
+ "eval_runtime": 40.0542,
836
+ "eval_samples_per_second": 43.841,
837
+ "eval_steps_per_second": 43.841,
838
+ "step": 4600
839
+ },
840
+ {
841
+ "epoch": 1.1772488447173515,
842
+ "grad_norm": 1.8414678573608398,
843
+ "learning_rate": 3.376477208778841e-05,
844
+ "loss": 2.5658,
845
+ "step": 4650
846
+ },
847
+ {
848
+ "epoch": 1.1899094764828764,
849
+ "grad_norm": 1.568259596824646,
850
+ "learning_rate": 3.3530294503845436e-05,
851
+ "loss": 2.5771,
852
+ "step": 4700
853
+ },
854
+ {
855
+ "epoch": 1.2025701082484015,
856
+ "grad_norm": 1.3547483682632446,
857
+ "learning_rate": 3.3295816919902464e-05,
858
+ "loss": 2.6525,
859
+ "step": 4750
860
+ },
861
+ {
862
+ "epoch": 1.2152307400139266,
863
+ "grad_norm": 1.1655386686325073,
864
+ "learning_rate": 3.3061339335959486e-05,
865
+ "loss": 2.6421,
866
+ "step": 4800
867
+ },
868
+ {
869
+ "epoch": 1.2152307400139266,
870
+ "eval_loss": 2.461489200592041,
871
+ "eval_runtime": 39.9962,
872
+ "eval_samples_per_second": 43.904,
873
+ "eval_steps_per_second": 43.904,
874
+ "step": 4800
875
+ },
876
+ {
877
+ "epoch": 1.2278913717794517,
878
+ "grad_norm": 1.798033595085144,
879
+ "learning_rate": 3.282686175201651e-05,
880
+ "loss": 2.6091,
881
+ "step": 4850
882
+ },
883
+ {
884
+ "epoch": 1.2405520035449769,
885
+ "grad_norm": 3.2964117527008057,
886
+ "learning_rate": 3.2592384168073535e-05,
887
+ "loss": 2.5997,
888
+ "step": 4900
889
+ },
890
+ {
891
+ "epoch": 1.253212635310502,
892
+ "grad_norm": 1.0457675457000732,
893
+ "learning_rate": 3.2357906584130557e-05,
894
+ "loss": 2.6144,
895
+ "step": 4950
896
+ },
897
+ {
898
+ "epoch": 1.265873267076027,
899
+ "grad_norm": 0.9728056192398071,
900
+ "learning_rate": 3.2123429000187585e-05,
901
+ "loss": 2.5712,
902
+ "step": 5000
903
+ },
904
+ {
905
+ "epoch": 1.265873267076027,
906
+ "eval_loss": 2.460186719894409,
907
+ "eval_runtime": 39.8386,
908
+ "eval_samples_per_second": 44.078,
909
+ "eval_steps_per_second": 44.078,
910
+ "step": 5000
911
+ },
912
+ {
913
+ "epoch": 1.2785338988415522,
914
+ "grad_norm": 1.2350194454193115,
915
+ "learning_rate": 3.188895141624461e-05,
916
+ "loss": 2.5448,
917
+ "step": 5050
918
+ },
919
+ {
920
+ "epoch": 1.2911945306070773,
921
+ "grad_norm": 1.4210622310638428,
922
+ "learning_rate": 3.1654473832301634e-05,
923
+ "loss": 2.6031,
924
+ "step": 5100
925
+ },
926
+ {
927
+ "epoch": 1.3038551623726025,
928
+ "grad_norm": 2.226473093032837,
929
+ "learning_rate": 3.1419996248358656e-05,
930
+ "loss": 2.6597,
931
+ "step": 5150
932
+ },
933
+ {
934
+ "epoch": 1.3165157941381276,
935
+ "grad_norm": 2.4525105953216553,
936
+ "learning_rate": 3.1185518664415684e-05,
937
+ "loss": 2.596,
938
+ "step": 5200
939
+ },
940
+ {
941
+ "epoch": 1.3165157941381276,
942
+ "eval_loss": 2.454537868499756,
943
+ "eval_runtime": 39.805,
944
+ "eval_samples_per_second": 44.115,
945
+ "eval_steps_per_second": 44.115,
946
+ "step": 5200
947
+ },
948
+ {
949
+ "epoch": 1.3291764259036527,
950
+ "grad_norm": 1.265309453010559,
951
+ "learning_rate": 3.095104108047271e-05,
952
+ "loss": 2.5559,
953
+ "step": 5250
954
+ },
955
+ {
956
+ "epoch": 1.3418370576691778,
957
+ "grad_norm": 2.1364307403564453,
958
+ "learning_rate": 3.071656349652973e-05,
959
+ "loss": 2.5859,
960
+ "step": 5300
961
+ },
962
+ {
963
+ "epoch": 1.3544976894347027,
964
+ "grad_norm": 1.5945920944213867,
965
+ "learning_rate": 3.048208591258676e-05,
966
+ "loss": 2.5778,
967
+ "step": 5350
968
+ },
969
+ {
970
+ "epoch": 1.3671583212002278,
971
+ "grad_norm": 1.2479759454727173,
972
+ "learning_rate": 3.0247608328643783e-05,
973
+ "loss": 2.6846,
974
+ "step": 5400
975
+ },
976
+ {
977
+ "epoch": 1.3671583212002278,
978
+ "eval_loss": 2.4547293186187744,
979
+ "eval_runtime": 39.7806,
980
+ "eval_samples_per_second": 44.142,
981
+ "eval_steps_per_second": 44.142,
982
+ "step": 5400
983
+ },
984
+ {
985
+ "epoch": 1.379818952965753,
986
+ "grad_norm": 1.4845050573349,
987
+ "learning_rate": 3.0013130744700808e-05,
988
+ "loss": 2.5661,
989
+ "step": 5450
990
+ },
991
+ {
992
+ "epoch": 1.392479584731278,
993
+ "grad_norm": 1.5581985712051392,
994
+ "learning_rate": 2.9778653160757836e-05,
995
+ "loss": 2.5441,
996
+ "step": 5500
997
+ },
998
+ {
999
+ "epoch": 1.4051402164968032,
1000
+ "grad_norm": 3.1663737297058105,
1001
+ "learning_rate": 2.9544175576814857e-05,
1002
+ "loss": 2.5044,
1003
+ "step": 5550
1004
+ },
1005
+ {
1006
+ "epoch": 1.4178008482623283,
1007
+ "grad_norm": 1.2454484701156616,
1008
+ "learning_rate": 2.9309697992871882e-05,
1009
+ "loss": 2.5747,
1010
+ "step": 5600
1011
+ },
1012
+ {
1013
+ "epoch": 1.4178008482623283,
1014
+ "eval_loss": 2.4544529914855957,
1015
+ "eval_runtime": 39.9287,
1016
+ "eval_samples_per_second": 43.978,
1017
+ "eval_steps_per_second": 43.978,
1018
+ "step": 5600
1019
+ },
1020
+ {
1021
+ "epoch": 1.4304614800278534,
1022
+ "grad_norm": 1.662784457206726,
1023
+ "learning_rate": 2.907522040892891e-05,
1024
+ "loss": 2.6064,
1025
+ "step": 5650
1026
+ },
1027
+ {
1028
+ "epoch": 1.4431221117933786,
1029
+ "grad_norm": 1.618458867073059,
1030
+ "learning_rate": 2.8840742824985935e-05,
1031
+ "loss": 2.5191,
1032
+ "step": 5700
1033
+ },
1034
+ {
1035
+ "epoch": 1.4557827435589035,
1036
+ "grad_norm": 1.3003348112106323,
1037
+ "learning_rate": 2.8606265241042956e-05,
1038
+ "loss": 2.5339,
1039
+ "step": 5750
1040
+ },
1041
+ {
1042
+ "epoch": 1.4684433753244286,
1043
+ "grad_norm": 1.1443992853164673,
1044
+ "learning_rate": 2.8371787657099984e-05,
1045
+ "loss": 2.5914,
1046
+ "step": 5800
1047
+ },
1048
+ {
1049
+ "epoch": 1.4684433753244286,
1050
+ "eval_loss": 2.453752279281616,
1051
+ "eval_runtime": 39.8234,
1052
+ "eval_samples_per_second": 44.095,
1053
+ "eval_steps_per_second": 44.095,
1054
+ "step": 5800
1055
+ },
1056
+ {
1057
+ "epoch": 1.4811040070899537,
1058
+ "grad_norm": 1.2574009895324707,
1059
+ "learning_rate": 2.813731007315701e-05,
1060
+ "loss": 2.6109,
1061
+ "step": 5850
1062
+ },
1063
+ {
1064
+ "epoch": 1.4937646388554788,
1065
+ "grad_norm": 1.002815842628479,
1066
+ "learning_rate": 2.790283248921403e-05,
1067
+ "loss": 2.6075,
1068
+ "step": 5900
1069
+ },
1070
+ {
1071
+ "epoch": 1.506425270621004,
1072
+ "grad_norm": 1.306024432182312,
1073
+ "learning_rate": 2.766835490527106e-05,
1074
+ "loss": 2.5733,
1075
+ "step": 5950
1076
+ },
1077
+ {
1078
+ "epoch": 1.519085902386529,
1079
+ "grad_norm": 2.5023701190948486,
1080
+ "learning_rate": 2.7433877321328083e-05,
1081
+ "loss": 2.6274,
1082
+ "step": 6000
1083
+ },
1084
+ {
1085
+ "epoch": 1.519085902386529,
1086
+ "eval_loss": 2.449617862701416,
1087
+ "eval_runtime": 39.9401,
1088
+ "eval_samples_per_second": 43.966,
1089
+ "eval_steps_per_second": 43.966,
1090
+ "step": 6000
1091
+ },
1092
+ {
1093
+ "epoch": 1.5317465341520542,
1094
+ "grad_norm": 1.9410326480865479,
1095
+ "learning_rate": 2.7199399737385105e-05,
1096
+ "loss": 2.5312,
1097
+ "step": 6050
1098
+ },
1099
+ {
1100
+ "epoch": 1.5444071659175793,
1101
+ "grad_norm": 1.9793561697006226,
1102
+ "learning_rate": 2.6964922153442136e-05,
1103
+ "loss": 2.5759,
1104
+ "step": 6100
1105
+ },
1106
+ {
1107
+ "epoch": 1.5570677976831044,
1108
+ "grad_norm": 1.290531873703003,
1109
+ "learning_rate": 2.6730444569499157e-05,
1110
+ "loss": 2.5817,
1111
+ "step": 6150
1112
+ },
1113
+ {
1114
+ "epoch": 1.5697284294486296,
1115
+ "grad_norm": 2.11389422416687,
1116
+ "learning_rate": 2.6495966985556182e-05,
1117
+ "loss": 2.6287,
1118
+ "step": 6200
1119
+ },
1120
+ {
1121
+ "epoch": 1.5697284294486296,
1122
+ "eval_loss": 2.4490554332733154,
1123
+ "eval_runtime": 39.7474,
1124
+ "eval_samples_per_second": 44.179,
1125
+ "eval_steps_per_second": 44.179,
1126
+ "step": 6200
1127
+ },
1128
+ {
1129
+ "epoch": 1.5823890612141547,
1130
+ "grad_norm": 1.6492938995361328,
1131
+ "learning_rate": 2.626148940161321e-05,
1132
+ "loss": 2.631,
1133
+ "step": 6250
1134
+ },
1135
+ {
1136
+ "epoch": 1.5950496929796798,
1137
+ "grad_norm": 1.3233673572540283,
1138
+ "learning_rate": 2.6027011817670232e-05,
1139
+ "loss": 2.5654,
1140
+ "step": 6300
1141
+ },
1142
+ {
1143
+ "epoch": 1.607710324745205,
1144
+ "grad_norm": 1.688264012336731,
1145
+ "learning_rate": 2.5792534233727257e-05,
1146
+ "loss": 2.6096,
1147
+ "step": 6350
1148
+ },
1149
+ {
1150
+ "epoch": 1.62037095651073,
1151
+ "grad_norm": 2.064823865890503,
1152
+ "learning_rate": 2.5558056649784285e-05,
1153
+ "loss": 2.6275,
1154
+ "step": 6400
1155
+ },
1156
+ {
1157
+ "epoch": 1.62037095651073,
1158
+ "eval_loss": 2.4488983154296875,
1159
+ "eval_runtime": 39.6847,
1160
+ "eval_samples_per_second": 44.249,
1161
+ "eval_steps_per_second": 44.249,
1162
+ "step": 6400
1163
+ },
1164
+ {
1165
+ "epoch": 1.633031588276255,
1166
+ "grad_norm": 1.5599696636199951,
1167
+ "learning_rate": 2.5323579065841306e-05,
1168
+ "loss": 2.6334,
1169
+ "step": 6450
1170
+ },
1171
+ {
1172
+ "epoch": 1.64569222004178,
1173
+ "grad_norm": 1.3142633438110352,
1174
+ "learning_rate": 2.508910148189833e-05,
1175
+ "loss": 2.5496,
1176
+ "step": 6500
1177
+ },
1178
+ {
1179
+ "epoch": 1.6583528518073052,
1180
+ "grad_norm": 1.474135160446167,
1181
+ "learning_rate": 2.4854623897955356e-05,
1182
+ "loss": 2.5628,
1183
+ "step": 6550
1184
+ },
1185
+ {
1186
+ "epoch": 1.6710134835728303,
1187
+ "grad_norm": 1.3737610578536987,
1188
+ "learning_rate": 2.4620146314012384e-05,
1189
+ "loss": 2.5345,
1190
+ "step": 6600
1191
+ },
1192
+ {
1193
+ "epoch": 1.6710134835728303,
1194
+ "eval_loss": 2.447559356689453,
1195
+ "eval_runtime": 39.7021,
1196
+ "eval_samples_per_second": 44.229,
1197
+ "eval_steps_per_second": 44.229,
1198
+ "step": 6600
1199
+ },
1200
+ {
1201
+ "epoch": 1.6836741153383554,
1202
+ "grad_norm": 1.2432060241699219,
1203
+ "learning_rate": 2.4385668730069405e-05,
1204
+ "loss": 2.5977,
1205
+ "step": 6650
1206
+ },
1207
+ {
1208
+ "epoch": 1.6963347471038803,
1209
+ "grad_norm": 1.465063452720642,
1210
+ "learning_rate": 2.415119114612643e-05,
1211
+ "loss": 2.6118,
1212
+ "step": 6700
1213
+ },
1214
+ {
1215
+ "epoch": 1.7089953788694054,
1216
+ "grad_norm": 1.5186200141906738,
1217
+ "learning_rate": 2.3916713562183458e-05,
1218
+ "loss": 2.6126,
1219
+ "step": 6750
1220
+ },
1221
+ {
1222
+ "epoch": 1.7216560106349306,
1223
+ "grad_norm": 1.6869078874588013,
1224
+ "learning_rate": 2.368223597824048e-05,
1225
+ "loss": 2.576,
1226
+ "step": 6800
1227
+ },
1228
+ {
1229
+ "epoch": 1.7216560106349306,
1230
+ "eval_loss": 2.4459502696990967,
1231
+ "eval_runtime": 39.7653,
1232
+ "eval_samples_per_second": 44.159,
1233
+ "eval_steps_per_second": 44.159,
1234
+ "step": 6800
1235
+ },
1236
+ {
1237
+ "epoch": 1.7343166424004557,
1238
+ "grad_norm": 1.2578104734420776,
1239
+ "learning_rate": 2.3447758394297507e-05,
1240
+ "loss": 2.6178,
1241
+ "step": 6850
1242
+ },
1243
+ {
1244
+ "epoch": 1.7469772741659808,
1245
+ "grad_norm": 1.7597213983535767,
1246
+ "learning_rate": 2.3213280810354532e-05,
1247
+ "loss": 2.6358,
1248
+ "step": 6900
1249
+ },
1250
+ {
1251
+ "epoch": 1.759637905931506,
1252
+ "grad_norm": 2.144465923309326,
1253
+ "learning_rate": 2.2978803226411554e-05,
1254
+ "loss": 2.5597,
1255
+ "step": 6950
1256
+ },
1257
+ {
1258
+ "epoch": 1.772298537697031,
1259
+ "grad_norm": 1.1808464527130127,
1260
+ "learning_rate": 2.2744325642468582e-05,
1261
+ "loss": 2.6269,
1262
+ "step": 7000
1263
+ },
1264
+ {
1265
+ "epoch": 1.772298537697031,
1266
+ "eval_loss": 2.4444611072540283,
1267
+ "eval_runtime": 40.0709,
1268
+ "eval_samples_per_second": 43.822,
1269
+ "eval_steps_per_second": 43.822,
1270
+ "step": 7000
1271
+ },
1272
+ {
1273
+ "epoch": 1.7849591694625562,
1274
+ "grad_norm": 1.4550806283950806,
1275
+ "learning_rate": 2.2509848058525606e-05,
1276
+ "loss": 2.6206,
1277
+ "step": 7050
1278
+ },
1279
+ {
1280
+ "epoch": 1.7976198012280813,
1281
+ "grad_norm": 1.2635902166366577,
1282
+ "learning_rate": 2.227537047458263e-05,
1283
+ "loss": 2.5722,
1284
+ "step": 7100
1285
+ },
1286
+ {
1287
+ "epoch": 1.8102804329936064,
1288
+ "grad_norm": 1.3835856914520264,
1289
+ "learning_rate": 2.2040892890639656e-05,
1290
+ "loss": 2.535,
1291
+ "step": 7150
1292
+ },
1293
+ {
1294
+ "epoch": 1.8229410647591315,
1295
+ "grad_norm": 1.735004186630249,
1296
+ "learning_rate": 2.180641530669668e-05,
1297
+ "loss": 2.6086,
1298
+ "step": 7200
1299
+ },
1300
+ {
1301
+ "epoch": 1.8229410647591315,
1302
+ "eval_loss": 2.443899154663086,
1303
+ "eval_runtime": 40.91,
1304
+ "eval_samples_per_second": 42.924,
1305
+ "eval_steps_per_second": 42.924,
1306
+ "step": 7200
1307
+ },
1308
+ {
1309
+ "epoch": 1.8356016965246567,
1310
+ "grad_norm": 1.263051986694336,
1311
+ "learning_rate": 2.1571937722753706e-05,
1312
+ "loss": 2.5544,
1313
+ "step": 7250
1314
+ },
1315
+ {
1316
+ "epoch": 1.8482623282901818,
1317
+ "grad_norm": 1.0899442434310913,
1318
+ "learning_rate": 2.133746013881073e-05,
1319
+ "loss": 2.5603,
1320
+ "step": 7300
1321
+ },
1322
+ {
1323
+ "epoch": 1.860922960055707,
1324
+ "grad_norm": 3.038811206817627,
1325
+ "learning_rate": 2.1102982554867755e-05,
1326
+ "loss": 2.5688,
1327
+ "step": 7350
1328
+ },
1329
+ {
1330
+ "epoch": 1.873583591821232,
1331
+ "grad_norm": 1.6385984420776367,
1332
+ "learning_rate": 2.086850497092478e-05,
1333
+ "loss": 2.6006,
1334
+ "step": 7400
1335
+ },
1336
+ {
1337
+ "epoch": 1.873583591821232,
1338
+ "eval_loss": 2.443300724029541,
1339
+ "eval_runtime": 40.6649,
1340
+ "eval_samples_per_second": 43.182,
1341
+ "eval_steps_per_second": 43.182,
1342
+ "step": 7400
1343
+ },
1344
+ {
1345
+ "epoch": 1.8862442235867571,
1346
+ "grad_norm": 1.2857129573822021,
1347
+ "learning_rate": 2.0634027386981805e-05,
1348
+ "loss": 2.5563,
1349
+ "step": 7450
1350
+ },
1351
+ {
1352
+ "epoch": 1.898904855352282,
1353
+ "grad_norm": 1.0289497375488281,
1354
+ "learning_rate": 2.0399549803038833e-05,
1355
+ "loss": 2.5671,
1356
+ "step": 7500
1357
+ },
1358
+ {
1359
+ "epoch": 1.9115654871178072,
1360
+ "grad_norm": 1.5041025876998901,
1361
+ "learning_rate": 2.0165072219095854e-05,
1362
+ "loss": 2.5689,
1363
+ "step": 7550
1364
+ },
1365
+ {
1366
+ "epoch": 1.9242261188833323,
1367
+ "grad_norm": 1.6611964702606201,
1368
+ "learning_rate": 1.993528418683174e-05,
1369
+ "loss": 2.5801,
1370
+ "step": 7600
1371
+ },
1372
+ {
1373
+ "epoch": 1.9242261188833323,
1374
+ "eval_loss": 2.443532943725586,
1375
+ "eval_runtime": 39.931,
1376
+ "eval_samples_per_second": 43.976,
1377
+ "eval_steps_per_second": 43.976,
1378
+ "step": 7600
1379
+ },
1380
+ {
1381
+ "epoch": 1.9368867506488574,
1382
+ "grad_norm": 1.521170735359192,
1383
+ "learning_rate": 1.9700806602888767e-05,
1384
+ "loss": 2.5969,
1385
+ "step": 7650
1386
+ },
1387
+ {
1388
+ "epoch": 1.9495473824143825,
1389
+ "grad_norm": 1.3700034618377686,
1390
+ "learning_rate": 1.946632901894579e-05,
1391
+ "loss": 2.6306,
1392
+ "step": 7700
1393
+ },
1394
+ {
1395
+ "epoch": 1.9622080141799074,
1396
+ "grad_norm": 2.311443328857422,
1397
+ "learning_rate": 1.9231851435002814e-05,
1398
+ "loss": 2.5608,
1399
+ "step": 7750
1400
+ },
1401
+ {
1402
+ "epoch": 1.9748686459454325,
1403
+ "grad_norm": 1.6699820756912231,
1404
+ "learning_rate": 1.8997373851059842e-05,
1405
+ "loss": 2.5113,
1406
+ "step": 7800
1407
+ },
1408
+ {
1409
+ "epoch": 1.9748686459454325,
1410
+ "eval_loss": 2.4421675205230713,
1411
+ "eval_runtime": 40.1783,
1412
+ "eval_samples_per_second": 43.705,
1413
+ "eval_steps_per_second": 43.705,
1414
+ "step": 7800
1415
+ },
1416
+ {
1417
+ "epoch": 1.9875292777109577,
1418
+ "grad_norm": 1.2560683488845825,
1419
+ "learning_rate": 1.8762896267116863e-05,
1420
+ "loss": 2.545,
1421
+ "step": 7850
1422
+ },
1423
+ {
1424
+ "epoch": 2.0,
1425
+ "grad_norm": 2.176563262939453,
1426
+ "learning_rate": 1.852841868317389e-05,
1427
+ "loss": 2.5752,
1428
+ "step": 7900
1429
+ },
1430
+ {
1431
+ "epoch": 2.012660631765525,
1432
+ "grad_norm": 1.2551178932189941,
1433
+ "learning_rate": 1.8293941099230916e-05,
1434
+ "loss": 2.5215,
1435
+ "step": 7950
1436
+ },
1437
+ {
1438
+ "epoch": 2.0253212635310502,
1439
+ "grad_norm": 1.5646872520446777,
1440
+ "learning_rate": 1.8059463515287937e-05,
1441
+ "loss": 2.5838,
1442
+ "step": 8000
1443
+ },
1444
+ {
1445
+ "epoch": 2.0253212635310502,
1446
+ "eval_loss": 2.441195249557495,
1447
+ "eval_runtime": 39.701,
1448
+ "eval_samples_per_second": 44.231,
1449
+ "eval_steps_per_second": 44.231,
1450
+ "step": 8000
1451
+ },
1452
+ {
1453
+ "epoch": 2.0379818952965754,
1454
+ "grad_norm": 1.4227900505065918,
1455
+ "learning_rate": 1.7824985931344966e-05,
1456
+ "loss": 2.5597,
1457
+ "step": 8050
1458
+ },
1459
+ {
1460
+ "epoch": 2.0506425270621005,
1461
+ "grad_norm": 1.3013832569122314,
1462
+ "learning_rate": 1.759050834740199e-05,
1463
+ "loss": 2.7641,
1464
+ "step": 8100
1465
+ },
1466
+ {
1467
+ "epoch": 2.0633031588276256,
1468
+ "grad_norm": 1.1282143592834473,
1469
+ "learning_rate": 1.7356030763459015e-05,
1470
+ "loss": 2.5875,
1471
+ "step": 8150
1472
+ },
1473
+ {
1474
+ "epoch": 2.0759637905931507,
1475
+ "grad_norm": 2.079760789871216,
1476
+ "learning_rate": 1.712155317951604e-05,
1477
+ "loss": 2.4861,
1478
+ "step": 8200
1479
+ },
1480
+ {
1481
+ "epoch": 2.0759637905931507,
1482
+ "eval_loss": 2.440812826156616,
1483
+ "eval_runtime": 40.0764,
1484
+ "eval_samples_per_second": 43.816,
1485
+ "eval_steps_per_second": 43.816,
1486
+ "step": 8200
1487
+ },
1488
+ {
1489
+ "epoch": 2.088624422358676,
1490
+ "grad_norm": 1.0884991884231567,
1491
+ "learning_rate": 1.6887075595573065e-05,
1492
+ "loss": 2.5941,
1493
+ "step": 8250
1494
+ },
1495
+ {
1496
+ "epoch": 2.101285054124201,
1497
+ "grad_norm": 1.9202015399932861,
1498
+ "learning_rate": 1.665259801163009e-05,
1499
+ "loss": 2.5929,
1500
+ "step": 8300
1501
+ },
1502
+ {
1503
+ "epoch": 2.113945685889726,
1504
+ "grad_norm": 1.5925830602645874,
1505
+ "learning_rate": 1.6418120427687114e-05,
1506
+ "loss": 2.5046,
1507
+ "step": 8350
1508
+ },
1509
+ {
1510
+ "epoch": 2.126606317655251,
1511
+ "grad_norm": 1.5219184160232544,
1512
+ "learning_rate": 1.618364284374414e-05,
1513
+ "loss": 2.5628,
1514
+ "step": 8400
1515
+ },
1516
+ {
1517
+ "epoch": 2.126606317655251,
1518
+ "eval_loss": 2.4396440982818604,
1519
+ "eval_runtime": 40.0053,
1520
+ "eval_samples_per_second": 43.894,
1521
+ "eval_steps_per_second": 43.894,
1522
+ "step": 8400
1523
+ },
1524
+ {
1525
+ "epoch": 2.139266949420776,
1526
+ "grad_norm": 1.4882445335388184,
1527
+ "learning_rate": 1.5949165259801164e-05,
1528
+ "loss": 2.6268,
1529
+ "step": 8450
1530
+ },
1531
+ {
1532
+ "epoch": 2.151927581186301,
1533
+ "grad_norm": 1.3513301610946655,
1534
+ "learning_rate": 1.571468767585819e-05,
1535
+ "loss": 2.5277,
1536
+ "step": 8500
1537
+ },
1538
+ {
1539
+ "epoch": 2.164588212951826,
1540
+ "grad_norm": 1.690974473953247,
1541
+ "learning_rate": 1.5480210091915216e-05,
1542
+ "loss": 2.5631,
1543
+ "step": 8550
1544
+ },
1545
+ {
1546
+ "epoch": 2.1772488447173513,
1547
+ "grad_norm": 1.5311528444290161,
1548
+ "learning_rate": 1.5245732507972238e-05,
1549
+ "loss": 2.5454,
1550
+ "step": 8600
1551
+ },
1552
+ {
1553
+ "epoch": 2.1772488447173513,
1554
+ "eval_loss": 2.4388718605041504,
1555
+ "eval_runtime": 40.0289,
1556
+ "eval_samples_per_second": 43.868,
1557
+ "eval_steps_per_second": 43.868,
1558
+ "step": 8600
1559
+ },
1560
+ {
1561
+ "epoch": 2.1899094764828764,
1562
+ "grad_norm": 2.1171281337738037,
1563
+ "learning_rate": 1.5011254924029264e-05,
1564
+ "loss": 2.6112,
1565
+ "step": 8650
1566
+ },
1567
+ {
1568
+ "epoch": 2.2025701082484015,
1569
+ "grad_norm": 1.9706814289093018,
1570
+ "learning_rate": 1.4776777340086289e-05,
1571
+ "loss": 2.588,
1572
+ "step": 8700
1573
+ },
1574
+ {
1575
+ "epoch": 2.2152307400139266,
1576
+ "grad_norm": 1.8991297483444214,
1577
+ "learning_rate": 1.4542299756143312e-05,
1578
+ "loss": 2.5655,
1579
+ "step": 8750
1580
+ },
1581
+ {
1582
+ "epoch": 2.2278913717794517,
1583
+ "grad_norm": 1.5568820238113403,
1584
+ "learning_rate": 1.4307822172200339e-05,
1585
+ "loss": 2.5312,
1586
+ "step": 8800
1587
+ },
1588
+ {
1589
+ "epoch": 2.2278913717794517,
1590
+ "eval_loss": 2.438715696334839,
1591
+ "eval_runtime": 40.1748,
1592
+ "eval_samples_per_second": 43.709,
1593
+ "eval_steps_per_second": 43.709,
1594
+ "step": 8800
1595
+ },
1596
+ {
1597
+ "epoch": 2.240552003544977,
1598
+ "grad_norm": 1.277051329612732,
1599
+ "learning_rate": 1.4073344588257365e-05,
1600
+ "loss": 2.5818,
1601
+ "step": 8850
1602
+ },
1603
+ {
1604
+ "epoch": 2.253212635310502,
1605
+ "grad_norm": 1.8890128135681152,
1606
+ "learning_rate": 1.3838867004314388e-05,
1607
+ "loss": 2.5211,
1608
+ "step": 8900
1609
+ },
1610
+ {
1611
+ "epoch": 2.265873267076027,
1612
+ "grad_norm": 1.8824830055236816,
1613
+ "learning_rate": 1.3604389420371413e-05,
1614
+ "loss": 2.53,
1615
+ "step": 8950
1616
+ },
1617
+ {
1618
+ "epoch": 2.278533898841552,
1619
+ "grad_norm": 1.239490032196045,
1620
+ "learning_rate": 1.336991183642844e-05,
1621
+ "loss": 2.5889,
1622
+ "step": 9000
1623
+ },
1624
+ {
1625
+ "epoch": 2.278533898841552,
1626
+ "eval_loss": 2.437577962875366,
1627
+ "eval_runtime": 40.1654,
1628
+ "eval_samples_per_second": 43.719,
1629
+ "eval_steps_per_second": 43.719,
1630
+ "step": 9000
1631
+ },
1632
+ {
1633
+ "epoch": 2.2911945306070773,
1634
+ "grad_norm": 1.7253328561782837,
1635
+ "learning_rate": 1.3135434252485462e-05,
1636
+ "loss": 2.5426,
1637
+ "step": 9050
1638
+ },
1639
+ {
1640
+ "epoch": 2.3038551623726025,
1641
+ "grad_norm": 1.6971838474273682,
1642
+ "learning_rate": 1.2900956668542489e-05,
1643
+ "loss": 2.4953,
1644
+ "step": 9100
1645
+ },
1646
+ {
1647
+ "epoch": 2.3165157941381276,
1648
+ "grad_norm": 1.4906270503997803,
1649
+ "learning_rate": 1.2666479084599514e-05,
1650
+ "loss": 2.606,
1651
+ "step": 9150
1652
+ },
1653
+ {
1654
+ "epoch": 2.3291764259036527,
1655
+ "grad_norm": 1.658526062965393,
1656
+ "learning_rate": 1.2432001500656538e-05,
1657
+ "loss": 2.5483,
1658
+ "step": 9200
1659
+ },
1660
+ {
1661
+ "epoch": 2.3291764259036527,
1662
+ "eval_loss": 2.437896490097046,
1663
+ "eval_runtime": 40.7008,
1664
+ "eval_samples_per_second": 43.144,
1665
+ "eval_steps_per_second": 43.144,
1666
+ "step": 9200
1667
+ },
1668
+ {
1669
+ "epoch": 2.341837057669178,
1670
+ "grad_norm": 1.0781177282333374,
1671
+ "learning_rate": 1.2197523916713563e-05,
1672
+ "loss": 2.5449,
1673
+ "step": 9250
1674
+ },
1675
+ {
1676
+ "epoch": 2.354497689434703,
1677
+ "grad_norm": 2.1414873600006104,
1678
+ "learning_rate": 1.1963046332770588e-05,
1679
+ "loss": 2.5303,
1680
+ "step": 9300
1681
+ },
1682
+ {
1683
+ "epoch": 2.367158321200228,
1684
+ "grad_norm": 2.063297986984253,
1685
+ "learning_rate": 1.1728568748827613e-05,
1686
+ "loss": 2.5837,
1687
+ "step": 9350
1688
+ },
1689
+ {
1690
+ "epoch": 2.3798189529657527,
1691
+ "grad_norm": 1.2153489589691162,
1692
+ "learning_rate": 1.1494091164884637e-05,
1693
+ "loss": 2.6384,
1694
+ "step": 9400
1695
+ },
1696
+ {
1697
+ "epoch": 2.3798189529657527,
1698
+ "eval_loss": 2.4365696907043457,
1699
+ "eval_runtime": 40.2398,
1700
+ "eval_samples_per_second": 43.638,
1701
+ "eval_steps_per_second": 43.638,
1702
+ "step": 9400
1703
+ },
1704
+ {
1705
+ "epoch": 2.3924795847312783,
1706
+ "grad_norm": 1.2976094484329224,
1707
+ "learning_rate": 1.1259613580941662e-05,
1708
+ "loss": 2.572,
1709
+ "step": 9450
1710
+ },
1711
+ {
1712
+ "epoch": 2.405140216496803,
1713
+ "grad_norm": 1.2775920629501343,
1714
+ "learning_rate": 1.1025135996998689e-05,
1715
+ "loss": 2.6083,
1716
+ "step": 9500
1717
+ },
1718
+ {
1719
+ "epoch": 2.417800848262328,
1720
+ "grad_norm": 1.358311653137207,
1721
+ "learning_rate": 1.0790658413055713e-05,
1722
+ "loss": 2.5206,
1723
+ "step": 9550
1724
+ },
1725
+ {
1726
+ "epoch": 2.4304614800278532,
1727
+ "grad_norm": 1.3438369035720825,
1728
+ "learning_rate": 1.0556180829112736e-05,
1729
+ "loss": 2.4967,
1730
+ "step": 9600
1731
+ },
1732
+ {
1733
+ "epoch": 2.4304614800278532,
1734
+ "eval_loss": 2.4359662532806396,
1735
+ "eval_runtime": 40.0802,
1736
+ "eval_samples_per_second": 43.812,
1737
+ "eval_steps_per_second": 43.812,
1738
+ "step": 9600
1739
+ },
1740
+ {
1741
+ "epoch": 2.4431221117933783,
1742
+ "grad_norm": 1.2618831396102905,
1743
+ "learning_rate": 1.0321703245169763e-05,
1744
+ "loss": 2.6169,
1745
+ "step": 9650
1746
+ },
1747
+ {
1748
+ "epoch": 2.4557827435589035,
1749
+ "grad_norm": 1.3764727115631104,
1750
+ "learning_rate": 1.0087225661226788e-05,
1751
+ "loss": 2.5444,
1752
+ "step": 9700
1753
+ },
1754
+ {
1755
+ "epoch": 2.4684433753244286,
1756
+ "grad_norm": 1.604864478111267,
1757
+ "learning_rate": 9.852748077283812e-06,
1758
+ "loss": 2.5343,
1759
+ "step": 9750
1760
+ },
1761
+ {
1762
+ "epoch": 2.4811040070899537,
1763
+ "grad_norm": 1.390496850013733,
1764
+ "learning_rate": 9.618270493340837e-06,
1765
+ "loss": 2.5051,
1766
+ "step": 9800
1767
+ },
1768
+ {
1769
+ "epoch": 2.4811040070899537,
1770
+ "eval_loss": 2.4353232383728027,
1771
+ "eval_runtime": 40.1607,
1772
+ "eval_samples_per_second": 43.724,
1773
+ "eval_steps_per_second": 43.724,
1774
+ "step": 9800
1775
+ },
1776
+ {
1777
+ "epoch": 2.493764638855479,
1778
+ "grad_norm": 2.1982169151306152,
1779
+ "learning_rate": 9.383792909397862e-06,
1780
+ "loss": 2.5036,
1781
+ "step": 9850
1782
+ },
1783
+ {
1784
+ "epoch": 2.506425270621004,
1785
+ "grad_norm": 1.3033822774887085,
1786
+ "learning_rate": 9.149315325454887e-06,
1787
+ "loss": 2.5205,
1788
+ "step": 9900
1789
+ },
1790
+ {
1791
+ "epoch": 2.519085902386529,
1792
+ "grad_norm": 1.682586431503296,
1793
+ "learning_rate": 8.919527293190772e-06,
1794
+ "loss": 2.6083,
1795
+ "step": 9950
1796
+ },
1797
+ {
1798
+ "epoch": 2.531746534152054,
1799
+ "grad_norm": 3.184382200241089,
1800
+ "learning_rate": 8.685049709247797e-06,
1801
+ "loss": 2.5314,
1802
+ "step": 10000
1803
+ },
1804
+ {
1805
+ "epoch": 2.531746534152054,
1806
+ "eval_loss": 2.434755802154541,
1807
+ "eval_runtime": 40.2877,
1808
+ "eval_samples_per_second": 43.587,
1809
+ "eval_steps_per_second": 43.587,
1810
+ "step": 10000
1811
+ },
1812
+ {
1813
+ "epoch": 2.5444071659175793,
1814
+ "grad_norm": 2.0026867389678955,
1815
+ "learning_rate": 8.450572125304821e-06,
1816
+ "loss": 2.5109,
1817
+ "step": 10050
1818
+ },
1819
+ {
1820
+ "epoch": 2.5570677976831044,
1821
+ "grad_norm": 1.3833885192871094,
1822
+ "learning_rate": 8.216094541361846e-06,
1823
+ "loss": 2.5362,
1824
+ "step": 10100
1825
+ },
1826
+ {
1827
+ "epoch": 2.5697284294486296,
1828
+ "grad_norm": 2.157984495162964,
1829
+ "learning_rate": 7.981616957418871e-06,
1830
+ "loss": 2.5423,
1831
+ "step": 10150
1832
+ },
1833
+ {
1834
+ "epoch": 2.5823890612141547,
1835
+ "grad_norm": 1.682053565979004,
1836
+ "learning_rate": 7.747139373475897e-06,
1837
+ "loss": 2.5133,
1838
+ "step": 10200
1839
+ },
1840
+ {
1841
+ "epoch": 2.5823890612141547,
1842
+ "eval_loss": 2.435208559036255,
1843
+ "eval_runtime": 40.4768,
1844
+ "eval_samples_per_second": 43.383,
1845
+ "eval_steps_per_second": 43.383,
1846
+ "step": 10200
1847
+ },
1848
+ {
1849
+ "epoch": 2.59504969297968,
1850
+ "grad_norm": 1.9720139503479004,
1851
+ "learning_rate": 7.512661789532921e-06,
1852
+ "loss": 2.6372,
1853
+ "step": 10250
1854
+ },
1855
+ {
1856
+ "epoch": 2.607710324745205,
1857
+ "grad_norm": 1.6906607151031494,
1858
+ "learning_rate": 7.278184205589945e-06,
1859
+ "loss": 2.5505,
1860
+ "step": 10300
1861
+ },
1862
+ {
1863
+ "epoch": 2.62037095651073,
1864
+ "grad_norm": 1.484045147895813,
1865
+ "learning_rate": 7.043706621646972e-06,
1866
+ "loss": 2.5095,
1867
+ "step": 10350
1868
+ },
1869
+ {
1870
+ "epoch": 2.633031588276255,
1871
+ "grad_norm": 1.6676850318908691,
1872
+ "learning_rate": 6.8092290377039955e-06,
1873
+ "loss": 2.6487,
1874
+ "step": 10400
1875
+ },
1876
+ {
1877
+ "epoch": 2.633031588276255,
1878
+ "eval_loss": 2.4344091415405273,
1879
+ "eval_runtime": 39.869,
1880
+ "eval_samples_per_second": 44.044,
1881
+ "eval_steps_per_second": 44.044,
1882
+ "step": 10400
1883
+ },
1884
+ {
1885
+ "epoch": 2.64569222004178,
1886
+ "grad_norm": 1.5012388229370117,
1887
+ "learning_rate": 6.57475145376102e-06,
1888
+ "loss": 2.5756,
1889
+ "step": 10450
1890
+ },
1891
+ {
1892
+ "epoch": 2.6583528518073054,
1893
+ "grad_norm": 1.043954849243164,
1894
+ "learning_rate": 6.340273869818046e-06,
1895
+ "loss": 2.5843,
1896
+ "step": 10500
1897
+ },
1898
+ {
1899
+ "epoch": 2.67101348357283,
1900
+ "grad_norm": 1.0455141067504883,
1901
+ "learning_rate": 6.105796285875071e-06,
1902
+ "loss": 2.5248,
1903
+ "step": 10550
1904
+ },
1905
+ {
1906
+ "epoch": 2.6836741153383556,
1907
+ "grad_norm": 1.39467453956604,
1908
+ "learning_rate": 5.871318701932095e-06,
1909
+ "loss": 2.5091,
1910
+ "step": 10600
1911
+ },
1912
+ {
1913
+ "epoch": 2.6836741153383556,
1914
+ "eval_loss": 2.4331610202789307,
1915
+ "eval_runtime": 39.923,
1916
+ "eval_samples_per_second": 43.985,
1917
+ "eval_steps_per_second": 43.985,
1918
+ "step": 10600
1919
+ },
1920
+ {
1921
+ "epoch": 2.6963347471038803,
1922
+ "grad_norm": 1.1417715549468994,
1923
+ "learning_rate": 5.63684111798912e-06,
1924
+ "loss": 2.5853,
1925
+ "step": 10650
1926
+ },
1927
+ {
1928
+ "epoch": 2.7089953788694054,
1929
+ "grad_norm": 1.133244514465332,
1930
+ "learning_rate": 5.402363534046146e-06,
1931
+ "loss": 2.5457,
1932
+ "step": 10700
1933
+ },
1934
+ {
1935
+ "epoch": 2.7216560106349306,
1936
+ "grad_norm": 1.2331452369689941,
1937
+ "learning_rate": 5.1678859501031705e-06,
1938
+ "loss": 2.5576,
1939
+ "step": 10750
1940
+ },
1941
+ {
1942
+ "epoch": 2.7343166424004557,
1943
+ "grad_norm": 1.7164263725280762,
1944
+ "learning_rate": 4.933408366160195e-06,
1945
+ "loss": 2.5471,
1946
+ "step": 10800
1947
+ },
1948
+ {
1949
+ "epoch": 2.7343166424004557,
1950
+ "eval_loss": 2.4340403079986572,
1951
+ "eval_runtime": 40.3849,
1952
+ "eval_samples_per_second": 43.482,
1953
+ "eval_steps_per_second": 43.482,
1954
+ "step": 10800
1955
+ },
1956
+ {
1957
+ "epoch": 2.746977274165981,
1958
+ "grad_norm": 1.3680106401443481,
1959
+ "learning_rate": 4.69893078221722e-06,
1960
+ "loss": 2.5562,
1961
+ "step": 10850
1962
+ },
1963
+ {
1964
+ "epoch": 2.759637905931506,
1965
+ "grad_norm": 1.0978279113769531,
1966
+ "learning_rate": 4.464453198274246e-06,
1967
+ "loss": 2.5185,
1968
+ "step": 10900
1969
+ },
1970
+ {
1971
+ "epoch": 2.772298537697031,
1972
+ "grad_norm": 1.2212647199630737,
1973
+ "learning_rate": 4.2299756143312695e-06,
1974
+ "loss": 2.6371,
1975
+ "step": 10950
1976
+ },
1977
+ {
1978
+ "epoch": 2.784959169462556,
1979
+ "grad_norm": 1.6452165842056274,
1980
+ "learning_rate": 3.995498030388295e-06,
1981
+ "loss": 2.681,
1982
+ "step": 11000
1983
+ },
1984
+ {
1985
+ "epoch": 2.784959169462556,
1986
+ "eval_loss": 2.4337143898010254,
1987
+ "eval_runtime": 40.4235,
1988
+ "eval_samples_per_second": 43.44,
1989
+ "eval_steps_per_second": 43.44,
1990
+ "step": 11000
1991
+ },
1992
+ {
1993
+ "epoch": 2.7976198012280813,
1994
+ "grad_norm": 1.7757978439331055,
1995
+ "learning_rate": 3.7610204464453203e-06,
1996
+ "loss": 2.5746,
1997
+ "step": 11050
1998
+ },
1999
+ {
2000
+ "epoch": 2.8102804329936064,
2001
+ "grad_norm": 1.2373579740524292,
2002
+ "learning_rate": 3.5265428625023455e-06,
2003
+ "loss": 2.5412,
2004
+ "step": 11100
2005
+ },
2006
+ {
2007
+ "epoch": 2.8229410647591315,
2008
+ "grad_norm": 1.1407558917999268,
2009
+ "learning_rate": 3.29206527855937e-06,
2010
+ "loss": 2.5973,
2011
+ "step": 11150
2012
+ },
2013
+ {
2014
+ "epoch": 2.8356016965246567,
2015
+ "grad_norm": 2.399686813354492,
2016
+ "learning_rate": 3.057587694616395e-06,
2017
+ "loss": 2.5566,
2018
+ "step": 11200
2019
+ },
2020
+ {
2021
+ "epoch": 2.8356016965246567,
2022
+ "eval_loss": 2.4338231086730957,
2023
+ "eval_runtime": 40.4877,
2024
+ "eval_samples_per_second": 43.371,
2025
+ "eval_steps_per_second": 43.371,
2026
+ "step": 11200
2027
+ },
2028
+ {
2029
+ "epoch": 2.8482623282901818,
2030
+ "grad_norm": 1.7053141593933105,
2031
+ "learning_rate": 2.8231101106734197e-06,
2032
+ "loss": 2.6224,
2033
+ "step": 11250
2034
+ },
2035
+ {
2036
+ "epoch": 2.860922960055707,
2037
+ "grad_norm": 1.8215903043746948,
2038
+ "learning_rate": 2.5886325267304445e-06,
2039
+ "loss": 2.5108,
2040
+ "step": 11300
2041
+ },
2042
+ {
2043
+ "epoch": 2.873583591821232,
2044
+ "grad_norm": 1.1648200750350952,
2045
+ "learning_rate": 2.3541549427874697e-06,
2046
+ "loss": 2.557,
2047
+ "step": 11350
2048
+ },
2049
+ {
2050
+ "epoch": 2.886244223586757,
2051
+ "grad_norm": 1.5225868225097656,
2052
+ "learning_rate": 2.1196773588444944e-06,
2053
+ "loss": 2.6285,
2054
+ "step": 11400
2055
+ },
2056
+ {
2057
+ "epoch": 2.886244223586757,
2058
+ "eval_loss": 2.4334514141082764,
2059
+ "eval_runtime": 40.3985,
2060
+ "eval_samples_per_second": 43.467,
2061
+ "eval_steps_per_second": 43.467,
2062
+ "step": 11400
2063
+ },
2064
+ {
2065
+ "epoch": 2.8989048553522823,
2066
+ "grad_norm": 1.4937622547149658,
2067
+ "learning_rate": 1.8851997749015194e-06,
2068
+ "loss": 2.481,
2069
+ "step": 11450
2070
+ },
2071
+ {
2072
+ "epoch": 2.911565487117807,
2073
+ "grad_norm": 1.9169902801513672,
2074
+ "learning_rate": 1.6507221909585446e-06,
2075
+ "loss": 2.5412,
2076
+ "step": 11500
2077
+ },
2078
+ {
2079
+ "epoch": 2.9242261188833325,
2080
+ "grad_norm": 1.6611114740371704,
2081
+ "learning_rate": 1.4162446070155693e-06,
2082
+ "loss": 2.5086,
2083
+ "step": 11550
2084
+ },
2085
+ {
2086
+ "epoch": 2.936886750648857,
2087
+ "grad_norm": 1.3464007377624512,
2088
+ "learning_rate": 1.1817670230725943e-06,
2089
+ "loss": 2.6063,
2090
+ "step": 11600
2091
+ },
2092
+ {
2093
+ "epoch": 2.936886750648857,
2094
+ "eval_loss": 2.4329476356506348,
2095
+ "eval_runtime": 40.4334,
2096
+ "eval_samples_per_second": 43.429,
2097
+ "eval_steps_per_second": 43.429,
2098
+ "step": 11600
2099
+ },
2100
+ {
2101
+ "epoch": 2.9495473824143827,
2102
+ "grad_norm": 1.453385829925537,
2103
+ "learning_rate": 9.472894391296193e-07,
2104
+ "loss": 2.5012,
2105
+ "step": 11650
2106
+ },
2107
+ {
2108
+ "epoch": 2.9622080141799074,
2109
+ "grad_norm": 1.6921356916427612,
2110
+ "learning_rate": 7.128118551866442e-07,
2111
+ "loss": 2.5589,
2112
+ "step": 11700
2113
+ },
2114
+ {
2115
+ "epoch": 2.9748686459454325,
2116
+ "grad_norm": 1.0562982559204102,
2117
+ "learning_rate": 4.783342712436691e-07,
2118
+ "loss": 2.6015,
2119
+ "step": 11750
2120
+ },
2121
+ {
2122
+ "epoch": 2.9875292777109577,
2123
+ "grad_norm": 1.457960844039917,
2124
+ "learning_rate": 2.4385668730069406e-07,
2125
+ "loss": 2.6224,
2126
+ "step": 11800
2127
+ },
2128
+ {
2129
+ "epoch": 2.9875292777109577,
2130
+ "eval_loss": 2.432849645614624,
2131
+ "eval_runtime": 40.4518,
2132
+ "eval_samples_per_second": 43.41,
2133
+ "eval_steps_per_second": 43.41,
2134
+ "step": 11800
2135
+ }
2136
+ ],
2137
+ "logging_steps": 50,
2138
+ "max_steps": 11847,
2139
+ "num_input_tokens_seen": 0,
2140
+ "num_train_epochs": 3,
2141
+ "save_steps": 200,
2142
+ "stateful_callbacks": {
2143
+ "TrainerControl": {
2144
+ "args": {
2145
+ "should_epoch_stop": false,
2146
+ "should_evaluate": false,
2147
+ "should_log": false,
2148
+ "should_save": true,
2149
+ "should_training_stop": false
2150
+ },
2151
+ "attributes": {}
2152
+ }
2153
+ },
2154
+ "total_flos": 6224498036047872.0,
2155
+ "train_batch_size": 1,
2156
+ "trial_name": null,
2157
+ "trial_params": null
2158
+ }
checkpoint-11800/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d46b7107c3dfd17ff0fb12b8cdcc256a49e8d4594358d4495904a5fcb069382
3
+ size 5304
checkpoint-11800/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-11847/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: gpt2
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.14.0
checkpoint-11847/adapter_config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "gpt2",
5
+ "bias": "none",
6
+ "eva_config": null,
7
+ "exclude_modules": null,
8
+ "fan_in_fan_out": true,
9
+ "inference_mode": true,
10
+ "init_lora_weights": true,
11
+ "layer_replication": null,
12
+ "layers_pattern": null,
13
+ "layers_to_transform": null,
14
+ "loftq_config": {},
15
+ "lora_alpha": 16,
16
+ "lora_bias": false,
17
+ "lora_dropout": 0.1,
18
+ "megatron_config": null,
19
+ "megatron_core": "megatron.core",
20
+ "modules_to_save": null,
21
+ "peft_type": "LORA",
22
+ "r": 8,
23
+ "rank_pattern": {},
24
+ "revision": null,
25
+ "target_modules": [
26
+ "c_proj",
27
+ "c_attn"
28
+ ],
29
+ "task_type": "CAUSAL_LM",
30
+ "use_dora": false,
31
+ "use_rslora": false
32
+ }
checkpoint-11847/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2096525bcf5c5b06858aea39ffe9f1d86e5e5f698c630fe8a7b6968326c4147d
3
+ size 3253104
checkpoint-11847/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-11847/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d317fa4b44b54d0a2990dc5b855d3892cbb408e7299889f7dfde8d83f90dec55
3
+ size 6548858
checkpoint-11847/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47d7b1d7efd16ee4ff475df79309ca7861c4a0fa87b8f21e5bc85746b2437479
3
+ size 14244
checkpoint-11847/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ace690d17ecc0bd723919a7caf76c676aa6af99ac2aad28a489002b232ed59bd
3
+ size 988
checkpoint-11847/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31739f88076d66774095d513dd74fc5f8222fb41697cf1f0e9e3ad7cd14e52b4
3
+ size 1064
checkpoint-11847/special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "pad_token": "<|endoftext|>",
5
+ "unk_token": "<|endoftext|>"
6
+ }
checkpoint-11847/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-11847/tokenizer_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "50256": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ }
12
+ },
13
+ "bos_token": "<|endoftext|>",
14
+ "clean_up_tokenization_spaces": false,
15
+ "eos_token": "<|endoftext|>",
16
+ "extra_special_tokens": {},
17
+ "model_max_length": 1024,
18
+ "pad_token": "<|endoftext|>",
19
+ "tokenizer_class": "GPT2Tokenizer",
20
+ "unk_token": "<|endoftext|>"
21
+ }
checkpoint-11847/trainer_state.json ADDED
@@ -0,0 +1,2158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 11800,
3
+ "best_metric": 2.432849645614624,
4
+ "best_model_checkpoint": "/kaggle/working/senko_adaptive/checkpoint-11800",
5
+ "epoch": 2.999430271570551,
6
+ "eval_steps": 200,
7
+ "global_step": 11847,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0126606317655251,
14
+ "grad_norm": 0.4274106025695801,
15
+ "learning_rate": 2.067510548523207e-06,
16
+ "loss": 3.4405,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 0.0253212635310502,
21
+ "grad_norm": 0.5292551517486572,
22
+ "learning_rate": 4.177215189873418e-06,
23
+ "loss": 3.4567,
24
+ "step": 100
25
+ },
26
+ {
27
+ "epoch": 0.0379818952965753,
28
+ "grad_norm": 0.7541739344596863,
29
+ "learning_rate": 6.28691983122363e-06,
30
+ "loss": 3.4683,
31
+ "step": 150
32
+ },
33
+ {
34
+ "epoch": 0.0506425270621004,
35
+ "grad_norm": 0.8833445906639099,
36
+ "learning_rate": 8.39662447257384e-06,
37
+ "loss": 3.5084,
38
+ "step": 200
39
+ },
40
+ {
41
+ "epoch": 0.0506425270621004,
42
+ "eval_loss": 3.5248923301696777,
43
+ "eval_runtime": 39.9384,
44
+ "eval_samples_per_second": 43.968,
45
+ "eval_steps_per_second": 43.968,
46
+ "step": 200
47
+ },
48
+ {
49
+ "epoch": 0.0633031588276255,
50
+ "grad_norm": 0.9998921155929565,
51
+ "learning_rate": 1.0506329113924052e-05,
52
+ "loss": 3.359,
53
+ "step": 250
54
+ },
55
+ {
56
+ "epoch": 0.0759637905931506,
57
+ "grad_norm": 0.8041885495185852,
58
+ "learning_rate": 1.2616033755274262e-05,
59
+ "loss": 3.351,
60
+ "step": 300
61
+ },
62
+ {
63
+ "epoch": 0.0886244223586757,
64
+ "grad_norm": 0.9213416576385498,
65
+ "learning_rate": 1.4725738396624473e-05,
66
+ "loss": 3.2244,
67
+ "step": 350
68
+ },
69
+ {
70
+ "epoch": 0.1012850541242008,
71
+ "grad_norm": 1.0922213792800903,
72
+ "learning_rate": 1.6835443037974685e-05,
73
+ "loss": 3.1565,
74
+ "step": 400
75
+ },
76
+ {
77
+ "epoch": 0.1012850541242008,
78
+ "eval_loss": 3.151216983795166,
79
+ "eval_runtime": 39.7515,
80
+ "eval_samples_per_second": 44.174,
81
+ "eval_steps_per_second": 44.174,
82
+ "step": 400
83
+ },
84
+ {
85
+ "epoch": 0.1139456858897259,
86
+ "grad_norm": 1.4199283123016357,
87
+ "learning_rate": 1.8945147679324897e-05,
88
+ "loss": 3.0154,
89
+ "step": 450
90
+ },
91
+ {
92
+ "epoch": 0.126606317655251,
93
+ "grad_norm": 1.077143907546997,
94
+ "learning_rate": 2.1054852320675106e-05,
95
+ "loss": 3.0456,
96
+ "step": 500
97
+ },
98
+ {
99
+ "epoch": 0.1392669494207761,
100
+ "grad_norm": 1.5466052293777466,
101
+ "learning_rate": 2.3164556962025318e-05,
102
+ "loss": 2.9099,
103
+ "step": 550
104
+ },
105
+ {
106
+ "epoch": 0.1519275811863012,
107
+ "grad_norm": 1.2139467000961304,
108
+ "learning_rate": 2.5274261603375527e-05,
109
+ "loss": 2.8839,
110
+ "step": 600
111
+ },
112
+ {
113
+ "epoch": 0.1519275811863012,
114
+ "eval_loss": 2.793567657470703,
115
+ "eval_runtime": 40.2573,
116
+ "eval_samples_per_second": 43.619,
117
+ "eval_steps_per_second": 43.619,
118
+ "step": 600
119
+ },
120
+ {
121
+ "epoch": 0.1645882129518263,
122
+ "grad_norm": 1.0270315408706665,
123
+ "learning_rate": 2.738396624472574e-05,
124
+ "loss": 2.8389,
125
+ "step": 650
126
+ },
127
+ {
128
+ "epoch": 0.1772488447173514,
129
+ "grad_norm": 1.5865377187728882,
130
+ "learning_rate": 2.949367088607595e-05,
131
+ "loss": 2.8228,
132
+ "step": 700
133
+ },
134
+ {
135
+ "epoch": 0.18990947648287648,
136
+ "grad_norm": 1.076073408126831,
137
+ "learning_rate": 3.160337552742616e-05,
138
+ "loss": 2.9255,
139
+ "step": 750
140
+ },
141
+ {
142
+ "epoch": 0.2025701082484016,
143
+ "grad_norm": 1.4510694742202759,
144
+ "learning_rate": 3.3713080168776376e-05,
145
+ "loss": 2.8165,
146
+ "step": 800
147
+ },
148
+ {
149
+ "epoch": 0.2025701082484016,
150
+ "eval_loss": 2.667301893234253,
151
+ "eval_runtime": 40.0906,
152
+ "eval_samples_per_second": 43.801,
153
+ "eval_steps_per_second": 43.801,
154
+ "step": 800
155
+ },
156
+ {
157
+ "epoch": 0.2152307400139267,
158
+ "grad_norm": 1.5206592082977295,
159
+ "learning_rate": 3.5822784810126585e-05,
160
+ "loss": 2.8022,
161
+ "step": 850
162
+ },
163
+ {
164
+ "epoch": 0.2278913717794518,
165
+ "grad_norm": 1.173909068107605,
166
+ "learning_rate": 3.7932489451476794e-05,
167
+ "loss": 2.8034,
168
+ "step": 900
169
+ },
170
+ {
171
+ "epoch": 0.24055200354497688,
172
+ "grad_norm": 1.4551103115081787,
173
+ "learning_rate": 4.004219409282701e-05,
174
+ "loss": 2.774,
175
+ "step": 950
176
+ },
177
+ {
178
+ "epoch": 0.253212635310502,
179
+ "grad_norm": 1.509749412536621,
180
+ "learning_rate": 4.215189873417722e-05,
181
+ "loss": 2.7978,
182
+ "step": 1000
183
+ },
184
+ {
185
+ "epoch": 0.253212635310502,
186
+ "eval_loss": 2.603116273880005,
187
+ "eval_runtime": 39.9617,
188
+ "eval_samples_per_second": 43.942,
189
+ "eval_steps_per_second": 43.942,
190
+ "step": 1000
191
+ },
192
+ {
193
+ "epoch": 0.2658732670760271,
194
+ "grad_norm": 1.745764136314392,
195
+ "learning_rate": 4.426160337552743e-05,
196
+ "loss": 2.7249,
197
+ "step": 1050
198
+ },
199
+ {
200
+ "epoch": 0.2785338988415522,
201
+ "grad_norm": 1.6712589263916016,
202
+ "learning_rate": 4.637130801687764e-05,
203
+ "loss": 2.7618,
204
+ "step": 1100
205
+ },
206
+ {
207
+ "epoch": 0.2911945306070773,
208
+ "grad_norm": 2.256267786026001,
209
+ "learning_rate": 4.8481012658227845e-05,
210
+ "loss": 2.7431,
211
+ "step": 1150
212
+ },
213
+ {
214
+ "epoch": 0.3038551623726024,
215
+ "grad_norm": 1.5181586742401123,
216
+ "learning_rate": 4.993434627649597e-05,
217
+ "loss": 2.778,
218
+ "step": 1200
219
+ },
220
+ {
221
+ "epoch": 0.3038551623726024,
222
+ "eval_loss": 2.5704379081726074,
223
+ "eval_runtime": 40.1791,
224
+ "eval_samples_per_second": 43.704,
225
+ "eval_steps_per_second": 43.704,
226
+ "step": 1200
227
+ },
228
+ {
229
+ "epoch": 0.31651579413812747,
230
+ "grad_norm": 1.1885608434677124,
231
+ "learning_rate": 4.969986869255299e-05,
232
+ "loss": 2.7224,
233
+ "step": 1250
234
+ },
235
+ {
236
+ "epoch": 0.3291764259036526,
237
+ "grad_norm": 1.2136404514312744,
238
+ "learning_rate": 4.946539110861002e-05,
239
+ "loss": 2.6823,
240
+ "step": 1300
241
+ },
242
+ {
243
+ "epoch": 0.3418370576691777,
244
+ "grad_norm": 0.8780732750892639,
245
+ "learning_rate": 4.9230913524667046e-05,
246
+ "loss": 2.6961,
247
+ "step": 1350
248
+ },
249
+ {
250
+ "epoch": 0.3544976894347028,
251
+ "grad_norm": 1.0844959020614624,
252
+ "learning_rate": 4.899643594072407e-05,
253
+ "loss": 2.7014,
254
+ "step": 1400
255
+ },
256
+ {
257
+ "epoch": 0.3544976894347028,
258
+ "eval_loss": 2.54587721824646,
259
+ "eval_runtime": 39.8803,
260
+ "eval_samples_per_second": 44.032,
261
+ "eval_steps_per_second": 44.032,
262
+ "step": 1400
263
+ },
264
+ {
265
+ "epoch": 0.3671583212002279,
266
+ "grad_norm": 1.3518335819244385,
267
+ "learning_rate": 4.8761958356781096e-05,
268
+ "loss": 2.6393,
269
+ "step": 1450
270
+ },
271
+ {
272
+ "epoch": 0.37981895296575297,
273
+ "grad_norm": 1.1389687061309814,
274
+ "learning_rate": 4.852748077283812e-05,
275
+ "loss": 2.7002,
276
+ "step": 1500
277
+ },
278
+ {
279
+ "epoch": 0.3924795847312781,
280
+ "grad_norm": 1.6295430660247803,
281
+ "learning_rate": 4.829300318889514e-05,
282
+ "loss": 2.6754,
283
+ "step": 1550
284
+ },
285
+ {
286
+ "epoch": 0.4051402164968032,
287
+ "grad_norm": 1.387499451637268,
288
+ "learning_rate": 4.8058525604952173e-05,
289
+ "loss": 2.6853,
290
+ "step": 1600
291
+ },
292
+ {
293
+ "epoch": 0.4051402164968032,
294
+ "eval_loss": 2.5297553539276123,
295
+ "eval_runtime": 39.722,
296
+ "eval_samples_per_second": 44.207,
297
+ "eval_steps_per_second": 44.207,
298
+ "step": 1600
299
+ },
300
+ {
301
+ "epoch": 0.4178008482623283,
302
+ "grad_norm": 1.014020323753357,
303
+ "learning_rate": 4.7824048021009195e-05,
304
+ "loss": 2.7275,
305
+ "step": 1650
306
+ },
307
+ {
308
+ "epoch": 0.4304614800278534,
309
+ "grad_norm": 1.1505990028381348,
310
+ "learning_rate": 4.7589570437066216e-05,
311
+ "loss": 2.6651,
312
+ "step": 1700
313
+ },
314
+ {
315
+ "epoch": 0.4431221117933785,
316
+ "grad_norm": 1.1389458179473877,
317
+ "learning_rate": 4.7355092853123244e-05,
318
+ "loss": 2.6993,
319
+ "step": 1750
320
+ },
321
+ {
322
+ "epoch": 0.4557827435589036,
323
+ "grad_norm": 1.2159587144851685,
324
+ "learning_rate": 4.7120615269180266e-05,
325
+ "loss": 2.7239,
326
+ "step": 1800
327
+ },
328
+ {
329
+ "epoch": 0.4557827435589036,
330
+ "eval_loss": 2.5201079845428467,
331
+ "eval_runtime": 39.8313,
332
+ "eval_samples_per_second": 44.086,
333
+ "eval_steps_per_second": 44.086,
334
+ "step": 1800
335
+ },
336
+ {
337
+ "epoch": 0.4684433753244287,
338
+ "grad_norm": 1.1873971223831177,
339
+ "learning_rate": 4.6886137685237294e-05,
340
+ "loss": 2.6368,
341
+ "step": 1850
342
+ },
343
+ {
344
+ "epoch": 0.48110400708995377,
345
+ "grad_norm": 1.5109103918075562,
346
+ "learning_rate": 4.665166010129432e-05,
347
+ "loss": 2.6827,
348
+ "step": 1900
349
+ },
350
+ {
351
+ "epoch": 0.4937646388554789,
352
+ "grad_norm": 1.9981125593185425,
353
+ "learning_rate": 4.641718251735134e-05,
354
+ "loss": 2.6513,
355
+ "step": 1950
356
+ },
357
+ {
358
+ "epoch": 0.506425270621004,
359
+ "grad_norm": 1.4879294633865356,
360
+ "learning_rate": 4.6182704933408365e-05,
361
+ "loss": 2.6433,
362
+ "step": 2000
363
+ },
364
+ {
365
+ "epoch": 0.506425270621004,
366
+ "eval_loss": 2.5107176303863525,
367
+ "eval_runtime": 40.0643,
368
+ "eval_samples_per_second": 43.83,
369
+ "eval_steps_per_second": 43.83,
370
+ "step": 2000
371
+ },
372
+ {
373
+ "epoch": 0.5190859023865291,
374
+ "grad_norm": 1.2832767963409424,
375
+ "learning_rate": 4.5952916901144253e-05,
376
+ "loss": 2.6225,
377
+ "step": 2050
378
+ },
379
+ {
380
+ "epoch": 0.5317465341520542,
381
+ "grad_norm": 1.2915899753570557,
382
+ "learning_rate": 4.5718439317201275e-05,
383
+ "loss": 2.6592,
384
+ "step": 2100
385
+ },
386
+ {
387
+ "epoch": 0.5444071659175793,
388
+ "grad_norm": 1.229929804801941,
389
+ "learning_rate": 4.54839617332583e-05,
390
+ "loss": 2.6411,
391
+ "step": 2150
392
+ },
393
+ {
394
+ "epoch": 0.5570677976831044,
395
+ "grad_norm": 1.2569608688354492,
396
+ "learning_rate": 4.524948414931533e-05,
397
+ "loss": 2.6436,
398
+ "step": 2200
399
+ },
400
+ {
401
+ "epoch": 0.5570677976831044,
402
+ "eval_loss": 2.504101514816284,
403
+ "eval_runtime": 39.8694,
404
+ "eval_samples_per_second": 44.044,
405
+ "eval_steps_per_second": 44.044,
406
+ "step": 2200
407
+ },
408
+ {
409
+ "epoch": 0.5697284294486294,
410
+ "grad_norm": 1.3688510656356812,
411
+ "learning_rate": 4.501500656537235e-05,
412
+ "loss": 2.6819,
413
+ "step": 2250
414
+ },
415
+ {
416
+ "epoch": 0.5823890612141546,
417
+ "grad_norm": 1.1405905485153198,
418
+ "learning_rate": 4.4780528981429374e-05,
419
+ "loss": 2.6116,
420
+ "step": 2300
421
+ },
422
+ {
423
+ "epoch": 0.5950496929796797,
424
+ "grad_norm": 1.453338861465454,
425
+ "learning_rate": 4.45460513974864e-05,
426
+ "loss": 2.6154,
427
+ "step": 2350
428
+ },
429
+ {
430
+ "epoch": 0.6077103247452048,
431
+ "grad_norm": 1.0401395559310913,
432
+ "learning_rate": 4.431157381354343e-05,
433
+ "loss": 2.6018,
434
+ "step": 2400
435
+ },
436
+ {
437
+ "epoch": 0.6077103247452048,
438
+ "eval_loss": 2.498344898223877,
439
+ "eval_runtime": 39.9496,
440
+ "eval_samples_per_second": 43.955,
441
+ "eval_steps_per_second": 43.955,
442
+ "step": 2400
443
+ },
444
+ {
445
+ "epoch": 0.6203709565107299,
446
+ "grad_norm": 1.4646718502044678,
447
+ "learning_rate": 4.407709622960045e-05,
448
+ "loss": 2.5734,
449
+ "step": 2450
450
+ },
451
+ {
452
+ "epoch": 0.6330315882762549,
453
+ "grad_norm": 1.3828164339065552,
454
+ "learning_rate": 4.384261864565748e-05,
455
+ "loss": 2.6445,
456
+ "step": 2500
457
+ },
458
+ {
459
+ "epoch": 0.6456922200417801,
460
+ "grad_norm": 2.1768596172332764,
461
+ "learning_rate": 4.36081410617145e-05,
462
+ "loss": 2.6618,
463
+ "step": 2550
464
+ },
465
+ {
466
+ "epoch": 0.6583528518073052,
467
+ "grad_norm": 1.6110296249389648,
468
+ "learning_rate": 4.337366347777152e-05,
469
+ "loss": 2.6509,
470
+ "step": 2600
471
+ },
472
+ {
473
+ "epoch": 0.6583528518073052,
474
+ "eval_loss": 2.4937028884887695,
475
+ "eval_runtime": 39.8698,
476
+ "eval_samples_per_second": 44.043,
477
+ "eval_steps_per_second": 44.043,
478
+ "step": 2600
479
+ },
480
+ {
481
+ "epoch": 0.6710134835728303,
482
+ "grad_norm": 1.2363536357879639,
483
+ "learning_rate": 4.313918589382856e-05,
484
+ "loss": 2.6274,
485
+ "step": 2650
486
+ },
487
+ {
488
+ "epoch": 0.6836741153383554,
489
+ "grad_norm": 2.192110538482666,
490
+ "learning_rate": 4.290470830988558e-05,
491
+ "loss": 2.6932,
492
+ "step": 2700
493
+ },
494
+ {
495
+ "epoch": 0.6963347471038804,
496
+ "grad_norm": 1.2024074792861938,
497
+ "learning_rate": 4.26702307259426e-05,
498
+ "loss": 2.6221,
499
+ "step": 2750
500
+ },
501
+ {
502
+ "epoch": 0.7089953788694056,
503
+ "grad_norm": 1.8665797710418701,
504
+ "learning_rate": 4.243575314199963e-05,
505
+ "loss": 2.6313,
506
+ "step": 2800
507
+ },
508
+ {
509
+ "epoch": 0.7089953788694056,
510
+ "eval_loss": 2.4876773357391357,
511
+ "eval_runtime": 40.026,
512
+ "eval_samples_per_second": 43.871,
513
+ "eval_steps_per_second": 43.871,
514
+ "step": 2800
515
+ },
516
+ {
517
+ "epoch": 0.7216560106349307,
518
+ "grad_norm": 1.4088993072509766,
519
+ "learning_rate": 4.220127555805665e-05,
520
+ "loss": 2.5675,
521
+ "step": 2850
522
+ },
523
+ {
524
+ "epoch": 0.7343166424004558,
525
+ "grad_norm": 1.3225140571594238,
526
+ "learning_rate": 4.196679797411368e-05,
527
+ "loss": 2.56,
528
+ "step": 2900
529
+ },
530
+ {
531
+ "epoch": 0.7469772741659809,
532
+ "grad_norm": 1.3416539430618286,
533
+ "learning_rate": 4.1732320390170706e-05,
534
+ "loss": 2.6517,
535
+ "step": 2950
536
+ },
537
+ {
538
+ "epoch": 0.7596379059315059,
539
+ "grad_norm": 1.079567790031433,
540
+ "learning_rate": 4.149784280622773e-05,
541
+ "loss": 2.698,
542
+ "step": 3000
543
+ },
544
+ {
545
+ "epoch": 0.7596379059315059,
546
+ "eval_loss": 2.4842560291290283,
547
+ "eval_runtime": 39.7988,
548
+ "eval_samples_per_second": 44.122,
549
+ "eval_steps_per_second": 44.122,
550
+ "step": 3000
551
+ },
552
+ {
553
+ "epoch": 0.772298537697031,
554
+ "grad_norm": 1.4532116651535034,
555
+ "learning_rate": 4.126336522228475e-05,
556
+ "loss": 2.6232,
557
+ "step": 3050
558
+ },
559
+ {
560
+ "epoch": 0.7849591694625562,
561
+ "grad_norm": 1.5380038022994995,
562
+ "learning_rate": 4.102888763834178e-05,
563
+ "loss": 2.6212,
564
+ "step": 3100
565
+ },
566
+ {
567
+ "epoch": 0.7976198012280813,
568
+ "grad_norm": 1.3965916633605957,
569
+ "learning_rate": 4.0794410054398805e-05,
570
+ "loss": 2.5804,
571
+ "step": 3150
572
+ },
573
+ {
574
+ "epoch": 0.8102804329936064,
575
+ "grad_norm": 1.4798463582992554,
576
+ "learning_rate": 4.0559932470455826e-05,
577
+ "loss": 2.6724,
578
+ "step": 3200
579
+ },
580
+ {
581
+ "epoch": 0.8102804329936064,
582
+ "eval_loss": 2.480894088745117,
583
+ "eval_runtime": 39.9604,
584
+ "eval_samples_per_second": 43.943,
585
+ "eval_steps_per_second": 43.943,
586
+ "step": 3200
587
+ },
588
+ {
589
+ "epoch": 0.8229410647591315,
590
+ "grad_norm": 1.2598360776901245,
591
+ "learning_rate": 4.0325454886512854e-05,
592
+ "loss": 2.6993,
593
+ "step": 3250
594
+ },
595
+ {
596
+ "epoch": 0.8356016965246565,
597
+ "grad_norm": 1.366295576095581,
598
+ "learning_rate": 4.0090977302569876e-05,
599
+ "loss": 2.551,
600
+ "step": 3300
601
+ },
602
+ {
603
+ "epoch": 0.8482623282901817,
604
+ "grad_norm": 1.1827855110168457,
605
+ "learning_rate": 3.98564997186269e-05,
606
+ "loss": 2.6131,
607
+ "step": 3350
608
+ },
609
+ {
610
+ "epoch": 0.8609229600557068,
611
+ "grad_norm": 1.2728627920150757,
612
+ "learning_rate": 3.9622022134683925e-05,
613
+ "loss": 2.6178,
614
+ "step": 3400
615
+ },
616
+ {
617
+ "epoch": 0.8609229600557068,
618
+ "eval_loss": 2.477010726928711,
619
+ "eval_runtime": 40.2504,
620
+ "eval_samples_per_second": 43.627,
621
+ "eval_steps_per_second": 43.627,
622
+ "step": 3400
623
+ },
624
+ {
625
+ "epoch": 0.8735835918212319,
626
+ "grad_norm": 1.341917634010315,
627
+ "learning_rate": 3.938754455074095e-05,
628
+ "loss": 2.5748,
629
+ "step": 3450
630
+ },
631
+ {
632
+ "epoch": 0.886244223586757,
633
+ "grad_norm": 1.4114609956741333,
634
+ "learning_rate": 3.9153066966797975e-05,
635
+ "loss": 2.667,
636
+ "step": 3500
637
+ },
638
+ {
639
+ "epoch": 0.898904855352282,
640
+ "grad_norm": 1.1211490631103516,
641
+ "learning_rate": 3.8918589382855e-05,
642
+ "loss": 2.5671,
643
+ "step": 3550
644
+ },
645
+ {
646
+ "epoch": 0.9115654871178072,
647
+ "grad_norm": 1.4166322946548462,
648
+ "learning_rate": 3.8684111798912024e-05,
649
+ "loss": 2.5945,
650
+ "step": 3600
651
+ },
652
+ {
653
+ "epoch": 0.9115654871178072,
654
+ "eval_loss": 2.47322940826416,
655
+ "eval_runtime": 40.2079,
656
+ "eval_samples_per_second": 43.673,
657
+ "eval_steps_per_second": 43.673,
658
+ "step": 3600
659
+ },
660
+ {
661
+ "epoch": 0.9242261188833323,
662
+ "grad_norm": 0.9144394993782043,
663
+ "learning_rate": 3.844963421496905e-05,
664
+ "loss": 2.6148,
665
+ "step": 3650
666
+ },
667
+ {
668
+ "epoch": 0.9368867506488574,
669
+ "grad_norm": 1.4106061458587646,
670
+ "learning_rate": 3.821515663102608e-05,
671
+ "loss": 2.6586,
672
+ "step": 3700
673
+ },
674
+ {
675
+ "epoch": 0.9495473824143825,
676
+ "grad_norm": 1.414415717124939,
677
+ "learning_rate": 3.79806790470831e-05,
678
+ "loss": 2.5874,
679
+ "step": 3750
680
+ },
681
+ {
682
+ "epoch": 0.9622080141799075,
683
+ "grad_norm": 1.5448992252349854,
684
+ "learning_rate": 3.774620146314012e-05,
685
+ "loss": 2.6422,
686
+ "step": 3800
687
+ },
688
+ {
689
+ "epoch": 0.9622080141799075,
690
+ "eval_loss": 2.4701173305511475,
691
+ "eval_runtime": 40.1267,
692
+ "eval_samples_per_second": 43.761,
693
+ "eval_steps_per_second": 43.761,
694
+ "step": 3800
695
+ },
696
+ {
697
+ "epoch": 0.9748686459454327,
698
+ "grad_norm": 1.1959314346313477,
699
+ "learning_rate": 3.751172387919715e-05,
700
+ "loss": 2.6975,
701
+ "step": 3850
702
+ },
703
+ {
704
+ "epoch": 0.9875292777109578,
705
+ "grad_norm": 0.9525274038314819,
706
+ "learning_rate": 3.727724629525417e-05,
707
+ "loss": 2.6675,
708
+ "step": 3900
709
+ },
710
+ {
711
+ "epoch": 1.0,
712
+ "grad_norm": 4.733253479003906,
713
+ "learning_rate": 3.70427687113112e-05,
714
+ "loss": 2.566,
715
+ "step": 3950
716
+ },
717
+ {
718
+ "epoch": 1.0126606317655251,
719
+ "grad_norm": 1.2803192138671875,
720
+ "learning_rate": 3.680829112736823e-05,
721
+ "loss": 2.5659,
722
+ "step": 4000
723
+ },
724
+ {
725
+ "epoch": 1.0126606317655251,
726
+ "eval_loss": 2.4702188968658447,
727
+ "eval_runtime": 40.1387,
728
+ "eval_samples_per_second": 43.748,
729
+ "eval_steps_per_second": 43.748,
730
+ "step": 4000
731
+ },
732
+ {
733
+ "epoch": 1.0253212635310502,
734
+ "grad_norm": 1.446990966796875,
735
+ "learning_rate": 3.657381354342525e-05,
736
+ "loss": 2.627,
737
+ "step": 4050
738
+ },
739
+ {
740
+ "epoch": 1.0379818952965754,
741
+ "grad_norm": 1.3563008308410645,
742
+ "learning_rate": 3.633933595948227e-05,
743
+ "loss": 2.6252,
744
+ "step": 4100
745
+ },
746
+ {
747
+ "epoch": 1.0506425270621005,
748
+ "grad_norm": 1.5763463973999023,
749
+ "learning_rate": 3.61048583755393e-05,
750
+ "loss": 2.6593,
751
+ "step": 4150
752
+ },
753
+ {
754
+ "epoch": 1.0633031588276256,
755
+ "grad_norm": 1.0055335760116577,
756
+ "learning_rate": 3.587038079159633e-05,
757
+ "loss": 2.5955,
758
+ "step": 4200
759
+ },
760
+ {
761
+ "epoch": 1.0633031588276256,
762
+ "eval_loss": 2.4676930904388428,
763
+ "eval_runtime": 40.0342,
764
+ "eval_samples_per_second": 43.863,
765
+ "eval_steps_per_second": 43.863,
766
+ "step": 4200
767
+ },
768
+ {
769
+ "epoch": 1.0759637905931505,
770
+ "grad_norm": 1.7013343572616577,
771
+ "learning_rate": 3.563590320765335e-05,
772
+ "loss": 2.59,
773
+ "step": 4250
774
+ },
775
+ {
776
+ "epoch": 1.0886244223586756,
777
+ "grad_norm": 1.541069507598877,
778
+ "learning_rate": 3.540142562371038e-05,
779
+ "loss": 2.6192,
780
+ "step": 4300
781
+ },
782
+ {
783
+ "epoch": 1.1012850541242007,
784
+ "grad_norm": 1.2536805868148804,
785
+ "learning_rate": 3.51669480397674e-05,
786
+ "loss": 2.6225,
787
+ "step": 4350
788
+ },
789
+ {
790
+ "epoch": 1.1139456858897259,
791
+ "grad_norm": 1.8328826427459717,
792
+ "learning_rate": 3.493247045582442e-05,
793
+ "loss": 2.6022,
794
+ "step": 4400
795
+ },
796
+ {
797
+ "epoch": 1.1139456858897259,
798
+ "eval_loss": 2.465629816055298,
799
+ "eval_runtime": 39.8532,
800
+ "eval_samples_per_second": 44.062,
801
+ "eval_steps_per_second": 44.062,
802
+ "step": 4400
803
+ },
804
+ {
805
+ "epoch": 1.126606317655251,
806
+ "grad_norm": 1.8557270765304565,
807
+ "learning_rate": 3.469799287188145e-05,
808
+ "loss": 2.6496,
809
+ "step": 4450
810
+ },
811
+ {
812
+ "epoch": 1.139266949420776,
813
+ "grad_norm": 1.3255618810653687,
814
+ "learning_rate": 3.446820483961734e-05,
815
+ "loss": 2.5315,
816
+ "step": 4500
817
+ },
818
+ {
819
+ "epoch": 1.1519275811863012,
820
+ "grad_norm": 1.2192399501800537,
821
+ "learning_rate": 3.423372725567436e-05,
822
+ "loss": 2.5409,
823
+ "step": 4550
824
+ },
825
+ {
826
+ "epoch": 1.1645882129518264,
827
+ "grad_norm": 1.2533234357833862,
828
+ "learning_rate": 3.399924967173139e-05,
829
+ "loss": 2.6457,
830
+ "step": 4600
831
+ },
832
+ {
833
+ "epoch": 1.1645882129518264,
834
+ "eval_loss": 2.462162733078003,
835
+ "eval_runtime": 40.0542,
836
+ "eval_samples_per_second": 43.841,
837
+ "eval_steps_per_second": 43.841,
838
+ "step": 4600
839
+ },
840
+ {
841
+ "epoch": 1.1772488447173515,
842
+ "grad_norm": 1.8414678573608398,
843
+ "learning_rate": 3.376477208778841e-05,
844
+ "loss": 2.5658,
845
+ "step": 4650
846
+ },
847
+ {
848
+ "epoch": 1.1899094764828764,
849
+ "grad_norm": 1.568259596824646,
850
+ "learning_rate": 3.3530294503845436e-05,
851
+ "loss": 2.5771,
852
+ "step": 4700
853
+ },
854
+ {
855
+ "epoch": 1.2025701082484015,
856
+ "grad_norm": 1.3547483682632446,
857
+ "learning_rate": 3.3295816919902464e-05,
858
+ "loss": 2.6525,
859
+ "step": 4750
860
+ },
861
+ {
862
+ "epoch": 1.2152307400139266,
863
+ "grad_norm": 1.1655386686325073,
864
+ "learning_rate": 3.3061339335959486e-05,
865
+ "loss": 2.6421,
866
+ "step": 4800
867
+ },
868
+ {
869
+ "epoch": 1.2152307400139266,
870
+ "eval_loss": 2.461489200592041,
871
+ "eval_runtime": 39.9962,
872
+ "eval_samples_per_second": 43.904,
873
+ "eval_steps_per_second": 43.904,
874
+ "step": 4800
875
+ },
876
+ {
877
+ "epoch": 1.2278913717794517,
878
+ "grad_norm": 1.798033595085144,
879
+ "learning_rate": 3.282686175201651e-05,
880
+ "loss": 2.6091,
881
+ "step": 4850
882
+ },
883
+ {
884
+ "epoch": 1.2405520035449769,
885
+ "grad_norm": 3.2964117527008057,
886
+ "learning_rate": 3.2592384168073535e-05,
887
+ "loss": 2.5997,
888
+ "step": 4900
889
+ },
890
+ {
891
+ "epoch": 1.253212635310502,
892
+ "grad_norm": 1.0457675457000732,
893
+ "learning_rate": 3.2357906584130557e-05,
894
+ "loss": 2.6144,
895
+ "step": 4950
896
+ },
897
+ {
898
+ "epoch": 1.265873267076027,
899
+ "grad_norm": 0.9728056192398071,
900
+ "learning_rate": 3.2123429000187585e-05,
901
+ "loss": 2.5712,
902
+ "step": 5000
903
+ },
904
+ {
905
+ "epoch": 1.265873267076027,
906
+ "eval_loss": 2.460186719894409,
907
+ "eval_runtime": 39.8386,
908
+ "eval_samples_per_second": 44.078,
909
+ "eval_steps_per_second": 44.078,
910
+ "step": 5000
911
+ },
912
+ {
913
+ "epoch": 1.2785338988415522,
914
+ "grad_norm": 1.2350194454193115,
915
+ "learning_rate": 3.188895141624461e-05,
916
+ "loss": 2.5448,
917
+ "step": 5050
918
+ },
919
+ {
920
+ "epoch": 1.2911945306070773,
921
+ "grad_norm": 1.4210622310638428,
922
+ "learning_rate": 3.1654473832301634e-05,
923
+ "loss": 2.6031,
924
+ "step": 5100
925
+ },
926
+ {
927
+ "epoch": 1.3038551623726025,
928
+ "grad_norm": 2.226473093032837,
929
+ "learning_rate": 3.1419996248358656e-05,
930
+ "loss": 2.6597,
931
+ "step": 5150
932
+ },
933
+ {
934
+ "epoch": 1.3165157941381276,
935
+ "grad_norm": 2.4525105953216553,
936
+ "learning_rate": 3.1185518664415684e-05,
937
+ "loss": 2.596,
938
+ "step": 5200
939
+ },
940
+ {
941
+ "epoch": 1.3165157941381276,
942
+ "eval_loss": 2.454537868499756,
943
+ "eval_runtime": 39.805,
944
+ "eval_samples_per_second": 44.115,
945
+ "eval_steps_per_second": 44.115,
946
+ "step": 5200
947
+ },
948
+ {
949
+ "epoch": 1.3291764259036527,
950
+ "grad_norm": 1.265309453010559,
951
+ "learning_rate": 3.095104108047271e-05,
952
+ "loss": 2.5559,
953
+ "step": 5250
954
+ },
955
+ {
956
+ "epoch": 1.3418370576691778,
957
+ "grad_norm": 2.1364307403564453,
958
+ "learning_rate": 3.071656349652973e-05,
959
+ "loss": 2.5859,
960
+ "step": 5300
961
+ },
962
+ {
963
+ "epoch": 1.3544976894347027,
964
+ "grad_norm": 1.5945920944213867,
965
+ "learning_rate": 3.048208591258676e-05,
966
+ "loss": 2.5778,
967
+ "step": 5350
968
+ },
969
+ {
970
+ "epoch": 1.3671583212002278,
971
+ "grad_norm": 1.2479759454727173,
972
+ "learning_rate": 3.0247608328643783e-05,
973
+ "loss": 2.6846,
974
+ "step": 5400
975
+ },
976
+ {
977
+ "epoch": 1.3671583212002278,
978
+ "eval_loss": 2.4547293186187744,
979
+ "eval_runtime": 39.7806,
980
+ "eval_samples_per_second": 44.142,
981
+ "eval_steps_per_second": 44.142,
982
+ "step": 5400
983
+ },
984
+ {
985
+ "epoch": 1.379818952965753,
986
+ "grad_norm": 1.4845050573349,
987
+ "learning_rate": 3.0013130744700808e-05,
988
+ "loss": 2.5661,
989
+ "step": 5450
990
+ },
991
+ {
992
+ "epoch": 1.392479584731278,
993
+ "grad_norm": 1.5581985712051392,
994
+ "learning_rate": 2.9778653160757836e-05,
995
+ "loss": 2.5441,
996
+ "step": 5500
997
+ },
998
+ {
999
+ "epoch": 1.4051402164968032,
1000
+ "grad_norm": 3.1663737297058105,
1001
+ "learning_rate": 2.9544175576814857e-05,
1002
+ "loss": 2.5044,
1003
+ "step": 5550
1004
+ },
1005
+ {
1006
+ "epoch": 1.4178008482623283,
1007
+ "grad_norm": 1.2454484701156616,
1008
+ "learning_rate": 2.9309697992871882e-05,
1009
+ "loss": 2.5747,
1010
+ "step": 5600
1011
+ },
1012
+ {
1013
+ "epoch": 1.4178008482623283,
1014
+ "eval_loss": 2.4544529914855957,
1015
+ "eval_runtime": 39.9287,
1016
+ "eval_samples_per_second": 43.978,
1017
+ "eval_steps_per_second": 43.978,
1018
+ "step": 5600
1019
+ },
1020
+ {
1021
+ "epoch": 1.4304614800278534,
1022
+ "grad_norm": 1.662784457206726,
1023
+ "learning_rate": 2.907522040892891e-05,
1024
+ "loss": 2.6064,
1025
+ "step": 5650
1026
+ },
1027
+ {
1028
+ "epoch": 1.4431221117933786,
1029
+ "grad_norm": 1.618458867073059,
1030
+ "learning_rate": 2.8840742824985935e-05,
1031
+ "loss": 2.5191,
1032
+ "step": 5700
1033
+ },
1034
+ {
1035
+ "epoch": 1.4557827435589035,
1036
+ "grad_norm": 1.3003348112106323,
1037
+ "learning_rate": 2.8606265241042956e-05,
1038
+ "loss": 2.5339,
1039
+ "step": 5750
1040
+ },
1041
+ {
1042
+ "epoch": 1.4684433753244286,
1043
+ "grad_norm": 1.1443992853164673,
1044
+ "learning_rate": 2.8371787657099984e-05,
1045
+ "loss": 2.5914,
1046
+ "step": 5800
1047
+ },
1048
+ {
1049
+ "epoch": 1.4684433753244286,
1050
+ "eval_loss": 2.453752279281616,
1051
+ "eval_runtime": 39.8234,
1052
+ "eval_samples_per_second": 44.095,
1053
+ "eval_steps_per_second": 44.095,
1054
+ "step": 5800
1055
+ },
1056
+ {
1057
+ "epoch": 1.4811040070899537,
1058
+ "grad_norm": 1.2574009895324707,
1059
+ "learning_rate": 2.813731007315701e-05,
1060
+ "loss": 2.6109,
1061
+ "step": 5850
1062
+ },
1063
+ {
1064
+ "epoch": 1.4937646388554788,
1065
+ "grad_norm": 1.002815842628479,
1066
+ "learning_rate": 2.790283248921403e-05,
1067
+ "loss": 2.6075,
1068
+ "step": 5900
1069
+ },
1070
+ {
1071
+ "epoch": 1.506425270621004,
1072
+ "grad_norm": 1.306024432182312,
1073
+ "learning_rate": 2.766835490527106e-05,
1074
+ "loss": 2.5733,
1075
+ "step": 5950
1076
+ },
1077
+ {
1078
+ "epoch": 1.519085902386529,
1079
+ "grad_norm": 2.5023701190948486,
1080
+ "learning_rate": 2.7433877321328083e-05,
1081
+ "loss": 2.6274,
1082
+ "step": 6000
1083
+ },
1084
+ {
1085
+ "epoch": 1.519085902386529,
1086
+ "eval_loss": 2.449617862701416,
1087
+ "eval_runtime": 39.9401,
1088
+ "eval_samples_per_second": 43.966,
1089
+ "eval_steps_per_second": 43.966,
1090
+ "step": 6000
1091
+ },
1092
+ {
1093
+ "epoch": 1.5317465341520542,
1094
+ "grad_norm": 1.9410326480865479,
1095
+ "learning_rate": 2.7199399737385105e-05,
1096
+ "loss": 2.5312,
1097
+ "step": 6050
1098
+ },
1099
+ {
1100
+ "epoch": 1.5444071659175793,
1101
+ "grad_norm": 1.9793561697006226,
1102
+ "learning_rate": 2.6964922153442136e-05,
1103
+ "loss": 2.5759,
1104
+ "step": 6100
1105
+ },
1106
+ {
1107
+ "epoch": 1.5570677976831044,
1108
+ "grad_norm": 1.290531873703003,
1109
+ "learning_rate": 2.6730444569499157e-05,
1110
+ "loss": 2.5817,
1111
+ "step": 6150
1112
+ },
1113
+ {
1114
+ "epoch": 1.5697284294486296,
1115
+ "grad_norm": 2.11389422416687,
1116
+ "learning_rate": 2.6495966985556182e-05,
1117
+ "loss": 2.6287,
1118
+ "step": 6200
1119
+ },
1120
+ {
1121
+ "epoch": 1.5697284294486296,
1122
+ "eval_loss": 2.4490554332733154,
1123
+ "eval_runtime": 39.7474,
1124
+ "eval_samples_per_second": 44.179,
1125
+ "eval_steps_per_second": 44.179,
1126
+ "step": 6200
1127
+ },
1128
+ {
1129
+ "epoch": 1.5823890612141547,
1130
+ "grad_norm": 1.6492938995361328,
1131
+ "learning_rate": 2.626148940161321e-05,
1132
+ "loss": 2.631,
1133
+ "step": 6250
1134
+ },
1135
+ {
1136
+ "epoch": 1.5950496929796798,
1137
+ "grad_norm": 1.3233673572540283,
1138
+ "learning_rate": 2.6027011817670232e-05,
1139
+ "loss": 2.5654,
1140
+ "step": 6300
1141
+ },
1142
+ {
1143
+ "epoch": 1.607710324745205,
1144
+ "grad_norm": 1.688264012336731,
1145
+ "learning_rate": 2.5792534233727257e-05,
1146
+ "loss": 2.6096,
1147
+ "step": 6350
1148
+ },
1149
+ {
1150
+ "epoch": 1.62037095651073,
1151
+ "grad_norm": 2.064823865890503,
1152
+ "learning_rate": 2.5558056649784285e-05,
1153
+ "loss": 2.6275,
1154
+ "step": 6400
1155
+ },
1156
+ {
1157
+ "epoch": 1.62037095651073,
1158
+ "eval_loss": 2.4488983154296875,
1159
+ "eval_runtime": 39.6847,
1160
+ "eval_samples_per_second": 44.249,
1161
+ "eval_steps_per_second": 44.249,
1162
+ "step": 6400
1163
+ },
1164
+ {
1165
+ "epoch": 1.633031588276255,
1166
+ "grad_norm": 1.5599696636199951,
1167
+ "learning_rate": 2.5323579065841306e-05,
1168
+ "loss": 2.6334,
1169
+ "step": 6450
1170
+ },
1171
+ {
1172
+ "epoch": 1.64569222004178,
1173
+ "grad_norm": 1.3142633438110352,
1174
+ "learning_rate": 2.508910148189833e-05,
1175
+ "loss": 2.5496,
1176
+ "step": 6500
1177
+ },
1178
+ {
1179
+ "epoch": 1.6583528518073052,
1180
+ "grad_norm": 1.474135160446167,
1181
+ "learning_rate": 2.4854623897955356e-05,
1182
+ "loss": 2.5628,
1183
+ "step": 6550
1184
+ },
1185
+ {
1186
+ "epoch": 1.6710134835728303,
1187
+ "grad_norm": 1.3737610578536987,
1188
+ "learning_rate": 2.4620146314012384e-05,
1189
+ "loss": 2.5345,
1190
+ "step": 6600
1191
+ },
1192
+ {
1193
+ "epoch": 1.6710134835728303,
1194
+ "eval_loss": 2.447559356689453,
1195
+ "eval_runtime": 39.7021,
1196
+ "eval_samples_per_second": 44.229,
1197
+ "eval_steps_per_second": 44.229,
1198
+ "step": 6600
1199
+ },
1200
+ {
1201
+ "epoch": 1.6836741153383554,
1202
+ "grad_norm": 1.2432060241699219,
1203
+ "learning_rate": 2.4385668730069405e-05,
1204
+ "loss": 2.5977,
1205
+ "step": 6650
1206
+ },
1207
+ {
1208
+ "epoch": 1.6963347471038803,
1209
+ "grad_norm": 1.465063452720642,
1210
+ "learning_rate": 2.415119114612643e-05,
1211
+ "loss": 2.6118,
1212
+ "step": 6700
1213
+ },
1214
+ {
1215
+ "epoch": 1.7089953788694054,
1216
+ "grad_norm": 1.5186200141906738,
1217
+ "learning_rate": 2.3916713562183458e-05,
1218
+ "loss": 2.6126,
1219
+ "step": 6750
1220
+ },
1221
+ {
1222
+ "epoch": 1.7216560106349306,
1223
+ "grad_norm": 1.6869078874588013,
1224
+ "learning_rate": 2.368223597824048e-05,
1225
+ "loss": 2.576,
1226
+ "step": 6800
1227
+ },
1228
+ {
1229
+ "epoch": 1.7216560106349306,
1230
+ "eval_loss": 2.4459502696990967,
1231
+ "eval_runtime": 39.7653,
1232
+ "eval_samples_per_second": 44.159,
1233
+ "eval_steps_per_second": 44.159,
1234
+ "step": 6800
1235
+ },
1236
+ {
1237
+ "epoch": 1.7343166424004557,
1238
+ "grad_norm": 1.2578104734420776,
1239
+ "learning_rate": 2.3447758394297507e-05,
1240
+ "loss": 2.6178,
1241
+ "step": 6850
1242
+ },
1243
+ {
1244
+ "epoch": 1.7469772741659808,
1245
+ "grad_norm": 1.7597213983535767,
1246
+ "learning_rate": 2.3213280810354532e-05,
1247
+ "loss": 2.6358,
1248
+ "step": 6900
1249
+ },
1250
+ {
1251
+ "epoch": 1.759637905931506,
1252
+ "grad_norm": 2.144465923309326,
1253
+ "learning_rate": 2.2978803226411554e-05,
1254
+ "loss": 2.5597,
1255
+ "step": 6950
1256
+ },
1257
+ {
1258
+ "epoch": 1.772298537697031,
1259
+ "grad_norm": 1.1808464527130127,
1260
+ "learning_rate": 2.2744325642468582e-05,
1261
+ "loss": 2.6269,
1262
+ "step": 7000
1263
+ },
1264
+ {
1265
+ "epoch": 1.772298537697031,
1266
+ "eval_loss": 2.4444611072540283,
1267
+ "eval_runtime": 40.0709,
1268
+ "eval_samples_per_second": 43.822,
1269
+ "eval_steps_per_second": 43.822,
1270
+ "step": 7000
1271
+ },
1272
+ {
1273
+ "epoch": 1.7849591694625562,
1274
+ "grad_norm": 1.4550806283950806,
1275
+ "learning_rate": 2.2509848058525606e-05,
1276
+ "loss": 2.6206,
1277
+ "step": 7050
1278
+ },
1279
+ {
1280
+ "epoch": 1.7976198012280813,
1281
+ "grad_norm": 1.2635902166366577,
1282
+ "learning_rate": 2.227537047458263e-05,
1283
+ "loss": 2.5722,
1284
+ "step": 7100
1285
+ },
1286
+ {
1287
+ "epoch": 1.8102804329936064,
1288
+ "grad_norm": 1.3835856914520264,
1289
+ "learning_rate": 2.2040892890639656e-05,
1290
+ "loss": 2.535,
1291
+ "step": 7150
1292
+ },
1293
+ {
1294
+ "epoch": 1.8229410647591315,
1295
+ "grad_norm": 1.735004186630249,
1296
+ "learning_rate": 2.180641530669668e-05,
1297
+ "loss": 2.6086,
1298
+ "step": 7200
1299
+ },
1300
+ {
1301
+ "epoch": 1.8229410647591315,
1302
+ "eval_loss": 2.443899154663086,
1303
+ "eval_runtime": 40.91,
1304
+ "eval_samples_per_second": 42.924,
1305
+ "eval_steps_per_second": 42.924,
1306
+ "step": 7200
1307
+ },
1308
+ {
1309
+ "epoch": 1.8356016965246567,
1310
+ "grad_norm": 1.263051986694336,
1311
+ "learning_rate": 2.1571937722753706e-05,
1312
+ "loss": 2.5544,
1313
+ "step": 7250
1314
+ },
1315
+ {
1316
+ "epoch": 1.8482623282901818,
1317
+ "grad_norm": 1.0899442434310913,
1318
+ "learning_rate": 2.133746013881073e-05,
1319
+ "loss": 2.5603,
1320
+ "step": 7300
1321
+ },
1322
+ {
1323
+ "epoch": 1.860922960055707,
1324
+ "grad_norm": 3.038811206817627,
1325
+ "learning_rate": 2.1102982554867755e-05,
1326
+ "loss": 2.5688,
1327
+ "step": 7350
1328
+ },
1329
+ {
1330
+ "epoch": 1.873583591821232,
1331
+ "grad_norm": 1.6385984420776367,
1332
+ "learning_rate": 2.086850497092478e-05,
1333
+ "loss": 2.6006,
1334
+ "step": 7400
1335
+ },
1336
+ {
1337
+ "epoch": 1.873583591821232,
1338
+ "eval_loss": 2.443300724029541,
1339
+ "eval_runtime": 40.6649,
1340
+ "eval_samples_per_second": 43.182,
1341
+ "eval_steps_per_second": 43.182,
1342
+ "step": 7400
1343
+ },
1344
+ {
1345
+ "epoch": 1.8862442235867571,
1346
+ "grad_norm": 1.2857129573822021,
1347
+ "learning_rate": 2.0634027386981805e-05,
1348
+ "loss": 2.5563,
1349
+ "step": 7450
1350
+ },
1351
+ {
1352
+ "epoch": 1.898904855352282,
1353
+ "grad_norm": 1.0289497375488281,
1354
+ "learning_rate": 2.0399549803038833e-05,
1355
+ "loss": 2.5671,
1356
+ "step": 7500
1357
+ },
1358
+ {
1359
+ "epoch": 1.9115654871178072,
1360
+ "grad_norm": 1.5041025876998901,
1361
+ "learning_rate": 2.0165072219095854e-05,
1362
+ "loss": 2.5689,
1363
+ "step": 7550
1364
+ },
1365
+ {
1366
+ "epoch": 1.9242261188833323,
1367
+ "grad_norm": 1.6611964702606201,
1368
+ "learning_rate": 1.993528418683174e-05,
1369
+ "loss": 2.5801,
1370
+ "step": 7600
1371
+ },
1372
+ {
1373
+ "epoch": 1.9242261188833323,
1374
+ "eval_loss": 2.443532943725586,
1375
+ "eval_runtime": 39.931,
1376
+ "eval_samples_per_second": 43.976,
1377
+ "eval_steps_per_second": 43.976,
1378
+ "step": 7600
1379
+ },
1380
+ {
1381
+ "epoch": 1.9368867506488574,
1382
+ "grad_norm": 1.521170735359192,
1383
+ "learning_rate": 1.9700806602888767e-05,
1384
+ "loss": 2.5969,
1385
+ "step": 7650
1386
+ },
1387
+ {
1388
+ "epoch": 1.9495473824143825,
1389
+ "grad_norm": 1.3700034618377686,
1390
+ "learning_rate": 1.946632901894579e-05,
1391
+ "loss": 2.6306,
1392
+ "step": 7700
1393
+ },
1394
+ {
1395
+ "epoch": 1.9622080141799074,
1396
+ "grad_norm": 2.311443328857422,
1397
+ "learning_rate": 1.9231851435002814e-05,
1398
+ "loss": 2.5608,
1399
+ "step": 7750
1400
+ },
1401
+ {
1402
+ "epoch": 1.9748686459454325,
1403
+ "grad_norm": 1.6699820756912231,
1404
+ "learning_rate": 1.8997373851059842e-05,
1405
+ "loss": 2.5113,
1406
+ "step": 7800
1407
+ },
1408
+ {
1409
+ "epoch": 1.9748686459454325,
1410
+ "eval_loss": 2.4421675205230713,
1411
+ "eval_runtime": 40.1783,
1412
+ "eval_samples_per_second": 43.705,
1413
+ "eval_steps_per_second": 43.705,
1414
+ "step": 7800
1415
+ },
1416
+ {
1417
+ "epoch": 1.9875292777109577,
1418
+ "grad_norm": 1.2560683488845825,
1419
+ "learning_rate": 1.8762896267116863e-05,
1420
+ "loss": 2.545,
1421
+ "step": 7850
1422
+ },
1423
+ {
1424
+ "epoch": 2.0,
1425
+ "grad_norm": 2.176563262939453,
1426
+ "learning_rate": 1.852841868317389e-05,
1427
+ "loss": 2.5752,
1428
+ "step": 7900
1429
+ },
1430
+ {
1431
+ "epoch": 2.012660631765525,
1432
+ "grad_norm": 1.2551178932189941,
1433
+ "learning_rate": 1.8293941099230916e-05,
1434
+ "loss": 2.5215,
1435
+ "step": 7950
1436
+ },
1437
+ {
1438
+ "epoch": 2.0253212635310502,
1439
+ "grad_norm": 1.5646872520446777,
1440
+ "learning_rate": 1.8059463515287937e-05,
1441
+ "loss": 2.5838,
1442
+ "step": 8000
1443
+ },
1444
+ {
1445
+ "epoch": 2.0253212635310502,
1446
+ "eval_loss": 2.441195249557495,
1447
+ "eval_runtime": 39.701,
1448
+ "eval_samples_per_second": 44.231,
1449
+ "eval_steps_per_second": 44.231,
1450
+ "step": 8000
1451
+ },
1452
+ {
1453
+ "epoch": 2.0379818952965754,
1454
+ "grad_norm": 1.4227900505065918,
1455
+ "learning_rate": 1.7824985931344966e-05,
1456
+ "loss": 2.5597,
1457
+ "step": 8050
1458
+ },
1459
+ {
1460
+ "epoch": 2.0506425270621005,
1461
+ "grad_norm": 1.3013832569122314,
1462
+ "learning_rate": 1.759050834740199e-05,
1463
+ "loss": 2.7641,
1464
+ "step": 8100
1465
+ },
1466
+ {
1467
+ "epoch": 2.0633031588276256,
1468
+ "grad_norm": 1.1282143592834473,
1469
+ "learning_rate": 1.7356030763459015e-05,
1470
+ "loss": 2.5875,
1471
+ "step": 8150
1472
+ },
1473
+ {
1474
+ "epoch": 2.0759637905931507,
1475
+ "grad_norm": 2.079760789871216,
1476
+ "learning_rate": 1.712155317951604e-05,
1477
+ "loss": 2.4861,
1478
+ "step": 8200
1479
+ },
1480
+ {
1481
+ "epoch": 2.0759637905931507,
1482
+ "eval_loss": 2.440812826156616,
1483
+ "eval_runtime": 40.0764,
1484
+ "eval_samples_per_second": 43.816,
1485
+ "eval_steps_per_second": 43.816,
1486
+ "step": 8200
1487
+ },
1488
+ {
1489
+ "epoch": 2.088624422358676,
1490
+ "grad_norm": 1.0884991884231567,
1491
+ "learning_rate": 1.6887075595573065e-05,
1492
+ "loss": 2.5941,
1493
+ "step": 8250
1494
+ },
1495
+ {
1496
+ "epoch": 2.101285054124201,
1497
+ "grad_norm": 1.9202015399932861,
1498
+ "learning_rate": 1.665259801163009e-05,
1499
+ "loss": 2.5929,
1500
+ "step": 8300
1501
+ },
1502
+ {
1503
+ "epoch": 2.113945685889726,
1504
+ "grad_norm": 1.5925830602645874,
1505
+ "learning_rate": 1.6418120427687114e-05,
1506
+ "loss": 2.5046,
1507
+ "step": 8350
1508
+ },
1509
+ {
1510
+ "epoch": 2.126606317655251,
1511
+ "grad_norm": 1.5219184160232544,
1512
+ "learning_rate": 1.618364284374414e-05,
1513
+ "loss": 2.5628,
1514
+ "step": 8400
1515
+ },
1516
+ {
1517
+ "epoch": 2.126606317655251,
1518
+ "eval_loss": 2.4396440982818604,
1519
+ "eval_runtime": 40.0053,
1520
+ "eval_samples_per_second": 43.894,
1521
+ "eval_steps_per_second": 43.894,
1522
+ "step": 8400
1523
+ },
1524
+ {
1525
+ "epoch": 2.139266949420776,
1526
+ "grad_norm": 1.4882445335388184,
1527
+ "learning_rate": 1.5949165259801164e-05,
1528
+ "loss": 2.6268,
1529
+ "step": 8450
1530
+ },
1531
+ {
1532
+ "epoch": 2.151927581186301,
1533
+ "grad_norm": 1.3513301610946655,
1534
+ "learning_rate": 1.571468767585819e-05,
1535
+ "loss": 2.5277,
1536
+ "step": 8500
1537
+ },
1538
+ {
1539
+ "epoch": 2.164588212951826,
1540
+ "grad_norm": 1.690974473953247,
1541
+ "learning_rate": 1.5480210091915216e-05,
1542
+ "loss": 2.5631,
1543
+ "step": 8550
1544
+ },
1545
+ {
1546
+ "epoch": 2.1772488447173513,
1547
+ "grad_norm": 1.5311528444290161,
1548
+ "learning_rate": 1.5245732507972238e-05,
1549
+ "loss": 2.5454,
1550
+ "step": 8600
1551
+ },
1552
+ {
1553
+ "epoch": 2.1772488447173513,
1554
+ "eval_loss": 2.4388718605041504,
1555
+ "eval_runtime": 40.0289,
1556
+ "eval_samples_per_second": 43.868,
1557
+ "eval_steps_per_second": 43.868,
1558
+ "step": 8600
1559
+ },
1560
+ {
1561
+ "epoch": 2.1899094764828764,
1562
+ "grad_norm": 2.1171281337738037,
1563
+ "learning_rate": 1.5011254924029264e-05,
1564
+ "loss": 2.6112,
1565
+ "step": 8650
1566
+ },
1567
+ {
1568
+ "epoch": 2.2025701082484015,
1569
+ "grad_norm": 1.9706814289093018,
1570
+ "learning_rate": 1.4776777340086289e-05,
1571
+ "loss": 2.588,
1572
+ "step": 8700
1573
+ },
1574
+ {
1575
+ "epoch": 2.2152307400139266,
1576
+ "grad_norm": 1.8991297483444214,
1577
+ "learning_rate": 1.4542299756143312e-05,
1578
+ "loss": 2.5655,
1579
+ "step": 8750
1580
+ },
1581
+ {
1582
+ "epoch": 2.2278913717794517,
1583
+ "grad_norm": 1.5568820238113403,
1584
+ "learning_rate": 1.4307822172200339e-05,
1585
+ "loss": 2.5312,
1586
+ "step": 8800
1587
+ },
1588
+ {
1589
+ "epoch": 2.2278913717794517,
1590
+ "eval_loss": 2.438715696334839,
1591
+ "eval_runtime": 40.1748,
1592
+ "eval_samples_per_second": 43.709,
1593
+ "eval_steps_per_second": 43.709,
1594
+ "step": 8800
1595
+ },
1596
+ {
1597
+ "epoch": 2.240552003544977,
1598
+ "grad_norm": 1.277051329612732,
1599
+ "learning_rate": 1.4073344588257365e-05,
1600
+ "loss": 2.5818,
1601
+ "step": 8850
1602
+ },
1603
+ {
1604
+ "epoch": 2.253212635310502,
1605
+ "grad_norm": 1.8890128135681152,
1606
+ "learning_rate": 1.3838867004314388e-05,
1607
+ "loss": 2.5211,
1608
+ "step": 8900
1609
+ },
1610
+ {
1611
+ "epoch": 2.265873267076027,
1612
+ "grad_norm": 1.8824830055236816,
1613
+ "learning_rate": 1.3604389420371413e-05,
1614
+ "loss": 2.53,
1615
+ "step": 8950
1616
+ },
1617
+ {
1618
+ "epoch": 2.278533898841552,
1619
+ "grad_norm": 1.239490032196045,
1620
+ "learning_rate": 1.336991183642844e-05,
1621
+ "loss": 2.5889,
1622
+ "step": 9000
1623
+ },
1624
+ {
1625
+ "epoch": 2.278533898841552,
1626
+ "eval_loss": 2.437577962875366,
1627
+ "eval_runtime": 40.1654,
1628
+ "eval_samples_per_second": 43.719,
1629
+ "eval_steps_per_second": 43.719,
1630
+ "step": 9000
1631
+ },
1632
+ {
1633
+ "epoch": 2.2911945306070773,
1634
+ "grad_norm": 1.7253328561782837,
1635
+ "learning_rate": 1.3135434252485462e-05,
1636
+ "loss": 2.5426,
1637
+ "step": 9050
1638
+ },
1639
+ {
1640
+ "epoch": 2.3038551623726025,
1641
+ "grad_norm": 1.6971838474273682,
1642
+ "learning_rate": 1.2900956668542489e-05,
1643
+ "loss": 2.4953,
1644
+ "step": 9100
1645
+ },
1646
+ {
1647
+ "epoch": 2.3165157941381276,
1648
+ "grad_norm": 1.4906270503997803,
1649
+ "learning_rate": 1.2666479084599514e-05,
1650
+ "loss": 2.606,
1651
+ "step": 9150
1652
+ },
1653
+ {
1654
+ "epoch": 2.3291764259036527,
1655
+ "grad_norm": 1.658526062965393,
1656
+ "learning_rate": 1.2432001500656538e-05,
1657
+ "loss": 2.5483,
1658
+ "step": 9200
1659
+ },
1660
+ {
1661
+ "epoch": 2.3291764259036527,
1662
+ "eval_loss": 2.437896490097046,
1663
+ "eval_runtime": 40.7008,
1664
+ "eval_samples_per_second": 43.144,
1665
+ "eval_steps_per_second": 43.144,
1666
+ "step": 9200
1667
+ },
1668
+ {
1669
+ "epoch": 2.341837057669178,
1670
+ "grad_norm": 1.0781177282333374,
1671
+ "learning_rate": 1.2197523916713563e-05,
1672
+ "loss": 2.5449,
1673
+ "step": 9250
1674
+ },
1675
+ {
1676
+ "epoch": 2.354497689434703,
1677
+ "grad_norm": 2.1414873600006104,
1678
+ "learning_rate": 1.1963046332770588e-05,
1679
+ "loss": 2.5303,
1680
+ "step": 9300
1681
+ },
1682
+ {
1683
+ "epoch": 2.367158321200228,
1684
+ "grad_norm": 2.063297986984253,
1685
+ "learning_rate": 1.1728568748827613e-05,
1686
+ "loss": 2.5837,
1687
+ "step": 9350
1688
+ },
1689
+ {
1690
+ "epoch": 2.3798189529657527,
1691
+ "grad_norm": 1.2153489589691162,
1692
+ "learning_rate": 1.1494091164884637e-05,
1693
+ "loss": 2.6384,
1694
+ "step": 9400
1695
+ },
1696
+ {
1697
+ "epoch": 2.3798189529657527,
1698
+ "eval_loss": 2.4365696907043457,
1699
+ "eval_runtime": 40.2398,
1700
+ "eval_samples_per_second": 43.638,
1701
+ "eval_steps_per_second": 43.638,
1702
+ "step": 9400
1703
+ },
1704
+ {
1705
+ "epoch": 2.3924795847312783,
1706
+ "grad_norm": 1.2976094484329224,
1707
+ "learning_rate": 1.1259613580941662e-05,
1708
+ "loss": 2.572,
1709
+ "step": 9450
1710
+ },
1711
+ {
1712
+ "epoch": 2.405140216496803,
1713
+ "grad_norm": 1.2775920629501343,
1714
+ "learning_rate": 1.1025135996998689e-05,
1715
+ "loss": 2.6083,
1716
+ "step": 9500
1717
+ },
1718
+ {
1719
+ "epoch": 2.417800848262328,
1720
+ "grad_norm": 1.358311653137207,
1721
+ "learning_rate": 1.0790658413055713e-05,
1722
+ "loss": 2.5206,
1723
+ "step": 9550
1724
+ },
1725
+ {
1726
+ "epoch": 2.4304614800278532,
1727
+ "grad_norm": 1.3438369035720825,
1728
+ "learning_rate": 1.0556180829112736e-05,
1729
+ "loss": 2.4967,
1730
+ "step": 9600
1731
+ },
1732
+ {
1733
+ "epoch": 2.4304614800278532,
1734
+ "eval_loss": 2.4359662532806396,
1735
+ "eval_runtime": 40.0802,
1736
+ "eval_samples_per_second": 43.812,
1737
+ "eval_steps_per_second": 43.812,
1738
+ "step": 9600
1739
+ },
1740
+ {
1741
+ "epoch": 2.4431221117933783,
1742
+ "grad_norm": 1.2618831396102905,
1743
+ "learning_rate": 1.0321703245169763e-05,
1744
+ "loss": 2.6169,
1745
+ "step": 9650
1746
+ },
1747
+ {
1748
+ "epoch": 2.4557827435589035,
1749
+ "grad_norm": 1.3764727115631104,
1750
+ "learning_rate": 1.0087225661226788e-05,
1751
+ "loss": 2.5444,
1752
+ "step": 9700
1753
+ },
1754
+ {
1755
+ "epoch": 2.4684433753244286,
1756
+ "grad_norm": 1.604864478111267,
1757
+ "learning_rate": 9.852748077283812e-06,
1758
+ "loss": 2.5343,
1759
+ "step": 9750
1760
+ },
1761
+ {
1762
+ "epoch": 2.4811040070899537,
1763
+ "grad_norm": 1.390496850013733,
1764
+ "learning_rate": 9.618270493340837e-06,
1765
+ "loss": 2.5051,
1766
+ "step": 9800
1767
+ },
1768
+ {
1769
+ "epoch": 2.4811040070899537,
1770
+ "eval_loss": 2.4353232383728027,
1771
+ "eval_runtime": 40.1607,
1772
+ "eval_samples_per_second": 43.724,
1773
+ "eval_steps_per_second": 43.724,
1774
+ "step": 9800
1775
+ },
1776
+ {
1777
+ "epoch": 2.493764638855479,
1778
+ "grad_norm": 2.1982169151306152,
1779
+ "learning_rate": 9.383792909397862e-06,
1780
+ "loss": 2.5036,
1781
+ "step": 9850
1782
+ },
1783
+ {
1784
+ "epoch": 2.506425270621004,
1785
+ "grad_norm": 1.3033822774887085,
1786
+ "learning_rate": 9.149315325454887e-06,
1787
+ "loss": 2.5205,
1788
+ "step": 9900
1789
+ },
1790
+ {
1791
+ "epoch": 2.519085902386529,
1792
+ "grad_norm": 1.682586431503296,
1793
+ "learning_rate": 8.919527293190772e-06,
1794
+ "loss": 2.6083,
1795
+ "step": 9950
1796
+ },
1797
+ {
1798
+ "epoch": 2.531746534152054,
1799
+ "grad_norm": 3.184382200241089,
1800
+ "learning_rate": 8.685049709247797e-06,
1801
+ "loss": 2.5314,
1802
+ "step": 10000
1803
+ },
1804
+ {
1805
+ "epoch": 2.531746534152054,
1806
+ "eval_loss": 2.434755802154541,
1807
+ "eval_runtime": 40.2877,
1808
+ "eval_samples_per_second": 43.587,
1809
+ "eval_steps_per_second": 43.587,
1810
+ "step": 10000
1811
+ },
1812
+ {
1813
+ "epoch": 2.5444071659175793,
1814
+ "grad_norm": 2.0026867389678955,
1815
+ "learning_rate": 8.450572125304821e-06,
1816
+ "loss": 2.5109,
1817
+ "step": 10050
1818
+ },
1819
+ {
1820
+ "epoch": 2.5570677976831044,
1821
+ "grad_norm": 1.3833885192871094,
1822
+ "learning_rate": 8.216094541361846e-06,
1823
+ "loss": 2.5362,
1824
+ "step": 10100
1825
+ },
1826
+ {
1827
+ "epoch": 2.5697284294486296,
1828
+ "grad_norm": 2.157984495162964,
1829
+ "learning_rate": 7.981616957418871e-06,
1830
+ "loss": 2.5423,
1831
+ "step": 10150
1832
+ },
1833
+ {
1834
+ "epoch": 2.5823890612141547,
1835
+ "grad_norm": 1.682053565979004,
1836
+ "learning_rate": 7.747139373475897e-06,
1837
+ "loss": 2.5133,
1838
+ "step": 10200
1839
+ },
1840
+ {
1841
+ "epoch": 2.5823890612141547,
1842
+ "eval_loss": 2.435208559036255,
1843
+ "eval_runtime": 40.4768,
1844
+ "eval_samples_per_second": 43.383,
1845
+ "eval_steps_per_second": 43.383,
1846
+ "step": 10200
1847
+ },
1848
+ {
1849
+ "epoch": 2.59504969297968,
1850
+ "grad_norm": 1.9720139503479004,
1851
+ "learning_rate": 7.512661789532921e-06,
1852
+ "loss": 2.6372,
1853
+ "step": 10250
1854
+ },
1855
+ {
1856
+ "epoch": 2.607710324745205,
1857
+ "grad_norm": 1.6906607151031494,
1858
+ "learning_rate": 7.278184205589945e-06,
1859
+ "loss": 2.5505,
1860
+ "step": 10300
1861
+ },
1862
+ {
1863
+ "epoch": 2.62037095651073,
1864
+ "grad_norm": 1.484045147895813,
1865
+ "learning_rate": 7.043706621646972e-06,
1866
+ "loss": 2.5095,
1867
+ "step": 10350
1868
+ },
1869
+ {
1870
+ "epoch": 2.633031588276255,
1871
+ "grad_norm": 1.6676850318908691,
1872
+ "learning_rate": 6.8092290377039955e-06,
1873
+ "loss": 2.6487,
1874
+ "step": 10400
1875
+ },
1876
+ {
1877
+ "epoch": 2.633031588276255,
1878
+ "eval_loss": 2.4344091415405273,
1879
+ "eval_runtime": 39.869,
1880
+ "eval_samples_per_second": 44.044,
1881
+ "eval_steps_per_second": 44.044,
1882
+ "step": 10400
1883
+ },
1884
+ {
1885
+ "epoch": 2.64569222004178,
1886
+ "grad_norm": 1.5012388229370117,
1887
+ "learning_rate": 6.57475145376102e-06,
1888
+ "loss": 2.5756,
1889
+ "step": 10450
1890
+ },
1891
+ {
1892
+ "epoch": 2.6583528518073054,
1893
+ "grad_norm": 1.043954849243164,
1894
+ "learning_rate": 6.340273869818046e-06,
1895
+ "loss": 2.5843,
1896
+ "step": 10500
1897
+ },
1898
+ {
1899
+ "epoch": 2.67101348357283,
1900
+ "grad_norm": 1.0455141067504883,
1901
+ "learning_rate": 6.105796285875071e-06,
1902
+ "loss": 2.5248,
1903
+ "step": 10550
1904
+ },
1905
+ {
1906
+ "epoch": 2.6836741153383556,
1907
+ "grad_norm": 1.39467453956604,
1908
+ "learning_rate": 5.871318701932095e-06,
1909
+ "loss": 2.5091,
1910
+ "step": 10600
1911
+ },
1912
+ {
1913
+ "epoch": 2.6836741153383556,
1914
+ "eval_loss": 2.4331610202789307,
1915
+ "eval_runtime": 39.923,
1916
+ "eval_samples_per_second": 43.985,
1917
+ "eval_steps_per_second": 43.985,
1918
+ "step": 10600
1919
+ },
1920
+ {
1921
+ "epoch": 2.6963347471038803,
1922
+ "grad_norm": 1.1417715549468994,
1923
+ "learning_rate": 5.63684111798912e-06,
1924
+ "loss": 2.5853,
1925
+ "step": 10650
1926
+ },
1927
+ {
1928
+ "epoch": 2.7089953788694054,
1929
+ "grad_norm": 1.133244514465332,
1930
+ "learning_rate": 5.402363534046146e-06,
1931
+ "loss": 2.5457,
1932
+ "step": 10700
1933
+ },
1934
+ {
1935
+ "epoch": 2.7216560106349306,
1936
+ "grad_norm": 1.2331452369689941,
1937
+ "learning_rate": 5.1678859501031705e-06,
1938
+ "loss": 2.5576,
1939
+ "step": 10750
1940
+ },
1941
+ {
1942
+ "epoch": 2.7343166424004557,
1943
+ "grad_norm": 1.7164263725280762,
1944
+ "learning_rate": 4.933408366160195e-06,
1945
+ "loss": 2.5471,
1946
+ "step": 10800
1947
+ },
1948
+ {
1949
+ "epoch": 2.7343166424004557,
1950
+ "eval_loss": 2.4340403079986572,
1951
+ "eval_runtime": 40.3849,
1952
+ "eval_samples_per_second": 43.482,
1953
+ "eval_steps_per_second": 43.482,
1954
+ "step": 10800
1955
+ },
1956
+ {
1957
+ "epoch": 2.746977274165981,
1958
+ "grad_norm": 1.3680106401443481,
1959
+ "learning_rate": 4.69893078221722e-06,
1960
+ "loss": 2.5562,
1961
+ "step": 10850
1962
+ },
1963
+ {
1964
+ "epoch": 2.759637905931506,
1965
+ "grad_norm": 1.0978279113769531,
1966
+ "learning_rate": 4.464453198274246e-06,
1967
+ "loss": 2.5185,
1968
+ "step": 10900
1969
+ },
1970
+ {
1971
+ "epoch": 2.772298537697031,
1972
+ "grad_norm": 1.2212647199630737,
1973
+ "learning_rate": 4.2299756143312695e-06,
1974
+ "loss": 2.6371,
1975
+ "step": 10950
1976
+ },
1977
+ {
1978
+ "epoch": 2.784959169462556,
1979
+ "grad_norm": 1.6452165842056274,
1980
+ "learning_rate": 3.995498030388295e-06,
1981
+ "loss": 2.681,
1982
+ "step": 11000
1983
+ },
1984
+ {
1985
+ "epoch": 2.784959169462556,
1986
+ "eval_loss": 2.4337143898010254,
1987
+ "eval_runtime": 40.4235,
1988
+ "eval_samples_per_second": 43.44,
1989
+ "eval_steps_per_second": 43.44,
1990
+ "step": 11000
1991
+ },
1992
+ {
1993
+ "epoch": 2.7976198012280813,
1994
+ "grad_norm": 1.7757978439331055,
1995
+ "learning_rate": 3.7610204464453203e-06,
1996
+ "loss": 2.5746,
1997
+ "step": 11050
1998
+ },
1999
+ {
2000
+ "epoch": 2.8102804329936064,
2001
+ "grad_norm": 1.2373579740524292,
2002
+ "learning_rate": 3.5265428625023455e-06,
2003
+ "loss": 2.5412,
2004
+ "step": 11100
2005
+ },
2006
+ {
2007
+ "epoch": 2.8229410647591315,
2008
+ "grad_norm": 1.1407558917999268,
2009
+ "learning_rate": 3.29206527855937e-06,
2010
+ "loss": 2.5973,
2011
+ "step": 11150
2012
+ },
2013
+ {
2014
+ "epoch": 2.8356016965246567,
2015
+ "grad_norm": 2.399686813354492,
2016
+ "learning_rate": 3.057587694616395e-06,
2017
+ "loss": 2.5566,
2018
+ "step": 11200
2019
+ },
2020
+ {
2021
+ "epoch": 2.8356016965246567,
2022
+ "eval_loss": 2.4338231086730957,
2023
+ "eval_runtime": 40.4877,
2024
+ "eval_samples_per_second": 43.371,
2025
+ "eval_steps_per_second": 43.371,
2026
+ "step": 11200
2027
+ },
2028
+ {
2029
+ "epoch": 2.8482623282901818,
2030
+ "grad_norm": 1.7053141593933105,
2031
+ "learning_rate": 2.8231101106734197e-06,
2032
+ "loss": 2.6224,
2033
+ "step": 11250
2034
+ },
2035
+ {
2036
+ "epoch": 2.860922960055707,
2037
+ "grad_norm": 1.8215903043746948,
2038
+ "learning_rate": 2.5886325267304445e-06,
2039
+ "loss": 2.5108,
2040
+ "step": 11300
2041
+ },
2042
+ {
2043
+ "epoch": 2.873583591821232,
2044
+ "grad_norm": 1.1648200750350952,
2045
+ "learning_rate": 2.3541549427874697e-06,
2046
+ "loss": 2.557,
2047
+ "step": 11350
2048
+ },
2049
+ {
2050
+ "epoch": 2.886244223586757,
2051
+ "grad_norm": 1.5225868225097656,
2052
+ "learning_rate": 2.1196773588444944e-06,
2053
+ "loss": 2.6285,
2054
+ "step": 11400
2055
+ },
2056
+ {
2057
+ "epoch": 2.886244223586757,
2058
+ "eval_loss": 2.4334514141082764,
2059
+ "eval_runtime": 40.3985,
2060
+ "eval_samples_per_second": 43.467,
2061
+ "eval_steps_per_second": 43.467,
2062
+ "step": 11400
2063
+ },
2064
+ {
2065
+ "epoch": 2.8989048553522823,
2066
+ "grad_norm": 1.4937622547149658,
2067
+ "learning_rate": 1.8851997749015194e-06,
2068
+ "loss": 2.481,
2069
+ "step": 11450
2070
+ },
2071
+ {
2072
+ "epoch": 2.911565487117807,
2073
+ "grad_norm": 1.9169902801513672,
2074
+ "learning_rate": 1.6507221909585446e-06,
2075
+ "loss": 2.5412,
2076
+ "step": 11500
2077
+ },
2078
+ {
2079
+ "epoch": 2.9242261188833325,
2080
+ "grad_norm": 1.6611114740371704,
2081
+ "learning_rate": 1.4162446070155693e-06,
2082
+ "loss": 2.5086,
2083
+ "step": 11550
2084
+ },
2085
+ {
2086
+ "epoch": 2.936886750648857,
2087
+ "grad_norm": 1.3464007377624512,
2088
+ "learning_rate": 1.1817670230725943e-06,
2089
+ "loss": 2.6063,
2090
+ "step": 11600
2091
+ },
2092
+ {
2093
+ "epoch": 2.936886750648857,
2094
+ "eval_loss": 2.4329476356506348,
2095
+ "eval_runtime": 40.4334,
2096
+ "eval_samples_per_second": 43.429,
2097
+ "eval_steps_per_second": 43.429,
2098
+ "step": 11600
2099
+ },
2100
+ {
2101
+ "epoch": 2.9495473824143827,
2102
+ "grad_norm": 1.453385829925537,
2103
+ "learning_rate": 9.472894391296193e-07,
2104
+ "loss": 2.5012,
2105
+ "step": 11650
2106
+ },
2107
+ {
2108
+ "epoch": 2.9622080141799074,
2109
+ "grad_norm": 1.6921356916427612,
2110
+ "learning_rate": 7.128118551866442e-07,
2111
+ "loss": 2.5589,
2112
+ "step": 11700
2113
+ },
2114
+ {
2115
+ "epoch": 2.9748686459454325,
2116
+ "grad_norm": 1.0562982559204102,
2117
+ "learning_rate": 4.783342712436691e-07,
2118
+ "loss": 2.6015,
2119
+ "step": 11750
2120
+ },
2121
+ {
2122
+ "epoch": 2.9875292777109577,
2123
+ "grad_norm": 1.457960844039917,
2124
+ "learning_rate": 2.4385668730069406e-07,
2125
+ "loss": 2.6224,
2126
+ "step": 11800
2127
+ },
2128
+ {
2129
+ "epoch": 2.9875292777109577,
2130
+ "eval_loss": 2.432849645614624,
2131
+ "eval_runtime": 40.4518,
2132
+ "eval_samples_per_second": 43.41,
2133
+ "eval_steps_per_second": 43.41,
2134
+ "step": 11800
2135
+ }
2136
+ ],
2137
+ "logging_steps": 50,
2138
+ "max_steps": 11847,
2139
+ "num_input_tokens_seen": 0,
2140
+ "num_train_epochs": 3,
2141
+ "save_steps": 200,
2142
+ "stateful_callbacks": {
2143
+ "TrainerControl": {
2144
+ "args": {
2145
+ "should_epoch_stop": false,
2146
+ "should_evaluate": false,
2147
+ "should_log": false,
2148
+ "should_save": true,
2149
+ "should_training_stop": true
2150
+ },
2151
+ "attributes": {}
2152
+ }
2153
+ },
2154
+ "total_flos": 6249293680214016.0,
2155
+ "train_batch_size": 1,
2156
+ "trial_name": null,
2157
+ "trial_params": null
2158
+ }
checkpoint-11847/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d46b7107c3dfd17ff0fb12b8cdcc256a49e8d4594358d4495904a5fcb069382
3
+ size 5304
checkpoint-11847/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "pad_token": "<|endoftext|>",
5
+ "unk_token": "<|endoftext|>"
6
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "50256": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ }
12
+ },
13
+ "bos_token": "<|endoftext|>",
14
+ "clean_up_tokenization_spaces": false,
15
+ "eos_token": "<|endoftext|>",
16
+ "extra_special_tokens": {},
17
+ "model_max_length": 1024,
18
+ "pad_token": "<|endoftext|>",
19
+ "tokenizer_class": "GPT2Tokenizer",
20
+ "unk_token": "<|endoftext|>"
21
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d46b7107c3dfd17ff0fb12b8cdcc256a49e8d4594358d4495904a5fcb069382
3
+ size 5304