ti250 commited on
Commit
0aaec32
·
verified ·
1 Parent(s): a3eb246

Upload 13 files

Browse files
README.md CHANGED
@@ -1,3 +1,47 @@
1
  ---
2
- license: mit
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ tags:
3
+ - generated_from_trainer
4
+ metrics:
5
+ - accuracy
6
+ model-index:
7
+ - name: MLM-final-scraped-g
8
+ results: []
9
  ---
10
+
11
+ # potocatalysisbert
12
+
13
+ This model is pretrained on a corpus of papers on photcatalysis. For more detailed training procedures, see
14
+ "How beneficial is pre-training on a narrow domain-specific corpus for information extraction about photocatalytic water splitting?" by Taketomo Isazawa and Jacqueline M. Cole.
15
+
16
+ It achieves the following results on the evaluation set:
17
+ - Loss: 1.0215
18
+ - Accuracy: 0.7554
19
+
20
+ ## Training procedure
21
+
22
+ ### Training hyperparameters
23
+
24
+ The following hyperparameters were used during training:
25
+ - learning_rate: 8e-05
26
+ - train_batch_size: 32
27
+ - eval_batch_size: 4
28
+ - seed: 0
29
+ - distributed_type: multi-GPU
30
+ - num_devices: 64
31
+ - total_train_batch_size: 2048
32
+ - total_eval_batch_size: 256
33
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
34
+ - lr_scheduler_type: linear
35
+ - lr_scheduler_warmup_steps: 10000
36
+ - training_steps: 187500
37
+
38
+ ### Framework versions
39
+
40
+ - Transformers 4.25.1
41
+ - Pytorch 1.12.0a0+git664058f
42
+ - Datasets 2.7.1
43
+ - Tokenizers 0.12.1
44
+
45
+ ## Acknowledgements
46
+
47
+ This model was trained for the paper "How beneficial is pre-training on a narrow domain-specific corpus for information extraction about photocatalytic water splitting?" by Taketomo Isazawa and Jacqueline M. Cole. J.M.C. is grateful for the BASF/Royal Academy of Engineering Research Chair in Data-Driven Molecular Engineering of Functional Materials, which includes PhD studentship support (for T.I.). This Chair is also partly supported by the Science and Technology Facilities Council. They are also indebted to the Argonne Leadership Computing Facility, which is a DOE Office of Science Facility, for use of its research resources, under contract No. DE-AC02-06CH11357.
all_results.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 84.96,
3
+ "eval_accuracy": 0.7553893124870495,
4
+ "eval_loss": 1.021484375,
5
+ "eval_runtime": 669.8614,
6
+ "eval_samples": 261426,
7
+ "eval_samples_per_second": 390.269,
8
+ "eval_steps_per_second": 1.526,
9
+ "perplexity": 2.777314281640181,
10
+ "train_loss": 0.10515284895833334,
11
+ "train_runtime": 4947.3494,
12
+ "train_samples": 4518599,
13
+ "train_samples_per_second": 77617.32,
14
+ "train_steps_per_second": 37.899
15
+ }
config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/grand/projects/SolarWindowsADSP/taketomo/polaris_outputs/MLM-final-scraped-g/checkpoint-170000",
3
+ "architectures": [
4
+ "BertForMaskedLM"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 3072,
13
+ "layer_norm_eps": 1e-12,
14
+ "max_position_embeddings": 512,
15
+ "model_type": "bert",
16
+ "num_attention_heads": 12,
17
+ "num_hidden_layers": 12,
18
+ "pad_token_id": 0,
19
+ "position_embedding_type": "absolute",
20
+ "torch_dtype": "float16",
21
+ "transformers_version": "4.25.1",
22
+ "type_vocab_size": 2,
23
+ "use_cache": true,
24
+ "vocab_size": 30522
25
+ }
config_backup.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/grand/projects/SolarWindowsADSP/taketomo/polaris_outputs/MLM-final-scraped-g/checkpoint-170000",
3
+ "architectures": [
4
+ "BertForMaskedLM"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 3072,
13
+ "layer_norm_eps": 1e-12,
14
+ "max_position_embeddings": 512,
15
+ "model_type": "bert",
16
+ "num_attention_heads": 12,
17
+ "num_hidden_layers": 12,
18
+ "pad_token_id": 0,
19
+ "position_embedding_type": "absolute",
20
+ "torch_dtype": "float16",
21
+ "transformers_version": "4.25.1",
22
+ "type_vocab_size": 2,
23
+ "use_cache": true,
24
+ "vocab_size": 30522
25
+ }
eval_results.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 84.96,
3
+ "eval_accuracy": 0.7553893124870495,
4
+ "eval_loss": 1.021484375,
5
+ "eval_runtime": 669.8614,
6
+ "eval_samples": 261426,
7
+ "eval_samples_per_second": 390.269,
8
+ "eval_steps_per_second": 1.526,
9
+ "perplexity": 2.777314281640181
10
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7df55897a38cf818d86199a39d009676b7e4ebdddd92de7fbc9470104a4d0dd9
3
+ size 438085219
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "do_lower_case": true,
4
+ "mask_token": "[MASK]",
5
+ "model_max_length": 512,
6
+ "name_or_path": "/grand/projects/SolarWindowsADSP/taketomo/polaris_outputs/tokenizer-bert-base-uncased-scraped_only",
7
+ "pad_token": "[PAD]",
8
+ "sep_token": "[SEP]",
9
+ "special_tokens_map_file": null,
10
+ "strip_accents": null,
11
+ "tokenize_chinese_chars": true,
12
+ "tokenizer_class": "BertTokenizer",
13
+ "unk_token": "[UNK]"
14
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 84.96,
3
+ "train_loss": 0.10515284895833334,
4
+ "train_runtime": 4947.3494,
5
+ "train_samples": 4518599,
6
+ "train_samples_per_second": 77617.32,
7
+ "train_steps_per_second": 37.899
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,1147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 84.95695514272768,
5
+ "global_step": 187500,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.45,
12
+ "learning_rate": 7.864000000000001e-06,
13
+ "loss": 8.2983,
14
+ "step": 1000
15
+ },
16
+ {
17
+ "epoch": 0.91,
18
+ "learning_rate": 1.5856e-05,
19
+ "loss": 6.5434,
20
+ "step": 2000
21
+ },
22
+ {
23
+ "epoch": 1.36,
24
+ "learning_rate": 2.3848e-05,
25
+ "loss": 6.1765,
26
+ "step": 3000
27
+ },
28
+ {
29
+ "epoch": 1.81,
30
+ "learning_rate": 3.184000000000001e-05,
31
+ "loss": 5.9601,
32
+ "step": 4000
33
+ },
34
+ {
35
+ "epoch": 2.27,
36
+ "learning_rate": 3.9832e-05,
37
+ "loss": 5.8266,
38
+ "step": 5000
39
+ },
40
+ {
41
+ "epoch": 2.72,
42
+ "learning_rate": 4.7824e-05,
43
+ "loss": 5.7332,
44
+ "step": 6000
45
+ },
46
+ {
47
+ "epoch": 3.17,
48
+ "learning_rate": 5.5816e-05,
49
+ "loss": 5.6554,
50
+ "step": 7000
51
+ },
52
+ {
53
+ "epoch": 3.62,
54
+ "learning_rate": 6.380800000000001e-05,
55
+ "loss": 5.5975,
56
+ "step": 8000
57
+ },
58
+ {
59
+ "epoch": 4.08,
60
+ "learning_rate": 7.18e-05,
61
+ "loss": 5.5568,
62
+ "step": 9000
63
+ },
64
+ {
65
+ "epoch": 4.53,
66
+ "learning_rate": 7.9792e-05,
67
+ "loss": 5.5224,
68
+ "step": 10000
69
+ },
70
+ {
71
+ "epoch": 4.98,
72
+ "learning_rate": 7.95614647887324e-05,
73
+ "loss": 5.4944,
74
+ "step": 11000
75
+ },
76
+ {
77
+ "epoch": 5.44,
78
+ "learning_rate": 7.911121126760565e-05,
79
+ "loss": 5.473,
80
+ "step": 12000
81
+ },
82
+ {
83
+ "epoch": 5.89,
84
+ "learning_rate": 7.866095774647888e-05,
85
+ "loss": 5.4557,
86
+ "step": 13000
87
+ },
88
+ {
89
+ "epoch": 6.34,
90
+ "learning_rate": 7.821070422535212e-05,
91
+ "loss": 5.4392,
92
+ "step": 14000
93
+ },
94
+ {
95
+ "epoch": 6.8,
96
+ "learning_rate": 7.776045070422535e-05,
97
+ "loss": 5.4274,
98
+ "step": 15000
99
+ },
100
+ {
101
+ "epoch": 7.25,
102
+ "learning_rate": 7.73101971830986e-05,
103
+ "loss": 5.4168,
104
+ "step": 16000
105
+ },
106
+ {
107
+ "epoch": 7.7,
108
+ "learning_rate": 7.685994366197184e-05,
109
+ "loss": 5.4051,
110
+ "step": 17000
111
+ },
112
+ {
113
+ "epoch": 8.16,
114
+ "learning_rate": 7.640969014084507e-05,
115
+ "loss": 5.3963,
116
+ "step": 18000
117
+ },
118
+ {
119
+ "epoch": 8.61,
120
+ "learning_rate": 7.595943661971832e-05,
121
+ "loss": 5.3901,
122
+ "step": 19000
123
+ },
124
+ {
125
+ "epoch": 9.06,
126
+ "learning_rate": 7.550918309859156e-05,
127
+ "loss": 5.3848,
128
+ "step": 20000
129
+ },
130
+ {
131
+ "epoch": 9.52,
132
+ "learning_rate": 7.505892957746479e-05,
133
+ "loss": 5.2904,
134
+ "step": 21000
135
+ },
136
+ {
137
+ "epoch": 9.97,
138
+ "learning_rate": 7.460867605633804e-05,
139
+ "loss": 4.6695,
140
+ "step": 22000
141
+ },
142
+ {
143
+ "epoch": 10.42,
144
+ "learning_rate": 7.415842253521126e-05,
145
+ "loss": 3.6226,
146
+ "step": 23000
147
+ },
148
+ {
149
+ "epoch": 10.87,
150
+ "learning_rate": 7.370816901408451e-05,
151
+ "loss": 2.5281,
152
+ "step": 24000
153
+ },
154
+ {
155
+ "epoch": 11.33,
156
+ "learning_rate": 7.325791549295775e-05,
157
+ "loss": 2.2386,
158
+ "step": 25000
159
+ },
160
+ {
161
+ "epoch": 11.78,
162
+ "learning_rate": 7.2807661971831e-05,
163
+ "loss": 2.0823,
164
+ "step": 26000
165
+ },
166
+ {
167
+ "epoch": 12.23,
168
+ "learning_rate": 7.235740845070423e-05,
169
+ "loss": 1.9728,
170
+ "step": 27000
171
+ },
172
+ {
173
+ "epoch": 12.69,
174
+ "learning_rate": 7.190715492957747e-05,
175
+ "loss": 1.8923,
176
+ "step": 28000
177
+ },
178
+ {
179
+ "epoch": 13.14,
180
+ "learning_rate": 7.145690140845072e-05,
181
+ "loss": 1.8241,
182
+ "step": 29000
183
+ },
184
+ {
185
+ "epoch": 13.59,
186
+ "learning_rate": 7.100664788732395e-05,
187
+ "loss": 1.7682,
188
+ "step": 30000
189
+ },
190
+ {
191
+ "epoch": 14.05,
192
+ "learning_rate": 7.055639436619719e-05,
193
+ "loss": 1.7244,
194
+ "step": 31000
195
+ },
196
+ {
197
+ "epoch": 14.5,
198
+ "learning_rate": 7.010614084507043e-05,
199
+ "loss": 1.6804,
200
+ "step": 32000
201
+ },
202
+ {
203
+ "epoch": 14.95,
204
+ "learning_rate": 6.965588732394366e-05,
205
+ "loss": 1.6465,
206
+ "step": 33000
207
+ },
208
+ {
209
+ "epoch": 15.41,
210
+ "learning_rate": 6.920563380281691e-05,
211
+ "loss": 1.6159,
212
+ "step": 34000
213
+ },
214
+ {
215
+ "epoch": 15.86,
216
+ "learning_rate": 6.875538028169015e-05,
217
+ "loss": 1.5909,
218
+ "step": 35000
219
+ },
220
+ {
221
+ "epoch": 16.31,
222
+ "learning_rate": 6.830512676056338e-05,
223
+ "loss": 1.5664,
224
+ "step": 36000
225
+ },
226
+ {
227
+ "epoch": 16.76,
228
+ "learning_rate": 6.785487323943663e-05,
229
+ "loss": 1.5481,
230
+ "step": 37000
231
+ },
232
+ {
233
+ "epoch": 17.22,
234
+ "learning_rate": 6.740461971830987e-05,
235
+ "loss": 1.5282,
236
+ "step": 38000
237
+ },
238
+ {
239
+ "epoch": 17.67,
240
+ "learning_rate": 6.69543661971831e-05,
241
+ "loss": 1.5103,
242
+ "step": 39000
243
+ },
244
+ {
245
+ "epoch": 18.12,
246
+ "learning_rate": 6.650411267605634e-05,
247
+ "loss": 1.4946,
248
+ "step": 40000
249
+ },
250
+ {
251
+ "epoch": 18.58,
252
+ "learning_rate": 6.605385915492959e-05,
253
+ "loss": 1.4793,
254
+ "step": 41000
255
+ },
256
+ {
257
+ "epoch": 19.03,
258
+ "learning_rate": 6.560360563380282e-05,
259
+ "loss": 1.4647,
260
+ "step": 42000
261
+ },
262
+ {
263
+ "epoch": 19.48,
264
+ "learning_rate": 6.515335211267606e-05,
265
+ "loss": 1.4525,
266
+ "step": 43000
267
+ },
268
+ {
269
+ "epoch": 19.94,
270
+ "learning_rate": 6.47030985915493e-05,
271
+ "loss": 1.4408,
272
+ "step": 44000
273
+ },
274
+ {
275
+ "epoch": 20.39,
276
+ "learning_rate": 6.425284507042254e-05,
277
+ "loss": 1.429,
278
+ "step": 45000
279
+ },
280
+ {
281
+ "epoch": 20.84,
282
+ "learning_rate": 6.380259154929578e-05,
283
+ "loss": 1.4191,
284
+ "step": 46000
285
+ },
286
+ {
287
+ "epoch": 21.3,
288
+ "learning_rate": 6.335233802816903e-05,
289
+ "loss": 1.4093,
290
+ "step": 47000
291
+ },
292
+ {
293
+ "epoch": 21.75,
294
+ "learning_rate": 6.290208450704226e-05,
295
+ "loss": 1.399,
296
+ "step": 48000
297
+ },
298
+ {
299
+ "epoch": 22.2,
300
+ "learning_rate": 6.24518309859155e-05,
301
+ "loss": 1.3902,
302
+ "step": 49000
303
+ },
304
+ {
305
+ "epoch": 22.66,
306
+ "learning_rate": 6.200157746478873e-05,
307
+ "loss": 1.3811,
308
+ "step": 50000
309
+ },
310
+ {
311
+ "epoch": 23.11,
312
+ "learning_rate": 6.155132394366198e-05,
313
+ "loss": 1.3738,
314
+ "step": 51000
315
+ },
316
+ {
317
+ "epoch": 23.56,
318
+ "learning_rate": 6.110107042253522e-05,
319
+ "loss": 1.3651,
320
+ "step": 52000
321
+ },
322
+ {
323
+ "epoch": 24.01,
324
+ "learning_rate": 6.0650816901408453e-05,
325
+ "loss": 1.361,
326
+ "step": 53000
327
+ },
328
+ {
329
+ "epoch": 24.47,
330
+ "learning_rate": 6.0200563380281696e-05,
331
+ "loss": 1.3519,
332
+ "step": 54000
333
+ },
334
+ {
335
+ "epoch": 24.92,
336
+ "learning_rate": 5.975030985915493e-05,
337
+ "loss": 1.3456,
338
+ "step": 55000
339
+ },
340
+ {
341
+ "epoch": 25.37,
342
+ "learning_rate": 5.9300056338028174e-05,
343
+ "loss": 1.3394,
344
+ "step": 56000
345
+ },
346
+ {
347
+ "epoch": 25.83,
348
+ "learning_rate": 5.8849802816901416e-05,
349
+ "loss": 1.3332,
350
+ "step": 57000
351
+ },
352
+ {
353
+ "epoch": 26.28,
354
+ "learning_rate": 5.839954929577465e-05,
355
+ "loss": 1.3273,
356
+ "step": 58000
357
+ },
358
+ {
359
+ "epoch": 26.73,
360
+ "learning_rate": 5.7949295774647894e-05,
361
+ "loss": 1.3222,
362
+ "step": 59000
363
+ },
364
+ {
365
+ "epoch": 27.19,
366
+ "learning_rate": 5.7499042253521136e-05,
367
+ "loss": 1.3159,
368
+ "step": 60000
369
+ },
370
+ {
371
+ "epoch": 27.64,
372
+ "learning_rate": 5.7048788732394365e-05,
373
+ "loss": 1.3117,
374
+ "step": 61000
375
+ },
376
+ {
377
+ "epoch": 28.09,
378
+ "learning_rate": 5.659853521126761e-05,
379
+ "loss": 1.3067,
380
+ "step": 62000
381
+ },
382
+ {
383
+ "epoch": 28.55,
384
+ "learning_rate": 5.614828169014085e-05,
385
+ "loss": 1.3015,
386
+ "step": 63000
387
+ },
388
+ {
389
+ "epoch": 29.0,
390
+ "learning_rate": 5.569802816901409e-05,
391
+ "loss": 1.2972,
392
+ "step": 64000
393
+ },
394
+ {
395
+ "epoch": 29.45,
396
+ "learning_rate": 5.524777464788733e-05,
397
+ "loss": 1.291,
398
+ "step": 65000
399
+ },
400
+ {
401
+ "epoch": 29.9,
402
+ "learning_rate": 5.479752112676057e-05,
403
+ "loss": 1.2877,
404
+ "step": 66000
405
+ },
406
+ {
407
+ "epoch": 30.36,
408
+ "learning_rate": 5.434726760563381e-05,
409
+ "loss": 1.2836,
410
+ "step": 67000
411
+ },
412
+ {
413
+ "epoch": 30.81,
414
+ "learning_rate": 5.389701408450704e-05,
415
+ "loss": 1.2798,
416
+ "step": 68000
417
+ },
418
+ {
419
+ "epoch": 31.26,
420
+ "learning_rate": 5.344721126760564e-05,
421
+ "loss": 1.276,
422
+ "step": 69000
423
+ },
424
+ {
425
+ "epoch": 31.72,
426
+ "learning_rate": 5.2996507042253526e-05,
427
+ "loss": 1.2723,
428
+ "step": 70000
429
+ },
430
+ {
431
+ "epoch": 32.17,
432
+ "learning_rate": 5.254670422535211e-05,
433
+ "loss": 1.2676,
434
+ "step": 71000
435
+ },
436
+ {
437
+ "epoch": 32.62,
438
+ "learning_rate": 5.2096000000000004e-05,
439
+ "loss": 1.2645,
440
+ "step": 72000
441
+ },
442
+ {
443
+ "epoch": 33.08,
444
+ "learning_rate": 5.1645746478873246e-05,
445
+ "loss": 1.2604,
446
+ "step": 73000
447
+ },
448
+ {
449
+ "epoch": 33.53,
450
+ "learning_rate": 5.119549295774648e-05,
451
+ "loss": 1.2577,
452
+ "step": 74000
453
+ },
454
+ {
455
+ "epoch": 33.98,
456
+ "learning_rate": 5.0745239436619724e-05,
457
+ "loss": 1.2552,
458
+ "step": 75000
459
+ },
460
+ {
461
+ "epoch": 34.44,
462
+ "learning_rate": 5.029498591549297e-05,
463
+ "loss": 1.2516,
464
+ "step": 76000
465
+ },
466
+ {
467
+ "epoch": 34.89,
468
+ "learning_rate": 4.9844732394366195e-05,
469
+ "loss": 1.2488,
470
+ "step": 77000
471
+ },
472
+ {
473
+ "epoch": 35.34,
474
+ "learning_rate": 4.939447887323944e-05,
475
+ "loss": 1.2458,
476
+ "step": 78000
477
+ },
478
+ {
479
+ "epoch": 35.8,
480
+ "learning_rate": 4.894467605633804e-05,
481
+ "loss": 1.2443,
482
+ "step": 79000
483
+ },
484
+ {
485
+ "epoch": 36.25,
486
+ "learning_rate": 4.8493971830985916e-05,
487
+ "loss": 1.2398,
488
+ "step": 80000
489
+ },
490
+ {
491
+ "epoch": 36.7,
492
+ "learning_rate": 4.804371830985916e-05,
493
+ "loss": 1.2374,
494
+ "step": 81000
495
+ },
496
+ {
497
+ "epoch": 37.15,
498
+ "learning_rate": 4.75934647887324e-05,
499
+ "loss": 1.2341,
500
+ "step": 82000
501
+ },
502
+ {
503
+ "epoch": 37.61,
504
+ "learning_rate": 4.714321126760564e-05,
505
+ "loss": 1.2321,
506
+ "step": 83000
507
+ },
508
+ {
509
+ "epoch": 38.06,
510
+ "learning_rate": 4.669295774647888e-05,
511
+ "loss": 1.23,
512
+ "step": 84000
513
+ },
514
+ {
515
+ "epoch": 38.51,
516
+ "learning_rate": 4.624270422535212e-05,
517
+ "loss": 1.2265,
518
+ "step": 85000
519
+ },
520
+ {
521
+ "epoch": 38.97,
522
+ "learning_rate": 4.5792450704225356e-05,
523
+ "loss": 1.2253,
524
+ "step": 86000
525
+ },
526
+ {
527
+ "epoch": 39.42,
528
+ "learning_rate": 4.534219718309859e-05,
529
+ "loss": 1.2227,
530
+ "step": 87000
531
+ },
532
+ {
533
+ "epoch": 39.87,
534
+ "learning_rate": 4.4891943661971834e-05,
535
+ "loss": 1.2204,
536
+ "step": 88000
537
+ },
538
+ {
539
+ "epoch": 40.33,
540
+ "learning_rate": 4.4441690140845077e-05,
541
+ "loss": 1.2171,
542
+ "step": 89000
543
+ },
544
+ {
545
+ "epoch": 40.78,
546
+ "learning_rate": 4.399143661971831e-05,
547
+ "loss": 1.2154,
548
+ "step": 90000
549
+ },
550
+ {
551
+ "epoch": 41.23,
552
+ "learning_rate": 4.3541183098591555e-05,
553
+ "loss": 1.2132,
554
+ "step": 91000
555
+ },
556
+ {
557
+ "epoch": 41.69,
558
+ "learning_rate": 4.30909295774648e-05,
559
+ "loss": 1.2118,
560
+ "step": 92000
561
+ },
562
+ {
563
+ "epoch": 42.14,
564
+ "learning_rate": 4.2640676056338026e-05,
565
+ "loss": 1.2088,
566
+ "step": 93000
567
+ },
568
+ {
569
+ "epoch": 42.59,
570
+ "learning_rate": 4.2189971830985916e-05,
571
+ "loss": 1.2068,
572
+ "step": 94000
573
+ },
574
+ {
575
+ "epoch": 43.04,
576
+ "learning_rate": 4.173971830985916e-05,
577
+ "loss": 1.2063,
578
+ "step": 95000
579
+ },
580
+ {
581
+ "epoch": 43.5,
582
+ "learning_rate": 4.12894647887324e-05,
583
+ "loss": 1.2025,
584
+ "step": 96000
585
+ },
586
+ {
587
+ "epoch": 43.95,
588
+ "learning_rate": 4.083921126760564e-05,
589
+ "loss": 1.2013,
590
+ "step": 97000
591
+ },
592
+ {
593
+ "epoch": 44.4,
594
+ "learning_rate": 4.038895774647888e-05,
595
+ "loss": 1.2002,
596
+ "step": 98000
597
+ },
598
+ {
599
+ "epoch": 44.86,
600
+ "learning_rate": 3.993825352112676e-05,
601
+ "loss": 1.198,
602
+ "step": 99000
603
+ },
604
+ {
605
+ "epoch": 45.31,
606
+ "learning_rate": 3.9488e-05,
607
+ "loss": 1.1962,
608
+ "step": 100000
609
+ },
610
+ {
611
+ "epoch": 45.76,
612
+ "learning_rate": 3.903774647887324e-05,
613
+ "loss": 1.1953,
614
+ "step": 101000
615
+ },
616
+ {
617
+ "epoch": 46.22,
618
+ "learning_rate": 3.8587492957746483e-05,
619
+ "loss": 1.1927,
620
+ "step": 102000
621
+ },
622
+ {
623
+ "epoch": 46.67,
624
+ "learning_rate": 3.813723943661972e-05,
625
+ "loss": 1.1909,
626
+ "step": 103000
627
+ },
628
+ {
629
+ "epoch": 47.12,
630
+ "learning_rate": 3.768698591549296e-05,
631
+ "loss": 1.1891,
632
+ "step": 104000
633
+ },
634
+ {
635
+ "epoch": 47.58,
636
+ "learning_rate": 3.72367323943662e-05,
637
+ "loss": 1.1876,
638
+ "step": 105000
639
+ },
640
+ {
641
+ "epoch": 48.03,
642
+ "learning_rate": 3.678647887323944e-05,
643
+ "loss": 1.1868,
644
+ "step": 106000
645
+ },
646
+ {
647
+ "epoch": 48.48,
648
+ "learning_rate": 3.633622535211268e-05,
649
+ "loss": 1.1841,
650
+ "step": 107000
651
+ },
652
+ {
653
+ "epoch": 48.94,
654
+ "learning_rate": 3.588597183098592e-05,
655
+ "loss": 1.1844,
656
+ "step": 108000
657
+ },
658
+ {
659
+ "epoch": 49.39,
660
+ "learning_rate": 3.543571830985916e-05,
661
+ "loss": 1.1815,
662
+ "step": 109000
663
+ },
664
+ {
665
+ "epoch": 49.84,
666
+ "learning_rate": 3.4985464788732395e-05,
667
+ "loss": 1.1809,
668
+ "step": 110000
669
+ },
670
+ {
671
+ "epoch": 50.29,
672
+ "learning_rate": 3.453521126760564e-05,
673
+ "loss": 1.1784,
674
+ "step": 111000
675
+ },
676
+ {
677
+ "epoch": 50.75,
678
+ "learning_rate": 3.408495774647888e-05,
679
+ "loss": 1.1774,
680
+ "step": 112000
681
+ },
682
+ {
683
+ "epoch": 51.2,
684
+ "learning_rate": 3.3634704225352115e-05,
685
+ "loss": 1.1761,
686
+ "step": 113000
687
+ },
688
+ {
689
+ "epoch": 51.65,
690
+ "learning_rate": 3.318445070422535e-05,
691
+ "loss": 1.175,
692
+ "step": 114000
693
+ },
694
+ {
695
+ "epoch": 52.11,
696
+ "learning_rate": 3.273419718309859e-05,
697
+ "loss": 1.1742,
698
+ "step": 115000
699
+ },
700
+ {
701
+ "epoch": 52.56,
702
+ "learning_rate": 3.2283943661971836e-05,
703
+ "loss": 1.1726,
704
+ "step": 116000
705
+ },
706
+ {
707
+ "epoch": 53.01,
708
+ "learning_rate": 3.183323943661972e-05,
709
+ "loss": 1.1719,
710
+ "step": 117000
711
+ },
712
+ {
713
+ "epoch": 53.47,
714
+ "learning_rate": 3.1382985915492955e-05,
715
+ "loss": 1.1699,
716
+ "step": 118000
717
+ },
718
+ {
719
+ "epoch": 53.92,
720
+ "learning_rate": 3.09327323943662e-05,
721
+ "loss": 1.1684,
722
+ "step": 119000
723
+ },
724
+ {
725
+ "epoch": 54.37,
726
+ "learning_rate": 3.048247887323944e-05,
727
+ "loss": 1.1676,
728
+ "step": 120000
729
+ },
730
+ {
731
+ "epoch": 54.83,
732
+ "learning_rate": 3.003222535211268e-05,
733
+ "loss": 1.1662,
734
+ "step": 121000
735
+ },
736
+ {
737
+ "epoch": 55.28,
738
+ "learning_rate": 2.9581971830985918e-05,
739
+ "loss": 1.1647,
740
+ "step": 122000
741
+ },
742
+ {
743
+ "epoch": 55.73,
744
+ "learning_rate": 2.9131718309859157e-05,
745
+ "loss": 1.1643,
746
+ "step": 123000
747
+ },
748
+ {
749
+ "epoch": 56.18,
750
+ "learning_rate": 2.86814647887324e-05,
751
+ "loss": 1.1639,
752
+ "step": 124000
753
+ },
754
+ {
755
+ "epoch": 56.64,
756
+ "learning_rate": 2.8231211267605635e-05,
757
+ "loss": 1.1617,
758
+ "step": 125000
759
+ },
760
+ {
761
+ "epoch": 57.09,
762
+ "learning_rate": 2.7780957746478874e-05,
763
+ "loss": 1.1606,
764
+ "step": 126000
765
+ },
766
+ {
767
+ "epoch": 57.54,
768
+ "learning_rate": 2.7330704225352116e-05,
769
+ "loss": 1.1608,
770
+ "step": 127000
771
+ },
772
+ {
773
+ "epoch": 58.0,
774
+ "learning_rate": 2.6880450704225355e-05,
775
+ "loss": 1.159,
776
+ "step": 128000
777
+ },
778
+ {
779
+ "epoch": 58.45,
780
+ "learning_rate": 2.643019718309859e-05,
781
+ "loss": 1.1572,
782
+ "step": 129000
783
+ },
784
+ {
785
+ "epoch": 58.9,
786
+ "learning_rate": 2.5979943661971833e-05,
787
+ "loss": 1.1572,
788
+ "step": 130000
789
+ },
790
+ {
791
+ "epoch": 59.36,
792
+ "learning_rate": 2.5529690140845072e-05,
793
+ "loss": 1.1562,
794
+ "step": 131000
795
+ },
796
+ {
797
+ "epoch": 59.81,
798
+ "learning_rate": 2.5079436619718314e-05,
799
+ "loss": 1.1551,
800
+ "step": 132000
801
+ },
802
+ {
803
+ "epoch": 60.26,
804
+ "learning_rate": 2.462918309859155e-05,
805
+ "loss": 1.1547,
806
+ "step": 133000
807
+ },
808
+ {
809
+ "epoch": 60.72,
810
+ "learning_rate": 2.417892957746479e-05,
811
+ "loss": 1.153,
812
+ "step": 134000
813
+ },
814
+ {
815
+ "epoch": 61.17,
816
+ "learning_rate": 2.372867605633803e-05,
817
+ "loss": 1.1526,
818
+ "step": 135000
819
+ },
820
+ {
821
+ "epoch": 61.62,
822
+ "learning_rate": 2.327842253521127e-05,
823
+ "loss": 1.1522,
824
+ "step": 136000
825
+ },
826
+ {
827
+ "epoch": 62.08,
828
+ "learning_rate": 2.2828169014084506e-05,
829
+ "loss": 1.1505,
830
+ "step": 137000
831
+ },
832
+ {
833
+ "epoch": 62.53,
834
+ "learning_rate": 2.2377915492957748e-05,
835
+ "loss": 1.1505,
836
+ "step": 138000
837
+ },
838
+ {
839
+ "epoch": 62.98,
840
+ "learning_rate": 2.1927661971830987e-05,
841
+ "loss": 1.1485,
842
+ "step": 139000
843
+ },
844
+ {
845
+ "epoch": 63.43,
846
+ "learning_rate": 2.1477859154929578e-05,
847
+ "loss": 1.1482,
848
+ "step": 140000
849
+ },
850
+ {
851
+ "epoch": 63.89,
852
+ "learning_rate": 2.102715492957747e-05,
853
+ "loss": 1.1466,
854
+ "step": 141000
855
+ },
856
+ {
857
+ "epoch": 64.34,
858
+ "learning_rate": 2.0576901408450704e-05,
859
+ "loss": 1.1467,
860
+ "step": 142000
861
+ },
862
+ {
863
+ "epoch": 64.79,
864
+ "learning_rate": 2.0126647887323946e-05,
865
+ "loss": 1.146,
866
+ "step": 143000
867
+ },
868
+ {
869
+ "epoch": 65.25,
870
+ "learning_rate": 1.9676394366197185e-05,
871
+ "loss": 1.1452,
872
+ "step": 144000
873
+ },
874
+ {
875
+ "epoch": 65.7,
876
+ "learning_rate": 1.9226140845070424e-05,
877
+ "loss": 1.1447,
878
+ "step": 145000
879
+ },
880
+ {
881
+ "epoch": 66.15,
882
+ "learning_rate": 1.8775887323943663e-05,
883
+ "loss": 1.1441,
884
+ "step": 146000
885
+ },
886
+ {
887
+ "epoch": 66.61,
888
+ "learning_rate": 1.8325633802816902e-05,
889
+ "loss": 1.143,
890
+ "step": 147000
891
+ },
892
+ {
893
+ "epoch": 67.06,
894
+ "learning_rate": 1.787538028169014e-05,
895
+ "loss": 1.1433,
896
+ "step": 148000
897
+ },
898
+ {
899
+ "epoch": 67.51,
900
+ "learning_rate": 1.7425126760563384e-05,
901
+ "loss": 1.1414,
902
+ "step": 149000
903
+ },
904
+ {
905
+ "epoch": 67.97,
906
+ "learning_rate": 1.697487323943662e-05,
907
+ "loss": 1.1413,
908
+ "step": 150000
909
+ },
910
+ {
911
+ "epoch": 68.42,
912
+ "learning_rate": 1.652461971830986e-05,
913
+ "loss": 1.14,
914
+ "step": 151000
915
+ },
916
+ {
917
+ "epoch": 68.87,
918
+ "learning_rate": 1.60743661971831e-05,
919
+ "loss": 1.14,
920
+ "step": 152000
921
+ },
922
+ {
923
+ "epoch": 69.32,
924
+ "learning_rate": 1.562411267605634e-05,
925
+ "loss": 1.139,
926
+ "step": 153000
927
+ },
928
+ {
929
+ "epoch": 69.78,
930
+ "learning_rate": 1.5173859154929578e-05,
931
+ "loss": 1.1385,
932
+ "step": 154000
933
+ },
934
+ {
935
+ "epoch": 70.23,
936
+ "learning_rate": 1.4723605633802817e-05,
937
+ "loss": 1.1385,
938
+ "step": 155000
939
+ },
940
+ {
941
+ "epoch": 70.68,
942
+ "learning_rate": 1.4272901408450706e-05,
943
+ "loss": 1.1374,
944
+ "step": 156000
945
+ },
946
+ {
947
+ "epoch": 71.14,
948
+ "learning_rate": 1.3822647887323945e-05,
949
+ "loss": 1.1362,
950
+ "step": 157000
951
+ },
952
+ {
953
+ "epoch": 71.59,
954
+ "learning_rate": 1.3372394366197183e-05,
955
+ "loss": 1.137,
956
+ "step": 158000
957
+ },
958
+ {
959
+ "epoch": 72.04,
960
+ "learning_rate": 1.2922140845070423e-05,
961
+ "loss": 1.1355,
962
+ "step": 159000
963
+ },
964
+ {
965
+ "epoch": 72.5,
966
+ "learning_rate": 1.2471887323943664e-05,
967
+ "loss": 1.1342,
968
+ "step": 160000
969
+ },
970
+ {
971
+ "epoch": 72.95,
972
+ "learning_rate": 1.2021633802816903e-05,
973
+ "loss": 1.1347,
974
+ "step": 161000
975
+ },
976
+ {
977
+ "epoch": 73.4,
978
+ "learning_rate": 1.157138028169014e-05,
979
+ "loss": 1.133,
980
+ "step": 162000
981
+ },
982
+ {
983
+ "epoch": 73.86,
984
+ "learning_rate": 1.1121126760563381e-05,
985
+ "loss": 1.1339,
986
+ "step": 163000
987
+ },
988
+ {
989
+ "epoch": 74.31,
990
+ "learning_rate": 1.0670873239436622e-05,
991
+ "loss": 1.1332,
992
+ "step": 164000
993
+ },
994
+ {
995
+ "epoch": 74.76,
996
+ "learning_rate": 1.022061971830986e-05,
997
+ "loss": 1.1319,
998
+ "step": 165000
999
+ },
1000
+ {
1001
+ "epoch": 75.22,
1002
+ "learning_rate": 9.7703661971831e-06,
1003
+ "loss": 1.1311,
1004
+ "step": 166000
1005
+ },
1006
+ {
1007
+ "epoch": 75.67,
1008
+ "learning_rate": 9.32056338028169e-06,
1009
+ "loss": 1.1303,
1010
+ "step": 167000
1011
+ },
1012
+ {
1013
+ "epoch": 76.12,
1014
+ "learning_rate": 8.869859154929579e-06,
1015
+ "loss": 1.1318,
1016
+ "step": 168000
1017
+ },
1018
+ {
1019
+ "epoch": 76.57,
1020
+ "learning_rate": 8.419605633802818e-06,
1021
+ "loss": 1.1304,
1022
+ "step": 169000
1023
+ },
1024
+ {
1025
+ "epoch": 77.03,
1026
+ "learning_rate": 7.969352112676057e-06,
1027
+ "loss": 1.1297,
1028
+ "step": 170000
1029
+ },
1030
+ {
1031
+ "epoch": 77.48,
1032
+ "learning_rate": 7.519098591549297e-06,
1033
+ "loss": 1.1295,
1034
+ "step": 171000
1035
+ },
1036
+ {
1037
+ "epoch": 77.93,
1038
+ "learning_rate": 7.068845070422535e-06,
1039
+ "loss": 1.1291,
1040
+ "step": 172000
1041
+ },
1042
+ {
1043
+ "epoch": 78.39,
1044
+ "learning_rate": 6.618591549295776e-06,
1045
+ "loss": 1.1287,
1046
+ "step": 173000
1047
+ },
1048
+ {
1049
+ "epoch": 78.84,
1050
+ "learning_rate": 6.167887323943662e-06,
1051
+ "loss": 1.128,
1052
+ "step": 174000
1053
+ },
1054
+ {
1055
+ "epoch": 79.29,
1056
+ "learning_rate": 5.717633802816902e-06,
1057
+ "loss": 1.1283,
1058
+ "step": 175000
1059
+ },
1060
+ {
1061
+ "epoch": 79.75,
1062
+ "learning_rate": 5.267380281690141e-06,
1063
+ "loss": 1.1272,
1064
+ "step": 176000
1065
+ },
1066
+ {
1067
+ "epoch": 80.2,
1068
+ "learning_rate": 4.817126760563381e-06,
1069
+ "loss": 1.1273,
1070
+ "step": 177000
1071
+ },
1072
+ {
1073
+ "epoch": 80.65,
1074
+ "learning_rate": 4.366873239436621e-06,
1075
+ "loss": 1.1271,
1076
+ "step": 178000
1077
+ },
1078
+ {
1079
+ "epoch": 81.11,
1080
+ "learning_rate": 3.9166197183098595e-06,
1081
+ "loss": 1.126,
1082
+ "step": 179000
1083
+ },
1084
+ {
1085
+ "epoch": 81.56,
1086
+ "learning_rate": 3.4663661971830985e-06,
1087
+ "loss": 1.1263,
1088
+ "step": 180000
1089
+ },
1090
+ {
1091
+ "epoch": 82.01,
1092
+ "learning_rate": 3.016112676056338e-06,
1093
+ "loss": 1.1261,
1094
+ "step": 181000
1095
+ },
1096
+ {
1097
+ "epoch": 82.46,
1098
+ "learning_rate": 2.5658591549295773e-06,
1099
+ "loss": 1.1247,
1100
+ "step": 182000
1101
+ },
1102
+ {
1103
+ "epoch": 82.92,
1104
+ "learning_rate": 2.115605633802817e-06,
1105
+ "loss": 1.1267,
1106
+ "step": 183000
1107
+ },
1108
+ {
1109
+ "epoch": 83.37,
1110
+ "learning_rate": 1.6653521126760563e-06,
1111
+ "loss": 1.1246,
1112
+ "step": 184000
1113
+ },
1114
+ {
1115
+ "epoch": 83.82,
1116
+ "learning_rate": 1.2150985915492959e-06,
1117
+ "loss": 1.1251,
1118
+ "step": 185000
1119
+ },
1120
+ {
1121
+ "epoch": 84.28,
1122
+ "learning_rate": 7.648450704225354e-07,
1123
+ "loss": 1.1251,
1124
+ "step": 186000
1125
+ },
1126
+ {
1127
+ "epoch": 84.73,
1128
+ "learning_rate": 3.145915492957747e-07,
1129
+ "loss": 1.1246,
1130
+ "step": 187000
1131
+ },
1132
+ {
1133
+ "epoch": 84.96,
1134
+ "step": 187500,
1135
+ "total_flos": 1.0104234636302352e+20,
1136
+ "train_loss": 0.10515284895833334,
1137
+ "train_runtime": 4947.3494,
1138
+ "train_samples_per_second": 77617.32,
1139
+ "train_steps_per_second": 37.899
1140
+ }
1141
+ ],
1142
+ "max_steps": 187500,
1143
+ "num_train_epochs": 85,
1144
+ "total_flos": 1.0104234636302352e+20,
1145
+ "trial_name": null,
1146
+ "trial_params": null
1147
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ebf1380be2a758a012ae4f84303799c4f081a889cdd1c60898e1584a869d0fc6
3
+ size 4271
vocab.txt ADDED
The diff for this file is too large to render. See raw diff