Samuel J. Huskey commited on
Commit
9ccfaf8
·
1 Parent(s): 679d600

add: model files

Browse files
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ *.DS_Store
config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "distilbert_multilingual_cased_greek_latin_classifier",
3
+ "activation": "gelu",
4
+ "architectures": [
5
+ "DistilBertForSequenceClassification"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "dim": 768,
9
+ "dropout": 0.1,
10
+ "hidden_dim": 3072,
11
+ "id2label": {
12
+ "0": "Greek",
13
+ "1": "Latin"
14
+ },
15
+ "initializer_range": 0.02,
16
+ "label2id": {
17
+ "Greek": 0,
18
+ "Latin": 1
19
+ },
20
+ "max_position_embeddings": 512,
21
+ "model_type": "distilbert",
22
+ "n_heads": 12,
23
+ "n_layers": 6,
24
+ "output_past": true,
25
+ "pad_token_id": 0,
26
+ "problem_type": "single_label_classification",
27
+ "qa_dropout": 0.1,
28
+ "seq_classif_dropout": 0.2,
29
+ "sinusoidal_pos_embds": false,
30
+ "tie_weights_": true,
31
+ "torch_dtype": "float32",
32
+ "transformers_version": "4.44.1",
33
+ "vocab_size": 119547
34
+ }
greek-latin-emissions.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ timestamp,project_name,run_id,experiment_id,duration,emissions,emissions_rate,cpu_power,gpu_power,ram_power,cpu_energy,gpu_energy,ram_energy,energy_consumed,country_name,country_iso_code,region,cloud_provider,cloud_region,os,python_version,codecarbon_version,cpu_count,cpu_model,gpu_count,gpu_model,longitude,latitude,ram_total_size,tracking_mode,on_cloud,pue
2
+ 2024-12-25T15:16:45,codecarbon,f658b237-20c1-45cf-a8ee-cdb5e8521351,5b0fa12a-3dd7-45bb-9766-cc326314d9f1,591.3640720729998,0.017871522337723503,3.0220845637570942e-05,42.5,105.04535891243202,31.30389261245728,0.006974275640934743,0.025850867347344004,0.005136130757212294,0.03796127374549103,Singapore,SGP,,,,Linux-6.1.85+-x86_64-with-glibc2.35,3.10.12,2.8.2,12,Intel(R) Xeon(R) CPU @ 2.20GHz,1,1 x NVIDIA A100-SXM4-40GB,103.8503,1.2868,83.47704696655273,machine,N,1.0
logs/events.out.tfevents.1735139166.49c9ca38522d.6642.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec7414fa22502f5ef0d7c5f52d365638b066dd1919a6b3a93629d482d49a2c66
3
+ size 4184
logs/events.out.tfevents.1735139214.49c9ca38522d.7518.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f838b6e8c8504aef5e7c251628fabfd9a900cf61dc48cc2d905e5a6564c1595
3
+ size 50378
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c9296003fd0fb8618db1ed16d87b534346cc0ac4b6124cff8ed59299c5a04e3
3
+ size 541317368
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21941a651fa3476118e1cdf74e425cc957ab413ac16623c4e0c6b8f4c7b5230f
3
+ size 1082696890
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f87605e6c67d5a5d9a11aa1efb02af19902881be7fdf3b3e1c6bbca0fd808e5c
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f311cf2c6b954a71898a0497794c30a06bad24230f99e1c5541cecdc5621e10
3
+ size 1064
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": false,
47
+ "extra_special_tokens": {},
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 512,
50
+ "pad_token": "[PAD]",
51
+ "sep_token": "[SEP]",
52
+ "strip_accents": null,
53
+ "tokenize_chinese_chars": true,
54
+ "tokenizer_class": "DistilBertTokenizer",
55
+ "unk_token": "[UNK]"
56
+ }
trainer_state.json ADDED
@@ -0,0 +1,1109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.9635820582698403,
3
+ "best_model_checkpoint": "./distilbert_multilingual_cased_greek_latin_classifiergreek/checkpoint-14160",
4
+ "epoch": 8.0,
5
+ "eval_steps": 500,
6
+ "global_step": 14160,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.05649717514124294,
13
+ "grad_norm": 9.908761978149414,
14
+ "learning_rate": 4.9858757062146896e-05,
15
+ "loss": 0.4326,
16
+ "step": 100
17
+ },
18
+ {
19
+ "epoch": 0.11299435028248588,
20
+ "grad_norm": 0.47730961441993713,
21
+ "learning_rate": 4.971751412429379e-05,
22
+ "loss": 0.2915,
23
+ "step": 200
24
+ },
25
+ {
26
+ "epoch": 0.1694915254237288,
27
+ "grad_norm": 5.969715595245361,
28
+ "learning_rate": 4.957627118644068e-05,
29
+ "loss": 0.2426,
30
+ "step": 300
31
+ },
32
+ {
33
+ "epoch": 0.22598870056497175,
34
+ "grad_norm": 2.764862537384033,
35
+ "learning_rate": 4.9435028248587575e-05,
36
+ "loss": 0.256,
37
+ "step": 400
38
+ },
39
+ {
40
+ "epoch": 0.2824858757062147,
41
+ "grad_norm": 12.726994514465332,
42
+ "learning_rate": 4.929378531073446e-05,
43
+ "loss": 0.2165,
44
+ "step": 500
45
+ },
46
+ {
47
+ "epoch": 0.3389830508474576,
48
+ "grad_norm": 1.1394743919372559,
49
+ "learning_rate": 4.915254237288136e-05,
50
+ "loss": 0.2168,
51
+ "step": 600
52
+ },
53
+ {
54
+ "epoch": 0.3954802259887006,
55
+ "grad_norm": 4.321178913116455,
56
+ "learning_rate": 4.9011299435028255e-05,
57
+ "loss": 0.2059,
58
+ "step": 700
59
+ },
60
+ {
61
+ "epoch": 0.4519774011299435,
62
+ "grad_norm": 18.238351821899414,
63
+ "learning_rate": 4.887005649717514e-05,
64
+ "loss": 0.2294,
65
+ "step": 800
66
+ },
67
+ {
68
+ "epoch": 0.5084745762711864,
69
+ "grad_norm": 7.210486888885498,
70
+ "learning_rate": 4.8728813559322034e-05,
71
+ "loss": 0.1904,
72
+ "step": 900
73
+ },
74
+ {
75
+ "epoch": 0.5649717514124294,
76
+ "grad_norm": 1.240628719329834,
77
+ "learning_rate": 4.8587570621468934e-05,
78
+ "loss": 0.1668,
79
+ "step": 1000
80
+ },
81
+ {
82
+ "epoch": 0.6214689265536724,
83
+ "grad_norm": 9.029092788696289,
84
+ "learning_rate": 4.844632768361582e-05,
85
+ "loss": 0.18,
86
+ "step": 1100
87
+ },
88
+ {
89
+ "epoch": 0.6779661016949152,
90
+ "grad_norm": 4.866886615753174,
91
+ "learning_rate": 4.8305084745762714e-05,
92
+ "loss": 0.2082,
93
+ "step": 1200
94
+ },
95
+ {
96
+ "epoch": 0.7344632768361582,
97
+ "grad_norm": 7.6279778480529785,
98
+ "learning_rate": 4.816384180790961e-05,
99
+ "loss": 0.1818,
100
+ "step": 1300
101
+ },
102
+ {
103
+ "epoch": 0.7909604519774012,
104
+ "grad_norm": 8.820233345031738,
105
+ "learning_rate": 4.80225988700565e-05,
106
+ "loss": 0.1853,
107
+ "step": 1400
108
+ },
109
+ {
110
+ "epoch": 0.847457627118644,
111
+ "grad_norm": 22.571056365966797,
112
+ "learning_rate": 4.788135593220339e-05,
113
+ "loss": 0.1638,
114
+ "step": 1500
115
+ },
116
+ {
117
+ "epoch": 0.903954802259887,
118
+ "grad_norm": 4.86266565322876,
119
+ "learning_rate": 4.7740112994350286e-05,
120
+ "loss": 0.1761,
121
+ "step": 1600
122
+ },
123
+ {
124
+ "epoch": 0.96045197740113,
125
+ "grad_norm": 2.2481741905212402,
126
+ "learning_rate": 4.759887005649718e-05,
127
+ "loss": 0.1481,
128
+ "step": 1700
129
+ },
130
+ {
131
+ "epoch": 1.0,
132
+ "eval_accuracy": 0.9473554127533367,
133
+ "eval_f1": 0.9471867274372374,
134
+ "eval_loss": 0.15506704151630402,
135
+ "eval_runtime": 0.99,
136
+ "eval_samples_per_second": 4086.911,
137
+ "eval_steps_per_second": 64.647,
138
+ "step": 1770
139
+ },
140
+ {
141
+ "epoch": 1.0169491525423728,
142
+ "grad_norm": 0.16318374872207642,
143
+ "learning_rate": 4.745762711864407e-05,
144
+ "loss": 0.1303,
145
+ "step": 1800
146
+ },
147
+ {
148
+ "epoch": 1.073446327683616,
149
+ "grad_norm": 2.8574020862579346,
150
+ "learning_rate": 4.7316384180790966e-05,
151
+ "loss": 0.0916,
152
+ "step": 1900
153
+ },
154
+ {
155
+ "epoch": 1.1299435028248588,
156
+ "grad_norm": 7.220451831817627,
157
+ "learning_rate": 4.717514124293785e-05,
158
+ "loss": 0.1214,
159
+ "step": 2000
160
+ },
161
+ {
162
+ "epoch": 1.1864406779661016,
163
+ "grad_norm": 0.23234207928180695,
164
+ "learning_rate": 4.703389830508475e-05,
165
+ "loss": 0.1005,
166
+ "step": 2100
167
+ },
168
+ {
169
+ "epoch": 1.2429378531073447,
170
+ "grad_norm": 5.788926124572754,
171
+ "learning_rate": 4.689265536723164e-05,
172
+ "loss": 0.106,
173
+ "step": 2200
174
+ },
175
+ {
176
+ "epoch": 1.2994350282485876,
177
+ "grad_norm": 14.127985954284668,
178
+ "learning_rate": 4.675141242937853e-05,
179
+ "loss": 0.1002,
180
+ "step": 2300
181
+ },
182
+ {
183
+ "epoch": 1.3559322033898304,
184
+ "grad_norm": 8.628386497497559,
185
+ "learning_rate": 4.6610169491525425e-05,
186
+ "loss": 0.1228,
187
+ "step": 2400
188
+ },
189
+ {
190
+ "epoch": 1.4124293785310735,
191
+ "grad_norm": 3.4723896980285645,
192
+ "learning_rate": 4.646892655367232e-05,
193
+ "loss": 0.1193,
194
+ "step": 2500
195
+ },
196
+ {
197
+ "epoch": 1.4689265536723164,
198
+ "grad_norm": 5.112296104431152,
199
+ "learning_rate": 4.632768361581921e-05,
200
+ "loss": 0.0989,
201
+ "step": 2600
202
+ },
203
+ {
204
+ "epoch": 1.5254237288135593,
205
+ "grad_norm": 8.718145370483398,
206
+ "learning_rate": 4.6186440677966104e-05,
207
+ "loss": 0.1291,
208
+ "step": 2700
209
+ },
210
+ {
211
+ "epoch": 1.5819209039548023,
212
+ "grad_norm": 0.052204638719558716,
213
+ "learning_rate": 4.6045197740113e-05,
214
+ "loss": 0.0987,
215
+ "step": 2800
216
+ },
217
+ {
218
+ "epoch": 1.6384180790960452,
219
+ "grad_norm": 11.152572631835938,
220
+ "learning_rate": 4.590395480225989e-05,
221
+ "loss": 0.113,
222
+ "step": 2900
223
+ },
224
+ {
225
+ "epoch": 1.694915254237288,
226
+ "grad_norm": 3.002537965774536,
227
+ "learning_rate": 4.5762711864406784e-05,
228
+ "loss": 0.1215,
229
+ "step": 3000
230
+ },
231
+ {
232
+ "epoch": 1.7514124293785311,
233
+ "grad_norm": 16.578323364257812,
234
+ "learning_rate": 4.562146892655367e-05,
235
+ "loss": 0.0998,
236
+ "step": 3100
237
+ },
238
+ {
239
+ "epoch": 1.807909604519774,
240
+ "grad_norm": 4.660722255706787,
241
+ "learning_rate": 4.548022598870056e-05,
242
+ "loss": 0.1015,
243
+ "step": 3200
244
+ },
245
+ {
246
+ "epoch": 1.8644067796610169,
247
+ "grad_norm": 0.32472193241119385,
248
+ "learning_rate": 4.533898305084746e-05,
249
+ "loss": 0.0982,
250
+ "step": 3300
251
+ },
252
+ {
253
+ "epoch": 1.92090395480226,
254
+ "grad_norm": 12.544636726379395,
255
+ "learning_rate": 4.519774011299435e-05,
256
+ "loss": 0.1151,
257
+ "step": 3400
258
+ },
259
+ {
260
+ "epoch": 1.9774011299435028,
261
+ "grad_norm": 0.20591090619564056,
262
+ "learning_rate": 4.505649717514124e-05,
263
+ "loss": 0.1109,
264
+ "step": 3500
265
+ },
266
+ {
267
+ "epoch": 2.0,
268
+ "eval_accuracy": 0.9540286702916461,
269
+ "eval_f1": 0.9542995811514169,
270
+ "eval_loss": 0.17088210582733154,
271
+ "eval_runtime": 0.9919,
272
+ "eval_samples_per_second": 4078.94,
273
+ "eval_steps_per_second": 64.521,
274
+ "step": 3540
275
+ },
276
+ {
277
+ "epoch": 2.0338983050847457,
278
+ "grad_norm": 1.156205177307129,
279
+ "learning_rate": 4.491525423728814e-05,
280
+ "loss": 0.0764,
281
+ "step": 3600
282
+ },
283
+ {
284
+ "epoch": 2.0903954802259888,
285
+ "grad_norm": 4.644138336181641,
286
+ "learning_rate": 4.477401129943503e-05,
287
+ "loss": 0.0603,
288
+ "step": 3700
289
+ },
290
+ {
291
+ "epoch": 2.146892655367232,
292
+ "grad_norm": 0.0709792822599411,
293
+ "learning_rate": 4.463276836158192e-05,
294
+ "loss": 0.0649,
295
+ "step": 3800
296
+ },
297
+ {
298
+ "epoch": 2.2033898305084745,
299
+ "grad_norm": 0.13090333342552185,
300
+ "learning_rate": 4.4491525423728816e-05,
301
+ "loss": 0.076,
302
+ "step": 3900
303
+ },
304
+ {
305
+ "epoch": 2.2598870056497176,
306
+ "grad_norm": 39.72850799560547,
307
+ "learning_rate": 4.435028248587571e-05,
308
+ "loss": 0.0827,
309
+ "step": 4000
310
+ },
311
+ {
312
+ "epoch": 2.3163841807909606,
313
+ "grad_norm": 0.29564905166625977,
314
+ "learning_rate": 4.42090395480226e-05,
315
+ "loss": 0.0701,
316
+ "step": 4100
317
+ },
318
+ {
319
+ "epoch": 2.3728813559322033,
320
+ "grad_norm": 0.23284725844860077,
321
+ "learning_rate": 4.4067796610169495e-05,
322
+ "loss": 0.0697,
323
+ "step": 4200
324
+ },
325
+ {
326
+ "epoch": 2.4293785310734464,
327
+ "grad_norm": 0.059655264019966125,
328
+ "learning_rate": 4.392655367231638e-05,
329
+ "loss": 0.0803,
330
+ "step": 4300
331
+ },
332
+ {
333
+ "epoch": 2.4858757062146895,
334
+ "grad_norm": 0.01876319944858551,
335
+ "learning_rate": 4.378531073446328e-05,
336
+ "loss": 0.0796,
337
+ "step": 4400
338
+ },
339
+ {
340
+ "epoch": 2.542372881355932,
341
+ "grad_norm": 0.07363492995500565,
342
+ "learning_rate": 4.3644067796610175e-05,
343
+ "loss": 0.0573,
344
+ "step": 4500
345
+ },
346
+ {
347
+ "epoch": 2.598870056497175,
348
+ "grad_norm": 0.26911139488220215,
349
+ "learning_rate": 4.350282485875706e-05,
350
+ "loss": 0.0732,
351
+ "step": 4600
352
+ },
353
+ {
354
+ "epoch": 2.655367231638418,
355
+ "grad_norm": 0.045297879725694656,
356
+ "learning_rate": 4.3361581920903954e-05,
357
+ "loss": 0.0645,
358
+ "step": 4700
359
+ },
360
+ {
361
+ "epoch": 2.711864406779661,
362
+ "grad_norm": 0.24285119771957397,
363
+ "learning_rate": 4.3220338983050854e-05,
364
+ "loss": 0.0852,
365
+ "step": 4800
366
+ },
367
+ {
368
+ "epoch": 2.768361581920904,
369
+ "grad_norm": 70.39765930175781,
370
+ "learning_rate": 4.307909604519774e-05,
371
+ "loss": 0.0763,
372
+ "step": 4900
373
+ },
374
+ {
375
+ "epoch": 2.824858757062147,
376
+ "grad_norm": 3.1160919666290283,
377
+ "learning_rate": 4.2937853107344634e-05,
378
+ "loss": 0.0618,
379
+ "step": 5000
380
+ },
381
+ {
382
+ "epoch": 2.8813559322033897,
383
+ "grad_norm": 0.14466217160224915,
384
+ "learning_rate": 4.279661016949153e-05,
385
+ "loss": 0.0628,
386
+ "step": 5100
387
+ },
388
+ {
389
+ "epoch": 2.937853107344633,
390
+ "grad_norm": 11.605415344238281,
391
+ "learning_rate": 4.265536723163842e-05,
392
+ "loss": 0.0539,
393
+ "step": 5200
394
+ },
395
+ {
396
+ "epoch": 2.994350282485876,
397
+ "grad_norm": 0.03261380270123482,
398
+ "learning_rate": 4.251412429378531e-05,
399
+ "loss": 0.0349,
400
+ "step": 5300
401
+ },
402
+ {
403
+ "epoch": 3.0,
404
+ "eval_accuracy": 0.9574888779041029,
405
+ "eval_f1": 0.9573208648613947,
406
+ "eval_loss": 0.20843710005283356,
407
+ "eval_runtime": 0.9688,
408
+ "eval_samples_per_second": 4176.286,
409
+ "eval_steps_per_second": 66.061,
410
+ "step": 5310
411
+ },
412
+ {
413
+ "epoch": 3.0508474576271185,
414
+ "grad_norm": 0.007287267595529556,
415
+ "learning_rate": 4.2372881355932206e-05,
416
+ "loss": 0.0575,
417
+ "step": 5400
418
+ },
419
+ {
420
+ "epoch": 3.1073446327683616,
421
+ "grad_norm": 19.718822479248047,
422
+ "learning_rate": 4.22316384180791e-05,
423
+ "loss": 0.0423,
424
+ "step": 5500
425
+ },
426
+ {
427
+ "epoch": 3.1638418079096047,
428
+ "grad_norm": 78.35333251953125,
429
+ "learning_rate": 4.209039548022599e-05,
430
+ "loss": 0.0349,
431
+ "step": 5600
432
+ },
433
+ {
434
+ "epoch": 3.2203389830508473,
435
+ "grad_norm": 0.025877630338072777,
436
+ "learning_rate": 4.1949152542372886e-05,
437
+ "loss": 0.0632,
438
+ "step": 5700
439
+ },
440
+ {
441
+ "epoch": 3.2768361581920904,
442
+ "grad_norm": 14.864492416381836,
443
+ "learning_rate": 4.180790960451977e-05,
444
+ "loss": 0.0651,
445
+ "step": 5800
446
+ },
447
+ {
448
+ "epoch": 3.3333333333333335,
449
+ "grad_norm": 0.002800008049234748,
450
+ "learning_rate": 4.166666666666667e-05,
451
+ "loss": 0.0225,
452
+ "step": 5900
453
+ },
454
+ {
455
+ "epoch": 3.389830508474576,
456
+ "grad_norm": 0.10183978080749512,
457
+ "learning_rate": 4.152542372881356e-05,
458
+ "loss": 0.0467,
459
+ "step": 6000
460
+ },
461
+ {
462
+ "epoch": 3.446327683615819,
463
+ "grad_norm": 30.69606590270996,
464
+ "learning_rate": 4.138418079096045e-05,
465
+ "loss": 0.0509,
466
+ "step": 6100
467
+ },
468
+ {
469
+ "epoch": 3.5028248587570623,
470
+ "grad_norm": 0.43340635299682617,
471
+ "learning_rate": 4.1242937853107345e-05,
472
+ "loss": 0.0366,
473
+ "step": 6200
474
+ },
475
+ {
476
+ "epoch": 3.559322033898305,
477
+ "grad_norm": 3.5696895122528076,
478
+ "learning_rate": 4.110169491525424e-05,
479
+ "loss": 0.0576,
480
+ "step": 6300
481
+ },
482
+ {
483
+ "epoch": 3.615819209039548,
484
+ "grad_norm": 3.981534481048584,
485
+ "learning_rate": 4.096045197740113e-05,
486
+ "loss": 0.0574,
487
+ "step": 6400
488
+ },
489
+ {
490
+ "epoch": 3.672316384180791,
491
+ "grad_norm": 0.020425381138920784,
492
+ "learning_rate": 4.0819209039548024e-05,
493
+ "loss": 0.0582,
494
+ "step": 6500
495
+ },
496
+ {
497
+ "epoch": 3.7288135593220337,
498
+ "grad_norm": 1.6137280464172363,
499
+ "learning_rate": 4.067796610169492e-05,
500
+ "loss": 0.037,
501
+ "step": 6600
502
+ },
503
+ {
504
+ "epoch": 3.785310734463277,
505
+ "grad_norm": 0.41225990653038025,
506
+ "learning_rate": 4.053672316384181e-05,
507
+ "loss": 0.039,
508
+ "step": 6700
509
+ },
510
+ {
511
+ "epoch": 3.84180790960452,
512
+ "grad_norm": 0.02651926688849926,
513
+ "learning_rate": 4.0395480225988704e-05,
514
+ "loss": 0.052,
515
+ "step": 6800
516
+ },
517
+ {
518
+ "epoch": 3.898305084745763,
519
+ "grad_norm": 4.174577713012695,
520
+ "learning_rate": 4.025423728813559e-05,
521
+ "loss": 0.0746,
522
+ "step": 6900
523
+ },
524
+ {
525
+ "epoch": 3.9548022598870056,
526
+ "grad_norm": 0.06549729406833649,
527
+ "learning_rate": 4.011299435028249e-05,
528
+ "loss": 0.0682,
529
+ "step": 7000
530
+ },
531
+ {
532
+ "epoch": 4.0,
533
+ "eval_accuracy": 0.9619377162629758,
534
+ "eval_f1": 0.961922353652765,
535
+ "eval_loss": 0.19074885547161102,
536
+ "eval_runtime": 0.9577,
537
+ "eval_samples_per_second": 4224.832,
538
+ "eval_steps_per_second": 66.829,
539
+ "step": 7080
540
+ },
541
+ {
542
+ "epoch": 4.011299435028248,
543
+ "grad_norm": 0.09852942079305649,
544
+ "learning_rate": 3.997175141242938e-05,
545
+ "loss": 0.0573,
546
+ "step": 7100
547
+ },
548
+ {
549
+ "epoch": 4.067796610169491,
550
+ "grad_norm": 0.010253222659230232,
551
+ "learning_rate": 3.983050847457627e-05,
552
+ "loss": 0.0268,
553
+ "step": 7200
554
+ },
555
+ {
556
+ "epoch": 4.124293785310734,
557
+ "grad_norm": 0.05561167746782303,
558
+ "learning_rate": 3.968926553672316e-05,
559
+ "loss": 0.041,
560
+ "step": 7300
561
+ },
562
+ {
563
+ "epoch": 4.1807909604519775,
564
+ "grad_norm": 0.020777329802513123,
565
+ "learning_rate": 3.954802259887006e-05,
566
+ "loss": 0.0428,
567
+ "step": 7400
568
+ },
569
+ {
570
+ "epoch": 4.237288135593221,
571
+ "grad_norm": 0.011439072899520397,
572
+ "learning_rate": 3.940677966101695e-05,
573
+ "loss": 0.0281,
574
+ "step": 7500
575
+ },
576
+ {
577
+ "epoch": 4.293785310734464,
578
+ "grad_norm": 0.29063406586647034,
579
+ "learning_rate": 3.926553672316384e-05,
580
+ "loss": 0.0347,
581
+ "step": 7600
582
+ },
583
+ {
584
+ "epoch": 4.350282485875706,
585
+ "grad_norm": 0.008078676648437977,
586
+ "learning_rate": 3.9124293785310735e-05,
587
+ "loss": 0.0427,
588
+ "step": 7700
589
+ },
590
+ {
591
+ "epoch": 4.406779661016949,
592
+ "grad_norm": 0.0378178134560585,
593
+ "learning_rate": 3.898305084745763e-05,
594
+ "loss": 0.0448,
595
+ "step": 7800
596
+ },
597
+ {
598
+ "epoch": 4.463276836158192,
599
+ "grad_norm": 0.09035930037498474,
600
+ "learning_rate": 3.884180790960452e-05,
601
+ "loss": 0.0328,
602
+ "step": 7900
603
+ },
604
+ {
605
+ "epoch": 4.519774011299435,
606
+ "grad_norm": 0.05402543023228645,
607
+ "learning_rate": 3.8700564971751415e-05,
608
+ "loss": 0.0345,
609
+ "step": 8000
610
+ },
611
+ {
612
+ "epoch": 4.576271186440678,
613
+ "grad_norm": 0.01713019795715809,
614
+ "learning_rate": 3.855932203389831e-05,
615
+ "loss": 0.0358,
616
+ "step": 8100
617
+ },
618
+ {
619
+ "epoch": 4.632768361581921,
620
+ "grad_norm": 0.0475781112909317,
621
+ "learning_rate": 3.84180790960452e-05,
622
+ "loss": 0.0532,
623
+ "step": 8200
624
+ },
625
+ {
626
+ "epoch": 4.6892655367231635,
627
+ "grad_norm": 0.006405588239431381,
628
+ "learning_rate": 3.8276836158192094e-05,
629
+ "loss": 0.0324,
630
+ "step": 8300
631
+ },
632
+ {
633
+ "epoch": 4.745762711864407,
634
+ "grad_norm": 4.002650260925293,
635
+ "learning_rate": 3.813559322033898e-05,
636
+ "loss": 0.0526,
637
+ "step": 8400
638
+ },
639
+ {
640
+ "epoch": 4.80225988700565,
641
+ "grad_norm": 0.010295218788087368,
642
+ "learning_rate": 3.799435028248588e-05,
643
+ "loss": 0.0428,
644
+ "step": 8500
645
+ },
646
+ {
647
+ "epoch": 4.858757062146893,
648
+ "grad_norm": 25.15464973449707,
649
+ "learning_rate": 3.7853107344632774e-05,
650
+ "loss": 0.0513,
651
+ "step": 8600
652
+ },
653
+ {
654
+ "epoch": 4.915254237288136,
655
+ "grad_norm": 0.018476568162441254,
656
+ "learning_rate": 3.771186440677966e-05,
657
+ "loss": 0.0627,
658
+ "step": 8700
659
+ },
660
+ {
661
+ "epoch": 4.971751412429379,
662
+ "grad_norm": 0.02234013006091118,
663
+ "learning_rate": 3.7570621468926554e-05,
664
+ "loss": 0.0436,
665
+ "step": 8800
666
+ },
667
+ {
668
+ "epoch": 5.0,
669
+ "eval_accuracy": 0.9594661393969353,
670
+ "eval_f1": 0.9595618588245926,
671
+ "eval_loss": 0.24679133296012878,
672
+ "eval_runtime": 0.9625,
673
+ "eval_samples_per_second": 4203.719,
674
+ "eval_steps_per_second": 66.495,
675
+ "step": 8850
676
+ },
677
+ {
678
+ "epoch": 5.028248587570621,
679
+ "grad_norm": 0.007325501646846533,
680
+ "learning_rate": 3.7429378531073453e-05,
681
+ "loss": 0.011,
682
+ "step": 8900
683
+ },
684
+ {
685
+ "epoch": 5.084745762711864,
686
+ "grad_norm": 0.015004786662757397,
687
+ "learning_rate": 3.728813559322034e-05,
688
+ "loss": 0.0239,
689
+ "step": 9000
690
+ },
691
+ {
692
+ "epoch": 5.141242937853107,
693
+ "grad_norm": 0.02809782139956951,
694
+ "learning_rate": 3.714689265536723e-05,
695
+ "loss": 0.0468,
696
+ "step": 9100
697
+ },
698
+ {
699
+ "epoch": 5.19774011299435,
700
+ "grad_norm": 0.061971381306648254,
701
+ "learning_rate": 3.7005649717514126e-05,
702
+ "loss": 0.0369,
703
+ "step": 9200
704
+ },
705
+ {
706
+ "epoch": 5.254237288135593,
707
+ "grad_norm": 0.028554769232869148,
708
+ "learning_rate": 3.686440677966102e-05,
709
+ "loss": 0.0254,
710
+ "step": 9300
711
+ },
712
+ {
713
+ "epoch": 5.3107344632768365,
714
+ "grad_norm": 0.049820106476545334,
715
+ "learning_rate": 3.672316384180791e-05,
716
+ "loss": 0.0371,
717
+ "step": 9400
718
+ },
719
+ {
720
+ "epoch": 5.367231638418079,
721
+ "grad_norm": 0.016609592363238335,
722
+ "learning_rate": 3.6581920903954806e-05,
723
+ "loss": 0.0184,
724
+ "step": 9500
725
+ },
726
+ {
727
+ "epoch": 5.423728813559322,
728
+ "grad_norm": 0.05181876942515373,
729
+ "learning_rate": 3.644067796610169e-05,
730
+ "loss": 0.0414,
731
+ "step": 9600
732
+ },
733
+ {
734
+ "epoch": 5.480225988700565,
735
+ "grad_norm": 0.05821879953145981,
736
+ "learning_rate": 3.629943502824859e-05,
737
+ "loss": 0.0308,
738
+ "step": 9700
739
+ },
740
+ {
741
+ "epoch": 5.536723163841808,
742
+ "grad_norm": 0.010366985574364662,
743
+ "learning_rate": 3.6158192090395485e-05,
744
+ "loss": 0.0278,
745
+ "step": 9800
746
+ },
747
+ {
748
+ "epoch": 5.593220338983051,
749
+ "grad_norm": 0.019191740080714226,
750
+ "learning_rate": 3.601694915254237e-05,
751
+ "loss": 0.0435,
752
+ "step": 9900
753
+ },
754
+ {
755
+ "epoch": 5.649717514124294,
756
+ "grad_norm": 0.06532129645347595,
757
+ "learning_rate": 3.587570621468927e-05,
758
+ "loss": 0.0237,
759
+ "step": 10000
760
+ },
761
+ {
762
+ "epoch": 5.706214689265536,
763
+ "grad_norm": 0.009392981417477131,
764
+ "learning_rate": 3.573446327683616e-05,
765
+ "loss": 0.0334,
766
+ "step": 10100
767
+ },
768
+ {
769
+ "epoch": 5.762711864406779,
770
+ "grad_norm": 0.023171979933977127,
771
+ "learning_rate": 3.559322033898305e-05,
772
+ "loss": 0.0487,
773
+ "step": 10200
774
+ },
775
+ {
776
+ "epoch": 5.8192090395480225,
777
+ "grad_norm": 0.055124878883361816,
778
+ "learning_rate": 3.5451977401129944e-05,
779
+ "loss": 0.0412,
780
+ "step": 10300
781
+ },
782
+ {
783
+ "epoch": 5.875706214689266,
784
+ "grad_norm": 0.015424055978655815,
785
+ "learning_rate": 3.531073446327684e-05,
786
+ "loss": 0.0292,
787
+ "step": 10400
788
+ },
789
+ {
790
+ "epoch": 5.932203389830509,
791
+ "grad_norm": 0.6497403979301453,
792
+ "learning_rate": 3.516949152542373e-05,
793
+ "loss": 0.039,
794
+ "step": 10500
795
+ },
796
+ {
797
+ "epoch": 5.988700564971752,
798
+ "grad_norm": 18.98410415649414,
799
+ "learning_rate": 3.5028248587570624e-05,
800
+ "loss": 0.0322,
801
+ "step": 10600
802
+ },
803
+ {
804
+ "epoch": 6.0,
805
+ "eval_accuracy": 0.9614434008897677,
806
+ "eval_f1": 0.9615565973130906,
807
+ "eval_loss": 0.2411661297082901,
808
+ "eval_runtime": 0.9629,
809
+ "eval_samples_per_second": 4201.762,
810
+ "eval_steps_per_second": 66.464,
811
+ "step": 10620
812
+ },
813
+ {
814
+ "epoch": 6.045197740112994,
815
+ "grad_norm": 0.013564531691372395,
816
+ "learning_rate": 3.488700564971752e-05,
817
+ "loss": 0.0254,
818
+ "step": 10700
819
+ },
820
+ {
821
+ "epoch": 6.101694915254237,
822
+ "grad_norm": 33.3035888671875,
823
+ "learning_rate": 3.474576271186441e-05,
824
+ "loss": 0.0295,
825
+ "step": 10800
826
+ },
827
+ {
828
+ "epoch": 6.15819209039548,
829
+ "grad_norm": 0.12126260250806808,
830
+ "learning_rate": 3.46045197740113e-05,
831
+ "loss": 0.0275,
832
+ "step": 10900
833
+ },
834
+ {
835
+ "epoch": 6.214689265536723,
836
+ "grad_norm": 0.03739802539348602,
837
+ "learning_rate": 3.446327683615819e-05,
838
+ "loss": 0.0267,
839
+ "step": 11000
840
+ },
841
+ {
842
+ "epoch": 6.271186440677966,
843
+ "grad_norm": 0.03359340503811836,
844
+ "learning_rate": 3.432203389830508e-05,
845
+ "loss": 0.0389,
846
+ "step": 11100
847
+ },
848
+ {
849
+ "epoch": 6.327683615819209,
850
+ "grad_norm": 0.003635927801951766,
851
+ "learning_rate": 3.418079096045198e-05,
852
+ "loss": 0.0255,
853
+ "step": 11200
854
+ },
855
+ {
856
+ "epoch": 6.3841807909604515,
857
+ "grad_norm": 0.06124364957213402,
858
+ "learning_rate": 3.403954802259887e-05,
859
+ "loss": 0.0229,
860
+ "step": 11300
861
+ },
862
+ {
863
+ "epoch": 6.440677966101695,
864
+ "grad_norm": 0.026170525699853897,
865
+ "learning_rate": 3.389830508474576e-05,
866
+ "loss": 0.0319,
867
+ "step": 11400
868
+ },
869
+ {
870
+ "epoch": 6.497175141242938,
871
+ "grad_norm": 0.013875061646103859,
872
+ "learning_rate": 3.375706214689266e-05,
873
+ "loss": 0.0276,
874
+ "step": 11500
875
+ },
876
+ {
877
+ "epoch": 6.553672316384181,
878
+ "grad_norm": 0.009600764140486717,
879
+ "learning_rate": 3.361581920903955e-05,
880
+ "loss": 0.0182,
881
+ "step": 11600
882
+ },
883
+ {
884
+ "epoch": 6.610169491525424,
885
+ "grad_norm": 0.02147483266890049,
886
+ "learning_rate": 3.347457627118644e-05,
887
+ "loss": 0.0277,
888
+ "step": 11700
889
+ },
890
+ {
891
+ "epoch": 6.666666666666667,
892
+ "grad_norm": 0.007301884237676859,
893
+ "learning_rate": 3.3333333333333335e-05,
894
+ "loss": 0.0268,
895
+ "step": 11800
896
+ },
897
+ {
898
+ "epoch": 6.72316384180791,
899
+ "grad_norm": 0.008684027940034866,
900
+ "learning_rate": 3.319209039548023e-05,
901
+ "loss": 0.0244,
902
+ "step": 11900
903
+ },
904
+ {
905
+ "epoch": 6.779661016949152,
906
+ "grad_norm": 0.0058201453648507595,
907
+ "learning_rate": 3.305084745762712e-05,
908
+ "loss": 0.018,
909
+ "step": 12000
910
+ },
911
+ {
912
+ "epoch": 6.836158192090395,
913
+ "grad_norm": 0.015645477920770645,
914
+ "learning_rate": 3.2909604519774014e-05,
915
+ "loss": 0.0389,
916
+ "step": 12100
917
+ },
918
+ {
919
+ "epoch": 6.892655367231638,
920
+ "grad_norm": 0.013589623384177685,
921
+ "learning_rate": 3.27683615819209e-05,
922
+ "loss": 0.0271,
923
+ "step": 12200
924
+ },
925
+ {
926
+ "epoch": 6.9491525423728815,
927
+ "grad_norm": 0.004052096512168646,
928
+ "learning_rate": 3.26271186440678e-05,
929
+ "loss": 0.012,
930
+ "step": 12300
931
+ },
932
+ {
933
+ "epoch": 7.0,
934
+ "eval_accuracy": 0.963173504695996,
935
+ "eval_f1": 0.9631919351432553,
936
+ "eval_loss": 0.22085699439048767,
937
+ "eval_runtime": 0.9623,
938
+ "eval_samples_per_second": 4204.299,
939
+ "eval_steps_per_second": 66.504,
940
+ "step": 12390
941
+ },
942
+ {
943
+ "epoch": 7.005649717514125,
944
+ "grad_norm": 0.06705684214830399,
945
+ "learning_rate": 3.2485875706214694e-05,
946
+ "loss": 0.0309,
947
+ "step": 12400
948
+ },
949
+ {
950
+ "epoch": 7.062146892655368,
951
+ "grad_norm": 0.006240461952984333,
952
+ "learning_rate": 3.234463276836158e-05,
953
+ "loss": 0.0084,
954
+ "step": 12500
955
+ },
956
+ {
957
+ "epoch": 7.11864406779661,
958
+ "grad_norm": 0.020344626158475876,
959
+ "learning_rate": 3.2203389830508473e-05,
960
+ "loss": 0.0168,
961
+ "step": 12600
962
+ },
963
+ {
964
+ "epoch": 7.175141242937853,
965
+ "grad_norm": 0.003926662262529135,
966
+ "learning_rate": 3.2062146892655373e-05,
967
+ "loss": 0.022,
968
+ "step": 12700
969
+ },
970
+ {
971
+ "epoch": 7.231638418079096,
972
+ "grad_norm": 0.0025492089334875345,
973
+ "learning_rate": 3.192090395480226e-05,
974
+ "loss": 0.0133,
975
+ "step": 12800
976
+ },
977
+ {
978
+ "epoch": 7.288135593220339,
979
+ "grad_norm": 0.005623087752610445,
980
+ "learning_rate": 3.177966101694915e-05,
981
+ "loss": 0.0164,
982
+ "step": 12900
983
+ },
984
+ {
985
+ "epoch": 7.344632768361582,
986
+ "grad_norm": 0.0032459620852023363,
987
+ "learning_rate": 3.1638418079096046e-05,
988
+ "loss": 0.0272,
989
+ "step": 13000
990
+ },
991
+ {
992
+ "epoch": 7.401129943502825,
993
+ "grad_norm": 1.1293178796768188,
994
+ "learning_rate": 3.149717514124294e-05,
995
+ "loss": 0.0148,
996
+ "step": 13100
997
+ },
998
+ {
999
+ "epoch": 7.4576271186440675,
1000
+ "grad_norm": 0.0017996145179495215,
1001
+ "learning_rate": 3.135593220338983e-05,
1002
+ "loss": 0.0132,
1003
+ "step": 13200
1004
+ },
1005
+ {
1006
+ "epoch": 7.5141242937853105,
1007
+ "grad_norm": 0.008758709765970707,
1008
+ "learning_rate": 3.1214689265536726e-05,
1009
+ "loss": 0.0152,
1010
+ "step": 13300
1011
+ },
1012
+ {
1013
+ "epoch": 7.570621468926554,
1014
+ "grad_norm": 0.0038798090536147356,
1015
+ "learning_rate": 3.107344632768362e-05,
1016
+ "loss": 0.0106,
1017
+ "step": 13400
1018
+ },
1019
+ {
1020
+ "epoch": 7.627118644067797,
1021
+ "grad_norm": 0.005076746456325054,
1022
+ "learning_rate": 3.093220338983051e-05,
1023
+ "loss": 0.0158,
1024
+ "step": 13500
1025
+ },
1026
+ {
1027
+ "epoch": 7.68361581920904,
1028
+ "grad_norm": 0.003670661011710763,
1029
+ "learning_rate": 3.0790960451977405e-05,
1030
+ "loss": 0.0093,
1031
+ "step": 13600
1032
+ },
1033
+ {
1034
+ "epoch": 7.740112994350282,
1035
+ "grad_norm": 0.003522429848089814,
1036
+ "learning_rate": 3.064971751412429e-05,
1037
+ "loss": 0.0183,
1038
+ "step": 13700
1039
+ },
1040
+ {
1041
+ "epoch": 7.796610169491525,
1042
+ "grad_norm": 0.06700780242681503,
1043
+ "learning_rate": 3.050847457627119e-05,
1044
+ "loss": 0.0398,
1045
+ "step": 13800
1046
+ },
1047
+ {
1048
+ "epoch": 7.853107344632768,
1049
+ "grad_norm": 0.01462018396705389,
1050
+ "learning_rate": 3.036723163841808e-05,
1051
+ "loss": 0.0286,
1052
+ "step": 13900
1053
+ },
1054
+ {
1055
+ "epoch": 7.909604519774011,
1056
+ "grad_norm": 0.025290269404649734,
1057
+ "learning_rate": 3.022598870056497e-05,
1058
+ "loss": 0.037,
1059
+ "step": 14000
1060
+ },
1061
+ {
1062
+ "epoch": 7.966101694915254,
1063
+ "grad_norm": 0.011192042380571365,
1064
+ "learning_rate": 3.0084745762711864e-05,
1065
+ "loss": 0.0147,
1066
+ "step": 14100
1067
+ },
1068
+ {
1069
+ "epoch": 8.0,
1070
+ "eval_accuracy": 0.9636678200692042,
1071
+ "eval_f1": 0.9635820582698403,
1072
+ "eval_loss": 0.24865780770778656,
1073
+ "eval_runtime": 0.9484,
1074
+ "eval_samples_per_second": 4266.086,
1075
+ "eval_steps_per_second": 67.481,
1076
+ "step": 14160
1077
+ }
1078
+ ],
1079
+ "logging_steps": 100,
1080
+ "max_steps": 35400,
1081
+ "num_input_tokens_seen": 0,
1082
+ "num_train_epochs": 20,
1083
+ "save_steps": 500,
1084
+ "stateful_callbacks": {
1085
+ "EarlyStoppingCallback": {
1086
+ "args": {
1087
+ "early_stopping_patience": 3,
1088
+ "early_stopping_threshold": 0.0
1089
+ },
1090
+ "attributes": {
1091
+ "early_stopping_patience_counter": 0
1092
+ }
1093
+ },
1094
+ "TrainerControl": {
1095
+ "args": {
1096
+ "should_epoch_stop": false,
1097
+ "should_evaluate": false,
1098
+ "should_log": false,
1099
+ "should_save": true,
1100
+ "should_training_stop": false
1101
+ },
1102
+ "attributes": {}
1103
+ }
1104
+ },
1105
+ "total_flos": 1172336478105600.0,
1106
+ "train_batch_size": 16,
1107
+ "trial_name": null,
1108
+ "trial_params": null
1109
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a911aa07d9ca1f76618367adcad9ebf5a951be0ecd3ebd50dbcf89c173eadc77
3
+ size 5304
vocab.txt ADDED
The diff for this file is too large to render. See raw diff