wangyichen25 commited on
Commit
d1d885d
·
verified ·
1 Parent(s): 849f7d7

Training in progress, step 160, checkpoint

Browse files
checkpoint-160/adapter_config.json CHANGED
@@ -28,16 +28,16 @@
28
  "rank_pattern": {},
29
  "revision": null,
30
  "target_modules": [
31
- "fc1",
32
- "up_proj",
33
  "out_proj",
 
34
  "v_proj",
 
35
  "k_proj",
36
- "fc2",
37
- "o_proj",
38
  "gate_proj",
39
- "q_proj",
40
- "down_proj"
 
 
41
  ],
42
  "task_type": "CAUSAL_LM",
43
  "trainable_token_indices": null,
 
28
  "rank_pattern": {},
29
  "revision": null,
30
  "target_modules": [
 
 
31
  "out_proj",
32
+ "fc1",
33
  "v_proj",
34
+ "q_proj",
35
  "k_proj",
 
 
36
  "gate_proj",
37
+ "fc2",
38
+ "down_proj",
39
+ "up_proj",
40
+ "o_proj"
41
  ],
42
  "task_type": "CAUSAL_LM",
43
  "trainable_token_indices": null,
checkpoint-160/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:49f822bd36c40f1f8d48f95730c252d06deb2b2d95d3f38005b8d3cf7c9a7818
3
  size 2839126480
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a3201385808c47b65331ee939f23761a31b98b0bd57c121757f75a17849fd3a
3
  size 2839126480
checkpoint-160/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2ed54e204775d4e28774400b23c0961258f8ecc95be90762a3104c7ebe7a5e1c
3
  size 5678690152
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c348f1c6a6a13c0e66551b7ac9b6ba26965e10ca620d5190054c149bf370f8b
3
  size 5678690152
checkpoint-160/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:90b27d4306df35327ffa5e43da414e159713ff473950399f75ec7dea4aa95a62
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5ff978651d34714c2b8e82ce8fa64bf490387f31df2437808fa264e674cd0a5
3
  size 14244
checkpoint-160/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:05d4908124e5c54ee16a9f1b4328ce9261238072041619ee6183bc5ca771da85
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be6316eb66d099c20b37cd5778a0d24974b075045152f1bd07af7071e41bfcfb
3
  size 1064
checkpoint-160/trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.7692307692307693,
6
  "eval_steps": 10,
7
  "global_step": 160,
8
  "is_hyper_param_search": false,
@@ -10,314 +10,314 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.04807692307692308,
14
- "grad_norm": 19.535831451416016,
15
- "learning_rate": 0.00019800995024875625,
16
- "loss": 15.9815,
17
- "mean_token_accuracy": 0.6259999990463256,
18
  "num_tokens": 49120.0,
19
  "step": 10
20
  },
21
  {
22
- "epoch": 0.04807692307692308,
23
- "eval_loss": 1.2629932165145874,
24
- "eval_mean_token_accuracy": 0.8946153659086961,
25
  "eval_num_tokens": 49120.0,
26
- "eval_runtime": 17.4778,
27
- "eval_samples_per_second": 2.861,
28
- "eval_steps_per_second": 0.744,
29
  "step": 10
30
  },
31
  {
32
- "epoch": 0.09615384615384616,
33
- "grad_norm": 7.8091559410095215,
34
- "learning_rate": 0.00018805970149253734,
35
- "loss": 2.5529,
36
- "mean_token_accuracy": 0.9347499996423722,
37
  "num_tokens": 98240.0,
38
  "step": 20
39
  },
40
  {
41
- "epoch": 0.09615384615384616,
42
- "eval_loss": 0.14167079329490662,
43
- "eval_mean_token_accuracy": 0.9773077139487634,
44
  "eval_num_tokens": 98240.0,
45
- "eval_runtime": 17.4221,
46
- "eval_samples_per_second": 2.87,
47
- "eval_steps_per_second": 0.746,
48
  "step": 20
49
  },
50
  {
51
- "epoch": 0.14423076923076922,
52
- "grad_norm": 3.069196939468384,
53
- "learning_rate": 0.00017810945273631843,
54
- "loss": 0.3788,
55
- "mean_token_accuracy": 0.9778750196099282,
56
  "num_tokens": 147360.0,
57
  "step": 30
58
  },
59
  {
60
- "epoch": 0.14423076923076922,
61
- "eval_loss": 0.03845745697617531,
62
- "eval_mean_token_accuracy": 0.9788461740200336,
63
  "eval_num_tokens": 147360.0,
64
- "eval_runtime": 17.5754,
65
- "eval_samples_per_second": 2.845,
66
- "eval_steps_per_second": 0.74,
67
  "step": 30
68
  },
69
  {
70
- "epoch": 0.19230769230769232,
71
- "grad_norm": 6.14934778213501,
72
- "learning_rate": 0.0001681592039800995,
73
- "loss": 0.1621,
74
- "mean_token_accuracy": 0.9760000124573708,
75
- "num_tokens": 196480.0,
76
  "step": 40
77
  },
78
  {
79
- "epoch": 0.19230769230769232,
80
- "eval_loss": 0.03434291481971741,
81
- "eval_mean_token_accuracy": 0.9788461740200336,
82
- "eval_num_tokens": 196480.0,
83
- "eval_runtime": 17.4843,
84
- "eval_samples_per_second": 2.86,
85
- "eval_steps_per_second": 0.744,
86
  "step": 40
87
  },
88
  {
89
- "epoch": 0.2403846153846154,
90
- "grad_norm": 7.388150691986084,
91
- "learning_rate": 0.00015820895522388059,
92
- "loss": 0.1563,
93
- "mean_token_accuracy": 0.9780000209808349,
94
- "num_tokens": 245600.0,
95
  "step": 50
96
  },
97
  {
98
- "epoch": 0.2403846153846154,
99
- "eval_loss": 0.05149823799729347,
100
- "eval_mean_token_accuracy": 0.9788461740200336,
101
- "eval_num_tokens": 245600.0,
102
- "eval_runtime": 17.5208,
103
- "eval_samples_per_second": 2.854,
104
- "eval_steps_per_second": 0.742,
105
  "step": 50
106
  },
107
  {
108
- "epoch": 0.28846153846153844,
109
- "grad_norm": 5.478137969970703,
110
- "learning_rate": 0.00014825870646766168,
111
- "loss": 0.177,
112
- "mean_token_accuracy": 0.9782500207424164,
113
- "num_tokens": 294720.0,
114
  "step": 60
115
  },
116
  {
117
- "epoch": 0.28846153846153844,
118
- "eval_loss": 0.03306853771209717,
119
- "eval_mean_token_accuracy": 0.9788461740200336,
120
- "eval_num_tokens": 294720.0,
121
- "eval_runtime": 17.4723,
122
  "eval_samples_per_second": 2.862,
123
  "eval_steps_per_second": 0.744,
124
  "step": 60
125
  },
126
  {
127
- "epoch": 0.33653846153846156,
128
- "grad_norm": 4.636706352233887,
129
- "learning_rate": 0.0001383084577114428,
130
- "loss": 0.1614,
131
- "mean_token_accuracy": 0.9776250213384629,
132
- "num_tokens": 343840.0,
133
  "step": 70
134
  },
135
  {
136
- "epoch": 0.33653846153846156,
137
- "eval_loss": 0.041035111993551254,
138
- "eval_mean_token_accuracy": 0.9788461740200336,
139
- "eval_num_tokens": 343840.0,
140
- "eval_runtime": 17.567,
141
- "eval_samples_per_second": 2.846,
142
- "eval_steps_per_second": 0.74,
143
  "step": 70
144
  },
145
  {
146
- "epoch": 0.38461538461538464,
147
- "grad_norm": 1.8013979196548462,
148
- "learning_rate": 0.00012835820895522389,
149
- "loss": 0.1449,
150
- "mean_token_accuracy": 0.9785000205039978,
151
- "num_tokens": 392960.0,
152
  "step": 80
153
  },
154
  {
155
- "epoch": 0.38461538461538464,
156
- "eval_loss": 0.033909473568201065,
157
- "eval_mean_token_accuracy": 0.9788461740200336,
158
- "eval_num_tokens": 392960.0,
159
- "eval_runtime": 17.4599,
160
- "eval_samples_per_second": 2.864,
161
- "eval_steps_per_second": 0.745,
162
  "step": 80
163
  },
164
  {
165
- "epoch": 0.4326923076923077,
166
- "grad_norm": 0.7925957441329956,
167
- "learning_rate": 0.00011840796019900498,
168
- "loss": 0.1343,
169
- "mean_token_accuracy": 0.979250019788742,
170
- "num_tokens": 442080.0,
171
  "step": 90
172
  },
173
  {
174
- "epoch": 0.4326923076923077,
175
- "eval_loss": 0.03350621089339256,
176
- "eval_mean_token_accuracy": 0.9788461740200336,
177
- "eval_num_tokens": 442080.0,
178
- "eval_runtime": 17.5312,
179
- "eval_samples_per_second": 2.852,
180
- "eval_steps_per_second": 0.742,
181
  "step": 90
182
  },
183
  {
184
- "epoch": 0.4807692307692308,
185
- "grad_norm": 3.3666160106658936,
186
- "learning_rate": 0.00010845771144278607,
187
- "loss": 0.1484,
188
- "mean_token_accuracy": 0.9778750211000442,
189
- "num_tokens": 491200.0,
190
  "step": 100
191
  },
192
  {
193
- "epoch": 0.4807692307692308,
194
- "eval_loss": 0.03422728180885315,
195
- "eval_mean_token_accuracy": 0.9788461740200336,
196
- "eval_num_tokens": 491200.0,
197
- "eval_runtime": 17.4416,
198
- "eval_samples_per_second": 2.867,
199
- "eval_steps_per_second": 0.745,
200
  "step": 100
201
  },
202
  {
203
- "epoch": 0.5288461538461539,
204
- "grad_norm": 1.1528574228286743,
205
- "learning_rate": 9.850746268656717e-05,
206
- "loss": 0.1372,
207
- "mean_token_accuracy": 0.9783750206232071,
208
- "num_tokens": 540320.0,
209
  "step": 110
210
  },
211
  {
212
- "epoch": 0.5288461538461539,
213
- "eval_loss": 0.03267410025000572,
214
- "eval_mean_token_accuracy": 0.9788461740200336,
215
- "eval_num_tokens": 540320.0,
216
- "eval_runtime": 17.6379,
217
- "eval_samples_per_second": 2.835,
218
- "eval_steps_per_second": 0.737,
219
  "step": 110
220
  },
221
  {
222
- "epoch": 0.5769230769230769,
223
- "grad_norm": 1.7770888805389404,
224
- "learning_rate": 8.855721393034826e-05,
225
- "loss": 0.1499,
226
- "mean_token_accuracy": 0.9773750215768814,
227
- "num_tokens": 589440.0,
228
  "step": 120
229
  },
230
  {
231
- "epoch": 0.5769230769230769,
232
- "eval_loss": 0.03415742516517639,
233
- "eval_mean_token_accuracy": 0.9788461740200336,
234
- "eval_num_tokens": 589440.0,
235
- "eval_runtime": 17.4268,
236
- "eval_samples_per_second": 2.869,
237
- "eval_steps_per_second": 0.746,
238
  "step": 120
239
  },
240
  {
241
- "epoch": 0.625,
242
- "grad_norm": 0.9424811601638794,
243
- "learning_rate": 7.860696517412935e-05,
244
- "loss": 0.1354,
245
- "mean_token_accuracy": 0.9777500197291374,
246
- "num_tokens": 638560.0,
247
  "step": 130
248
  },
249
  {
250
- "epoch": 0.625,
251
- "eval_loss": 0.03265358507633209,
252
- "eval_mean_token_accuracy": 0.9788461740200336,
253
- "eval_num_tokens": 638560.0,
254
- "eval_runtime": 17.6065,
255
- "eval_samples_per_second": 2.84,
256
- "eval_steps_per_second": 0.738,
257
  "step": 130
258
  },
259
  {
260
- "epoch": 0.6730769230769231,
261
- "grad_norm": 2.114447593688965,
262
- "learning_rate": 6.865671641791044e-05,
263
- "loss": 0.1343,
264
- "mean_token_accuracy": 0.97887502014637,
265
- "num_tokens": 687680.0,
266
  "step": 140
267
  },
268
  {
269
- "epoch": 0.6730769230769231,
270
- "eval_loss": 0.033645644783973694,
271
- "eval_mean_token_accuracy": 0.9788461740200336,
272
- "eval_num_tokens": 687680.0,
273
- "eval_runtime": 17.4491,
274
- "eval_samples_per_second": 2.865,
275
- "eval_steps_per_second": 0.745,
276
  "step": 140
277
  },
278
  {
279
- "epoch": 0.7211538461538461,
280
- "grad_norm": 1.5852668285369873,
281
- "learning_rate": 5.870646766169154e-05,
282
- "loss": 0.142,
283
- "mean_token_accuracy": 0.9767500206828117,
284
- "num_tokens": 736800.0,
285
  "step": 150
286
  },
287
  {
288
- "epoch": 0.7211538461538461,
289
- "eval_loss": 0.03248896449804306,
290
- "eval_mean_token_accuracy": 0.9788461740200336,
291
- "eval_num_tokens": 736800.0,
292
- "eval_runtime": 17.6031,
293
- "eval_samples_per_second": 2.84,
294
- "eval_steps_per_second": 0.739,
295
  "step": 150
296
  },
297
  {
298
- "epoch": 0.7692307692307693,
299
- "grad_norm": 0.6951805949211121,
300
- "learning_rate": 4.875621890547264e-05,
301
- "loss": 0.1486,
302
- "mean_token_accuracy": 0.9771250188350677,
303
- "num_tokens": 785920.0,
304
  "step": 160
305
  },
306
  {
307
- "epoch": 0.7692307692307693,
308
- "eval_loss": 0.033385761082172394,
309
- "eval_mean_token_accuracy": 0.9788461740200336,
310
- "eval_num_tokens": 785920.0,
311
- "eval_runtime": 17.4166,
312
- "eval_samples_per_second": 2.871,
313
- "eval_steps_per_second": 0.746,
314
  "step": 160
315
  }
316
  ],
317
  "logging_steps": 10,
318
- "max_steps": 208,
319
  "num_input_tokens_seen": 0,
320
- "num_train_epochs": 1,
321
  "save_steps": 20,
322
  "stateful_callbacks": {
323
  "TrainerControl": {
@@ -331,7 +331,7 @@
331
  "attributes": {}
332
  }
333
  },
334
- "total_flos": 2.043639724498944e+16,
335
  "train_batch_size": 4,
336
  "trial_name": null,
337
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 4.104575163398692,
6
  "eval_steps": 10,
7
  "global_step": 160,
8
  "is_hyper_param_search": false,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.26143790849673204,
14
+ "grad_norm": 80.0328369140625,
15
+ "learning_rate": 1.9682539682539684e-05,
16
+ "loss": 22.6724,
17
+ "mean_token_accuracy": 0.49049999862909316,
18
  "num_tokens": 49120.0,
19
  "step": 10
20
  },
21
  {
22
+ "epoch": 0.26143790849673204,
23
+ "eval_loss": 4.981190204620361,
24
+ "eval_mean_token_accuracy": 0.5223076802033645,
25
  "eval_num_tokens": 49120.0,
26
+ "eval_runtime": 17.5192,
27
+ "eval_samples_per_second": 2.854,
28
+ "eval_steps_per_second": 0.742,
29
  "step": 10
30
  },
31
  {
32
+ "epoch": 0.5228758169934641,
33
+ "grad_norm": 57.1182746887207,
34
+ "learning_rate": 1.8624338624338625e-05,
35
+ "loss": 17.5658,
36
+ "mean_token_accuracy": 0.5591249987483025,
37
  "num_tokens": 98240.0,
38
  "step": 20
39
  },
40
  {
41
+ "epoch": 0.5228758169934641,
42
+ "eval_loss": 3.730682134628296,
43
+ "eval_mean_token_accuracy": 0.5742307580434359,
44
  "eval_num_tokens": 98240.0,
45
+ "eval_runtime": 17.4511,
46
+ "eval_samples_per_second": 2.865,
47
+ "eval_steps_per_second": 0.745,
48
  "step": 20
49
  },
50
  {
51
+ "epoch": 0.7843137254901961,
52
+ "grad_norm": 58.24352264404297,
53
+ "learning_rate": 1.7566137566137566e-05,
54
+ "loss": 13.113,
55
+ "mean_token_accuracy": 0.6388749912381172,
56
  "num_tokens": 147360.0,
57
  "step": 30
58
  },
59
  {
60
+ "epoch": 0.7843137254901961,
61
+ "eval_loss": 2.7529046535491943,
62
+ "eval_mean_token_accuracy": 0.7115384431985708,
63
  "eval_num_tokens": 147360.0,
64
+ "eval_runtime": 17.6786,
65
+ "eval_samples_per_second": 2.828,
66
+ "eval_steps_per_second": 0.735,
67
  "step": 30
68
  },
69
  {
70
+ "epoch": 1.026143790849673,
71
+ "grad_norm": 30.73020362854004,
72
+ "learning_rate": 1.6507936507936507e-05,
73
+ "loss": 8.7369,
74
+ "mean_token_accuracy": 0.7555405417004147,
75
+ "num_tokens": 192796.0,
76
  "step": 40
77
  },
78
  {
79
+ "epoch": 1.026143790849673,
80
+ "eval_loss": 1.836286187171936,
81
+ "eval_mean_token_accuracy": 0.8476923199800345,
82
+ "eval_num_tokens": 192796.0,
83
+ "eval_runtime": 17.4538,
84
+ "eval_samples_per_second": 2.865,
85
+ "eval_steps_per_second": 0.745,
86
  "step": 40
87
  },
88
  {
89
+ "epoch": 1.287581699346405,
90
+ "grad_norm": 21.440105438232422,
91
+ "learning_rate": 1.544973544973545e-05,
92
+ "loss": 6.2989,
93
+ "mean_token_accuracy": 0.8789999932050705,
94
+ "num_tokens": 241916.0,
95
  "step": 50
96
  },
97
  {
98
+ "epoch": 1.287581699346405,
99
+ "eval_loss": 1.3245856761932373,
100
+ "eval_mean_token_accuracy": 0.8942307508908786,
101
+ "eval_num_tokens": 241916.0,
102
+ "eval_runtime": 17.6384,
103
+ "eval_samples_per_second": 2.835,
104
+ "eval_steps_per_second": 0.737,
105
  "step": 50
106
  },
107
  {
108
+ "epoch": 1.5490196078431373,
109
+ "grad_norm": 10.908184051513672,
110
+ "learning_rate": 1.4391534391534392e-05,
111
+ "loss": 4.5427,
112
+ "mean_token_accuracy": 0.9057500049471855,
113
+ "num_tokens": 291036.0,
114
  "step": 60
115
  },
116
  {
117
+ "epoch": 1.5490196078431373,
118
+ "eval_loss": 0.9362902045249939,
119
+ "eval_mean_token_accuracy": 0.9126923221808213,
120
+ "eval_num_tokens": 291036.0,
121
+ "eval_runtime": 17.472,
122
  "eval_samples_per_second": 2.862,
123
  "eval_steps_per_second": 0.744,
124
  "step": 60
125
  },
126
  {
127
+ "epoch": 1.8104575163398693,
128
+ "grad_norm": 11.274917602539062,
129
+ "learning_rate": 1.3333333333333333e-05,
130
+ "loss": 3.4538,
131
+ "mean_token_accuracy": 0.9126250177621842,
132
+ "num_tokens": 340156.0,
133
  "step": 70
134
  },
135
  {
136
+ "epoch": 1.8104575163398693,
137
+ "eval_loss": 0.7567419409751892,
138
+ "eval_mean_token_accuracy": 0.9103846183189979,
139
+ "eval_num_tokens": 340156.0,
140
+ "eval_runtime": 17.5544,
141
+ "eval_samples_per_second": 2.848,
142
+ "eval_steps_per_second": 0.741,
143
  "step": 70
144
  },
145
  {
146
+ "epoch": 2.052287581699346,
147
+ "grad_norm": 13.183298110961914,
148
+ "learning_rate": 1.2275132275132276e-05,
149
+ "loss": 2.5315,
150
+ "mean_token_accuracy": 0.9293243321212562,
151
+ "num_tokens": 385592.0,
152
  "step": 80
153
  },
154
  {
155
+ "epoch": 2.052287581699346,
156
+ "eval_loss": 0.6147329807281494,
157
+ "eval_mean_token_accuracy": 0.9292307771169223,
158
+ "eval_num_tokens": 385592.0,
159
+ "eval_runtime": 17.582,
160
+ "eval_samples_per_second": 2.844,
161
+ "eval_steps_per_second": 0.739,
162
  "step": 80
163
  },
164
  {
165
+ "epoch": 2.313725490196078,
166
+ "grad_norm": 12.541313171386719,
167
+ "learning_rate": 1.1216931216931217e-05,
168
+ "loss": 2.2552,
169
+ "mean_token_accuracy": 0.9353750020265579,
170
+ "num_tokens": 434712.0,
171
  "step": 90
172
  },
173
  {
174
+ "epoch": 2.313725490196078,
175
+ "eval_loss": 0.5065863132476807,
176
+ "eval_mean_token_accuracy": 0.9465384529187129,
177
+ "eval_num_tokens": 434712.0,
178
+ "eval_runtime": 17.6818,
179
+ "eval_samples_per_second": 2.828,
180
+ "eval_steps_per_second": 0.735,
181
  "step": 90
182
  },
183
  {
184
+ "epoch": 2.57516339869281,
185
+ "grad_norm": 7.007925033569336,
186
+ "learning_rate": 1.015873015873016e-05,
187
+ "loss": 1.9547,
188
+ "mean_token_accuracy": 0.9474999904632568,
189
+ "num_tokens": 483832.0,
190
  "step": 100
191
  },
192
  {
193
+ "epoch": 2.57516339869281,
194
+ "eval_loss": 0.4519544243812561,
195
+ "eval_mean_token_accuracy": 0.9511538331325238,
196
+ "eval_num_tokens": 483832.0,
197
+ "eval_runtime": 17.5399,
198
+ "eval_samples_per_second": 2.851,
199
+ "eval_steps_per_second": 0.741,
200
  "step": 100
201
  },
202
  {
203
+ "epoch": 2.8366013071895426,
204
+ "grad_norm": 6.267516613006592,
205
+ "learning_rate": 9.1005291005291e-06,
206
+ "loss": 1.7199,
207
+ "mean_token_accuracy": 0.9503749877214431,
208
+ "num_tokens": 532952.0,
209
  "step": 110
210
  },
211
  {
212
+ "epoch": 2.8366013071895426,
213
+ "eval_loss": 0.396070659160614,
214
+ "eval_mean_token_accuracy": 0.9519230631681589,
215
+ "eval_num_tokens": 532952.0,
216
+ "eval_runtime": 17.6107,
217
+ "eval_samples_per_second": 2.839,
218
+ "eval_steps_per_second": 0.738,
219
  "step": 110
220
  },
221
  {
222
+ "epoch": 3.0784313725490198,
223
+ "grad_norm": 6.098326683044434,
224
+ "learning_rate": 8.042328042328043e-06,
225
+ "loss": 1.3242,
226
+ "mean_token_accuracy": 0.9683783892038706,
227
+ "num_tokens": 578388.0,
228
  "step": 120
229
  },
230
  {
231
+ "epoch": 3.0784313725490198,
232
+ "eval_loss": 0.3188876807689667,
233
+ "eval_mean_token_accuracy": 0.9742307938062228,
234
+ "eval_num_tokens": 578388.0,
235
+ "eval_runtime": 17.8412,
236
+ "eval_samples_per_second": 2.802,
237
+ "eval_steps_per_second": 0.729,
238
  "step": 120
239
  },
240
  {
241
+ "epoch": 3.3398692810457518,
242
+ "grad_norm": 9.010005950927734,
243
+ "learning_rate": 6.984126984126984e-06,
244
+ "loss": 1.1048,
245
+ "mean_token_accuracy": 0.9745000153779984,
246
+ "num_tokens": 627508.0,
247
  "step": 130
248
  },
249
  {
250
+ "epoch": 3.3398692810457518,
251
+ "eval_loss": 0.2367120385169983,
252
+ "eval_mean_token_accuracy": 0.9719230899443994,
253
+ "eval_num_tokens": 627508.0,
254
+ "eval_runtime": 17.8266,
255
+ "eval_samples_per_second": 2.805,
256
+ "eval_steps_per_second": 0.729,
257
  "step": 130
258
  },
259
  {
260
+ "epoch": 3.6013071895424837,
261
+ "grad_norm": 8.258376121520996,
262
+ "learning_rate": 5.925925925925926e-06,
263
+ "loss": 0.7666,
264
+ "mean_token_accuracy": 0.9731250181794167,
265
+ "num_tokens": 676628.0,
266
  "step": 140
267
  },
268
  {
269
+ "epoch": 3.6013071895424837,
270
+ "eval_loss": 0.14354808628559113,
271
+ "eval_mean_token_accuracy": 0.9723077049622169,
272
+ "eval_num_tokens": 676628.0,
273
+ "eval_runtime": 17.4766,
274
+ "eval_samples_per_second": 2.861,
275
+ "eval_steps_per_second": 0.744,
276
  "step": 140
277
  },
278
  {
279
+ "epoch": 3.8627450980392157,
280
+ "grad_norm": 7.381052494049072,
281
+ "learning_rate": 4.867724867724868e-06,
282
+ "loss": 0.496,
283
+ "mean_token_accuracy": 0.9710000097751618,
284
+ "num_tokens": 725748.0,
285
  "step": 150
286
  },
287
  {
288
+ "epoch": 3.8627450980392157,
289
+ "eval_loss": 0.1069604679942131,
290
+ "eval_mean_token_accuracy": 0.975769253877493,
291
+ "eval_num_tokens": 725748.0,
292
+ "eval_runtime": 17.6354,
293
+ "eval_samples_per_second": 2.835,
294
+ "eval_steps_per_second": 0.737,
295
  "step": 150
296
  },
297
  {
298
+ "epoch": 4.104575163398692,
299
+ "grad_norm": 4.855284690856934,
300
+ "learning_rate": 3.80952380952381e-06,
301
+ "loss": 0.3519,
302
+ "mean_token_accuracy": 0.9745946172121409,
303
+ "num_tokens": 771184.0,
304
  "step": 160
305
  },
306
  {
307
+ "epoch": 4.104575163398692,
308
+ "eval_loss": 0.08544553071260452,
309
+ "eval_mean_token_accuracy": 0.975769253877493,
310
+ "eval_num_tokens": 771184.0,
311
+ "eval_runtime": 17.4807,
312
+ "eval_samples_per_second": 2.86,
313
+ "eval_steps_per_second": 0.744,
314
  "step": 160
315
  }
316
  ],
317
  "logging_steps": 10,
318
+ "max_steps": 195,
319
  "num_input_tokens_seen": 0,
320
+ "num_train_epochs": 5,
321
  "save_steps": 20,
322
  "stateful_callbacks": {
323
  "TrainerControl": {
 
331
  "attributes": {}
332
  }
333
  },
334
+ "total_flos": 2.005321479664589e+16,
335
  "train_batch_size": 4,
336
  "trial_name": null,
337
  "trial_params": null
checkpoint-160/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:09a9baff00283695ce70ca75d7d996c90cc5734afa7d9ed3f9a6113fde47b709
3
  size 5880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fa3299c8cd6826c610bec43e932e8377d63e21789add24e4c875f4d75cb544d
3
  size 5880