wangyichen25 commited on
Commit
bb0f5b0
·
verified ·
1 Parent(s): b4d4730

Training in progress, step 400, checkpoint

Browse files
checkpoint-400/adapter_config.json CHANGED
@@ -28,16 +28,16 @@
28
  "rank_pattern": {},
29
  "revision": null,
30
  "target_modules": [
31
- "fc1",
32
- "fc2",
33
  "o_proj",
 
34
  "gate_proj",
35
  "up_proj",
36
- "k_proj",
 
37
  "down_proj",
 
38
  "out_proj",
39
- "q_proj",
40
- "v_proj"
41
  ],
42
  "task_type": "CAUSAL_LM",
43
  "trainable_token_indices": null,
 
28
  "rank_pattern": {},
29
  "revision": null,
30
  "target_modules": [
 
 
31
  "o_proj",
32
+ "fc1",
33
  "gate_proj",
34
  "up_proj",
35
+ "q_proj",
36
+ "v_proj",
37
  "down_proj",
38
+ "k_proj",
39
  "out_proj",
40
+ "fc2"
 
41
  ],
42
  "task_type": "CAUSAL_LM",
43
  "trainable_token_indices": null,
checkpoint-400/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bc79ea31c9f05159694a8268df9027be66e17bbfcb7f85c120751790564a14ef
3
  size 2839126480
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36c2d2df5bbc160d2377615ced3af4094c4fc562df7296192de4abd2153f1941
3
  size 2839126480
checkpoint-400/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:16cb7d77761b46bcbfc6a8b625ffd9ff283ea167c846c6b04f27c95c3674056c
3
  size 5678690152
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c04ac2cb09d32c0fff71c15b417d3b7fdefdcaa2f4380033bc13fd468fa77130
3
  size 5678690152
checkpoint-400/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0ed6d8d369760e08757480bee1347f5f9a123bd537895c933990fed168f6608a
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2123d2cdefe4b1df1b665235b0196d13a18a0218fcb3a07b661248d939b3ceaf
3
  size 1064
checkpoint-400/trainer_state.json CHANGED
@@ -11,382 +11,382 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.04162330905306972,
14
- "grad_norm": 20.466537475585938,
15
- "learning_rate": 4.957081545064378e-05,
16
- "loss": 8.8593,
17
- "mean_token_accuracy": 0.6828776024281978,
18
  "num_tokens": 161548.0,
19
  "step": 20
20
  },
21
  {
22
  "epoch": 0.04162330905306972,
23
- "eval_loss": 1.1315333843231201,
24
- "eval_mean_token_accuracy": 0.8237503719329834,
25
  "eval_num_tokens": 161548.0,
26
- "eval_runtime": 34.7122,
27
- "eval_samples_per_second": 2.881,
28
- "eval_steps_per_second": 0.72,
29
  "step": 20
30
  },
31
  {
32
  "epoch": 0.08324661810613944,
33
- "grad_norm": 2.744436025619507,
34
- "learning_rate": 4.742489270386266e-05,
35
- "loss": 1.8553,
36
- "mean_token_accuracy": 0.9392111219465733,
37
  "num_tokens": 323168.0,
38
  "step": 40
39
  },
40
  {
41
  "epoch": 0.08324661810613944,
42
- "eval_loss": 0.16864576935768127,
43
- "eval_mean_token_accuracy": 0.9810383248329163,
44
  "eval_num_tokens": 323168.0,
45
- "eval_runtime": 34.3084,
46
- "eval_samples_per_second": 2.915,
47
- "eval_steps_per_second": 0.729,
48
  "step": 40
49
  },
50
  {
51
  "epoch": 0.12486992715920915,
52
- "grad_norm": 1.6022675037384033,
53
- "learning_rate": 4.527896995708155e-05,
54
- "loss": 0.3903,
55
- "mean_token_accuracy": 0.9888123281300067,
56
  "num_tokens": 484780.0,
57
  "step": 60
58
  },
59
  {
60
  "epoch": 0.12486992715920915,
61
- "eval_loss": 0.03530249744653702,
62
- "eval_mean_token_accuracy": 0.9926152086257934,
63
  "eval_num_tokens": 484780.0,
64
- "eval_runtime": 34.7758,
65
- "eval_samples_per_second": 2.876,
66
- "eval_steps_per_second": 0.719,
67
  "step": 60
68
  },
69
  {
70
  "epoch": 0.16649323621227888,
71
- "grad_norm": 1.2044650316238403,
72
- "learning_rate": 4.313304721030043e-05,
73
- "loss": 0.0664,
74
- "mean_token_accuracy": 0.9934882044792175,
75
  "num_tokens": 646431.0,
76
  "step": 80
77
  },
78
  {
79
  "epoch": 0.16649323621227888,
80
- "eval_loss": 0.012011010199785233,
81
- "eval_mean_token_accuracy": 0.9936227130889893,
82
  "eval_num_tokens": 646431.0,
83
- "eval_runtime": 34.3329,
84
- "eval_samples_per_second": 2.913,
85
- "eval_steps_per_second": 0.728,
86
  "step": 80
87
  },
88
  {
89
  "epoch": 0.2081165452653486,
90
- "grad_norm": 0.4138229191303253,
91
- "learning_rate": 4.098712446351932e-05,
92
- "loss": 0.0513,
93
- "mean_token_accuracy": 0.994041533768177,
94
  "num_tokens": 808054.0,
95
  "step": 100
96
  },
97
  {
98
  "epoch": 0.2081165452653486,
99
- "eval_loss": 0.011702906340360641,
100
- "eval_mean_token_accuracy": 0.9942703366279602,
101
  "eval_num_tokens": 808054.0,
102
- "eval_runtime": 34.3225,
103
- "eval_samples_per_second": 2.914,
104
  "eval_steps_per_second": 0.728,
105
  "step": 100
106
  },
107
  {
108
  "epoch": 0.2497398543184183,
109
- "grad_norm": 0.8489187955856323,
110
- "learning_rate": 3.88412017167382e-05,
111
- "loss": 0.0416,
112
- "mean_token_accuracy": 0.9943524189293385,
113
  "num_tokens": 969623.0,
114
  "step": 120
115
  },
116
  {
117
  "epoch": 0.2497398543184183,
118
- "eval_loss": 0.010666043497622013,
119
- "eval_mean_token_accuracy": 0.9943506979942321,
120
  "eval_num_tokens": 969623.0,
121
- "eval_runtime": 34.3761,
122
- "eval_samples_per_second": 2.909,
123
  "eval_steps_per_second": 0.727,
124
  "step": 120
125
  },
126
  {
127
  "epoch": 0.29136316337148804,
128
- "grad_norm": 0.49626249074935913,
129
- "learning_rate": 3.669527896995708e-05,
130
- "loss": 0.0384,
131
- "mean_token_accuracy": 0.9943184182047844,
132
  "num_tokens": 1131222.0,
133
  "step": 140
134
  },
135
  {
136
  "epoch": 0.29136316337148804,
137
- "eval_loss": 0.009807135909795761,
138
- "eval_mean_token_accuracy": 0.9946736145019531,
139
  "eval_num_tokens": 1131222.0,
140
- "eval_runtime": 34.34,
141
- "eval_samples_per_second": 2.912,
142
  "eval_steps_per_second": 0.728,
143
  "step": 140
144
  },
145
  {
146
  "epoch": 0.33298647242455776,
147
- "grad_norm": 0.5716305375099182,
148
- "learning_rate": 3.454935622317597e-05,
149
- "loss": 0.0384,
150
- "mean_token_accuracy": 0.9946457795798779,
151
  "num_tokens": 1292839.0,
152
  "step": 160
153
  },
154
  {
155
  "epoch": 0.33298647242455776,
156
- "eval_loss": 0.009297176264226437,
157
- "eval_mean_token_accuracy": 0.9946328043937683,
158
  "eval_num_tokens": 1292839.0,
159
- "eval_runtime": 34.557,
160
- "eval_samples_per_second": 2.894,
161
- "eval_steps_per_second": 0.723,
162
  "step": 160
163
  },
164
  {
165
  "epoch": 0.37460978147762747,
166
- "grad_norm": 0.4264802932739258,
167
- "learning_rate": 3.240343347639485e-05,
168
- "loss": 0.0373,
169
- "mean_token_accuracy": 0.9946053452789784,
170
  "num_tokens": 1454417.0,
171
  "step": 180
172
  },
173
  {
174
  "epoch": 0.37460978147762747,
175
- "eval_loss": 0.009250417351722717,
176
- "eval_mean_token_accuracy": 0.9949553918838501,
177
  "eval_num_tokens": 1454417.0,
178
- "eval_runtime": 34.3425,
179
- "eval_samples_per_second": 2.912,
180
- "eval_steps_per_second": 0.728,
181
  "step": 180
182
  },
183
  {
184
  "epoch": 0.4162330905306972,
185
- "grad_norm": 0.3315845727920532,
186
- "learning_rate": 3.0257510729613737e-05,
187
- "loss": 0.0326,
188
- "mean_token_accuracy": 0.9949214711785317,
189
  "num_tokens": 1615982.0,
190
  "step": 200
191
  },
192
  {
193
  "epoch": 0.4162330905306972,
194
- "eval_loss": 0.00862209778279066,
195
- "eval_mean_token_accuracy": 0.994794466495514,
196
  "eval_num_tokens": 1615982.0,
197
- "eval_runtime": 34.317,
198
- "eval_samples_per_second": 2.914,
199
- "eval_steps_per_second": 0.729,
200
  "step": 200
201
  },
202
  {
203
  "epoch": 0.4578563995837669,
204
- "grad_norm": 0.3415542542934418,
205
- "learning_rate": 2.811158798283262e-05,
206
- "loss": 0.0328,
207
- "mean_token_accuracy": 0.9948225237429142,
208
  "num_tokens": 1777586.0,
209
  "step": 220
210
  },
211
  {
212
  "epoch": 0.4578563995837669,
213
- "eval_loss": 0.008552273735404015,
214
- "eval_mean_token_accuracy": 0.9952386736869812,
215
  "eval_num_tokens": 1777586.0,
216
- "eval_runtime": 34.357,
217
- "eval_samples_per_second": 2.911,
218
- "eval_steps_per_second": 0.728,
219
  "step": 220
220
  },
221
  {
222
  "epoch": 0.4994797086368366,
223
- "grad_norm": 0.5801106691360474,
224
- "learning_rate": 2.59656652360515e-05,
225
- "loss": 0.0317,
226
- "mean_token_accuracy": 0.9949369013309479,
227
  "num_tokens": 1939204.0,
228
  "step": 240
229
  },
230
  {
231
  "epoch": 0.4994797086368366,
232
- "eval_loss": 0.008411003276705742,
233
- "eval_mean_token_accuracy": 0.9951176619529725,
234
  "eval_num_tokens": 1939204.0,
235
- "eval_runtime": 34.3383,
236
- "eval_samples_per_second": 2.912,
237
- "eval_steps_per_second": 0.728,
238
  "step": 240
239
  },
240
  {
241
  "epoch": 0.5411030176899063,
242
- "grad_norm": 0.3876211941242218,
243
- "learning_rate": 2.3819742489270388e-05,
244
- "loss": 0.0322,
245
- "mean_token_accuracy": 0.9952259331941604,
246
  "num_tokens": 2100821.0,
247
  "step": 260
248
  },
249
  {
250
  "epoch": 0.5411030176899063,
251
- "eval_loss": 0.008060808293521404,
252
- "eval_mean_token_accuracy": 0.9952384281158447,
253
  "eval_num_tokens": 2100821.0,
254
- "eval_runtime": 34.5381,
255
- "eval_samples_per_second": 2.895,
256
- "eval_steps_per_second": 0.724,
257
  "step": 260
258
  },
259
  {
260
  "epoch": 0.5827263267429761,
261
- "grad_norm": 0.3304857015609741,
262
- "learning_rate": 2.1673819742489272e-05,
263
- "loss": 0.0326,
264
- "mean_token_accuracy": 0.9948949187994003,
265
  "num_tokens": 2262372.0,
266
  "step": 280
267
  },
268
  {
269
  "epoch": 0.5827263267429761,
270
- "eval_loss": 0.008030685596168041,
271
- "eval_mean_token_accuracy": 0.9953594398498535,
272
  "eval_num_tokens": 2262372.0,
273
- "eval_runtime": 34.2589,
274
- "eval_samples_per_second": 2.919,
275
- "eval_steps_per_second": 0.73,
276
  "step": 280
277
  },
278
  {
279
  "epoch": 0.6243496357960457,
280
- "grad_norm": 0.262820839881897,
281
- "learning_rate": 1.9527896995708157e-05,
282
- "loss": 0.0295,
283
- "mean_token_accuracy": 0.9954247616231442,
284
  "num_tokens": 2423931.0,
285
  "step": 300
286
  },
287
  {
288
  "epoch": 0.6243496357960457,
289
- "eval_loss": 0.0076558589935302734,
290
- "eval_mean_token_accuracy": 0.9952384281158447,
291
  "eval_num_tokens": 2423931.0,
292
- "eval_runtime": 34.2994,
293
- "eval_samples_per_second": 2.915,
294
- "eval_steps_per_second": 0.729,
295
  "step": 300
296
  },
297
  {
298
  "epoch": 0.6659729448491155,
299
- "grad_norm": 0.4573795199394226,
300
- "learning_rate": 1.7381974248927038e-05,
301
- "loss": 0.0308,
302
- "mean_token_accuracy": 0.9953766994178295,
303
  "num_tokens": 2585532.0,
304
  "step": 320
305
  },
306
  {
307
  "epoch": 0.6659729448491155,
308
- "eval_loss": 0.007896814495325089,
309
- "eval_mean_token_accuracy": 0.9952785062789917,
310
  "eval_num_tokens": 2585532.0,
311
- "eval_runtime": 34.2979,
312
- "eval_samples_per_second": 2.916,
313
- "eval_steps_per_second": 0.729,
314
  "step": 320
315
  },
316
  {
317
  "epoch": 0.7075962539021852,
318
- "grad_norm": 0.3572976887226105,
319
- "learning_rate": 1.5236051502145923e-05,
320
- "loss": 0.0311,
321
- "mean_token_accuracy": 0.9952577523887157,
322
  "num_tokens": 2747068.0,
323
  "step": 340
324
  },
325
  {
326
  "epoch": 0.7075962539021852,
327
- "eval_loss": 0.007795471698045731,
328
- "eval_mean_token_accuracy": 0.9952789568901061,
329
  "eval_num_tokens": 2747068.0,
330
- "eval_runtime": 34.3024,
331
  "eval_samples_per_second": 2.915,
332
  "eval_steps_per_second": 0.729,
333
  "step": 340
334
  },
335
  {
336
  "epoch": 0.7492195629552549,
337
- "grad_norm": 0.25351160764694214,
338
- "learning_rate": 1.3090128755364809e-05,
339
- "loss": 0.0302,
340
- "mean_token_accuracy": 0.9953413404524326,
341
  "num_tokens": 2908697.0,
342
  "step": 360
343
  },
344
  {
345
  "epoch": 0.7492195629552549,
346
- "eval_loss": 0.007526129484176636,
347
- "eval_mean_token_accuracy": 0.9951572942733765,
348
  "eval_num_tokens": 2908697.0,
349
- "eval_runtime": 34.4802,
350
- "eval_samples_per_second": 2.9,
351
- "eval_steps_per_second": 0.725,
352
  "step": 360
353
  },
354
  {
355
  "epoch": 0.7908428720083247,
356
- "grad_norm": 0.45476171374320984,
357
- "learning_rate": 1.0944206008583692e-05,
358
- "loss": 0.0301,
359
- "mean_token_accuracy": 0.9954766884446145,
360
  "num_tokens": 3070284.0,
361
  "step": 380
362
  },
363
  {
364
  "epoch": 0.7908428720083247,
365
- "eval_loss": 0.007462680339813232,
366
- "eval_mean_token_accuracy": 0.9952787923812866,
367
  "eval_num_tokens": 3070284.0,
368
- "eval_runtime": 34.2958,
369
- "eval_samples_per_second": 2.916,
370
- "eval_steps_per_second": 0.729,
371
  "step": 380
372
  },
373
  {
374
  "epoch": 0.8324661810613944,
375
- "grad_norm": 0.37078818678855896,
376
- "learning_rate": 8.798283261802575e-06,
377
- "loss": 0.0298,
378
- "mean_token_accuracy": 0.9952507764101028,
379
  "num_tokens": 3231892.0,
380
  "step": 400
381
  },
382
  {
383
  "epoch": 0.8324661810613944,
384
- "eval_loss": 0.0073690456338226795,
385
- "eval_mean_token_accuracy": 0.9953188729286194,
386
  "eval_num_tokens": 3231892.0,
387
- "eval_runtime": 34.3243,
388
- "eval_samples_per_second": 2.913,
389
- "eval_steps_per_second": 0.728,
390
  "step": 400
391
  }
392
  ],
@@ -394,7 +394,7 @@
394
  "max_steps": 481,
395
  "num_input_tokens_seen": 0,
396
  "num_train_epochs": 1,
397
- "save_steps": 50,
398
  "stateful_callbacks": {
399
  "TrainerControl": {
400
  "args": {
 
11
  "log_history": [
12
  {
13
  "epoch": 0.04162330905306972,
14
+ "grad_norm": 2.6779890060424805,
15
+ "learning_rate": 0.00019828326180257511,
16
+ "loss": 5.368,
17
+ "mean_token_accuracy": 0.8072595901787281,
18
  "num_tokens": 161548.0,
19
  "step": 20
20
  },
21
  {
22
  "epoch": 0.04162330905306972,
23
+ "eval_loss": 0.12305498868227005,
24
+ "eval_mean_token_accuracy": 0.9847841203212738,
25
  "eval_num_tokens": 161548.0,
26
+ "eval_runtime": 68.9772,
27
+ "eval_samples_per_second": 2.9,
28
+ "eval_steps_per_second": 0.725,
29
  "step": 20
30
  },
31
  {
32
  "epoch": 0.08324661810613944,
33
+ "grad_norm": 0.5366652011871338,
34
+ "learning_rate": 0.00018969957081545064,
35
+ "loss": 0.1516,
36
+ "mean_token_accuracy": 0.9917096085846424,
37
  "num_tokens": 323168.0,
38
  "step": 40
39
  },
40
  {
41
  "epoch": 0.08324661810613944,
42
+ "eval_loss": 0.013321125879883766,
43
+ "eval_mean_token_accuracy": 0.9938336944580078,
44
  "eval_num_tokens": 323168.0,
45
+ "eval_runtime": 68.8811,
46
+ "eval_samples_per_second": 2.904,
47
+ "eval_steps_per_second": 0.726,
48
  "step": 40
49
  },
50
  {
51
  "epoch": 0.12486992715920915,
52
+ "grad_norm": 0.4724605977535248,
53
+ "learning_rate": 0.0001811158798283262,
54
+ "loss": 0.0462,
55
+ "mean_token_accuracy": 0.9939016968011856,
56
  "num_tokens": 484780.0,
57
  "step": 60
58
  },
59
  {
60
  "epoch": 0.12486992715920915,
61
+ "eval_loss": 0.010577572509646416,
62
+ "eval_mean_token_accuracy": 0.99473925948143,
63
  "eval_num_tokens": 484780.0,
64
+ "eval_runtime": 68.7899,
65
+ "eval_samples_per_second": 2.907,
66
+ "eval_steps_per_second": 0.727,
67
  "step": 60
68
  },
69
  {
70
  "epoch": 0.16649323621227888,
71
+ "grad_norm": 0.33190304040908813,
72
+ "learning_rate": 0.00017253218884120172,
73
+ "loss": 0.0388,
74
+ "mean_token_accuracy": 0.9948750860989094,
75
  "num_tokens": 646431.0,
76
  "step": 80
77
  },
78
  {
79
  "epoch": 0.16649323621227888,
80
+ "eval_loss": 0.00940256379544735,
81
+ "eval_mean_token_accuracy": 0.9951227140426636,
82
  "eval_num_tokens": 646431.0,
83
+ "eval_runtime": 69.1687,
84
+ "eval_samples_per_second": 2.891,
85
+ "eval_steps_per_second": 0.723,
86
  "step": 80
87
  },
88
  {
89
  "epoch": 0.2081165452653486,
90
+ "grad_norm": 0.20264093577861786,
91
+ "learning_rate": 0.00016394849785407727,
92
+ "loss": 0.0367,
93
+ "mean_token_accuracy": 0.9950500458478928,
94
  "num_tokens": 808054.0,
95
  "step": 100
96
  },
97
  {
98
  "epoch": 0.2081165452653486,
99
+ "eval_loss": 0.008246215991675854,
100
+ "eval_mean_token_accuracy": 0.9951834440231323,
101
  "eval_num_tokens": 808054.0,
102
+ "eval_runtime": 68.7059,
103
+ "eval_samples_per_second": 2.911,
104
  "eval_steps_per_second": 0.728,
105
  "step": 100
106
  },
107
  {
108
  "epoch": 0.2497398543184183,
109
+ "grad_norm": 0.30675262212753296,
110
+ "learning_rate": 0.0001553648068669528,
111
+ "loss": 0.0341,
112
+ "mean_token_accuracy": 0.994870014488697,
113
  "num_tokens": 969623.0,
114
  "step": 120
115
  },
116
  {
117
  "epoch": 0.2497398543184183,
118
+ "eval_loss": 0.00862042885273695,
119
+ "eval_mean_token_accuracy": 0.994941633939743,
120
  "eval_num_tokens": 969623.0,
121
+ "eval_runtime": 68.8073,
122
+ "eval_samples_per_second": 2.907,
123
  "eval_steps_per_second": 0.727,
124
  "step": 120
125
  },
126
  {
127
  "epoch": 0.29136316337148804,
128
+ "grad_norm": 0.1850423365831375,
129
+ "learning_rate": 0.00014678111587982832,
130
+ "loss": 0.034,
131
+ "mean_token_accuracy": 0.9949115067720413,
132
  "num_tokens": 1131222.0,
133
  "step": 140
134
  },
135
  {
136
  "epoch": 0.29136316337148804,
137
+ "eval_loss": 0.007929541170597076,
138
+ "eval_mean_token_accuracy": 0.9951228404045105,
139
  "eval_num_tokens": 1131222.0,
140
+ "eval_runtime": 68.7004,
141
+ "eval_samples_per_second": 2.911,
142
  "eval_steps_per_second": 0.728,
143
  "step": 140
144
  },
145
  {
146
  "epoch": 0.33298647242455776,
147
+ "grad_norm": 0.3047815263271332,
148
+ "learning_rate": 0.00013819742489270387,
149
+ "loss": 0.0328,
150
+ "mean_token_accuracy": 0.9950883395969867,
151
  "num_tokens": 1292839.0,
152
  "step": 160
153
  },
154
  {
155
  "epoch": 0.33298647242455776,
156
+ "eval_loss": 0.007993862964212894,
157
+ "eval_mean_token_accuracy": 0.9951430022716522,
158
  "eval_num_tokens": 1292839.0,
159
+ "eval_runtime": 68.1668,
160
+ "eval_samples_per_second": 2.934,
161
+ "eval_steps_per_second": 0.733,
162
  "step": 160
163
  },
164
  {
165
  "epoch": 0.37460978147762747,
166
+ "grad_norm": 0.13929370045661926,
167
+ "learning_rate": 0.0001296137339055794,
168
+ "loss": 0.0331,
169
+ "mean_token_accuracy": 0.9951471641659737,
170
  "num_tokens": 1454417.0,
171
  "step": 180
172
  },
173
  {
174
  "epoch": 0.37460978147762747,
175
+ "eval_loss": 0.007801079656928778,
176
+ "eval_mean_token_accuracy": 0.9954658925533295,
177
  "eval_num_tokens": 1454417.0,
178
+ "eval_runtime": 68.0889,
179
+ "eval_samples_per_second": 2.937,
180
+ "eval_steps_per_second": 0.734,
181
  "step": 180
182
  },
183
  {
184
  "epoch": 0.4162330905306972,
185
+ "grad_norm": 0.29847878217697144,
186
+ "learning_rate": 0.00012103004291845495,
187
+ "loss": 0.0308,
188
+ "mean_token_accuracy": 0.9953480623662472,
189
  "num_tokens": 1615982.0,
190
  "step": 200
191
  },
192
  {
193
  "epoch": 0.4162330905306972,
194
+ "eval_loss": 0.007313677109777927,
195
+ "eval_mean_token_accuracy": 0.9954455089569092,
196
  "eval_num_tokens": 1615982.0,
197
+ "eval_runtime": 68.4814,
198
+ "eval_samples_per_second": 2.921,
199
+ "eval_steps_per_second": 0.73,
200
  "step": 200
201
  },
202
  {
203
  "epoch": 0.4578563995837669,
204
+ "grad_norm": 0.14924530684947968,
205
+ "learning_rate": 0.00011244635193133049,
206
+ "loss": 0.0298,
207
+ "mean_token_accuracy": 0.9952255949378014,
208
  "num_tokens": 1777586.0,
209
  "step": 220
210
  },
211
  {
212
  "epoch": 0.4578563995837669,
213
+ "eval_loss": 0.0077740405686199665,
214
+ "eval_mean_token_accuracy": 0.9953651452064514,
215
  "eval_num_tokens": 1777586.0,
216
+ "eval_runtime": 68.0902,
217
+ "eval_samples_per_second": 2.937,
218
+ "eval_steps_per_second": 0.734,
219
  "step": 220
220
  },
221
  {
222
  "epoch": 0.4994797086368366,
223
+ "grad_norm": 0.31947973370552063,
224
+ "learning_rate": 0.000103862660944206,
225
+ "loss": 0.0299,
226
+ "mean_token_accuracy": 0.9952647179365158,
227
  "num_tokens": 1939204.0,
228
  "step": 240
229
  },
230
  {
231
  "epoch": 0.4994797086368366,
232
+ "eval_loss": 0.007334359921514988,
233
+ "eval_mean_token_accuracy": 0.9954258131980896,
234
  "eval_num_tokens": 1939204.0,
235
+ "eval_runtime": 68.026,
236
+ "eval_samples_per_second": 2.94,
237
+ "eval_steps_per_second": 0.735,
238
  "step": 240
239
  },
240
  {
241
  "epoch": 0.5411030176899063,
242
+ "grad_norm": 0.15298807621002197,
243
+ "learning_rate": 9.527896995708155e-05,
244
+ "loss": 0.0305,
245
+ "mean_token_accuracy": 0.9951752610504627,
246
  "num_tokens": 2100821.0,
247
  "step": 260
248
  },
249
  {
250
  "epoch": 0.5411030176899063,
251
+ "eval_loss": 0.007329077925533056,
252
+ "eval_mean_token_accuracy": 0.9955062341690063,
253
  "eval_num_tokens": 2100821.0,
254
+ "eval_runtime": 68.3167,
255
+ "eval_samples_per_second": 2.928,
256
+ "eval_steps_per_second": 0.732,
257
  "step": 260
258
  },
259
  {
260
  "epoch": 0.5827263267429761,
261
+ "grad_norm": 0.1927741914987564,
262
+ "learning_rate": 8.669527896995709e-05,
263
+ "loss": 0.0301,
264
+ "mean_token_accuracy": 0.9952726632356643,
265
  "num_tokens": 2262372.0,
266
  "step": 280
267
  },
268
  {
269
  "epoch": 0.5827263267429761,
270
+ "eval_loss": 0.007559146732091904,
271
+ "eval_mean_token_accuracy": 0.9953450250625611,
272
  "eval_num_tokens": 2262372.0,
273
+ "eval_runtime": 68.1437,
274
+ "eval_samples_per_second": 2.935,
275
+ "eval_steps_per_second": 0.734,
276
  "step": 280
277
  },
278
  {
279
  "epoch": 0.6243496357960457,
280
+ "grad_norm": 0.15931323170661926,
281
+ "learning_rate": 7.811158798283263e-05,
282
+ "loss": 0.028,
283
+ "mean_token_accuracy": 0.9956002615392208,
284
  "num_tokens": 2423931.0,
285
  "step": 300
286
  },
287
  {
288
  "epoch": 0.6243496357960457,
289
+ "eval_loss": 0.007200200576335192,
290
+ "eval_mean_token_accuracy": 0.9955260717868805,
291
  "eval_num_tokens": 2423931.0,
292
+ "eval_runtime": 68.2059,
293
+ "eval_samples_per_second": 2.932,
294
+ "eval_steps_per_second": 0.733,
295
  "step": 300
296
  },
297
  {
298
  "epoch": 0.6659729448491155,
299
+ "grad_norm": 0.2115289717912674,
300
+ "learning_rate": 6.952789699570815e-05,
301
+ "loss": 0.0274,
302
+ "mean_token_accuracy": 0.9953135840594769,
303
  "num_tokens": 2585532.0,
304
  "step": 320
305
  },
306
  {
307
  "epoch": 0.6659729448491155,
308
+ "eval_loss": 0.00743951927870512,
309
+ "eval_mean_token_accuracy": 0.9954253661632538,
310
  "eval_num_tokens": 2585532.0,
311
+ "eval_runtime": 67.978,
312
+ "eval_samples_per_second": 2.942,
313
+ "eval_steps_per_second": 0.736,
314
  "step": 320
315
  },
316
  {
317
  "epoch": 0.7075962539021852,
318
+ "grad_norm": 0.19469623267650604,
319
+ "learning_rate": 6.094420600858369e-05,
320
+ "loss": 0.0291,
321
+ "mean_token_accuracy": 0.9953210979700089,
322
  "num_tokens": 2747068.0,
323
  "step": 340
324
  },
325
  {
326
  "epoch": 0.7075962539021852,
327
+ "eval_loss": 0.0072942511178553104,
328
+ "eval_mean_token_accuracy": 0.9953449034690857,
329
  "eval_num_tokens": 2747068.0,
330
+ "eval_runtime": 68.6035,
331
  "eval_samples_per_second": 2.915,
332
  "eval_steps_per_second": 0.729,
333
  "step": 340
334
  },
335
  {
336
  "epoch": 0.7492195629552549,
337
+ "grad_norm": 0.14993388950824738,
338
+ "learning_rate": 5.2360515021459236e-05,
339
+ "loss": 0.029,
340
+ "mean_token_accuracy": 0.9953904427587986,
341
  "num_tokens": 2908697.0,
342
  "step": 360
343
  },
344
  {
345
  "epoch": 0.7492195629552549,
346
+ "eval_loss": 0.0072441427037119865,
347
+ "eval_mean_token_accuracy": 0.9954857683181763,
348
  "eval_num_tokens": 2908697.0,
349
+ "eval_runtime": 68.0534,
350
+ "eval_samples_per_second": 2.939,
351
+ "eval_steps_per_second": 0.735,
352
  "step": 360
353
  },
354
  {
355
  "epoch": 0.7908428720083247,
356
+ "grad_norm": 0.19807656109333038,
357
+ "learning_rate": 4.377682403433477e-05,
358
+ "loss": 0.0289,
359
+ "mean_token_accuracy": 0.9953253343701363,
360
  "num_tokens": 3070284.0,
361
  "step": 380
362
  },
363
  {
364
  "epoch": 0.7908428720083247,
365
+ "eval_loss": 0.00713132182136178,
366
+ "eval_mean_token_accuracy": 0.9954454469680786,
367
  "eval_num_tokens": 3070284.0,
368
+ "eval_runtime": 68.0742,
369
+ "eval_samples_per_second": 2.938,
370
+ "eval_steps_per_second": 0.734,
371
  "step": 380
372
  },
373
  {
374
  "epoch": 0.8324661810613944,
375
+ "grad_norm": 0.124539814889431,
376
+ "learning_rate": 3.51931330472103e-05,
377
+ "loss": 0.0282,
378
+ "mean_token_accuracy": 0.9952500656247139,
379
  "num_tokens": 3231892.0,
380
  "step": 400
381
  },
382
  {
383
  "epoch": 0.8324661810613944,
384
+ "eval_loss": 0.007116459775716066,
385
+ "eval_mean_token_accuracy": 0.995526316165924,
386
  "eval_num_tokens": 3231892.0,
387
+ "eval_runtime": 68.1093,
388
+ "eval_samples_per_second": 2.936,
389
+ "eval_steps_per_second": 0.734,
390
  "step": 400
391
  }
392
  ],
 
394
  "max_steps": 481,
395
  "num_input_tokens_seen": 0,
396
  "num_train_epochs": 1,
397
+ "save_steps": 40,
398
  "stateful_callbacks": {
399
  "TrainerControl": {
400
  "args": {
checkpoint-400/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dd8015c9bbe5c0dbff1905684d15785ccb3cb07ead997cd50301f35dcf48b143
3
- size 5880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a99c92db5718c8e3fa68a50a104ce7f740a033660d2ea251fbb6febbc7e4942
3
+ size 5816