yeok commited on
Commit
049da22
·
verified ·
1 Parent(s): 1691d17

Model save

Browse files
Files changed (4) hide show
  1. README.md +1 -1
  2. all_results.json +6 -6
  3. train_results.json +6 -6
  4. trainer_state.json +486 -262
README.md CHANGED
@@ -27,7 +27,7 @@ print(output["generated_text"])
27
 
28
  ## Training procedure
29
 
30
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/yeokch/stream-of-search-train/runs/q51rp2gs)
31
 
32
 
33
  This model was trained with SFT.
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/yeokch/stream-of-search-train/runs/avxmo5b9)
31
 
32
 
33
  This model was trained with SFT.
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "total_flos": 7.11765772950569e+16,
3
- "train_loss": 0.6465841082908846,
4
- "train_runtime": 5579.7567,
5
- "train_samples": 6400,
6
- "train_samples_per_second": 0.711,
7
- "train_steps_per_second": 0.044
8
  }
 
1
  {
2
+ "total_flos": 1.112161775475753e+17,
3
+ "train_loss": 0.24216800009339584,
4
+ "train_runtime": 6297.1418,
5
+ "train_samples": 10000,
6
+ "train_samples_per_second": 0.983,
7
+ "train_steps_per_second": 0.061
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "total_flos": 7.11765772950569e+16,
3
- "train_loss": 0.6465841082908846,
4
- "train_runtime": 5579.7567,
5
- "train_samples": 6400,
6
- "train_samples_per_second": 0.711,
7
- "train_steps_per_second": 0.044
8
  }
 
1
  {
2
+ "total_flos": 1.112161775475753e+17,
3
+ "train_loss": 0.24216800009339584,
4
+ "train_runtime": 6297.1418,
5
+ "train_samples": 10000,
6
+ "train_samples_per_second": 0.983,
7
+ "train_steps_per_second": 0.061
8
  }
trainer_state.json CHANGED
@@ -2,426 +2,650 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.9964699949571356,
6
  "eval_steps": 500,
7
- "global_step": 247,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.004034291477559254,
14
- "grad_norm": 0.31632325053215027,
15
- "learning_rate": 8.000000000000001e-06,
16
- "loss": 1.0252,
17
- "mean_token_accuracy": 0.7187690734863281,
18
  "step": 1
19
  },
20
  {
21
- "epoch": 0.020171457387796268,
22
- "grad_norm": 0.3391473889350891,
23
- "learning_rate": 4e-05,
24
- "loss": 1.0786,
25
- "mean_token_accuracy": 0.7031630147248507,
26
  "step": 5
27
  },
28
  {
29
- "epoch": 0.040342914775592535,
30
- "grad_norm": 0.2010446935892105,
31
- "learning_rate": 8e-05,
32
- "loss": 1.0307,
33
- "mean_token_accuracy": 0.7161946006119251,
34
  "step": 10
35
  },
36
  {
37
- "epoch": 0.060514372163388806,
38
- "grad_norm": 0.26254045963287354,
39
- "learning_rate": 0.00012,
40
- "loss": 0.9875,
41
- "mean_token_accuracy": 0.7219432845711709,
42
  "step": 15
43
  },
44
  {
45
- "epoch": 0.08068582955118507,
46
- "grad_norm": 0.2110530585050583,
47
- "learning_rate": 0.00016,
48
- "loss": 0.971,
49
- "mean_token_accuracy": 0.7234235741198063,
50
  "step": 20
51
  },
52
  {
53
- "epoch": 0.10085728693898134,
54
- "grad_norm": 0.17029768228530884,
55
- "learning_rate": 0.0002,
56
- "loss": 0.8975,
57
- "mean_token_accuracy": 0.7393156312406063,
58
  "step": 25
59
  },
60
  {
61
- "epoch": 0.12102874432677761,
62
- "grad_norm": 0.1730598360300064,
63
- "learning_rate": 0.00019974977965945,
64
- "loss": 0.8406,
65
- "mean_token_accuracy": 0.7519014693796635,
66
  "step": 30
67
  },
68
  {
69
- "epoch": 0.14120020171457387,
70
- "grad_norm": 0.146613210439682,
71
- "learning_rate": 0.00019900037084217637,
72
- "loss": 0.7982,
73
- "mean_token_accuracy": 0.7588404878973961,
74
  "step": 35
75
  },
76
  {
77
- "epoch": 0.16137165910237014,
78
- "grad_norm": 0.11783485114574432,
79
- "learning_rate": 0.00019775552389476864,
80
- "loss": 0.7373,
81
- "mean_token_accuracy": 0.7751434445381165,
82
  "step": 40
83
  },
84
  {
85
- "epoch": 0.1815431164901664,
86
- "grad_norm": 0.13487069308757782,
87
- "learning_rate": 0.00019602146853776894,
88
- "loss": 0.7216,
89
- "mean_token_accuracy": 0.7767107143998147,
90
  "step": 45
91
  },
92
  {
93
- "epoch": 0.20171457387796268,
94
- "grad_norm": 0.13654637336730957,
95
- "learning_rate": 0.0001938068826896166,
96
- "loss": 0.6875,
97
- "mean_token_accuracy": 0.7851818971335888,
98
  "step": 50
99
  },
100
  {
101
- "epoch": 0.22188603126575895,
102
- "grad_norm": 0.1778300255537033,
103
- "learning_rate": 0.0001911228490388136,
104
- "loss": 0.6764,
105
- "mean_token_accuracy": 0.7869231440126896,
106
  "step": 55
107
  },
108
  {
109
- "epoch": 0.24205748865355523,
110
- "grad_norm": 0.19565220177173615,
111
- "learning_rate": 0.00018798279958164295,
112
- "loss": 0.667,
113
- "mean_token_accuracy": 0.7879578113555908,
114
  "step": 60
115
  },
116
  {
117
- "epoch": 0.2622289460413515,
118
- "grad_norm": 0.17145097255706787,
119
- "learning_rate": 0.00018440244840299506,
120
- "loss": 0.6592,
121
- "mean_token_accuracy": 0.7885651886463165,
122
  "step": 65
123
  },
124
  {
125
- "epoch": 0.28240040342914774,
126
- "grad_norm": 0.21791349351406097,
127
- "learning_rate": 0.00018039971303669407,
128
- "loss": 0.6444,
129
- "mean_token_accuracy": 0.7919591583311558,
130
  "step": 70
131
  },
132
  {
133
- "epoch": 0.30257186081694404,
134
- "grad_norm": 0.17937114834785461,
135
- "learning_rate": 0.00017599462479886974,
136
- "loss": 0.6393,
137
- "mean_token_accuracy": 0.7939399912953377,
138
  "step": 75
139
  },
140
  {
141
- "epoch": 0.3227433182047403,
142
- "grad_norm": 0.19707709550857544,
143
- "learning_rate": 0.00017120922854310257,
144
- "loss": 0.6143,
145
- "mean_token_accuracy": 0.8009873569011688,
146
  "step": 80
147
  },
148
  {
149
- "epoch": 0.3429147755925366,
150
- "grad_norm": 0.15954312682151794,
151
- "learning_rate": 0.00016606747233900815,
152
- "loss": 0.6236,
153
- "mean_token_accuracy": 0.7977978855371475,
154
  "step": 85
155
  },
156
  {
157
- "epoch": 0.3630862329803328,
158
- "grad_norm": 0.21868781745433807,
159
- "learning_rate": 0.00016059508762635482,
160
- "loss": 0.6133,
161
- "mean_token_accuracy": 0.7997257344424724,
162
  "step": 90
163
  },
164
  {
165
- "epoch": 0.3832576903681291,
166
- "grad_norm": 0.16990062594413757,
167
- "learning_rate": 0.00015481946044447099,
168
- "loss": 0.6019,
169
- "mean_token_accuracy": 0.8031925238668919,
170
  "step": 95
171
  },
172
  {
173
- "epoch": 0.40342914775592537,
174
- "grad_norm": 0.21849067509174347,
175
- "learning_rate": 0.00014876949438136347,
176
- "loss": 0.6088,
177
- "mean_token_accuracy": 0.8004456080496312,
178
  "step": 100
179
  },
180
  {
181
- "epoch": 0.4236006051437216,
182
- "grad_norm": 0.19513824582099915,
183
- "learning_rate": 0.0001424754659284048,
184
- "loss": 0.5942,
185
- "mean_token_accuracy": 0.804381325095892,
186
  "step": 105
187
  },
188
  {
189
- "epoch": 0.4437720625315179,
190
- "grad_norm": 0.1967114359140396,
191
- "learning_rate": 0.0001359688729644536,
192
- "loss": 0.6032,
193
- "mean_token_accuracy": 0.8013887144625187,
194
  "step": 110
195
  },
196
  {
197
- "epoch": 0.46394351991931415,
198
- "grad_norm": 0.23586668074131012,
199
- "learning_rate": 0.00012928227712765504,
200
- "loss": 0.5964,
201
- "mean_token_accuracy": 0.8034794270992279,
202
  "step": 115
203
  },
204
  {
205
- "epoch": 0.48411497730711045,
206
- "grad_norm": 0.2300032526254654,
207
- "learning_rate": 0.00012244914086375724,
208
- "loss": 0.6001,
209
- "mean_token_accuracy": 0.8025347903370857,
210
  "step": 120
211
  },
212
  {
213
- "epoch": 0.5042864346949067,
214
- "grad_norm": 0.20042847096920013,
215
- "learning_rate": 0.00011550365996641979,
216
- "loss": 0.5815,
217
- "mean_token_accuracy": 0.8074655055999755,
218
  "step": 125
219
  },
220
  {
221
- "epoch": 0.524457892082703,
222
- "grad_norm": 0.17820023000240326,
223
- "learning_rate": 0.00010848059244755093,
224
- "loss": 0.583,
225
- "mean_token_accuracy": 0.8069817453622818,
226
  "step": 130
227
  },
228
  {
229
- "epoch": 0.5446293494704992,
230
- "grad_norm": 0.18053625524044037,
231
- "learning_rate": 0.00010141508459407623,
232
- "loss": 0.5736,
233
- "mean_token_accuracy": 0.8103940255939961,
234
  "step": 135
235
  },
236
  {
237
- "epoch": 0.5648008068582955,
238
- "grad_norm": 0.18805579841136932,
239
- "learning_rate": 9.434249508162076e-05,
240
- "loss": 0.5867,
241
- "mean_token_accuracy": 0.8052298232913018,
242
  "step": 140
243
  },
244
  {
245
- "epoch": 0.5849722642460918,
246
- "grad_norm": 0.20117288827896118,
247
- "learning_rate": 8.729821802531212e-05,
248
- "loss": 0.5855,
249
- "mean_token_accuracy": 0.8056235462427139,
250
  "step": 145
251
  },
252
  {
253
- "epoch": 0.6051437216338881,
254
- "grad_norm": 0.1941426694393158,
255
- "learning_rate": 8.031750585322947e-05,
256
- "loss": 0.5769,
257
- "mean_token_accuracy": 0.8083704620599746,
258
  "step": 150
259
  },
260
  {
261
- "epoch": 0.6253151790216843,
262
- "grad_norm": 0.22273395955562592,
263
- "learning_rate": 7.343529288891239e-05,
264
- "loss": 0.5673,
265
- "mean_token_accuracy": 0.8112946927547455,
266
  "step": 155
267
  },
268
  {
269
- "epoch": 0.6454866364094806,
270
- "grad_norm": 0.1648331880569458,
271
- "learning_rate": 6.668602052579424e-05,
272
- "loss": 0.5638,
273
- "mean_token_accuracy": 0.8121352635324002,
274
  "step": 160
275
  },
276
  {
277
- "epoch": 0.6656580937972768,
278
- "grad_norm": 0.16016638278961182,
279
- "learning_rate": 6.010346486845837e-05,
280
- "loss": 0.5612,
281
- "mean_token_accuracy": 0.8127746880054474,
282
  "step": 165
283
  },
284
  {
285
- "epoch": 0.6858295511850732,
286
- "grad_norm": 0.20431025326251984,
287
- "learning_rate": 5.372056770327013e-05,
288
- "loss": 0.5696,
289
- "mean_token_accuracy": 0.8099606201052666,
290
  "step": 170
291
  },
292
  {
293
- "epoch": 0.7060010085728694,
294
- "grad_norm": 0.15511010587215424,
295
- "learning_rate": 4.756927164427685e-05,
296
- "loss": 0.5678,
297
- "mean_token_accuracy": 0.8108936175704002,
298
  "step": 175
299
  },
300
  {
301
- "epoch": 0.7261724659606656,
302
- "grad_norm": 0.16541433334350586,
303
- "learning_rate": 4.168036027937267e-05,
304
- "loss": 0.5678,
305
- "mean_token_accuracy": 0.8105023756623269,
306
  "step": 180
307
  },
308
  {
309
- "epoch": 0.7463439233484619,
310
- "grad_norm": 0.17240563035011292,
311
- "learning_rate": 3.6083304116701535e-05,
312
- "loss": 0.5657,
313
- "mean_token_accuracy": 0.811833106726408,
314
  "step": 185
315
  },
316
  {
317
- "epoch": 0.7665153807362582,
318
- "grad_norm": 0.2840154767036438,
319
- "learning_rate": 3.080611310224539e-05,
320
- "loss": 0.5645,
321
- "mean_token_accuracy": 0.8111509509384632,
322
  "step": 190
323
  },
324
  {
325
- "epoch": 0.7866868381240545,
326
- "grad_norm": 0.17428910732269287,
327
- "learning_rate": 2.587519644666001e-05,
328
- "loss": 0.5676,
329
- "mean_token_accuracy": 0.8105710506439209,
330
  "step": 195
331
  },
332
  {
333
- "epoch": 0.8068582955118507,
334
- "grad_norm": 0.16991418600082397,
335
- "learning_rate": 2.1315230462840985e-05,
336
- "loss": 0.5688,
337
- "mean_token_accuracy": 0.8101178124547005,
338
  "step": 200
339
  },
340
  {
341
- "epoch": 0.827029752899647,
342
- "grad_norm": 0.16515938937664032,
343
- "learning_rate": 1.7149035075615794e-05,
344
- "loss": 0.5755,
345
- "mean_token_accuracy": 0.8074880324304103,
346
  "step": 205
347
  },
348
  {
349
- "epoch": 0.8472012102874432,
350
- "grad_norm": 0.1514715701341629,
351
- "learning_rate": 1.339745962155613e-05,
352
- "loss": 0.5487,
353
- "mean_token_accuracy": 0.8160267353057862,
354
  "step": 210
355
  },
356
  {
357
- "epoch": 0.8673726676752396,
358
- "grad_norm": 0.16116391122341156,
359
- "learning_rate": 1.0079278510416313e-05,
360
- "loss": 0.5584,
361
- "mean_token_accuracy": 0.8127884238958358,
362
  "step": 215
363
  },
364
  {
365
- "epoch": 0.8875441250630358,
366
- "grad_norm": 0.14942748844623566,
367
- "learning_rate": 7.211097270349066e-06,
368
- "loss": 0.5703,
369
- "mean_token_accuracy": 0.810148586332798,
370
  "step": 220
371
  },
372
  {
373
- "epoch": 0.9077155824508321,
374
- "grad_norm": 0.13097812235355377,
375
- "learning_rate": 4.807269447087348e-06,
376
- "loss": 0.5695,
377
- "mean_token_accuracy": 0.8091624893248082,
378
  "step": 225
379
  },
380
  {
381
- "epoch": 0.9278870398386283,
382
- "grad_norm": 0.146128311753273,
383
- "learning_rate": 2.8798247729623806e-06,
384
- "loss": 0.562,
385
- "mean_token_accuracy": 0.81240995824337,
386
  "step": 230
387
  },
388
  {
389
- "epoch": 0.9480584972264247,
390
- "grad_norm": 0.12836919724941254,
391
- "learning_rate": 1.4384089652291543e-06,
392
- "loss": 0.5572,
393
- "mean_token_accuracy": 0.8139100790023803,
394
  "step": 235
395
  },
396
  {
397
- "epoch": 0.9682299546142209,
398
- "grad_norm": 0.13767513632774353,
399
- "learning_rate": 4.902354549733978e-07,
400
- "loss": 0.5612,
401
- "mean_token_accuracy": 0.812609875202179,
402
  "step": 240
403
  },
404
  {
405
- "epoch": 0.9884014120020171,
406
- "grad_norm": 0.13750818371772766,
407
- "learning_rate": 4.0049288167842705e-08,
408
- "loss": 0.5753,
409
- "mean_token_accuracy": 0.8083017885684967,
410
  "step": 245
411
  },
412
  {
413
- "epoch": 0.9964699949571356,
414
- "mean_token_accuracy": 0.8086390513926744,
415
- "step": 247,
416
- "total_flos": 7.11765772950569e+16,
417
- "train_loss": 0.6465841082908846,
418
- "train_runtime": 5579.7567,
419
- "train_samples_per_second": 0.711,
420
- "train_steps_per_second": 0.044
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
421
  }
422
  ],
423
  "logging_steps": 5,
424
- "max_steps": 247,
425
  "num_input_tokens_seen": 0,
426
  "num_train_epochs": 1,
427
  "save_steps": 100,
@@ -437,7 +661,7 @@
437
  "attributes": {}
438
  }
439
  },
440
- "total_flos": 7.11765772950569e+16,
441
  "train_batch_size": 1,
442
  "trial_name": null,
443
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.9978994991113266,
6
  "eval_steps": 500,
7
+ "global_step": 386,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.0025852318629827112,
14
+ "grad_norm": 0.19946137070655823,
15
+ "learning_rate": 5.128205128205128e-06,
16
+ "loss": 0.4855,
17
+ "mean_token_accuracy": 0.8692543655633926,
18
  "step": 1
19
  },
20
  {
21
+ "epoch": 0.012926159314913557,
22
+ "grad_norm": 0.18048785626888275,
23
+ "learning_rate": 2.564102564102564e-05,
24
+ "loss": 0.5257,
25
+ "mean_token_accuracy": 0.8587683830410242,
26
  "step": 5
27
  },
28
  {
29
+ "epoch": 0.025852318629827113,
30
+ "grad_norm": 0.18388661742210388,
31
+ "learning_rate": 5.128205128205128e-05,
32
+ "loss": 0.5986,
33
+ "mean_token_accuracy": 0.838620126247406,
34
  "step": 10
35
  },
36
  {
37
+ "epoch": 0.038778477944740666,
38
+ "grad_norm": 0.13041158020496368,
39
+ "learning_rate": 7.692307692307693e-05,
40
+ "loss": 0.5464,
41
+ "mean_token_accuracy": 0.8496200017631054,
42
  "step": 15
43
  },
44
  {
45
+ "epoch": 0.051704637259654226,
46
+ "grad_norm": 0.10899022966623306,
47
+ "learning_rate": 0.00010256410256410256,
48
+ "loss": 0.4321,
49
+ "mean_token_accuracy": 0.8813652157783508,
50
  "step": 20
51
  },
52
  {
53
+ "epoch": 0.06463079657456779,
54
+ "grad_norm": 0.12952959537506104,
55
+ "learning_rate": 0.00012820512820512823,
56
+ "loss": 0.4429,
57
+ "mean_token_accuracy": 0.8760499283671379,
58
  "step": 25
59
  },
60
  {
61
+ "epoch": 0.07755695588948133,
62
+ "grad_norm": 0.11217343807220459,
63
+ "learning_rate": 0.00015384615384615385,
64
+ "loss": 0.4426,
65
+ "mean_token_accuracy": 0.8728497698903084,
66
  "step": 30
67
  },
68
  {
69
+ "epoch": 0.09048311520439489,
70
+ "grad_norm": 0.11570142209529877,
71
+ "learning_rate": 0.0001794871794871795,
72
+ "loss": 0.4326,
73
+ "mean_token_accuracy": 0.8739698991179466,
74
  "step": 35
75
  },
76
  {
77
+ "epoch": 0.10340927451930845,
78
+ "grad_norm": 0.11375421285629272,
79
+ "learning_rate": 0.00019999590166142655,
80
+ "loss": 0.4398,
81
+ "mean_token_accuracy": 0.8683982357382775,
82
  "step": 40
83
  },
84
  {
85
+ "epoch": 0.11633543383422201,
86
+ "grad_norm": 0.09867997467517853,
87
+ "learning_rate": 0.00019985249508066755,
88
+ "loss": 0.3217,
89
+ "mean_token_accuracy": 0.9007599778473377,
90
  "step": 45
91
  },
92
  {
93
+ "epoch": 0.12926159314913557,
94
+ "grad_norm": 0.10740496963262558,
95
+ "learning_rate": 0.00019950450737506824,
96
+ "loss": 0.3468,
97
+ "mean_token_accuracy": 0.8914886720478534,
98
  "step": 50
99
  },
100
  {
101
+ "epoch": 0.14218775246404913,
102
+ "grad_norm": 0.10870860517024994,
103
+ "learning_rate": 0.00019895265151345518,
104
+ "loss": 0.297,
105
+ "mean_token_accuracy": 0.9054083719849586,
106
  "step": 55
107
  },
108
  {
109
+ "epoch": 0.15511391177896267,
110
+ "grad_norm": 0.1147221028804779,
111
+ "learning_rate": 0.00019819805815653768,
112
+ "loss": 0.2912,
113
+ "mean_token_accuracy": 0.9063926823437214,
114
  "step": 60
115
  },
116
  {
117
+ "epoch": 0.16804007109387623,
118
+ "grad_norm": 0.1086113452911377,
119
+ "learning_rate": 0.00019724227334037256,
120
+ "loss": 0.2709,
121
+ "mean_token_accuracy": 0.9122237786650658,
122
  "step": 65
123
  },
124
  {
125
+ "epoch": 0.18096623040878979,
126
+ "grad_norm": 0.11263403296470642,
127
+ "learning_rate": 0.00019608725530879375,
128
+ "loss": 0.2775,
129
+ "mean_token_accuracy": 0.9088832184672355,
130
  "step": 70
131
  },
132
  {
133
+ "epoch": 0.19389238972370335,
134
+ "grad_norm": 0.11787361651659012,
135
+ "learning_rate": 0.00019473537050129704,
136
+ "loss": 0.2758,
137
+ "mean_token_accuracy": 0.909870583564043,
138
  "step": 75
139
  },
140
  {
141
+ "epoch": 0.2068185490386169,
142
+ "grad_norm": 0.0966123640537262,
143
+ "learning_rate": 0.00019318938870459984,
144
+ "loss": 0.2113,
145
+ "mean_token_accuracy": 0.9297552116215229,
146
  "step": 80
147
  },
148
  {
149
+ "epoch": 0.21974470835353047,
150
+ "grad_norm": 0.09169968962669373,
151
+ "learning_rate": 0.00019145247737780961,
152
+ "loss": 0.2097,
153
+ "mean_token_accuracy": 0.9294927291572094,
154
  "step": 85
155
  },
156
  {
157
+ "epoch": 0.23267086766844403,
158
+ "grad_norm": 0.10205108672380447,
159
+ "learning_rate": 0.0001895281951628281,
160
+ "loss": 0.2354,
161
+ "mean_token_accuracy": 0.9218288294970989,
162
  "step": 90
163
  },
164
  {
165
+ "epoch": 0.24559702698335756,
166
+ "grad_norm": 0.11714070290327072,
167
+ "learning_rate": 0.00018742048459328682,
168
+ "loss": 0.2286,
169
+ "mean_token_accuracy": 0.9224011048674583,
170
  "step": 95
171
  },
172
  {
173
+ "epoch": 0.25852318629827115,
174
+ "grad_norm": 0.08743036538362503,
175
+ "learning_rate": 0.00018513366401695276,
176
+ "loss": 0.2524,
177
+ "mean_token_accuracy": 0.9145891763269901,
178
  "step": 100
179
  },
180
  {
181
+ "epoch": 0.2714493456131847,
182
+ "grad_norm": 0.12086658179759979,
183
+ "learning_rate": 0.00018267241874815314,
184
+ "loss": 0.2355,
185
+ "mean_token_accuracy": 0.9212763957679272,
186
  "step": 105
187
  },
188
  {
189
+ "epoch": 0.28437550492809827,
190
+ "grad_norm": 0.10271570086479187,
191
+ "learning_rate": 0.0001800417914683471,
192
+ "loss": 0.2033,
193
+ "mean_token_accuracy": 0.9312141306698323,
194
  "step": 110
195
  },
196
  {
197
+ "epoch": 0.2973016642430118,
198
+ "grad_norm": 0.11228019744157791,
199
+ "learning_rate": 0.0001772471718945119,
200
+ "loss": 0.1578,
201
+ "mean_token_accuracy": 0.946045958250761,
202
  "step": 115
203
  },
204
  {
205
+ "epoch": 0.31022782355792533,
206
+ "grad_norm": 0.1075127124786377,
207
+ "learning_rate": 0.00017429428573651024,
208
+ "loss": 0.2109,
209
+ "mean_token_accuracy": 0.928734278678894,
210
  "step": 120
211
  },
212
  {
213
+ "epoch": 0.3231539828728389,
214
+ "grad_norm": 0.16245290637016296,
215
+ "learning_rate": 0.00017118918296606537,
216
+ "loss": 0.2544,
217
+ "mean_token_accuracy": 0.9145586542785168,
218
  "step": 125
219
  },
220
  {
221
+ "epoch": 0.33608014218775245,
222
+ "grad_norm": 0.0984274297952652,
223
+ "learning_rate": 0.0001679382254213768,
224
+ "loss": 0.2398,
225
+ "mean_token_accuracy": 0.9193581290543079,
226
  "step": 130
227
  },
228
  {
229
+ "epoch": 0.34900630150266604,
230
+ "grad_norm": 0.14143706858158112,
231
+ "learning_rate": 0.00016454807377277398,
232
+ "loss": 0.2187,
233
+ "mean_token_accuracy": 0.9259690448641777,
234
  "step": 135
235
  },
236
  {
237
+ "epoch": 0.36193246081757957,
238
+ "grad_norm": 0.10752697288990021,
239
+ "learning_rate": 0.0001610256738761125,
240
+ "loss": 0.2205,
241
+ "mean_token_accuracy": 0.9248595975339413,
242
  "step": 140
243
  },
244
  {
245
+ "epoch": 0.37485862013249316,
246
+ "grad_norm": 0.15562467277050018,
247
+ "learning_rate": 0.00015737824254187275,
248
+ "loss": 0.2399,
249
+ "mean_token_accuracy": 0.9187141321599483,
250
  "step": 145
251
  },
252
  {
253
+ "epoch": 0.3877847794474067,
254
+ "grad_norm": 0.10149288177490234,
255
+ "learning_rate": 0.00015361325274911779,
256
+ "loss": 0.1906,
257
+ "mean_token_accuracy": 0.9351025439798832,
258
  "step": 150
259
  },
260
  {
261
+ "epoch": 0.4007109387623202,
262
+ "grad_norm": 0.13106182217597961,
263
+ "learning_rate": 0.00014973841833460457,
264
+ "loss": 0.2366,
265
+ "mean_token_accuracy": 0.9191612683236599,
266
  "step": 155
267
  },
268
  {
269
+ "epoch": 0.4136370980772338,
270
+ "grad_norm": 0.12485964596271515,
271
+ "learning_rate": 0.0001457616781884173,
272
+ "loss": 0.2532,
273
+ "mean_token_accuracy": 0.9145220316946506,
274
  "step": 160
275
  },
276
  {
277
+ "epoch": 0.42656325739214734,
278
+ "grad_norm": 0.12415427714586258,
279
+ "learning_rate": 0.0001416911799885049,
280
+ "loss": 0.1772,
281
+ "mean_token_accuracy": 0.9388307243585586,
282
  "step": 165
283
  },
284
  {
285
+ "epoch": 0.43948941670706093,
286
+ "grad_norm": 0.12156961858272552,
287
+ "learning_rate": 0.0001375352635074461,
288
+ "loss": 0.1974,
289
+ "mean_token_accuracy": 0.9322396464645862,
290
  "step": 170
291
  },
292
  {
293
+ "epoch": 0.45241557602197446,
294
+ "grad_norm": 0.08770665526390076,
295
+ "learning_rate": 0.00013330244352564527,
296
+ "loss": 0.2159,
297
+ "mean_token_accuracy": 0.9263734519481659,
298
  "step": 175
299
  },
300
  {
301
+ "epoch": 0.46534173533688805,
302
+ "grad_norm": 0.11970090866088867,
303
+ "learning_rate": 0.00012900139238596598,
304
+ "loss": 0.1788,
305
+ "mean_token_accuracy": 0.9383759558200836,
306
  "step": 180
307
  },
308
  {
309
+ "epoch": 0.4782678946518016,
310
+ "grad_norm": 0.0777999758720398,
311
+ "learning_rate": 0.00012464092222554552,
312
+ "loss": 0.1921,
313
+ "mean_token_accuracy": 0.9340388782322406,
314
  "step": 185
315
  },
316
  {
317
+ "epoch": 0.4911940539667151,
318
+ "grad_norm": 0.1088053435087204,
319
+ "learning_rate": 0.00012022996692119424,
320
+ "loss": 0.2428,
321
+ "mean_token_accuracy": 0.9170919217169284,
322
  "step": 190
323
  },
324
  {
325
+ "epoch": 0.5041202132816287,
326
+ "grad_norm": 0.12958504259586334,
327
+ "learning_rate": 0.00011577756378537033,
328
+ "loss": 0.1805,
329
+ "mean_token_accuracy": 0.9373964861035347,
330
  "step": 195
331
  },
332
  {
333
+ "epoch": 0.5170463725965423,
334
+ "grad_norm": 0.08942475914955139,
335
+ "learning_rate": 0.00011129283505023274,
336
+ "loss": 0.1757,
337
+ "mean_token_accuracy": 0.9391221977770329,
338
  "step": 200
339
  },
340
  {
341
+ "epoch": 0.5299725319114558,
342
+ "grad_norm": 0.12055882811546326,
343
+ "learning_rate": 0.00010678496917770719,
344
+ "loss": 0.2274,
345
+ "mean_token_accuracy": 0.9221370957791806,
346
  "step": 205
347
  },
348
  {
349
+ "epoch": 0.5428986912263694,
350
+ "grad_norm": 0.09946688264608383,
351
+ "learning_rate": 0.00010226320203385878,
352
+ "loss": 0.2235,
353
+ "mean_token_accuracy": 0.9239851593971252,
354
  "step": 210
355
  },
356
  {
357
+ "epoch": 0.5558248505412829,
358
+ "grad_norm": 0.08823594450950623,
359
+ "learning_rate": 9.773679796614124e-05,
360
+ "loss": 0.1757,
361
+ "mean_token_accuracy": 0.9392839625477791,
362
  "step": 215
363
  },
364
  {
365
+ "epoch": 0.5687510098561965,
366
+ "grad_norm": 0.09366550296545029,
367
+ "learning_rate": 9.321503082229282e-05,
368
+ "loss": 0.215,
369
+ "mean_token_accuracy": 0.9264558620750905,
370
  "step": 220
371
  },
372
  {
373
+ "epoch": 0.58167716917111,
374
+ "grad_norm": 0.1250247210264206,
375
+ "learning_rate": 8.87071649497673e-05,
376
+ "loss": 0.203,
377
+ "mean_token_accuracy": 0.9299200311303139,
378
  "step": 225
379
  },
380
  {
381
+ "epoch": 0.5946033284860236,
382
+ "grad_norm": 0.07920438051223755,
383
+ "learning_rate": 8.422243621462969e-05,
384
+ "loss": 0.1777,
385
+ "mean_token_accuracy": 0.93870100826025,
386
  "step": 230
387
  },
388
  {
389
+ "epoch": 0.6075294878009372,
390
+ "grad_norm": 0.09220347553491592,
391
+ "learning_rate": 7.97700330788058e-05,
392
+ "loss": 0.2382,
393
+ "mean_token_accuracy": 0.9188789471983909,
394
  "step": 235
395
  },
396
  {
397
+ "epoch": 0.6204556471158507,
398
+ "grad_norm": 0.09347136318683624,
399
+ "learning_rate": 7.535907777445449e-05,
400
+ "loss": 0.1768,
401
+ "mean_token_accuracy": 0.9390825219452381,
402
  "step": 240
403
  },
404
  {
405
+ "epoch": 0.6333818064307642,
406
+ "grad_norm": 0.06848734617233276,
407
+ "learning_rate": 7.099860761403403e-05,
408
+ "loss": 0.1909,
409
+ "mean_token_accuracy": 0.9346477761864662,
410
  "step": 245
411
  },
412
  {
413
+ "epoch": 0.6463079657456778,
414
+ "grad_norm": 0.08231879770755768,
415
+ "learning_rate": 6.669755647435474e-05,
416
+ "loss": 0.2094,
417
+ "mean_token_accuracy": 0.9281139463186264,
418
+ "step": 250
419
+ },
420
+ {
421
+ "epoch": 0.6592341250605914,
422
+ "grad_norm": 0.14479972422122955,
423
+ "learning_rate": 6.24647364925539e-05,
424
+ "loss": 0.1897,
425
+ "mean_token_accuracy": 0.9346370972692967,
426
+ "step": 255
427
+ },
428
+ {
429
+ "epoch": 0.6721602843755049,
430
+ "grad_norm": 0.10493209958076477,
431
+ "learning_rate": 5.830882001149517e-05,
432
+ "loss": 0.1977,
433
+ "mean_token_accuracy": 0.9318774163722991,
434
+ "step": 260
435
+ },
436
+ {
437
+ "epoch": 0.6850864436904185,
438
+ "grad_norm": 0.08989942073822021,
439
+ "learning_rate": 5.423832181158274e-05,
440
+ "loss": 0.1786,
441
+ "mean_token_accuracy": 0.9378982990980148,
442
+ "step": 265
443
+ },
444
+ {
445
+ "epoch": 0.6980126030053321,
446
+ "grad_norm": 0.0723404809832573,
447
+ "learning_rate": 5.0261581665395475e-05,
448
+ "loss": 0.1854,
449
+ "mean_token_accuracy": 0.9353665545582771,
450
+ "step": 270
451
+ },
452
+ {
453
+ "epoch": 0.7109387623202456,
454
+ "grad_norm": 0.08017778396606445,
455
+ "learning_rate": 4.6386747250882224e-05,
456
+ "loss": 0.2185,
457
+ "mean_token_accuracy": 0.9242644280195236,
458
+ "step": 275
459
+ },
460
+ {
461
+ "epoch": 0.7238649216351591,
462
+ "grad_norm": 0.09016852080821991,
463
+ "learning_rate": 4.2621757458127285e-05,
464
+ "loss": 0.1912,
465
+ "mean_token_accuracy": 0.9338557474315167,
466
+ "step": 280
467
+ },
468
+ {
469
+ "epoch": 0.7367910809500727,
470
+ "grad_norm": 0.0839182510972023,
471
+ "learning_rate": 3.8974326123887515e-05,
472
+ "loss": 0.1764,
473
+ "mean_token_accuracy": 0.9384186826646328,
474
+ "step": 285
475
+ },
476
+ {
477
+ "epoch": 0.7497172402649863,
478
+ "grad_norm": 0.08571181446313858,
479
+ "learning_rate": 3.5451926227225997e-05,
480
+ "loss": 0.1984,
481
+ "mean_token_accuracy": 0.931569704413414,
482
+ "step": 290
483
+ },
484
+ {
485
+ "epoch": 0.7626433995798998,
486
+ "grad_norm": 0.09723369032144547,
487
+ "learning_rate": 3.20617745786232e-05,
488
+ "loss": 0.2095,
489
+ "mean_token_accuracy": 0.9279029227793216,
490
+ "step": 295
491
+ },
492
+ {
493
+ "epoch": 0.7755695588948134,
494
+ "grad_norm": 0.08322826772928238,
495
+ "learning_rate": 2.8810817033934656e-05,
496
+ "loss": 0.214,
497
+ "mean_token_accuracy": 0.926049928367138,
498
+ "step": 300
499
+ },
500
+ {
501
+ "epoch": 0.788495718209727,
502
+ "grad_norm": 0.0943559929728508,
503
+ "learning_rate": 2.5705714263489776e-05,
504
+ "loss": 0.2025,
505
+ "mean_token_accuracy": 0.9304434671998024,
506
+ "step": 305
507
+ },
508
+ {
509
+ "epoch": 0.8014218775246404,
510
+ "grad_norm": 0.12426702678203583,
511
+ "learning_rate": 2.275282810548811e-05,
512
+ "loss": 0.2255,
513
+ "mean_token_accuracy": 0.92267579510808,
514
+ "step": 310
515
+ },
516
+ {
517
+ "epoch": 0.814348036839554,
518
+ "grad_norm": 0.07339806854724884,
519
+ "learning_rate": 1.9958208531652877e-05,
520
+ "loss": 0.2119,
521
+ "mean_token_accuracy": 0.9270739153027534,
522
+ "step": 315
523
+ },
524
+ {
525
+ "epoch": 0.8272741961544676,
526
+ "grad_norm": 0.07293356955051422,
527
+ "learning_rate": 1.73275812518469e-05,
528
+ "loss": 0.1728,
529
+ "mean_token_accuracy": 0.9397997766733169,
530
+ "step": 320
531
+ },
532
+ {
533
+ "epoch": 0.8402003554693812,
534
+ "grad_norm": 0.06353917717933655,
535
+ "learning_rate": 1.4866335983047264e-05,
536
+ "loss": 0.2043,
537
+ "mean_token_accuracy": 0.9285617105662822,
538
+ "step": 325
539
+ },
540
+ {
541
+ "epoch": 0.8531265147842947,
542
+ "grad_norm": 0.0852632075548172,
543
+ "learning_rate": 1.2579515406713193e-05,
544
+ "loss": 0.1894,
545
+ "mean_token_accuracy": 0.9348805241286755,
546
+ "step": 330
547
+ },
548
+ {
549
+ "epoch": 0.8660526740992083,
550
+ "grad_norm": 0.10367190837860107,
551
+ "learning_rate": 1.0471804837171916e-05,
552
+ "loss": 0.2155,
553
+ "mean_token_accuracy": 0.9256119452416897,
554
+ "step": 335
555
+ },
556
+ {
557
+ "epoch": 0.8789788334141219,
558
+ "grad_norm": 0.07647697627544403,
559
+ "learning_rate": 8.547522622190385e-06,
560
+ "loss": 0.191,
561
+ "mean_token_accuracy": 0.9336939886212349,
562
+ "step": 340
563
+ },
564
+ {
565
+ "epoch": 0.8919049927290353,
566
+ "grad_norm": 0.09329604357481003,
567
+ "learning_rate": 6.810611295400171e-06,
568
+ "loss": 0.162,
569
+ "mean_token_accuracy": 0.9432654656469822,
570
+ "step": 345
571
+ },
572
+ {
573
+ "epoch": 0.9048311520439489,
574
+ "grad_norm": 0.10007894039154053,
575
+ "learning_rate": 5.264629498702967e-06,
576
+ "loss": 0.2196,
577
+ "mean_token_accuracy": 0.9224743604660034,
578
+ "step": 350
579
+ },
580
+ {
581
+ "epoch": 0.9177573113588625,
582
+ "grad_norm": 0.09568954259157181,
583
+ "learning_rate": 3.91274469120626e-06,
584
+ "loss": 0.2326,
585
+ "mean_token_accuracy": 0.9203760787844658,
586
+ "step": 355
587
+ },
588
+ {
589
+ "epoch": 0.9306834706737761,
590
+ "grad_norm": 0.09242628514766693,
591
+ "learning_rate": 2.7577266596274576e-06,
592
+ "loss": 0.217,
593
+ "mean_token_accuracy": 0.9255997397005558,
594
+ "step": 360
595
+ },
596
+ {
597
+ "epoch": 0.9436096299886896,
598
+ "grad_norm": 0.08556357026100159,
599
+ "learning_rate": 1.8019418434623404e-06,
600
+ "loss": 0.1704,
601
+ "mean_token_accuracy": 0.9403167776763439,
602
+ "step": 365
603
+ },
604
+ {
605
+ "epoch": 0.9565357893036032,
606
+ "grad_norm": 0.08201264590024948,
607
+ "learning_rate": 1.0473484865448525e-06,
608
+ "loss": 0.1877,
609
+ "mean_token_accuracy": 0.9344814352691173,
610
+ "step": 370
611
+ },
612
+ {
613
+ "epoch": 0.9694619486185168,
614
+ "grad_norm": 0.08764708787202835,
615
+ "learning_rate": 4.954926249317815e-07,
616
+ "loss": 0.1614,
617
+ "mean_token_accuracy": 0.943453174829483,
618
+ "step": 375
619
+ },
620
+ {
621
+ "epoch": 0.9823881079334302,
622
+ "grad_norm": 0.09879707545042038,
623
+ "learning_rate": 1.4750491933247512e-07,
624
+ "loss": 0.1984,
625
+ "mean_token_accuracy": 0.9314216762781143,
626
+ "step": 380
627
+ },
628
+ {
629
+ "epoch": 0.9953142672483438,
630
+ "grad_norm": 0.0833079144358635,
631
+ "learning_rate": 4.0983385734660875e-09,
632
+ "loss": 0.1962,
633
+ "mean_token_accuracy": 0.9319176472723484,
634
+ "step": 385
635
+ },
636
+ {
637
+ "epoch": 0.9978994991113266,
638
+ "mean_token_accuracy": 0.9182563126087189,
639
+ "step": 386,
640
+ "total_flos": 1.112161775475753e+17,
641
+ "train_loss": 0.24216800009339584,
642
+ "train_runtime": 6297.1418,
643
+ "train_samples_per_second": 0.983,
644
+ "train_steps_per_second": 0.061
645
  }
646
  ],
647
  "logging_steps": 5,
648
+ "max_steps": 386,
649
  "num_input_tokens_seen": 0,
650
  "num_train_epochs": 1,
651
  "save_steps": 100,
 
661
  "attributes": {}
662
  }
663
  },
664
+ "total_flos": 1.112161775475753e+17,
665
  "train_batch_size": 1,
666
  "trial_name": null,
667
  "trial_params": null