File size: 12,837 Bytes
4d86664
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.9978976874562018,
  "eval_steps": 500,
  "global_step": 267,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.01868722261153936,
      "grad_norm": 3.5830482905634797,
      "learning_rate": 1.785714285714286e-05,
      "loss": 0.8152,
      "mean_token_accuracy": 0.7747266553342342,
      "step": 5
    },
    {
      "epoch": 0.03737444522307872,
      "grad_norm": 1.6625011834876648,
      "learning_rate": 3.571428571428572e-05,
      "loss": 0.7025,
      "mean_token_accuracy": 0.7917468748986721,
      "step": 10
    },
    {
      "epoch": 0.05606166783461808,
      "grad_norm": 1.5899441749058187,
      "learning_rate": 4.9998265374824964e-05,
      "loss": 0.629,
      "mean_token_accuracy": 0.8071001760661602,
      "step": 15
    },
    {
      "epoch": 0.07474889044615744,
      "grad_norm": 1.5979853248536116,
      "learning_rate": 4.993758157237536e-05,
      "loss": 0.5972,
      "mean_token_accuracy": 0.8143526442348957,
      "step": 20
    },
    {
      "epoch": 0.09343611305769679,
      "grad_norm": 1.2674965103641067,
      "learning_rate": 4.979043378581744e-05,
      "loss": 0.5727,
      "mean_token_accuracy": 0.8199357651174068,
      "step": 25
    },
    {
      "epoch": 0.11212333566923616,
      "grad_norm": 1.3685254142570102,
      "learning_rate": 4.9557389054153965e-05,
      "loss": 0.5555,
      "mean_token_accuracy": 0.8241050921380519,
      "step": 30
    },
    {
      "epoch": 0.1308105582807755,
      "grad_norm": 0.9457691254500147,
      "learning_rate": 4.923934542318854e-05,
      "loss": 0.5409,
      "mean_token_accuracy": 0.8273717932403087,
      "step": 35
    },
    {
      "epoch": 0.14949778089231489,
      "grad_norm": 0.9189569415750541,
      "learning_rate": 4.883752848487571e-05,
      "loss": 0.5272,
      "mean_token_accuracy": 0.8308477029204369,
      "step": 40
    },
    {
      "epoch": 0.16818500350385424,
      "grad_norm": 0.7752642460093544,
      "learning_rate": 4.835348665446049e-05,
      "loss": 0.5213,
      "mean_token_accuracy": 0.8322811633348465,
      "step": 45
    },
    {
      "epoch": 0.18687222611539359,
      "grad_norm": 0.8083941553455664,
      "learning_rate": 4.7789085203607664e-05,
      "loss": 0.5118,
      "mean_token_accuracy": 0.8346851594746113,
      "step": 50
    },
    {
      "epoch": 0.20555944872693296,
      "grad_norm": 0.9161612194082712,
      "learning_rate": 4.714649907251388e-05,
      "loss": 0.5088,
      "mean_token_accuracy": 0.8352511122822761,
      "step": 55
    },
    {
      "epoch": 0.22424667133847231,
      "grad_norm": 0.6723034421711911,
      "learning_rate": 4.6428204488701576e-05,
      "loss": 0.5018,
      "mean_token_accuracy": 0.837028643488884,
      "step": 60
    },
    {
      "epoch": 0.2429338939500117,
      "grad_norm": 0.7871127694104725,
      "learning_rate": 4.563696942479205e-05,
      "loss": 0.5061,
      "mean_token_accuracy": 0.8351837247610092,
      "step": 65
    },
    {
      "epoch": 0.261621116561551,
      "grad_norm": 0.761272463434446,
      "learning_rate": 4.477584293202868e-05,
      "loss": 0.4939,
      "mean_token_accuracy": 0.8390413090586663,
      "step": 70
    },
    {
      "epoch": 0.2803083391730904,
      "grad_norm": 0.5083377211370889,
      "learning_rate": 4.384814339065424e-05,
      "loss": 0.4914,
      "mean_token_accuracy": 0.8395063504576683,
      "step": 75
    },
    {
      "epoch": 0.29899556178462977,
      "grad_norm": 0.908648503032802,
      "learning_rate": 4.285744572241972e-05,
      "loss": 0.4972,
      "mean_token_accuracy": 0.8376469679176808,
      "step": 80
    },
    {
      "epoch": 0.3176827843961691,
      "grad_norm": 0.5938541774391267,
      "learning_rate": 4.180756761450171e-05,
      "loss": 0.4816,
      "mean_token_accuracy": 0.8424232035875321,
      "step": 85
    },
    {
      "epoch": 0.33637000700770847,
      "grad_norm": 0.4849394515949123,
      "learning_rate": 4.070255480791492e-05,
      "loss": 0.491,
      "mean_token_accuracy": 0.8394017495214939,
      "step": 90
    },
    {
      "epoch": 0.35505722961924785,
      "grad_norm": 0.5718927262308711,
      "learning_rate": 3.954666550711159e-05,
      "loss": 0.4851,
      "mean_token_accuracy": 0.8409382797777653,
      "step": 95
    },
    {
      "epoch": 0.37374445223078717,
      "grad_norm": 0.5859959003534665,
      "learning_rate": 3.8344353970845606e-05,
      "loss": 0.4862,
      "mean_token_accuracy": 0.8411718301475049,
      "step": 100
    },
    {
      "epoch": 0.39243167484232655,
      "grad_norm": 0.3782337735273932,
      "learning_rate": 3.710025334753495e-05,
      "loss": 0.4834,
      "mean_token_accuracy": 0.8412449143826961,
      "step": 105
    },
    {
      "epoch": 0.41111889745386593,
      "grad_norm": 0.41111283052381054,
      "learning_rate": 3.581915782126652e-05,
      "loss": 0.476,
      "mean_token_accuracy": 0.8432458408176899,
      "step": 110
    },
    {
      "epoch": 0.4298061200654053,
      "grad_norm": 0.3851706170856147,
      "learning_rate": 3.4506004137244676e-05,
      "loss": 0.4851,
      "mean_token_accuracy": 0.8405788190662861,
      "step": 115
    },
    {
      "epoch": 0.44849334267694463,
      "grad_norm": 0.5062426249148336,
      "learning_rate": 3.3165852577875546e-05,
      "loss": 0.4785,
      "mean_token_accuracy": 0.8426314078271389,
      "step": 120
    },
    {
      "epoch": 0.467180565288484,
      "grad_norm": 0.40499027788768055,
      "learning_rate": 3.180386746279663e-05,
      "loss": 0.4747,
      "mean_token_accuracy": 0.843725374341011,
      "step": 125
    },
    {
      "epoch": 0.4858677879000234,
      "grad_norm": 0.47044740848341193,
      "learning_rate": 3.04252972479953e-05,
      "loss": 0.472,
      "mean_token_accuracy": 0.8443724811077118,
      "step": 130
    },
    {
      "epoch": 0.5045550105115627,
      "grad_norm": 0.4272272972180237,
      "learning_rate": 2.90354543007051e-05,
      "loss": 0.4725,
      "mean_token_accuracy": 0.8439367160201072,
      "step": 135
    },
    {
      "epoch": 0.523242233123102,
      "grad_norm": 0.4584455682048115,
      "learning_rate": 2.7639694428017792e-05,
      "loss": 0.4777,
      "mean_token_accuracy": 0.842538620531559,
      "step": 140
    },
    {
      "epoch": 0.5419294557346415,
      "grad_norm": 0.4589522708203329,
      "learning_rate": 2.6243396238098518e-05,
      "loss": 0.4693,
      "mean_token_accuracy": 0.8448904320597649,
      "step": 145
    },
    {
      "epoch": 0.5606166783461808,
      "grad_norm": 0.5239037802900407,
      "learning_rate": 2.4851940413536174e-05,
      "loss": 0.4697,
      "mean_token_accuracy": 0.8447436839342117,
      "step": 150
    },
    {
      "epoch": 0.5793039009577201,
      "grad_norm": 0.4866344062046232,
      "learning_rate": 2.347068897669999e-05,
      "loss": 0.4687,
      "mean_token_accuracy": 0.8448469452559948,
      "step": 155
    },
    {
      "epoch": 0.5979911235692595,
      "grad_norm": 0.3163374085877721,
      "learning_rate": 2.2104964627003848e-05,
      "loss": 0.4629,
      "mean_token_accuracy": 0.846843034029007,
      "step": 160
    },
    {
      "epoch": 0.6166783461807989,
      "grad_norm": 0.2747336073106856,
      "learning_rate": 2.0760030229702972e-05,
      "loss": 0.4612,
      "mean_token_accuracy": 0.8469885870814323,
      "step": 165
    },
    {
      "epoch": 0.6353655687923382,
      "grad_norm": 0.24722982457005138,
      "learning_rate": 1.9441068535263564e-05,
      "loss": 0.4596,
      "mean_token_accuracy": 0.8476050347089767,
      "step": 170
    },
    {
      "epoch": 0.6540527914038776,
      "grad_norm": 0.23852151209010736,
      "learning_rate": 1.815316220745756e-05,
      "loss": 0.4636,
      "mean_token_accuracy": 0.8460029393434525,
      "step": 175
    },
    {
      "epoch": 0.6727400140154169,
      "grad_norm": 0.2597676847867842,
      "learning_rate": 1.6901274237144782e-05,
      "loss": 0.4669,
      "mean_token_accuracy": 0.8451244607567787,
      "step": 180
    },
    {
      "epoch": 0.6914272366269563,
      "grad_norm": 0.8605153163863832,
      "learning_rate": 1.5690228817218815e-05,
      "loss": 0.4668,
      "mean_token_accuracy": 0.8468827910721302,
      "step": 185
    },
    {
      "epoch": 0.7101144592384957,
      "grad_norm": 0.24915841679008277,
      "learning_rate": 1.4524692752415493e-05,
      "loss": 0.4591,
      "mean_token_accuracy": 0.8473225735127926,
      "step": 190
    },
    {
      "epoch": 0.728801681850035,
      "grad_norm": 0.29798920762515824,
      "learning_rate": 1.3409157475622094e-05,
      "loss": 0.4576,
      "mean_token_accuracy": 0.847739252448082,
      "step": 195
    },
    {
      "epoch": 0.7474889044615743,
      "grad_norm": 0.26110601129292016,
      "learning_rate": 1.2347921739987815e-05,
      "loss": 0.4611,
      "mean_token_accuracy": 0.8468295410275459,
      "step": 200
    },
    {
      "epoch": 0.7661761270731138,
      "grad_norm": 0.2785413649007615,
      "learning_rate": 1.1345075053532287e-05,
      "loss": 0.4615,
      "mean_token_accuracy": 0.846584790199995,
      "step": 205
    },
    {
      "epoch": 0.7848633496846531,
      "grad_norm": 0.281588896295376,
      "learning_rate": 1.0404481920087206e-05,
      "loss": 0.4532,
      "mean_token_accuracy": 0.8491566374897956,
      "step": 210
    },
    {
      "epoch": 0.8035505722961925,
      "grad_norm": 0.26784943568942043,
      "learning_rate": 9.529766947299371e-06,
      "loss": 0.4555,
      "mean_token_accuracy": 0.8485622465610504,
      "step": 215
    },
    {
      "epoch": 0.8222377949077319,
      "grad_norm": 0.22085700610769077,
      "learning_rate": 8.724300879081718e-06,
      "loss": 0.4584,
      "mean_token_accuracy": 0.8476461283862591,
      "step": 220
    },
    {
      "epoch": 0.8409250175192712,
      "grad_norm": 0.1833370072353519,
      "learning_rate": 7.991187606337009e-06,
      "loss": 0.452,
      "mean_token_accuracy": 0.8494263976812363,
      "step": 225
    },
    {
      "epoch": 0.8596122401308106,
      "grad_norm": 0.20272051001866034,
      "learning_rate": 7.333252206008559e-06,
      "loss": 0.4538,
      "mean_token_accuracy": 0.8487676382064819,
      "step": 230
    },
    {
      "epoch": 0.8782994627423499,
      "grad_norm": 0.19595652872474512,
      "learning_rate": 6.753030054550158e-06,
      "loss": 0.4506,
      "mean_token_accuracy": 0.8496683083474637,
      "step": 235
    },
    {
      "epoch": 0.8969866853538893,
      "grad_norm": 0.19319039281954486,
      "learning_rate": 6.25275705776658e-06,
      "loss": 0.4519,
      "mean_token_accuracy": 0.8493411011993885,
      "step": 240
    },
    {
      "epoch": 0.9156739079654287,
      "grad_norm": 0.20035376740680347,
      "learning_rate": 5.834361034674521e-06,
      "loss": 0.4557,
      "mean_token_accuracy": 0.8482660032808781,
      "step": 245
    },
    {
      "epoch": 0.934361130576968,
      "grad_norm": 0.2010469617138832,
      "learning_rate": 5.499454288586379e-06,
      "loss": 0.453,
      "mean_token_accuracy": 0.8490205124020577,
      "step": 250
    },
    {
      "epoch": 0.9530483531885073,
      "grad_norm": 0.2001591137777418,
      "learning_rate": 5.24932739404462e-06,
      "loss": 0.4488,
      "mean_token_accuracy": 0.8502446681261062,
      "step": 255
    },
    {
      "epoch": 0.9717355758000468,
      "grad_norm": 0.18321355893669744,
      "learning_rate": 5.08494422354882e-06,
      "loss": 0.4518,
      "mean_token_accuracy": 0.849391470849514,
      "step": 260
    },
    {
      "epoch": 0.9904227984115861,
      "grad_norm": 0.18951844466307075,
      "learning_rate": 5.006938233240212e-06,
      "loss": 0.4554,
      "mean_token_accuracy": 0.8482832841575145,
      "step": 265
    },
    {
      "epoch": 0.9978976874562018,
      "mean_token_accuracy": 0.8498711809515953,
      "step": 267,
      "total_flos": 2841831180075008.0,
      "train_loss": 0.4941176689519418,
      "train_runtime": 26749.4727,
      "train_samples_per_second": 1.28,
      "train_steps_per_second": 0.01
    }
  ],
  "logging_steps": 5,
  "max_steps": 267,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 100,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 2841831180075008.0,
  "train_batch_size": 1,
  "trial_name": null,
  "trial_params": null
}