sanjaypn14 commited on
Commit
fc4a80f
·
verified ·
1 Parent(s): 83bdc89

Uploading fine-tuned LLaMA-EatFit-2-7b model

Browse files
Files changed (1) hide show
  1. trainer_state.json +1017 -0
trainer_state.json ADDED
@@ -0,0 +1,1017 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "eval_steps": 500,
6
+ "global_step": 3092,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.008085381630012937,
13
+ "grad_norm": 0.3009789288043976,
14
+ "learning_rate": 0.00013440860215053763,
15
+ "loss": 1.7048,
16
+ "mean_token_accuracy": 0.661682380437851,
17
+ "step": 25
18
+ },
19
+ {
20
+ "epoch": 0.016170763260025874,
21
+ "grad_norm": 0.4460005462169647,
22
+ "learning_rate": 0.00026881720430107527,
23
+ "loss": 1.2718,
24
+ "mean_token_accuracy": 0.713244981765747,
25
+ "step": 50
26
+ },
27
+ {
28
+ "epoch": 0.02425614489003881,
29
+ "grad_norm": 0.25601527094841003,
30
+ "learning_rate": 0.0004032258064516129,
31
+ "loss": 0.9995,
32
+ "mean_token_accuracy": 0.7553433787822723,
33
+ "step": 75
34
+ },
35
+ {
36
+ "epoch": 0.03234152652005175,
37
+ "grad_norm": 0.2627596855163574,
38
+ "learning_rate": 0.0004999932787358948,
39
+ "loss": 0.9036,
40
+ "mean_token_accuracy": 0.7764883422851563,
41
+ "step": 100
42
+ },
43
+ {
44
+ "epoch": 0.04042690815006468,
45
+ "grad_norm": 0.4143337905406952,
46
+ "learning_rate": 0.0004998595518201121,
47
+ "loss": 0.8985,
48
+ "mean_token_accuracy": 0.7727250349521637,
49
+ "step": 125
50
+ },
51
+ {
52
+ "epoch": 0.04851228978007762,
53
+ "grad_norm": 0.23922297358512878,
54
+ "learning_rate": 0.00049955446943686,
55
+ "loss": 0.7909,
56
+ "mean_token_accuracy": 0.7968144822120666,
57
+ "step": 150
58
+ },
59
+ {
60
+ "epoch": 0.056597671410090554,
61
+ "grad_norm": 0.2834464907646179,
62
+ "learning_rate": 0.0004990782408138185,
63
+ "loss": 0.822,
64
+ "mean_token_accuracy": 0.7941457486152649,
65
+ "step": 175
66
+ },
67
+ {
68
+ "epoch": 0.0646830530401035,
69
+ "grad_norm": 0.29430675506591797,
70
+ "learning_rate": 0.000498431192551983,
71
+ "loss": 0.8546,
72
+ "mean_token_accuracy": 0.7836022305488587,
73
+ "step": 200
74
+ },
75
+ {
76
+ "epoch": 0.07276843467011643,
77
+ "grad_norm": 0.23746679723262787,
78
+ "learning_rate": 0.0004976137684016788,
79
+ "loss": 0.7788,
80
+ "mean_token_accuracy": 0.8025749707221985,
81
+ "step": 225
82
+ },
83
+ {
84
+ "epoch": 0.08085381630012936,
85
+ "grad_norm": 0.24229322373867035,
86
+ "learning_rate": 0.0004966265289582338,
87
+ "loss": 0.8583,
88
+ "mean_token_accuracy": 0.7800547671318054,
89
+ "step": 250
90
+ },
91
+ {
92
+ "epoch": 0.08893919793014231,
93
+ "grad_norm": 0.24889522790908813,
94
+ "learning_rate": 0.0004954701512775184,
95
+ "loss": 0.685,
96
+ "mean_token_accuracy": 0.8196275687217712,
97
+ "step": 275
98
+ },
99
+ {
100
+ "epoch": 0.09702457956015524,
101
+ "grad_norm": 0.22737586498260498,
102
+ "learning_rate": 0.0004941454284116157,
103
+ "loss": 0.8068,
104
+ "mean_token_accuracy": 0.7950991153717041,
105
+ "step": 300
106
+ },
107
+ {
108
+ "epoch": 0.10510996119016817,
109
+ "grad_norm": 0.2224387228488922,
110
+ "learning_rate": 0.0004926532688649407,
111
+ "loss": 0.8054,
112
+ "mean_token_accuracy": 0.7932506108283996,
113
+ "step": 325
114
+ },
115
+ {
116
+ "epoch": 0.11319534282018111,
117
+ "grad_norm": 0.2293468415737152,
118
+ "learning_rate": 0.0004909946959711816,
119
+ "loss": 0.7816,
120
+ "mean_token_accuracy": 0.7965369141101837,
121
+ "step": 350
122
+ },
123
+ {
124
+ "epoch": 0.12128072445019406,
125
+ "grad_norm": 0.2561907470226288,
126
+ "learning_rate": 0.0004891708471914897,
127
+ "loss": 0.7487,
128
+ "mean_token_accuracy": 0.8107043826580047,
129
+ "step": 375
130
+ },
131
+ {
132
+ "epoch": 0.129366106080207,
133
+ "grad_norm": 0.2583022117614746,
134
+ "learning_rate": 0.0004871829733344012,
135
+ "loss": 0.7786,
136
+ "mean_token_accuracy": 0.7999367988109589,
137
+ "step": 400
138
+ },
139
+ {
140
+ "epoch": 0.13745148771021992,
141
+ "grad_norm": 0.2550680339336395,
142
+ "learning_rate": 0.00048503243769802327,
143
+ "loss": 0.7576,
144
+ "mean_token_accuracy": 0.8000009500980377,
145
+ "step": 425
146
+ },
147
+ {
148
+ "epoch": 0.14553686934023286,
149
+ "grad_norm": 0.24982689321041107,
150
+ "learning_rate": 0.0004827207151350745,
151
+ "loss": 0.7897,
152
+ "mean_token_accuracy": 0.7945795893669129,
153
+ "step": 450
154
+ },
155
+ {
156
+ "epoch": 0.1536222509702458,
157
+ "grad_norm": 0.2784912884235382,
158
+ "learning_rate": 0.0004802493910414205,
159
+ "loss": 0.8013,
160
+ "mean_token_accuracy": 0.792938197851181,
161
+ "step": 475
162
+ },
163
+ {
164
+ "epoch": 0.16170763260025872,
165
+ "grad_norm": 0.2220473736524582,
166
+ "learning_rate": 0.00047762016026879807,
167
+ "loss": 0.711,
168
+ "mean_token_accuracy": 0.8124627935886383,
169
+ "step": 500
170
+ },
171
+ {
172
+ "epoch": 0.16979301423027165,
173
+ "grad_norm": 0.2841341197490692,
174
+ "learning_rate": 0.00047483482596247353,
175
+ "loss": 0.7382,
176
+ "mean_token_accuracy": 0.8129073297977447,
177
+ "step": 525
178
+ },
179
+ {
180
+ "epoch": 0.17787839586028462,
181
+ "grad_norm": 0.23232664167881012,
182
+ "learning_rate": 0.00047189529832463296,
183
+ "loss": 0.7588,
184
+ "mean_token_accuracy": 0.8059421420097351,
185
+ "step": 550
186
+ },
187
+ {
188
+ "epoch": 0.18596377749029755,
189
+ "grad_norm": 0.22178468108177185,
190
+ "learning_rate": 0.00046880359330435216,
191
+ "loss": 0.8146,
192
+ "mean_token_accuracy": 0.7915572142601013,
193
+ "step": 575
194
+ },
195
+ {
196
+ "epoch": 0.19404915912031048,
197
+ "grad_norm": 0.23006677627563477,
198
+ "learning_rate": 0.0004655618312150437,
199
+ "loss": 0.7822,
200
+ "mean_token_accuracy": 0.7942324769496918,
201
+ "step": 600
202
+ },
203
+ {
204
+ "epoch": 0.20213454075032342,
205
+ "grad_norm": 0.32772719860076904,
206
+ "learning_rate": 0.00046217223528033146,
207
+ "loss": 0.7632,
208
+ "mean_token_accuracy": 0.8043809008598327,
209
+ "step": 625
210
+ },
211
+ {
212
+ "epoch": 0.21021992238033635,
213
+ "grad_norm": 0.18335266411304474,
214
+ "learning_rate": 0.0004586371301093476,
215
+ "loss": 0.7218,
216
+ "mean_token_accuracy": 0.816128705739975,
217
+ "step": 650
218
+ },
219
+ {
220
+ "epoch": 0.21830530401034928,
221
+ "grad_norm": 0.22064125537872314,
222
+ "learning_rate": 0.00045495894010249915,
223
+ "loss": 0.7364,
224
+ "mean_token_accuracy": 0.8087958478927613,
225
+ "step": 675
226
+ },
227
+ {
228
+ "epoch": 0.22639068564036222,
229
+ "grad_norm": 0.21018439531326294,
230
+ "learning_rate": 0.0004511401877887967,
231
+ "loss": 0.738,
232
+ "mean_token_accuracy": 0.8069515633583069,
233
+ "step": 700
234
+ },
235
+ {
236
+ "epoch": 0.23447606727037515,
237
+ "grad_norm": 0.21225683391094208,
238
+ "learning_rate": 0.0004471834920958864,
239
+ "loss": 0.7397,
240
+ "mean_token_accuracy": 0.8051351046562195,
241
+ "step": 725
242
+ },
243
+ {
244
+ "epoch": 0.2425614489003881,
245
+ "grad_norm": 0.22577480971813202,
246
+ "learning_rate": 0.00044309156655397003,
247
+ "loss": 0.7872,
248
+ "mean_token_accuracy": 0.7951467871665955,
249
+ "step": 750
250
+ },
251
+ {
252
+ "epoch": 0.25064683053040104,
253
+ "grad_norm": 0.19751884043216705,
254
+ "learning_rate": 0.000438867217434847,
255
+ "loss": 0.7147,
256
+ "mean_token_accuracy": 0.8134795188903808,
257
+ "step": 775
258
+ },
259
+ {
260
+ "epoch": 0.258732212160414,
261
+ "grad_norm": 0.2011658251285553,
262
+ "learning_rate": 0.0004345133418273529,
263
+ "loss": 0.7923,
264
+ "mean_token_accuracy": 0.7959077858924866,
265
+ "step": 800
266
+ },
267
+ {
268
+ "epoch": 0.2668175937904269,
269
+ "grad_norm": 0.2174764722585678,
270
+ "learning_rate": 0.00043003292565051544,
271
+ "loss": 0.7576,
272
+ "mean_token_accuracy": 0.8044932758808137,
273
+ "step": 825
274
+ },
275
+ {
276
+ "epoch": 0.27490297542043984,
277
+ "grad_norm": 0.20990514755249023,
278
+ "learning_rate": 0.0004254290416057898,
279
+ "loss": 0.739,
280
+ "mean_token_accuracy": 0.8073045027256012,
281
+ "step": 850
282
+ },
283
+ {
284
+ "epoch": 0.2829883570504528,
285
+ "grad_norm": 0.1976221799850464,
286
+ "learning_rate": 0.0004207048470697777,
287
+ "loss": 0.6717,
288
+ "mean_token_accuracy": 0.824974125623703,
289
+ "step": 875
290
+ },
291
+ {
292
+ "epoch": 0.2910737386804657,
293
+ "grad_norm": 0.2552309036254883,
294
+ "learning_rate": 0.0004158635819288762,
295
+ "loss": 0.7311,
296
+ "mean_token_accuracy": 0.8078971183300019,
297
+ "step": 900
298
+ },
299
+ {
300
+ "epoch": 0.29915912031047864,
301
+ "grad_norm": 0.26244303584098816,
302
+ "learning_rate": 0.00041090856635734067,
303
+ "loss": 0.7264,
304
+ "mean_token_accuracy": 0.8127052938938141,
305
+ "step": 925
306
+ },
307
+ {
308
+ "epoch": 0.3072445019404916,
309
+ "grad_norm": 0.24333028495311737,
310
+ "learning_rate": 0.000405843198540285,
311
+ "loss": 0.7184,
312
+ "mean_token_accuracy": 0.8114526355266571,
313
+ "step": 950
314
+ },
315
+ {
316
+ "epoch": 0.3153298835705045,
317
+ "grad_norm": 0.19133129715919495,
318
+ "learning_rate": 0.0004006709523431822,
319
+ "loss": 0.7538,
320
+ "mean_token_accuracy": 0.8016650295257568,
321
+ "step": 975
322
+ },
323
+ {
324
+ "epoch": 0.32341526520051744,
325
+ "grad_norm": 0.25047338008880615,
326
+ "learning_rate": 0.00039539537492946285,
327
+ "loss": 0.8019,
328
+ "mean_token_accuracy": 0.7935136258602142,
329
+ "step": 1000
330
+ },
331
+ {
332
+ "epoch": 0.3315006468305304,
333
+ "grad_norm": 0.19724124670028687,
334
+ "learning_rate": 0.0003900200843278449,
335
+ "loss": 0.6892,
336
+ "mean_token_accuracy": 0.8166925406455994,
337
+ "step": 1025
338
+ },
339
+ {
340
+ "epoch": 0.3395860284605433,
341
+ "grad_norm": 0.21111617982387543,
342
+ "learning_rate": 0.0003845487669510631,
343
+ "loss": 0.7281,
344
+ "mean_token_accuracy": 0.8110716784000397,
345
+ "step": 1050
346
+ },
347
+ {
348
+ "epoch": 0.3476714100905563,
349
+ "grad_norm": 0.2487187534570694,
350
+ "learning_rate": 0.00037898517506770196,
351
+ "loss": 0.7962,
352
+ "mean_token_accuracy": 0.7921491277217865,
353
+ "step": 1075
354
+ },
355
+ {
356
+ "epoch": 0.35575679172056923,
357
+ "grad_norm": 0.2757723927497864,
358
+ "learning_rate": 0.0003733331242288622,
359
+ "loss": 0.7533,
360
+ "mean_token_accuracy": 0.8056223785877228,
361
+ "step": 1100
362
+ },
363
+ {
364
+ "epoch": 0.36384217335058217,
365
+ "grad_norm": 0.27630847692489624,
366
+ "learning_rate": 0.0003675964906514289,
367
+ "loss": 0.7885,
368
+ "mean_token_accuracy": 0.7973137283325196,
369
+ "step": 1125
370
+ },
371
+ {
372
+ "epoch": 0.3719275549805951,
373
+ "grad_norm": 0.2365068644285202,
374
+ "learning_rate": 0.00036177920855973405,
375
+ "loss": 0.7275,
376
+ "mean_token_accuracy": 0.8077067303657531,
377
+ "step": 1150
378
+ },
379
+ {
380
+ "epoch": 0.38001293661060803,
381
+ "grad_norm": 0.2511255443096161,
382
+ "learning_rate": 0.00035588526748743754,
383
+ "loss": 0.8052,
384
+ "mean_token_accuracy": 0.7931141972541809,
385
+ "step": 1175
386
+ },
387
+ {
388
+ "epoch": 0.38809831824062097,
389
+ "grad_norm": 0.2489156723022461,
390
+ "learning_rate": 0.0003499187095414763,
391
+ "loss": 0.7369,
392
+ "mean_token_accuracy": 0.8076127851009369,
393
+ "step": 1200
394
+ },
395
+ {
396
+ "epoch": 0.3961836998706339,
397
+ "grad_norm": 0.23067978024482727,
398
+ "learning_rate": 0.00034388362662995855,
399
+ "loss": 0.7393,
400
+ "mean_token_accuracy": 0.8086310243606567,
401
+ "step": 1225
402
+ },
403
+ {
404
+ "epoch": 0.40426908150064683,
405
+ "grad_norm": 0.2202920764684677,
406
+ "learning_rate": 0.000337784157655904,
407
+ "loss": 0.7578,
408
+ "mean_token_accuracy": 0.8064273929595948,
409
+ "step": 1250
410
+ },
411
+ {
412
+ "epoch": 0.41235446313065977,
413
+ "grad_norm": 0.21269731223583221,
414
+ "learning_rate": 0.0003316244856787544,
415
+ "loss": 0.7937,
416
+ "mean_token_accuracy": 0.795601452589035,
417
+ "step": 1275
418
+ },
419
+ {
420
+ "epoch": 0.4204398447606727,
421
+ "grad_norm": 0.24461720883846283,
422
+ "learning_rate": 0.0003254088350456017,
423
+ "loss": 0.6656,
424
+ "mean_token_accuracy": 0.8226878666877746,
425
+ "step": 1300
426
+ },
427
+ {
428
+ "epoch": 0.42852522639068563,
429
+ "grad_norm": 0.24032443761825562,
430
+ "learning_rate": 0.0003191414684941003,
431
+ "loss": 0.7778,
432
+ "mean_token_accuracy": 0.7960509729385375,
433
+ "step": 1325
434
+ },
435
+ {
436
+ "epoch": 0.43661060802069857,
437
+ "grad_norm": 0.25362804532051086,
438
+ "learning_rate": 0.0003128266842290513,
439
+ "loss": 0.6967,
440
+ "mean_token_accuracy": 0.8132575881481171,
441
+ "step": 1350
442
+ },
443
+ {
444
+ "epoch": 0.4446959896507115,
445
+ "grad_norm": 0.2388894110918045,
446
+ "learning_rate": 0.0003064688129746629,
447
+ "loss": 0.716,
448
+ "mean_token_accuracy": 0.8112483811378479,
449
+ "step": 1375
450
+ },
451
+ {
452
+ "epoch": 0.45278137128072443,
453
+ "grad_norm": 0.2121650129556656,
454
+ "learning_rate": 0.0003000722150045085,
455
+ "loss": 0.6942,
456
+ "mean_token_accuracy": 0.8156200110912323,
457
+ "step": 1400
458
+ },
459
+ {
460
+ "epoch": 0.46086675291073737,
461
+ "grad_norm": 0.2317550778388977,
462
+ "learning_rate": 0.0002936412771512206,
463
+ "loss": 0.7493,
464
+ "mean_token_accuracy": 0.8051575112342835,
465
+ "step": 1425
466
+ },
467
+ {
468
+ "epoch": 0.4689521345407503,
469
+ "grad_norm": 0.23476050794124603,
470
+ "learning_rate": 0.0002871804097979687,
471
+ "loss": 0.7136,
472
+ "mean_token_accuracy": 0.8104170382022857,
473
+ "step": 1450
474
+ },
475
+ {
476
+ "epoch": 0.4770375161707633,
477
+ "grad_norm": 0.2009560465812683,
478
+ "learning_rate": 0.00028069404385378736,
479
+ "loss": 0.7117,
480
+ "mean_token_accuracy": 0.8178813803195953,
481
+ "step": 1475
482
+ },
483
+ {
484
+ "epoch": 0.4851228978007762,
485
+ "grad_norm": 0.21633079648017883,
486
+ "learning_rate": 0.0002741866277148276,
487
+ "loss": 0.7392,
488
+ "mean_token_accuracy": 0.8080459308624267,
489
+ "step": 1500
490
+ },
491
+ {
492
+ "epoch": 0.49320827943078915,
493
+ "grad_norm": 0.2863864004611969,
494
+ "learning_rate": 0.00026766262421361407,
495
+ "loss": 0.7429,
496
+ "mean_token_accuracy": 0.8051086151599884,
497
+ "step": 1525
498
+ },
499
+ {
500
+ "epoch": 0.5012936610608021,
501
+ "grad_norm": 0.2746196389198303,
502
+ "learning_rate": 0.0002611265075584034,
503
+ "loss": 0.7378,
504
+ "mean_token_accuracy": 0.8071331679821014,
505
+ "step": 1550
506
+ },
507
+ {
508
+ "epoch": 0.509379042690815,
509
+ "grad_norm": 0.1745605319738388,
510
+ "learning_rate": 0.0002545827602647397,
511
+ "loss": 0.7329,
512
+ "mean_token_accuracy": 0.8071370398998261,
513
+ "step": 1575
514
+ },
515
+ {
516
+ "epoch": 0.517464424320828,
517
+ "grad_norm": 0.2692930996417999,
518
+ "learning_rate": 0.0002480358700813135,
519
+ "loss": 0.6469,
520
+ "mean_token_accuracy": 0.8244569575786591,
521
+ "step": 1600
522
+ },
523
+ {
524
+ "epoch": 0.5255498059508409,
525
+ "grad_norm": 0.2551010549068451,
526
+ "learning_rate": 0.00024149032691223173,
527
+ "loss": 0.7093,
528
+ "mean_token_accuracy": 0.8107299065589905,
529
+ "step": 1625
530
+ },
531
+ {
532
+ "epoch": 0.5336351875808538,
533
+ "grad_norm": 0.19366180896759033,
534
+ "learning_rate": 0.0002349506197378092,
535
+ "loss": 0.7518,
536
+ "mean_token_accuracy": 0.8024619662761688,
537
+ "step": 1650
538
+ },
539
+ {
540
+ "epoch": 0.5417205692108668,
541
+ "grad_norm": 0.22391283512115479,
542
+ "learning_rate": 0.00022842123353599369,
543
+ "loss": 0.6933,
544
+ "mean_token_accuracy": 0.8174584257602692,
545
+ "step": 1675
546
+ },
547
+ {
548
+ "epoch": 0.5498059508408797,
549
+ "grad_norm": 0.24723011255264282,
550
+ "learning_rate": 0.0002219066462065364,
551
+ "loss": 0.669,
552
+ "mean_token_accuracy": 0.822363510131836,
553
+ "step": 1700
554
+ },
555
+ {
556
+ "epoch": 0.5578913324708926,
557
+ "grad_norm": 0.20229819416999817,
558
+ "learning_rate": 0.00021541132550001584,
559
+ "loss": 0.6508,
560
+ "mean_token_accuracy": 0.8255820453166962,
561
+ "step": 1725
562
+ },
563
+ {
564
+ "epoch": 0.5659767141009056,
565
+ "grad_norm": 0.2818906605243683,
566
+ "learning_rate": 0.00020893972595382274,
567
+ "loss": 0.6417,
568
+ "mean_token_accuracy": 0.830688863992691,
569
+ "step": 1750
570
+ },
571
+ {
572
+ "epoch": 0.5740620957309185,
573
+ "grad_norm": 0.24256223440170288,
574
+ "learning_rate": 0.00020249628583720672,
575
+ "loss": 0.7353,
576
+ "mean_token_accuracy": 0.8104202616214752,
577
+ "step": 1775
578
+ },
579
+ {
580
+ "epoch": 0.5821474773609314,
581
+ "grad_norm": 0.24385611712932587,
582
+ "learning_rate": 0.00019608542410747888,
583
+ "loss": 0.6876,
584
+ "mean_token_accuracy": 0.819042581319809,
585
+ "step": 1800
586
+ },
587
+ {
588
+ "epoch": 0.5902328589909444,
589
+ "grad_norm": 0.15913242101669312,
590
+ "learning_rate": 0.00018971153737945968,
591
+ "loss": 0.646,
592
+ "mean_token_accuracy": 0.8284247839450836,
593
+ "step": 1825
594
+ },
595
+ {
596
+ "epoch": 0.5983182406209573,
597
+ "grad_norm": 0.17542122304439545,
598
+ "learning_rate": 0.00018337899691024914,
599
+ "loss": 0.6216,
600
+ "mean_token_accuracy": 0.83616614818573,
601
+ "step": 1850
602
+ },
603
+ {
604
+ "epoch": 0.6064036222509702,
605
+ "grad_norm": 0.22214658558368683,
606
+ "learning_rate": 0.0001770921456013872,
607
+ "loss": 0.6947,
608
+ "mean_token_accuracy": 0.8170740747451782,
609
+ "step": 1875
610
+ },
611
+ {
612
+ "epoch": 0.6144890038809832,
613
+ "grad_norm": 0.22132755815982819,
614
+ "learning_rate": 0.00017085529502046073,
615
+ "loss": 0.6788,
616
+ "mean_token_accuracy": 0.820680763721466,
617
+ "step": 1900
618
+ },
619
+ {
620
+ "epoch": 0.6225743855109961,
621
+ "grad_norm": 0.26897531747817993,
622
+ "learning_rate": 0.00016467272244420029,
623
+ "loss": 0.6833,
624
+ "mean_token_accuracy": 0.822064242362976,
625
+ "step": 1925
626
+ },
627
+ {
628
+ "epoch": 0.630659767141009,
629
+ "grad_norm": 0.25688934326171875,
630
+ "learning_rate": 0.0001585486679250922,
631
+ "loss": 0.6945,
632
+ "mean_token_accuracy": 0.8143122482299805,
633
+ "step": 1950
634
+ },
635
+ {
636
+ "epoch": 0.638745148771022,
637
+ "grad_norm": 0.21207553148269653,
638
+ "learning_rate": 0.0001524873313835208,
639
+ "loss": 0.6596,
640
+ "mean_token_accuracy": 0.8273860597610474,
641
+ "step": 1975
642
+ },
643
+ {
644
+ "epoch": 0.6468305304010349,
645
+ "grad_norm": 0.281393438577652,
646
+ "learning_rate": 0.00014649286972743319,
647
+ "loss": 0.6767,
648
+ "mean_token_accuracy": 0.8178416419029236,
649
+ "step": 2000
650
+ },
651
+ {
652
+ "epoch": 0.6549159120310478,
653
+ "grad_norm": 0.27408191561698914,
654
+ "learning_rate": 0.00014056939400150143,
655
+ "loss": 0.6974,
656
+ "mean_token_accuracy": 0.8189209842681885,
657
+ "step": 2025
658
+ },
659
+ {
660
+ "epoch": 0.6630012936610608,
661
+ "grad_norm": 0.26886942982673645,
662
+ "learning_rate": 0.00013472096656773913,
663
+ "loss": 0.6497,
664
+ "mean_token_accuracy": 0.8288757252693176,
665
+ "step": 2050
666
+ },
667
+ {
668
+ "epoch": 0.6710866752910737,
669
+ "grad_norm": 0.21919454634189606,
670
+ "learning_rate": 0.00012895159831950462,
671
+ "loss": 0.693,
672
+ "mean_token_accuracy": 0.8163833570480347,
673
+ "step": 2075
674
+ },
675
+ {
676
+ "epoch": 0.6791720569210866,
677
+ "grad_norm": 0.21283280849456787,
678
+ "learning_rate": 0.0001232652459308012,
679
+ "loss": 0.7117,
680
+ "mean_token_accuracy": 0.808628898859024,
681
+ "step": 2100
682
+ },
683
+ {
684
+ "epoch": 0.6872574385510997,
685
+ "grad_norm": 0.229765385389328,
686
+ "learning_rate": 0.00011766580914276209,
687
+ "loss": 0.7317,
688
+ "mean_token_accuracy": 0.8035627353191376,
689
+ "step": 2125
690
+ },
691
+ {
692
+ "epoch": 0.6953428201811126,
693
+ "grad_norm": 0.21411098539829254,
694
+ "learning_rate": 0.00011215712808918003,
695
+ "loss": 0.6469,
696
+ "mean_token_accuracy": 0.8277445828914642,
697
+ "step": 2150
698
+ },
699
+ {
700
+ "epoch": 0.7034282018111255,
701
+ "grad_norm": 0.2254790961742401,
702
+ "learning_rate": 0.00010674298066291601,
703
+ "loss": 0.6976,
704
+ "mean_token_accuracy": 0.8171502375602722,
705
+ "step": 2175
706
+ },
707
+ {
708
+ "epoch": 0.7115135834411385,
709
+ "grad_norm": 0.27148380875587463,
710
+ "learning_rate": 0.0001014270799249933,
711
+ "loss": 0.717,
712
+ "mean_token_accuracy": 0.8086051964759826,
713
+ "step": 2200
714
+ },
715
+ {
716
+ "epoch": 0.7195989650711514,
717
+ "grad_norm": 0.2047407031059265,
718
+ "learning_rate": 9.621307155815398e-05,
719
+ "loss": 0.718,
720
+ "mean_token_accuracy": 0.8121638822555542,
721
+ "step": 2225
722
+ },
723
+ {
724
+ "epoch": 0.7276843467011643,
725
+ "grad_norm": 0.22144050896167755,
726
+ "learning_rate": 9.11045313666231e-05,
727
+ "loss": 0.6623,
728
+ "mean_token_accuracy": 0.8254709720611573,
729
+ "step": 2250
730
+ },
731
+ {
732
+ "epoch": 0.7357697283311773,
733
+ "grad_norm": 0.27873218059539795,
734
+ "learning_rate": 8.610496282379687e-05,
735
+ "loss": 0.7073,
736
+ "mean_token_accuracy": 0.8168034076690673,
737
+ "step": 2275
738
+ },
739
+ {
740
+ "epoch": 0.7438551099611902,
741
+ "grad_norm": 0.25058674812316895,
742
+ "learning_rate": 8.121779466953572e-05,
743
+ "loss": 0.7961,
744
+ "mean_token_accuracy": 0.7905523943901062,
745
+ "step": 2300
746
+ },
747
+ {
748
+ "epoch": 0.7519404915912031,
749
+ "grad_norm": 0.2540716826915741,
750
+ "learning_rate": 7.644637855870959e-05,
751
+ "loss": 0.7561,
752
+ "mean_token_accuracy": 0.8036962306499481,
753
+ "step": 2325
754
+ },
755
+ {
756
+ "epoch": 0.7600258732212161,
757
+ "grad_norm": 0.2069474756717682,
758
+ "learning_rate": 7.179398676260923e-05,
759
+ "loss": 0.7163,
760
+ "mean_token_accuracy": 0.8117474913597107,
761
+ "step": 2350
762
+ },
763
+ {
764
+ "epoch": 0.768111254851229,
765
+ "grad_norm": 0.23127734661102295,
766
+ "learning_rate": 6.726380992479941e-05,
767
+ "loss": 0.6983,
768
+ "mean_token_accuracy": 0.8151715826988221,
769
+ "step": 2375
770
+ },
771
+ {
772
+ "epoch": 0.7761966364812419,
773
+ "grad_norm": 0.19075877964496613,
774
+ "learning_rate": 6.285895487295229e-05,
775
+ "loss": 0.6644,
776
+ "mean_token_accuracy": 0.8226857626438141,
777
+ "step": 2400
778
+ },
779
+ {
780
+ "epoch": 0.7842820181112549,
781
+ "grad_norm": 0.26920101046562195,
782
+ "learning_rate": 5.858244248816302e-05,
783
+ "loss": 0.678,
784
+ "mean_token_accuracy": 0.8184169673919678,
785
+ "step": 2425
786
+ },
787
+ {
788
+ "epoch": 0.7923673997412678,
789
+ "grad_norm": 0.22957506775856018,
790
+ "learning_rate": 5.443720563320792e-05,
791
+ "loss": 0.7125,
792
+ "mean_token_accuracy": 0.8165527045726776,
793
+ "step": 2450
794
+ },
795
+ {
796
+ "epoch": 0.8004527813712807,
797
+ "grad_norm": 0.23255349695682526,
798
+ "learning_rate": 5.042608714116612e-05,
799
+ "loss": 0.6648,
800
+ "mean_token_accuracy": 0.8205063927173615,
801
+ "step": 2475
802
+ },
803
+ {
804
+ "epoch": 0.8085381630012937,
805
+ "grad_norm": 0.21010981500148773,
806
+ "learning_rate": 4.655183786578426e-05,
807
+ "loss": 0.6833,
808
+ "mean_token_accuracy": 0.8196286606788635,
809
+ "step": 2500
810
+ },
811
+ {
812
+ "epoch": 0.8166235446313066,
813
+ "grad_norm": 0.21943055093288422,
814
+ "learning_rate": 4.2817114794921677e-05,
815
+ "loss": 0.6897,
816
+ "mean_token_accuracy": 0.8164256310462952,
817
+ "step": 2525
818
+ },
819
+ {
820
+ "epoch": 0.8247089262613195,
821
+ "grad_norm": 0.21263104677200317,
822
+ "learning_rate": 3.92244792283685e-05,
823
+ "loss": 0.6553,
824
+ "mean_token_accuracy": 0.8244921159744263,
825
+ "step": 2550
826
+ },
827
+ {
828
+ "epoch": 0.8327943078913325,
829
+ "grad_norm": 0.30405542254447937,
830
+ "learning_rate": 3.577639502128843e-05,
831
+ "loss": 0.6771,
832
+ "mean_token_accuracy": 0.8220798122882843,
833
+ "step": 2575
834
+ },
835
+ {
836
+ "epoch": 0.8408796895213454,
837
+ "grad_norm": 0.2544702887535095,
838
+ "learning_rate": 3.247522689448923e-05,
839
+ "loss": 0.6825,
840
+ "mean_token_accuracy": 0.8175348448753357,
841
+ "step": 2600
842
+ },
843
+ {
844
+ "epoch": 0.8489650711513583,
845
+ "grad_norm": 0.2168809473514557,
846
+ "learning_rate": 2.9323238812679982e-05,
847
+ "loss": 0.7437,
848
+ "mean_token_accuracy": 0.8062794303894043,
849
+ "step": 2625
850
+ },
851
+ {
852
+ "epoch": 0.8570504527813713,
853
+ "grad_norm": 0.32856041193008423,
854
+ "learning_rate": 2.6322592431828136e-05,
855
+ "loss": 0.6968,
856
+ "mean_token_accuracy": 0.8184567129611969,
857
+ "step": 2650
858
+ },
859
+ {
860
+ "epoch": 0.8651358344113842,
861
+ "grad_norm": 0.2495715171098709,
862
+ "learning_rate": 2.3475345616680327e-05,
863
+ "loss": 0.6864,
864
+ "mean_token_accuracy": 0.817262338399887,
865
+ "step": 2675
866
+ },
867
+ {
868
+ "epoch": 0.8732212160413971,
869
+ "grad_norm": 0.21743454039096832,
870
+ "learning_rate": 2.0783451029463995e-05,
871
+ "loss": 0.6554,
872
+ "mean_token_accuracy": 0.8261248970031738,
873
+ "step": 2700
874
+ },
875
+ {
876
+ "epoch": 0.8813065976714101,
877
+ "grad_norm": 0.19413378834724426,
878
+ "learning_rate": 1.8248754790737733e-05,
879
+ "loss": 0.691,
880
+ "mean_token_accuracy": 0.8169219958782196,
881
+ "step": 2725
882
+ },
883
+ {
884
+ "epoch": 0.889391979301423,
885
+ "grad_norm": 0.2755376100540161,
886
+ "learning_rate": 1.5872995213308566e-05,
887
+ "loss": 0.6868,
888
+ "mean_token_accuracy": 0.8156666767597198,
889
+ "step": 2750
890
+ },
891
+ {
892
+ "epoch": 0.8974773609314359,
893
+ "grad_norm": 0.23579077422618866,
894
+ "learning_rate": 1.3657801610084563e-05,
895
+ "loss": 0.669,
896
+ "mean_token_accuracy": 0.8254081463813782,
897
+ "step": 2775
898
+ },
899
+ {
900
+ "epoch": 0.9055627425614489,
901
+ "grad_norm": 0.21246632933616638,
902
+ "learning_rate": 1.1604693176680392e-05,
903
+ "loss": 0.6519,
904
+ "mean_token_accuracy": 0.8244655966758728,
905
+ "step": 2800
906
+ },
907
+ {
908
+ "epoch": 0.9136481241914618,
909
+ "grad_norm": 0.29115888476371765,
910
+ "learning_rate": 9.715077949542184e-06,
911
+ "loss": 0.6673,
912
+ "mean_token_accuracy": 0.825350991487503,
913
+ "step": 2825
914
+ },
915
+ {
916
+ "epoch": 0.9217335058214747,
917
+ "grad_norm": 0.24020685255527496,
918
+ "learning_rate": 7.990251840305996e-06,
919
+ "loss": 0.6349,
920
+ "mean_token_accuracy": 0.8334643471240998,
921
+ "step": 2850
922
+ },
923
+ {
924
+ "epoch": 0.9298188874514877,
925
+ "grad_norm": 0.2604895532131195,
926
+ "learning_rate": 6.431397747052342e-06,
927
+ "loss": 0.6659,
928
+ "mean_token_accuracy": 0.8195065236091614,
929
+ "step": 2875
930
+ },
931
+ {
932
+ "epoch": 0.9379042690815006,
933
+ "grad_norm": 0.19995881617069244,
934
+ "learning_rate": 5.039584743066344e-06,
935
+ "loss": 0.706,
936
+ "mean_token_accuracy": 0.8151924252510071,
937
+ "step": 2900
938
+ },
939
+ {
940
+ "epoch": 0.9459896507115135,
941
+ "grad_norm": 0.22976571321487427,
942
+ "learning_rate": 3.815767343659377e-06,
943
+ "loss": 0.6477,
944
+ "mean_token_accuracy": 0.8256394731998443,
945
+ "step": 2925
946
+ },
947
+ {
948
+ "epoch": 0.9540750323415266,
949
+ "grad_norm": 0.22688381373882294,
950
+ "learning_rate": 2.760784851555953e-06,
951
+ "loss": 0.672,
952
+ "mean_token_accuracy": 0.8214276111125947,
953
+ "step": 2950
954
+ },
955
+ {
956
+ "epoch": 0.9621604139715395,
957
+ "grad_norm": 0.23771828413009644,
958
+ "learning_rate": 1.875360781293689e-06,
959
+ "loss": 0.6642,
960
+ "mean_token_accuracy": 0.8211515319347381,
961
+ "step": 2975
962
+ },
963
+ {
964
+ "epoch": 0.9702457956015524,
965
+ "grad_norm": 0.22044353187084198,
966
+ "learning_rate": 1.1601023630319064e-06,
967
+ "loss": 0.6148,
968
+ "mean_token_accuracy": 0.838649377822876,
969
+ "step": 3000
970
+ },
971
+ {
972
+ "epoch": 0.9783311772315654,
973
+ "grad_norm": 0.21916313469409943,
974
+ "learning_rate": 6.155001261089477e-07,
975
+ "loss": 0.7242,
976
+ "mean_token_accuracy": 0.8094534778594971,
977
+ "step": 3025
978
+ },
979
+ {
980
+ "epoch": 0.9864165588615783,
981
+ "grad_norm": 0.20908524096012115,
982
+ "learning_rate": 2.4192756263349826e-07,
983
+ "loss": 0.7293,
984
+ "mean_token_accuracy": 0.8055576062202454,
985
+ "step": 3050
986
+ },
987
+ {
988
+ "epoch": 0.9945019404915912,
989
+ "grad_norm": 0.2526475489139557,
990
+ "learning_rate": 3.9640871341173336e-08,
991
+ "loss": 0.6961,
992
+ "mean_token_accuracy": 0.8134376859664917,
993
+ "step": 3075
994
+ }
995
+ ],
996
+ "logging_steps": 25,
997
+ "max_steps": 3092,
998
+ "num_input_tokens_seen": 0,
999
+ "num_train_epochs": 1,
1000
+ "save_steps": 0,
1001
+ "stateful_callbacks": {
1002
+ "TrainerControl": {
1003
+ "args": {
1004
+ "should_epoch_stop": false,
1005
+ "should_evaluate": false,
1006
+ "should_log": false,
1007
+ "should_save": true,
1008
+ "should_training_stop": true
1009
+ },
1010
+ "attributes": {}
1011
+ }
1012
+ },
1013
+ "total_flos": 1.2584006622747034e+17,
1014
+ "train_batch_size": 1,
1015
+ "trial_name": null,
1016
+ "trial_params": null
1017
+ }