chansung commited on
Commit
e4e88e3
·
verified ·
1 Parent(s): 34b8b77

Model save

Browse files
Files changed (4) hide show
  1. README.md +78 -0
  2. all_results.json +9 -0
  3. train_results.json +9 -0
  4. trainer_state.json +508 -0
README.md ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ license: llama3.2
4
+ base_model: meta-llama/Llama-3.2-1B
5
+ tags:
6
+ - trl
7
+ - sft
8
+ - generated_from_trainer
9
+ datasets:
10
+ - generator
11
+ model-index:
12
+ - name: llama3-1b-summarize-gpt4o-128k
13
+ results: []
14
+ ---
15
+
16
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
17
+ should probably proofread and complete it, then remove this comment. -->
18
+
19
+ # llama3-1b-summarize-gpt4o-128k
20
+
21
+ This model is a fine-tuned version of [meta-llama/Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B) on the generator dataset.
22
+ It achieves the following results on the evaluation set:
23
+ - Loss: 2.7256
24
+
25
+ ## Model description
26
+
27
+ More information needed
28
+
29
+ ## Intended uses & limitations
30
+
31
+ More information needed
32
+
33
+ ## Training and evaluation data
34
+
35
+ More information needed
36
+
37
+ ## Training procedure
38
+
39
+ ### Training hyperparameters
40
+
41
+ The following hyperparameters were used during training:
42
+ - learning_rate: 0.0002
43
+ - train_batch_size: 32
44
+ - eval_batch_size: 32
45
+ - seed: 42
46
+ - distributed_type: multi-GPU
47
+ - num_devices: 8
48
+ - gradient_accumulation_steps: 2
49
+ - total_train_batch_size: 512
50
+ - total_eval_batch_size: 256
51
+ - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
52
+ - lr_scheduler_type: cosine
53
+ - lr_scheduler_warmup_ratio: 0.1
54
+ - num_epochs: 10
55
+
56
+ ### Training results
57
+
58
+ | Training Loss | Epoch | Step | Validation Loss |
59
+ |:-------------:|:------:|:----:|:---------------:|
60
+ | 2.1182 | 1.0 | 28 | 2.7821 |
61
+ | 1.8354 | 2.0 | 56 | 2.7378 |
62
+ | 1.7604 | 3.0 | 84 | 2.7259 |
63
+ | 1.7035 | 4.0 | 112 | 2.7250 |
64
+ | 1.6822 | 5.0 | 140 | 2.7263 |
65
+ | 1.6619 | 6.0 | 168 | 2.7254 |
66
+ | 1.6471 | 7.0 | 196 | 2.7241 |
67
+ | 1.6458 | 8.0 | 224 | 2.7253 |
68
+ | 1.641 | 9.0 | 252 | 2.7256 |
69
+ | 1.6383 | 9.6545 | 270 | 2.7256 |
70
+
71
+
72
+ ### Framework versions
73
+
74
+ - PEFT 0.15.1
75
+ - Transformers 4.50.3
76
+ - Pytorch 2.6.0+cu124
77
+ - Datasets 3.5.0
78
+ - Tokenizers 0.21.1
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 9.654545454545454,
3
+ "total_flos": 8.156088875152835e+17,
4
+ "train_loss": 1.7710220513520418,
5
+ "train_runtime": 1245.0854,
6
+ "train_samples": 129221,
7
+ "train_samples_per_second": 112.233,
8
+ "train_steps_per_second": 0.217
9
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 9.654545454545454,
3
+ "total_flos": 8.156088875152835e+17,
4
+ "train_loss": 1.7710220513520418,
5
+ "train_runtime": 1245.0854,
6
+ "train_samples": 129221,
7
+ "train_samples_per_second": 112.233,
8
+ "train_steps_per_second": 0.217
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,508 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 9.654545454545454,
6
+ "eval_steps": 500,
7
+ "global_step": 270,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.03636363636363636,
14
+ "grad_norm": 3.691458225250244,
15
+ "learning_rate": 7.4074074074074075e-06,
16
+ "loss": 2.6618,
17
+ "step": 1
18
+ },
19
+ {
20
+ "epoch": 0.18181818181818182,
21
+ "grad_norm": 3.446274518966675,
22
+ "learning_rate": 3.7037037037037037e-05,
23
+ "loss": 2.6408,
24
+ "step": 5
25
+ },
26
+ {
27
+ "epoch": 0.36363636363636365,
28
+ "grad_norm": 2.032472848892212,
29
+ "learning_rate": 7.407407407407407e-05,
30
+ "loss": 2.5317,
31
+ "step": 10
32
+ },
33
+ {
34
+ "epoch": 0.5454545454545454,
35
+ "grad_norm": 0.9463324546813965,
36
+ "learning_rate": 0.00011111111111111112,
37
+ "loss": 2.36,
38
+ "step": 15
39
+ },
40
+ {
41
+ "epoch": 0.7272727272727273,
42
+ "grad_norm": 0.7080094218254089,
43
+ "learning_rate": 0.00014814814814814815,
44
+ "loss": 2.2174,
45
+ "step": 20
46
+ },
47
+ {
48
+ "epoch": 0.9090909090909091,
49
+ "grad_norm": 0.5037975311279297,
50
+ "learning_rate": 0.0001851851851851852,
51
+ "loss": 2.1182,
52
+ "step": 25
53
+ },
54
+ {
55
+ "epoch": 1.0,
56
+ "eval_loss": 2.782132625579834,
57
+ "eval_runtime": 0.8292,
58
+ "eval_samples_per_second": 12.06,
59
+ "eval_steps_per_second": 1.206,
60
+ "step": 28
61
+ },
62
+ {
63
+ "epoch": 1.0727272727272728,
64
+ "grad_norm": 0.41888633370399475,
65
+ "learning_rate": 0.00019992479525042303,
66
+ "loss": 2.0277,
67
+ "step": 30
68
+ },
69
+ {
70
+ "epoch": 1.2545454545454544,
71
+ "grad_norm": 0.27915704250335693,
72
+ "learning_rate": 0.00019946562024066014,
73
+ "loss": 1.9587,
74
+ "step": 35
75
+ },
76
+ {
77
+ "epoch": 1.4363636363636363,
78
+ "grad_norm": 0.20056034624576569,
79
+ "learning_rate": 0.00019859096633447965,
80
+ "loss": 1.9087,
81
+ "step": 40
82
+ },
83
+ {
84
+ "epoch": 1.6181818181818182,
85
+ "grad_norm": 0.16737522184848785,
86
+ "learning_rate": 0.00019730448705798239,
87
+ "loss": 1.8766,
88
+ "step": 45
89
+ },
90
+ {
91
+ "epoch": 1.8,
92
+ "grad_norm": 0.15048423409461975,
93
+ "learning_rate": 0.00019561155617738797,
94
+ "loss": 1.8481,
95
+ "step": 50
96
+ },
97
+ {
98
+ "epoch": 1.981818181818182,
99
+ "grad_norm": 0.1224176436662674,
100
+ "learning_rate": 0.000193519245252219,
101
+ "loss": 1.8354,
102
+ "step": 55
103
+ },
104
+ {
105
+ "epoch": 2.0,
106
+ "eval_loss": 2.737755537033081,
107
+ "eval_runtime": 0.829,
108
+ "eval_samples_per_second": 12.063,
109
+ "eval_steps_per_second": 1.206,
110
+ "step": 56
111
+ },
112
+ {
113
+ "epoch": 2.1454545454545455,
114
+ "grad_norm": 0.1324545294046402,
115
+ "learning_rate": 0.0001910362940966147,
116
+ "loss": 1.8118,
117
+ "step": 60
118
+ },
119
+ {
120
+ "epoch": 2.327272727272727,
121
+ "grad_norm": 0.11611360311508179,
122
+ "learning_rate": 0.0001881730742721608,
123
+ "loss": 1.7937,
124
+ "step": 65
125
+ },
126
+ {
127
+ "epoch": 2.509090909090909,
128
+ "grad_norm": 0.1148991584777832,
129
+ "learning_rate": 0.00018494154576472976,
130
+ "loss": 1.7791,
131
+ "step": 70
132
+ },
133
+ {
134
+ "epoch": 2.690909090909091,
135
+ "grad_norm": 0.11438702791929245,
136
+ "learning_rate": 0.00018135520702629675,
137
+ "loss": 1.7654,
138
+ "step": 75
139
+ },
140
+ {
141
+ "epoch": 2.8727272727272726,
142
+ "grad_norm": 0.11716635525226593,
143
+ "learning_rate": 0.00017742903859041325,
144
+ "loss": 1.7604,
145
+ "step": 80
146
+ },
147
+ {
148
+ "epoch": 3.0,
149
+ "eval_loss": 2.7259373664855957,
150
+ "eval_runtime": 0.8303,
151
+ "eval_samples_per_second": 12.044,
152
+ "eval_steps_per_second": 1.204,
153
+ "step": 84
154
+ },
155
+ {
156
+ "epoch": 3.036363636363636,
157
+ "grad_norm": 0.1302882581949234,
158
+ "learning_rate": 0.00017317944049686124,
159
+ "loss": 1.7453,
160
+ "step": 85
161
+ },
162
+ {
163
+ "epoch": 3.2181818181818183,
164
+ "grad_norm": 0.12489154189825058,
165
+ "learning_rate": 0.0001686241637868734,
166
+ "loss": 1.7396,
167
+ "step": 90
168
+ },
169
+ {
170
+ "epoch": 3.4,
171
+ "grad_norm": 0.10804688185453415,
172
+ "learning_rate": 0.0001637822363550706,
173
+ "loss": 1.7272,
174
+ "step": 95
175
+ },
176
+ {
177
+ "epoch": 3.581818181818182,
178
+ "grad_norm": 0.1448238343000412,
179
+ "learning_rate": 0.0001586738834678418,
180
+ "loss": 1.7231,
181
+ "step": 100
182
+ },
183
+ {
184
+ "epoch": 3.7636363636363637,
185
+ "grad_norm": 0.12403673678636551,
186
+ "learning_rate": 0.00015332044328016914,
187
+ "loss": 1.7101,
188
+ "step": 105
189
+ },
190
+ {
191
+ "epoch": 3.9454545454545453,
192
+ "grad_norm": 0.11520184576511383,
193
+ "learning_rate": 0.0001477442777037949,
194
+ "loss": 1.7035,
195
+ "step": 110
196
+ },
197
+ {
198
+ "epoch": 4.0,
199
+ "eval_loss": 2.724990129470825,
200
+ "eval_runtime": 0.8296,
201
+ "eval_samples_per_second": 12.053,
202
+ "eval_steps_per_second": 1.205,
203
+ "step": 112
204
+ },
205
+ {
206
+ "epoch": 4.109090909090909,
207
+ "grad_norm": 0.11850611865520477,
208
+ "learning_rate": 0.0001419686789990429,
209
+ "loss": 1.6998,
210
+ "step": 115
211
+ },
212
+ {
213
+ "epoch": 4.290909090909091,
214
+ "grad_norm": 0.141310453414917,
215
+ "learning_rate": 0.00013601777248047105,
216
+ "loss": 1.6942,
217
+ "step": 120
218
+ },
219
+ {
220
+ "epoch": 4.472727272727273,
221
+ "grad_norm": 0.14388997852802277,
222
+ "learning_rate": 0.00012991641574276418,
223
+ "loss": 1.6887,
224
+ "step": 125
225
+ },
226
+ {
227
+ "epoch": 4.654545454545454,
228
+ "grad_norm": 0.11356977373361588,
229
+ "learning_rate": 0.00012369009482781192,
230
+ "loss": 1.6845,
231
+ "step": 130
232
+ },
233
+ {
234
+ "epoch": 4.836363636363636,
235
+ "grad_norm": 0.13505423069000244,
236
+ "learning_rate": 0.00011736481776669306,
237
+ "loss": 1.6801,
238
+ "step": 135
239
+ },
240
+ {
241
+ "epoch": 5.0,
242
+ "grad_norm": 0.18071481585502625,
243
+ "learning_rate": 0.00011096700594125318,
244
+ "loss": 1.6822,
245
+ "step": 140
246
+ },
247
+ {
248
+ "epoch": 5.0,
249
+ "eval_loss": 2.7262730598449707,
250
+ "eval_runtime": 0.8327,
251
+ "eval_samples_per_second": 12.009,
252
+ "eval_steps_per_second": 1.201,
253
+ "step": 140
254
+ },
255
+ {
256
+ "epoch": 5.181818181818182,
257
+ "grad_norm": 0.12405228614807129,
258
+ "learning_rate": 0.00010452338371907064,
259
+ "loss": 1.671,
260
+ "step": 145
261
+ },
262
+ {
263
+ "epoch": 5.363636363636363,
264
+ "grad_norm": 0.15709254145622253,
265
+ "learning_rate": 9.806086682281758e-05,
266
+ "loss": 1.6697,
267
+ "step": 150
268
+ },
269
+ {
270
+ "epoch": 5.545454545454545,
271
+ "grad_norm": 0.1405353993177414,
272
+ "learning_rate": 9.160644990030931e-05,
273
+ "loss": 1.6707,
274
+ "step": 155
275
+ },
276
+ {
277
+ "epoch": 5.7272727272727275,
278
+ "grad_norm": 0.13487176597118378,
279
+ "learning_rate": 8.518709376487515e-05,
280
+ "loss": 1.6619,
281
+ "step": 160
282
+ },
283
+ {
284
+ "epoch": 5.909090909090909,
285
+ "grad_norm": 0.12394227087497711,
286
+ "learning_rate": 7.882961277705895e-05,
287
+ "loss": 1.6619,
288
+ "step": 165
289
+ },
290
+ {
291
+ "epoch": 6.0,
292
+ "eval_loss": 2.7253997325897217,
293
+ "eval_runtime": 0.8321,
294
+ "eval_samples_per_second": 12.017,
295
+ "eval_steps_per_second": 1.202,
296
+ "step": 168
297
+ },
298
+ {
299
+ "epoch": 6.072727272727272,
300
+ "grad_norm": 0.11816684156656265,
301
+ "learning_rate": 7.256056283806986e-05,
302
+ "loss": 1.6573,
303
+ "step": 170
304
+ },
305
+ {
306
+ "epoch": 6.254545454545455,
307
+ "grad_norm": 0.14117498695850372,
308
+ "learning_rate": 6.640613046284581e-05,
309
+ "loss": 1.6622,
310
+ "step": 175
311
+ },
312
+ {
313
+ "epoch": 6.4363636363636365,
314
+ "grad_norm": 0.1342514008283615,
315
+ "learning_rate": 6.039202339608432e-05,
316
+ "loss": 1.6535,
317
+ "step": 180
318
+ },
319
+ {
320
+ "epoch": 6.618181818181818,
321
+ "grad_norm": 0.13483189046382904,
322
+ "learning_rate": 5.4543363228149946e-05,
323
+ "loss": 1.6532,
324
+ "step": 185
325
+ },
326
+ {
327
+ "epoch": 6.8,
328
+ "grad_norm": 0.1636153757572174,
329
+ "learning_rate": 4.888458045941269e-05,
330
+ "loss": 1.6482,
331
+ "step": 190
332
+ },
333
+ {
334
+ "epoch": 6.9818181818181815,
335
+ "grad_norm": 0.1563912183046341,
336
+ "learning_rate": 4.343931245134616e-05,
337
+ "loss": 1.6471,
338
+ "step": 195
339
+ },
340
+ {
341
+ "epoch": 7.0,
342
+ "eval_loss": 2.7240517139434814,
343
+ "eval_runtime": 0.8312,
344
+ "eval_samples_per_second": 12.031,
345
+ "eval_steps_per_second": 1.203,
346
+ "step": 196
347
+ },
348
+ {
349
+ "epoch": 7.1454545454545455,
350
+ "grad_norm": 0.11320989578962326,
351
+ "learning_rate": 3.8230304690654304e-05,
352
+ "loss": 1.6472,
353
+ "step": 200
354
+ },
355
+ {
356
+ "epoch": 7.327272727272727,
357
+ "grad_norm": 0.111383818089962,
358
+ "learning_rate": 3.3279315778858036e-05,
359
+ "loss": 1.6488,
360
+ "step": 205
361
+ },
362
+ {
363
+ "epoch": 7.509090909090909,
364
+ "grad_norm": 0.10844731330871582,
365
+ "learning_rate": 2.8607026544210114e-05,
366
+ "loss": 1.6458,
367
+ "step": 210
368
+ },
369
+ {
370
+ "epoch": 7.690909090909091,
371
+ "grad_norm": 0.10823339223861694,
372
+ "learning_rate": 2.423295365558821e-05,
373
+ "loss": 1.6456,
374
+ "step": 215
375
+ },
376
+ {
377
+ "epoch": 7.872727272727273,
378
+ "grad_norm": 0.10790830850601196,
379
+ "learning_rate": 2.01753680992107e-05,
380
+ "loss": 1.6458,
381
+ "step": 220
382
+ },
383
+ {
384
+ "epoch": 8.0,
385
+ "eval_loss": 2.7252650260925293,
386
+ "eval_runtime": 0.8302,
387
+ "eval_samples_per_second": 12.045,
388
+ "eval_steps_per_second": 1.204,
389
+ "step": 224
390
+ },
391
+ {
392
+ "epoch": 8.036363636363637,
393
+ "grad_norm": 0.11462420970201492,
394
+ "learning_rate": 1.6451218858706374e-05,
395
+ "loss": 1.643,
396
+ "step": 225
397
+ },
398
+ {
399
+ "epoch": 8.218181818181819,
400
+ "grad_norm": 0.10164881497621536,
401
+ "learning_rate": 1.307606211733522e-05,
402
+ "loss": 1.6435,
403
+ "step": 230
404
+ },
405
+ {
406
+ "epoch": 8.4,
407
+ "grad_norm": 0.11715802550315857,
408
+ "learning_rate": 1.0063996278090704e-05,
409
+ "loss": 1.6436,
410
+ "step": 235
411
+ },
412
+ {
413
+ "epoch": 8.581818181818182,
414
+ "grad_norm": 0.1077931597828865,
415
+ "learning_rate": 7.427603073110967e-06,
416
+ "loss": 1.6437,
417
+ "step": 240
418
+ },
419
+ {
420
+ "epoch": 8.763636363636364,
421
+ "grad_norm": 0.09881118685007095,
422
+ "learning_rate": 5.177895008392353e-06,
423
+ "loss": 1.6415,
424
+ "step": 245
425
+ },
426
+ {
427
+ "epoch": 8.945454545454545,
428
+ "grad_norm": 0.0973580852150917,
429
+ "learning_rate": 3.3242693633337983e-06,
430
+ "loss": 1.641,
431
+ "step": 250
432
+ },
433
+ {
434
+ "epoch": 9.0,
435
+ "eval_loss": 2.725569009780884,
436
+ "eval_runtime": 0.8306,
437
+ "eval_samples_per_second": 12.039,
438
+ "eval_steps_per_second": 1.204,
439
+ "step": 252
440
+ },
441
+ {
442
+ "epoch": 9.10909090909091,
443
+ "grad_norm": 0.10264136642217636,
444
+ "learning_rate": 1.874468937261531e-06,
445
+ "loss": 1.6464,
446
+ "step": 255
447
+ },
448
+ {
449
+ "epoch": 9.290909090909091,
450
+ "grad_norm": 0.1021399274468422,
451
+ "learning_rate": 8.345497068998897e-07,
452
+ "loss": 1.6443,
453
+ "step": 260
454
+ },
455
+ {
456
+ "epoch": 9.472727272727273,
457
+ "grad_norm": 0.10423731058835983,
458
+ "learning_rate": 2.088555298867978e-07,
459
+ "loss": 1.6436,
460
+ "step": 265
461
+ },
462
+ {
463
+ "epoch": 9.654545454545454,
464
+ "grad_norm": 0.09860274940729141,
465
+ "learning_rate": 0.0,
466
+ "loss": 1.6383,
467
+ "step": 270
468
+ },
469
+ {
470
+ "epoch": 9.654545454545454,
471
+ "eval_loss": 2.725593328475952,
472
+ "eval_runtime": 0.8317,
473
+ "eval_samples_per_second": 12.024,
474
+ "eval_steps_per_second": 1.202,
475
+ "step": 270
476
+ },
477
+ {
478
+ "epoch": 9.654545454545454,
479
+ "step": 270,
480
+ "total_flos": 8.156088875152835e+17,
481
+ "train_loss": 1.7710220513520418,
482
+ "train_runtime": 1245.0854,
483
+ "train_samples_per_second": 112.233,
484
+ "train_steps_per_second": 0.217
485
+ }
486
+ ],
487
+ "logging_steps": 5,
488
+ "max_steps": 270,
489
+ "num_input_tokens_seen": 0,
490
+ "num_train_epochs": 10,
491
+ "save_steps": 100,
492
+ "stateful_callbacks": {
493
+ "TrainerControl": {
494
+ "args": {
495
+ "should_epoch_stop": false,
496
+ "should_evaluate": false,
497
+ "should_log": false,
498
+ "should_save": true,
499
+ "should_training_stop": true
500
+ },
501
+ "attributes": {}
502
+ }
503
+ },
504
+ "total_flos": 8.156088875152835e+17,
505
+ "train_batch_size": 32,
506
+ "trial_name": null,
507
+ "trial_params": null
508
+ }