chansung commited on
Commit
e9545d2
·
verified ·
1 Parent(s): 5679477

Model save

Browse files
Files changed (4) hide show
  1. README.md +78 -0
  2. all_results.json +9 -0
  3. train_results.json +9 -0
  4. trainer_state.json +578 -0
README.md ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ license: llama3.2
4
+ base_model: meta-llama/Llama-3.2-1B
5
+ tags:
6
+ - trl
7
+ - sft
8
+ - generated_from_trainer
9
+ datasets:
10
+ - generator
11
+ model-index:
12
+ - name: llama3-1b-closedqa-gpt4o-100k
13
+ results: []
14
+ ---
15
+
16
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
17
+ should probably proofread and complete it, then remove this comment. -->
18
+
19
+ # llama3-1b-closedqa-gpt4o-100k
20
+
21
+ This model is a fine-tuned version of [meta-llama/Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B) on the generator dataset.
22
+ It achieves the following results on the evaluation set:
23
+ - Loss: 2.5954
24
+
25
+ ## Model description
26
+
27
+ More information needed
28
+
29
+ ## Intended uses & limitations
30
+
31
+ More information needed
32
+
33
+ ## Training and evaluation data
34
+
35
+ More information needed
36
+
37
+ ## Training procedure
38
+
39
+ ### Training hyperparameters
40
+
41
+ The following hyperparameters were used during training:
42
+ - learning_rate: 0.0002
43
+ - train_batch_size: 32
44
+ - eval_batch_size: 32
45
+ - seed: 42
46
+ - distributed_type: multi-GPU
47
+ - num_devices: 8
48
+ - gradient_accumulation_steps: 2
49
+ - total_train_batch_size: 512
50
+ - total_eval_batch_size: 256
51
+ - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
52
+ - lr_scheduler_type: cosine
53
+ - lr_scheduler_warmup_ratio: 0.1
54
+ - num_epochs: 10
55
+
56
+ ### Training results
57
+
58
+ | Training Loss | Epoch | Step | Validation Loss |
59
+ |:-------------:|:-----:|:----:|:---------------:|
60
+ | 1.845 | 1.0 | 32 | 2.6849 |
61
+ | 1.6455 | 2.0 | 64 | 2.6129 |
62
+ | 1.5798 | 3.0 | 96 | 2.6014 |
63
+ | 1.546 | 4.0 | 128 | 2.5975 |
64
+ | 1.5244 | 5.0 | 160 | 2.5963 |
65
+ | 1.5072 | 6.0 | 192 | 2.5944 |
66
+ | 1.501 | 7.0 | 224 | 2.5941 |
67
+ | 1.4858 | 8.0 | 256 | 2.5944 |
68
+ | 1.4917 | 9.0 | 288 | 2.5956 |
69
+ | 1.4886 | 10.0 | 320 | 2.5954 |
70
+
71
+
72
+ ### Framework versions
73
+
74
+ - PEFT 0.15.1
75
+ - Transformers 4.50.3
76
+ - Pytorch 2.6.0+cu124
77
+ - Datasets 3.5.0
78
+ - Tokenizers 0.21.1
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "total_flos": 9.830314275137126e+17,
4
+ "train_loss": 1.5905956603586673,
5
+ "train_runtime": 1497.0202,
6
+ "train_samples": 111440,
7
+ "train_samples_per_second": 109.25,
8
+ "train_steps_per_second": 0.214
9
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "total_flos": 9.830314275137126e+17,
4
+ "train_loss": 1.5905956603586673,
5
+ "train_runtime": 1497.0202,
6
+ "train_samples": 111440,
7
+ "train_samples_per_second": 109.25,
8
+ "train_steps_per_second": 0.214
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,578 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 10.0,
6
+ "eval_steps": 500,
7
+ "global_step": 320,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.03125,
14
+ "grad_norm": 2.2531771659851074,
15
+ "learning_rate": 6.25e-06,
16
+ "loss": 2.2918,
17
+ "step": 1
18
+ },
19
+ {
20
+ "epoch": 0.15625,
21
+ "grad_norm": 2.1257269382476807,
22
+ "learning_rate": 3.125e-05,
23
+ "loss": 2.284,
24
+ "step": 5
25
+ },
26
+ {
27
+ "epoch": 0.3125,
28
+ "grad_norm": 1.5896614789962769,
29
+ "learning_rate": 6.25e-05,
30
+ "loss": 2.2225,
31
+ "step": 10
32
+ },
33
+ {
34
+ "epoch": 0.46875,
35
+ "grad_norm": 0.7286556363105774,
36
+ "learning_rate": 9.375e-05,
37
+ "loss": 2.1021,
38
+ "step": 15
39
+ },
40
+ {
41
+ "epoch": 0.625,
42
+ "grad_norm": 0.6398385763168335,
43
+ "learning_rate": 0.000125,
44
+ "loss": 1.9996,
45
+ "step": 20
46
+ },
47
+ {
48
+ "epoch": 0.78125,
49
+ "grad_norm": 0.4788893461227417,
50
+ "learning_rate": 0.00015625,
51
+ "loss": 1.9178,
52
+ "step": 25
53
+ },
54
+ {
55
+ "epoch": 0.9375,
56
+ "grad_norm": 0.3934371769428253,
57
+ "learning_rate": 0.0001875,
58
+ "loss": 1.845,
59
+ "step": 30
60
+ },
61
+ {
62
+ "epoch": 1.0,
63
+ "eval_loss": 2.684882402420044,
64
+ "eval_runtime": 0.8301,
65
+ "eval_samples_per_second": 7.228,
66
+ "eval_steps_per_second": 1.205,
67
+ "step": 32
68
+ },
69
+ {
70
+ "epoch": 1.09375,
71
+ "grad_norm": 0.28933560848236084,
72
+ "learning_rate": 0.00019994645874763658,
73
+ "loss": 1.786,
74
+ "step": 35
75
+ },
76
+ {
77
+ "epoch": 1.25,
78
+ "grad_norm": 0.20734435319900513,
79
+ "learning_rate": 0.00019961946980917456,
80
+ "loss": 1.7402,
81
+ "step": 40
82
+ },
83
+ {
84
+ "epoch": 1.40625,
85
+ "grad_norm": 0.16501550376415253,
86
+ "learning_rate": 0.00019899620837148077,
87
+ "loss": 1.7072,
88
+ "step": 45
89
+ },
90
+ {
91
+ "epoch": 1.5625,
92
+ "grad_norm": 0.13184033334255219,
93
+ "learning_rate": 0.00019807852804032305,
94
+ "loss": 1.6846,
95
+ "step": 50
96
+ },
97
+ {
98
+ "epoch": 1.71875,
99
+ "grad_norm": 0.11590086668729782,
100
+ "learning_rate": 0.00019686915803565934,
101
+ "loss": 1.665,
102
+ "step": 55
103
+ },
104
+ {
105
+ "epoch": 1.875,
106
+ "grad_norm": 0.11442063748836517,
107
+ "learning_rate": 0.0001953716950748227,
108
+ "loss": 1.6455,
109
+ "step": 60
110
+ },
111
+ {
112
+ "epoch": 2.0,
113
+ "eval_loss": 2.612865686416626,
114
+ "eval_runtime": 0.83,
115
+ "eval_samples_per_second": 7.229,
116
+ "eval_steps_per_second": 1.205,
117
+ "step": 64
118
+ },
119
+ {
120
+ "epoch": 2.03125,
121
+ "grad_norm": 0.09601253271102905,
122
+ "learning_rate": 0.0001935905926757326,
123
+ "loss": 1.6364,
124
+ "step": 65
125
+ },
126
+ {
127
+ "epoch": 2.1875,
128
+ "grad_norm": 0.10327372699975967,
129
+ "learning_rate": 0.00019153114791194473,
130
+ "loss": 1.6227,
131
+ "step": 70
132
+ },
133
+ {
134
+ "epoch": 2.34375,
135
+ "grad_norm": 0.09720147401094437,
136
+ "learning_rate": 0.00018919948565893142,
137
+ "loss": 1.611,
138
+ "step": 75
139
+ },
140
+ {
141
+ "epoch": 2.5,
142
+ "grad_norm": 0.09051357209682465,
143
+ "learning_rate": 0.00018660254037844388,
144
+ "loss": 1.6067,
145
+ "step": 80
146
+ },
147
+ {
148
+ "epoch": 2.65625,
149
+ "grad_norm": 0.10204631090164185,
150
+ "learning_rate": 0.0001837480354951308,
151
+ "loss": 1.6009,
152
+ "step": 85
153
+ },
154
+ {
155
+ "epoch": 2.8125,
156
+ "grad_norm": 0.11083294451236725,
157
+ "learning_rate": 0.00018064446042674828,
158
+ "loss": 1.5825,
159
+ "step": 90
160
+ },
161
+ {
162
+ "epoch": 2.96875,
163
+ "grad_norm": 0.1064315065741539,
164
+ "learning_rate": 0.0001773010453362737,
165
+ "loss": 1.5798,
166
+ "step": 95
167
+ },
168
+ {
169
+ "epoch": 3.0,
170
+ "eval_loss": 2.6013729572296143,
171
+ "eval_runtime": 0.8292,
172
+ "eval_samples_per_second": 7.236,
173
+ "eval_steps_per_second": 1.206,
174
+ "step": 96
175
+ },
176
+ {
177
+ "epoch": 3.125,
178
+ "grad_norm": 0.0996951013803482,
179
+ "learning_rate": 0.0001737277336810124,
180
+ "loss": 1.5716,
181
+ "step": 100
182
+ },
183
+ {
184
+ "epoch": 3.28125,
185
+ "grad_norm": 0.10399119555950165,
186
+ "learning_rate": 0.00016993515264033672,
187
+ "loss": 1.5691,
188
+ "step": 105
189
+ },
190
+ {
191
+ "epoch": 3.4375,
192
+ "grad_norm": 0.0989290326833725,
193
+ "learning_rate": 0.00016593458151000688,
194
+ "loss": 1.5641,
195
+ "step": 110
196
+ },
197
+ {
198
+ "epoch": 3.59375,
199
+ "grad_norm": 0.108305424451828,
200
+ "learning_rate": 0.00016173791815707051,
201
+ "loss": 1.5539,
202
+ "step": 115
203
+ },
204
+ {
205
+ "epoch": 3.75,
206
+ "grad_norm": 0.11103138327598572,
207
+ "learning_rate": 0.0001573576436351046,
208
+ "loss": 1.5511,
209
+ "step": 120
210
+ },
211
+ {
212
+ "epoch": 3.90625,
213
+ "grad_norm": 0.11553769558668137,
214
+ "learning_rate": 0.0001528067850650368,
215
+ "loss": 1.546,
216
+ "step": 125
217
+ },
218
+ {
219
+ "epoch": 4.0,
220
+ "eval_loss": 2.597513437271118,
221
+ "eval_runtime": 0.8293,
222
+ "eval_samples_per_second": 7.235,
223
+ "eval_steps_per_second": 1.206,
224
+ "step": 128
225
+ },
226
+ {
227
+ "epoch": 4.0625,
228
+ "grad_norm": 0.11389793455600739,
229
+ "learning_rate": 0.00014809887689193877,
230
+ "loss": 1.5361,
231
+ "step": 130
232
+ },
233
+ {
234
+ "epoch": 4.21875,
235
+ "grad_norm": 0.1214594691991806,
236
+ "learning_rate": 0.00014324792063301662,
237
+ "loss": 1.5346,
238
+ "step": 135
239
+ },
240
+ {
241
+ "epoch": 4.375,
242
+ "grad_norm": 0.11472882330417633,
243
+ "learning_rate": 0.000138268343236509,
244
+ "loss": 1.5339,
245
+ "step": 140
246
+ },
247
+ {
248
+ "epoch": 4.53125,
249
+ "grad_norm": 0.10563024878501892,
250
+ "learning_rate": 0.00013317495417533524,
251
+ "loss": 1.5314,
252
+ "step": 145
253
+ },
254
+ {
255
+ "epoch": 4.6875,
256
+ "grad_norm": 0.10731685161590576,
257
+ "learning_rate": 0.00012798290140309923,
258
+ "loss": 1.5284,
259
+ "step": 150
260
+ },
261
+ {
262
+ "epoch": 4.84375,
263
+ "grad_norm": 0.10968785732984543,
264
+ "learning_rate": 0.00012270762630343734,
265
+ "loss": 1.5233,
266
+ "step": 155
267
+ },
268
+ {
269
+ "epoch": 5.0,
270
+ "grad_norm": 0.11469796299934387,
271
+ "learning_rate": 0.00011736481776669306,
272
+ "loss": 1.5244,
273
+ "step": 160
274
+ },
275
+ {
276
+ "epoch": 5.0,
277
+ "eval_loss": 2.5962610244750977,
278
+ "eval_runtime": 0.8296,
279
+ "eval_samples_per_second": 7.233,
280
+ "eval_steps_per_second": 1.205,
281
+ "step": 160
282
+ },
283
+ {
284
+ "epoch": 5.15625,
285
+ "grad_norm": 0.10656954348087311,
286
+ "learning_rate": 0.00011197036553049625,
287
+ "loss": 1.5151,
288
+ "step": 165
289
+ },
290
+ {
291
+ "epoch": 5.3125,
292
+ "grad_norm": 0.12216157466173172,
293
+ "learning_rate": 0.00010654031292301432,
294
+ "loss": 1.5119,
295
+ "step": 170
296
+ },
297
+ {
298
+ "epoch": 5.46875,
299
+ "grad_norm": 0.11443573981523514,
300
+ "learning_rate": 0.00010109080914941824,
301
+ "loss": 1.5107,
302
+ "step": 175
303
+ },
304
+ {
305
+ "epoch": 5.625,
306
+ "grad_norm": 0.11616319417953491,
307
+ "learning_rate": 9.563806126346642e-05,
308
+ "loss": 1.5165,
309
+ "step": 180
310
+ },
311
+ {
312
+ "epoch": 5.78125,
313
+ "grad_norm": 0.1112031489610672,
314
+ "learning_rate": 9.019828596704394e-05,
315
+ "loss": 1.509,
316
+ "step": 185
317
+ },
318
+ {
319
+ "epoch": 5.9375,
320
+ "grad_norm": 0.11148490756750107,
321
+ "learning_rate": 8.478766138100834e-05,
322
+ "loss": 1.5072,
323
+ "step": 190
324
+ },
325
+ {
326
+ "epoch": 6.0,
327
+ "eval_loss": 2.594440460205078,
328
+ "eval_runtime": 0.8285,
329
+ "eval_samples_per_second": 7.242,
330
+ "eval_steps_per_second": 1.207,
331
+ "step": 192
332
+ },
333
+ {
334
+ "epoch": 6.09375,
335
+ "grad_norm": 0.11521880328655243,
336
+ "learning_rate": 7.942227893077652e-05,
337
+ "loss": 1.5015,
338
+ "step": 195
339
+ },
340
+ {
341
+ "epoch": 6.25,
342
+ "grad_norm": 0.11160895973443985,
343
+ "learning_rate": 7.411809548974792e-05,
344
+ "loss": 1.4991,
345
+ "step": 200
346
+ },
347
+ {
348
+ "epoch": 6.40625,
349
+ "grad_norm": 0.11868108808994293,
350
+ "learning_rate": 6.889088592289093e-05,
351
+ "loss": 1.5008,
352
+ "step": 205
353
+ },
354
+ {
355
+ "epoch": 6.5625,
356
+ "grad_norm": 0.11169662326574326,
357
+ "learning_rate": 6.375619617162985e-05,
358
+ "loss": 1.5006,
359
+ "step": 210
360
+ },
361
+ {
362
+ "epoch": 6.71875,
363
+ "grad_norm": 0.11667145788669586,
364
+ "learning_rate": 5.872929701956054e-05,
365
+ "loss": 1.4989,
366
+ "step": 215
367
+ },
368
+ {
369
+ "epoch": 6.875,
370
+ "grad_norm": 0.10703334957361221,
371
+ "learning_rate": 5.382513867649663e-05,
372
+ "loss": 1.501,
373
+ "step": 220
374
+ },
375
+ {
376
+ "epoch": 7.0,
377
+ "eval_loss": 2.594101667404175,
378
+ "eval_runtime": 0.8304,
379
+ "eval_samples_per_second": 7.225,
380
+ "eval_steps_per_second": 1.204,
381
+ "step": 224
382
+ },
383
+ {
384
+ "epoch": 7.03125,
385
+ "grad_norm": 0.11598368734121323,
386
+ "learning_rate": 4.9058306315915826e-05,
387
+ "loss": 1.5005,
388
+ "step": 225
389
+ },
390
+ {
391
+ "epoch": 7.1875,
392
+ "grad_norm": 0.11565965414047241,
393
+ "learning_rate": 4.444297669803981e-05,
394
+ "loss": 1.4975,
395
+ "step": 230
396
+ },
397
+ {
398
+ "epoch": 7.34375,
399
+ "grad_norm": 0.10369732230901718,
400
+ "learning_rate": 3.999287600755192e-05,
401
+ "loss": 1.4969,
402
+ "step": 235
403
+ },
404
+ {
405
+ "epoch": 7.5,
406
+ "grad_norm": 0.10466954857110977,
407
+ "learning_rate": 3.5721239031346066e-05,
408
+ "loss": 1.4949,
409
+ "step": 240
410
+ },
411
+ {
412
+ "epoch": 7.65625,
413
+ "grad_norm": 0.09984403848648071,
414
+ "learning_rate": 3.164076979771287e-05,
415
+ "loss": 1.4901,
416
+ "step": 245
417
+ },
418
+ {
419
+ "epoch": 7.8125,
420
+ "grad_norm": 0.10001492500305176,
421
+ "learning_rate": 2.776360379402445e-05,
422
+ "loss": 1.492,
423
+ "step": 250
424
+ },
425
+ {
426
+ "epoch": 7.96875,
427
+ "grad_norm": 0.10096081346273422,
428
+ "learning_rate": 2.4101271875283817e-05,
429
+ "loss": 1.4858,
430
+ "step": 255
431
+ },
432
+ {
433
+ "epoch": 8.0,
434
+ "eval_loss": 2.5943994522094727,
435
+ "eval_runtime": 0.8348,
436
+ "eval_samples_per_second": 7.187,
437
+ "eval_steps_per_second": 1.198,
438
+ "step": 256
439
+ },
440
+ {
441
+ "epoch": 8.125,
442
+ "grad_norm": 0.09807378798723221,
443
+ "learning_rate": 2.0664665970876496e-05,
444
+ "loss": 1.4889,
445
+ "step": 260
446
+ },
447
+ {
448
+ "epoch": 8.28125,
449
+ "grad_norm": 0.1029752716422081,
450
+ "learning_rate": 1.7464006691513623e-05,
451
+ "loss": 1.4893,
452
+ "step": 265
453
+ },
454
+ {
455
+ "epoch": 8.4375,
456
+ "grad_norm": 0.0935431718826294,
457
+ "learning_rate": 1.4508812932705363e-05,
458
+ "loss": 1.4883,
459
+ "step": 270
460
+ },
461
+ {
462
+ "epoch": 8.59375,
463
+ "grad_norm": 0.09739290177822113,
464
+ "learning_rate": 1.1807873565164506e-05,
465
+ "loss": 1.4858,
466
+ "step": 275
467
+ },
468
+ {
469
+ "epoch": 8.75,
470
+ "grad_norm": 0.10472334176301956,
471
+ "learning_rate": 9.369221296335006e-06,
472
+ "loss": 1.4933,
473
+ "step": 280
474
+ },
475
+ {
476
+ "epoch": 8.90625,
477
+ "grad_norm": 0.09686731547117233,
478
+ "learning_rate": 7.200108780781556e-06,
479
+ "loss": 1.4917,
480
+ "step": 285
481
+ },
482
+ {
483
+ "epoch": 9.0,
484
+ "eval_loss": 2.5955541133880615,
485
+ "eval_runtime": 0.8323,
486
+ "eval_samples_per_second": 7.209,
487
+ "eval_steps_per_second": 1.201,
488
+ "step": 288
489
+ },
490
+ {
491
+ "epoch": 9.0625,
492
+ "grad_norm": 0.09799981117248535,
493
+ "learning_rate": 5.306987050489442e-06,
494
+ "loss": 1.4904,
495
+ "step": 290
496
+ },
497
+ {
498
+ "epoch": 9.21875,
499
+ "grad_norm": 0.10055091232061386,
500
+ "learning_rate": 3.6954863292237297e-06,
501
+ "loss": 1.4813,
502
+ "step": 295
503
+ },
504
+ {
505
+ "epoch": 9.375,
506
+ "grad_norm": 0.09550856798887253,
507
+ "learning_rate": 2.3703992880066638e-06,
508
+ "loss": 1.4873,
509
+ "step": 300
510
+ },
511
+ {
512
+ "epoch": 9.53125,
513
+ "grad_norm": 0.09861158579587936,
514
+ "learning_rate": 1.3356667915121025e-06,
515
+ "loss": 1.4911,
516
+ "step": 305
517
+ },
518
+ {
519
+ "epoch": 9.6875,
520
+ "grad_norm": 0.10015583783388138,
521
+ "learning_rate": 5.943661777680354e-07,
522
+ "loss": 1.4933,
523
+ "step": 310
524
+ },
525
+ {
526
+ "epoch": 9.84375,
527
+ "grad_norm": 0.09337816387414932,
528
+ "learning_rate": 1.487021060236904e-07,
529
+ "loss": 1.49,
530
+ "step": 315
531
+ },
532
+ {
533
+ "epoch": 10.0,
534
+ "grad_norm": 0.08882776647806168,
535
+ "learning_rate": 0.0,
536
+ "loss": 1.4886,
537
+ "step": 320
538
+ },
539
+ {
540
+ "epoch": 10.0,
541
+ "eval_loss": 2.595360517501831,
542
+ "eval_runtime": 0.827,
543
+ "eval_samples_per_second": 7.255,
544
+ "eval_steps_per_second": 1.209,
545
+ "step": 320
546
+ },
547
+ {
548
+ "epoch": 10.0,
549
+ "step": 320,
550
+ "total_flos": 9.830314275137126e+17,
551
+ "train_loss": 1.5905956603586673,
552
+ "train_runtime": 1497.0202,
553
+ "train_samples_per_second": 109.25,
554
+ "train_steps_per_second": 0.214
555
+ }
556
+ ],
557
+ "logging_steps": 5,
558
+ "max_steps": 320,
559
+ "num_input_tokens_seen": 0,
560
+ "num_train_epochs": 10,
561
+ "save_steps": 100,
562
+ "stateful_callbacks": {
563
+ "TrainerControl": {
564
+ "args": {
565
+ "should_epoch_stop": false,
566
+ "should_evaluate": false,
567
+ "should_log": false,
568
+ "should_save": true,
569
+ "should_training_stop": true
570
+ },
571
+ "attributes": {}
572
+ }
573
+ },
574
+ "total_flos": 9.830314275137126e+17,
575
+ "train_batch_size": 32,
576
+ "trial_name": null,
577
+ "trial_params": null
578
+ }