chansung commited on
Commit
3888f2b
·
verified ·
1 Parent(s): 21645dd

Model save

Browse files
Files changed (4) hide show
  1. README.md +78 -0
  2. all_results.json +9 -0
  3. train_results.json +9 -0
  4. trainer_state.json +606 -0
README.md ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ license: llama3.2
4
+ base_model: meta-llama/Llama-3.2-1B
5
+ tags:
6
+ - trl
7
+ - sft
8
+ - generated_from_trainer
9
+ datasets:
10
+ - generator
11
+ model-index:
12
+ - name: llama3.1-1b-coding-gpt4o-100k2
13
+ results: []
14
+ ---
15
+
16
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
17
+ should probably proofread and complete it, then remove this comment. -->
18
+
19
+ # llama3.1-1b-coding-gpt4o-100k2
20
+
21
+ This model is a fine-tuned version of [meta-llama/Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B) on the generator dataset.
22
+ It achieves the following results on the evaluation set:
23
+ - Loss: 1.6745
24
+
25
+ ## Model description
26
+
27
+ More information needed
28
+
29
+ ## Intended uses & limitations
30
+
31
+ More information needed
32
+
33
+ ## Training and evaluation data
34
+
35
+ More information needed
36
+
37
+ ## Training procedure
38
+
39
+ ### Training hyperparameters
40
+
41
+ The following hyperparameters were used during training:
42
+ - learning_rate: 0.002
43
+ - train_batch_size: 32
44
+ - eval_batch_size: 32
45
+ - seed: 42
46
+ - distributed_type: multi-GPU
47
+ - num_devices: 8
48
+ - gradient_accumulation_steps: 2
49
+ - total_train_batch_size: 512
50
+ - total_eval_batch_size: 256
51
+ - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
52
+ - lr_scheduler_type: cosine
53
+ - lr_scheduler_warmup_ratio: 0.1
54
+ - num_epochs: 10
55
+
56
+ ### Training results
57
+
58
+ | Training Loss | Epoch | Step | Validation Loss |
59
+ |:-------------:|:-----:|:----:|:---------------:|
60
+ | 1.2024 | 1.0 | 34 | 1.7381 |
61
+ | 1.0846 | 2.0 | 68 | 1.6923 |
62
+ | 1.0447 | 3.0 | 102 | 1.6731 |
63
+ | 1.0207 | 4.0 | 136 | 1.6660 |
64
+ | 1.0039 | 5.0 | 170 | 1.6681 |
65
+ | 0.9957 | 6.0 | 204 | 1.6620 |
66
+ | 0.9793 | 7.0 | 238 | 1.6656 |
67
+ | 0.9761 | 8.0 | 272 | 1.6707 |
68
+ | 0.9678 | 9.0 | 306 | 1.6741 |
69
+ | 0.9709 | 10.0 | 340 | 1.6745 |
70
+
71
+
72
+ ### Framework versions
73
+
74
+ - PEFT 0.15.1
75
+ - Transformers 4.50.3
76
+ - Pytorch 2.6.0+cu124
77
+ - Datasets 3.5.0
78
+ - Tokenizers 0.21.1
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "total_flos": 1.0444708917333197e+18,
4
+ "train_loss": 1.0473914388348073,
5
+ "train_runtime": 1591.5084,
6
+ "train_samples": 116368,
7
+ "train_samples_per_second": 108.407,
8
+ "train_steps_per_second": 0.214
9
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "total_flos": 1.0444708917333197e+18,
4
+ "train_loss": 1.0473914388348073,
5
+ "train_runtime": 1591.5084,
6
+ "train_samples": 116368,
7
+ "train_samples_per_second": 108.407,
8
+ "train_steps_per_second": 0.214
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,606 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 10.0,
6
+ "eval_steps": 500,
7
+ "global_step": 340,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.029411764705882353,
14
+ "grad_norm": 2.488327741622925,
15
+ "learning_rate": 5.882352941176471e-05,
16
+ "loss": 1.9016,
17
+ "step": 1
18
+ },
19
+ {
20
+ "epoch": 0.14705882352941177,
21
+ "grad_norm": 1.427666425704956,
22
+ "learning_rate": 0.00029411764705882356,
23
+ "loss": 1.8391,
24
+ "step": 5
25
+ },
26
+ {
27
+ "epoch": 0.29411764705882354,
28
+ "grad_norm": 0.6930230259895325,
29
+ "learning_rate": 0.0005882352941176471,
30
+ "loss": 1.5625,
31
+ "step": 10
32
+ },
33
+ {
34
+ "epoch": 0.4411764705882353,
35
+ "grad_norm": 0.2596357762813568,
36
+ "learning_rate": 0.0008823529411764706,
37
+ "loss": 1.3849,
38
+ "step": 15
39
+ },
40
+ {
41
+ "epoch": 0.5882352941176471,
42
+ "grad_norm": 0.16956600546836853,
43
+ "learning_rate": 0.0011764705882352942,
44
+ "loss": 1.2969,
45
+ "step": 20
46
+ },
47
+ {
48
+ "epoch": 0.7352941176470589,
49
+ "grad_norm": 0.12523840367794037,
50
+ "learning_rate": 0.0014705882352941178,
51
+ "loss": 1.2405,
52
+ "step": 25
53
+ },
54
+ {
55
+ "epoch": 0.8823529411764706,
56
+ "grad_norm": 0.09968027472496033,
57
+ "learning_rate": 0.0017647058823529412,
58
+ "loss": 1.2024,
59
+ "step": 30
60
+ },
61
+ {
62
+ "epoch": 1.0,
63
+ "eval_loss": 1.7381153106689453,
64
+ "eval_runtime": 0.828,
65
+ "eval_samples_per_second": 4.831,
66
+ "eval_steps_per_second": 1.208,
67
+ "step": 34
68
+ },
69
+ {
70
+ "epoch": 1.0294117647058822,
71
+ "grad_norm": 0.366025447845459,
72
+ "learning_rate": 0.001999947298487173,
73
+ "loss": 1.1724,
74
+ "step": 35
75
+ },
76
+ {
77
+ "epoch": 1.1764705882352942,
78
+ "grad_norm": 0.12924526631832123,
79
+ "learning_rate": 0.0019981033287370443,
80
+ "loss": 1.1507,
81
+ "step": 40
82
+ },
83
+ {
84
+ "epoch": 1.3235294117647058,
85
+ "grad_norm": 0.15898454189300537,
86
+ "learning_rate": 0.0019936298356132177,
87
+ "loss": 1.1306,
88
+ "step": 45
89
+ },
90
+ {
91
+ "epoch": 1.4705882352941178,
92
+ "grad_norm": 0.09307724982500076,
93
+ "learning_rate": 0.0019865386046236597,
94
+ "loss": 1.1156,
95
+ "step": 50
96
+ },
97
+ {
98
+ "epoch": 1.6176470588235294,
99
+ "grad_norm": 0.0881420224905014,
100
+ "learning_rate": 0.001976848317759601,
101
+ "loss": 1.1038,
102
+ "step": 55
103
+ },
104
+ {
105
+ "epoch": 1.7647058823529411,
106
+ "grad_norm": 0.08049994707107544,
107
+ "learning_rate": 0.0019645845042774554,
108
+ "loss": 1.0939,
109
+ "step": 60
110
+ },
111
+ {
112
+ "epoch": 1.9117647058823528,
113
+ "grad_norm": 0.07463372498750687,
114
+ "learning_rate": 0.001949779473441478,
115
+ "loss": 1.0846,
116
+ "step": 65
117
+ },
118
+ {
119
+ "epoch": 2.0,
120
+ "eval_loss": 1.6922645568847656,
121
+ "eval_runtime": 0.8305,
122
+ "eval_samples_per_second": 4.817,
123
+ "eval_steps_per_second": 1.204,
124
+ "step": 68
125
+ },
126
+ {
127
+ "epoch": 2.0588235294117645,
128
+ "grad_norm": 0.09218032658100128,
129
+ "learning_rate": 0.0019324722294043557,
130
+ "loss": 1.0691,
131
+ "step": 70
132
+ },
133
+ {
134
+ "epoch": 2.2058823529411766,
135
+ "grad_norm": 0.09160086512565613,
136
+ "learning_rate": 0.0019127083684499803,
137
+ "loss": 1.0627,
138
+ "step": 75
139
+ },
140
+ {
141
+ "epoch": 2.3529411764705883,
142
+ "grad_norm": 0.08734273165464401,
143
+ "learning_rate": 0.0018905399588691164,
144
+ "loss": 1.0554,
145
+ "step": 80
146
+ },
147
+ {
148
+ "epoch": 2.5,
149
+ "grad_norm": 0.16841983795166016,
150
+ "learning_rate": 0.001866025403784439,
151
+ "loss": 1.0504,
152
+ "step": 85
153
+ },
154
+ {
155
+ "epoch": 2.6470588235294117,
156
+ "grad_norm": 0.0975656732916832,
157
+ "learning_rate": 0.0018392292872863268,
158
+ "loss": 1.0538,
159
+ "step": 90
160
+ },
161
+ {
162
+ "epoch": 2.7941176470588234,
163
+ "grad_norm": 0.08383582532405853,
164
+ "learning_rate": 0.0018102222042847736,
165
+ "loss": 1.0443,
166
+ "step": 95
167
+ },
168
+ {
169
+ "epoch": 2.9411764705882355,
170
+ "grad_norm": 0.09132234007120132,
171
+ "learning_rate": 0.0017790805745256705,
172
+ "loss": 1.0447,
173
+ "step": 100
174
+ },
175
+ {
176
+ "epoch": 3.0,
177
+ "eval_loss": 1.6731293201446533,
178
+ "eval_runtime": 0.8295,
179
+ "eval_samples_per_second": 4.822,
180
+ "eval_steps_per_second": 1.206,
181
+ "step": 102
182
+ },
183
+ {
184
+ "epoch": 3.088235294117647,
185
+ "grad_norm": 0.09759514033794403,
186
+ "learning_rate": 0.0017458864412614435,
187
+ "loss": 1.0375,
188
+ "step": 105
189
+ },
190
+ {
191
+ "epoch": 3.235294117647059,
192
+ "grad_norm": 0.09400475025177002,
193
+ "learning_rate": 0.0017107272551064472,
194
+ "loss": 1.0277,
195
+ "step": 110
196
+ },
197
+ {
198
+ "epoch": 3.3823529411764706,
199
+ "grad_norm": 0.07706195116043091,
200
+ "learning_rate": 0.0016736956436465573,
201
+ "loss": 1.0239,
202
+ "step": 115
203
+ },
204
+ {
205
+ "epoch": 3.5294117647058822,
206
+ "grad_norm": 0.08647977560758591,
207
+ "learning_rate": 0.0016348891674099228,
208
+ "loss": 1.0229,
209
+ "step": 120
210
+ },
211
+ {
212
+ "epoch": 3.6764705882352944,
213
+ "grad_norm": 0.08289068937301636,
214
+ "learning_rate": 0.0015944100628417868,
215
+ "loss": 1.0238,
216
+ "step": 125
217
+ },
218
+ {
219
+ "epoch": 3.8235294117647056,
220
+ "grad_norm": 0.08053147792816162,
221
+ "learning_rate": 0.0015523649729605059,
222
+ "loss": 1.0195,
223
+ "step": 130
224
+ },
225
+ {
226
+ "epoch": 3.9705882352941178,
227
+ "grad_norm": 0.08788559585809708,
228
+ "learning_rate": 0.001508864666404365,
229
+ "loss": 1.0207,
230
+ "step": 135
231
+ },
232
+ {
233
+ "epoch": 4.0,
234
+ "eval_loss": 1.6660211086273193,
235
+ "eval_runtime": 0.8294,
236
+ "eval_samples_per_second": 4.823,
237
+ "eval_steps_per_second": 1.206,
238
+ "step": 136
239
+ },
240
+ {
241
+ "epoch": 4.117647058823529,
242
+ "grad_norm": 0.0886395052075386,
243
+ "learning_rate": 0.0014640237456093634,
244
+ "loss": 1.007,
245
+ "step": 140
246
+ },
247
+ {
248
+ "epoch": 4.264705882352941,
249
+ "grad_norm": 0.08254272490739822,
250
+ "learning_rate": 0.0014179603448867834,
251
+ "loss": 1.0049,
252
+ "step": 145
253
+ },
254
+ {
255
+ "epoch": 4.411764705882353,
256
+ "grad_norm": 0.08430016040802002,
257
+ "learning_rate": 0.0013707958191959608,
258
+ "loss": 1.0103,
259
+ "step": 150
260
+ },
261
+ {
262
+ "epoch": 4.5588235294117645,
263
+ "grad_norm": 0.0816187709569931,
264
+ "learning_rate": 0.001322654424432195,
265
+ "loss": 1.0011,
266
+ "step": 155
267
+ },
268
+ {
269
+ "epoch": 4.705882352941177,
270
+ "grad_norm": 0.08980533480644226,
271
+ "learning_rate": 0.0012736629900720832,
272
+ "loss": 1.0107,
273
+ "step": 160
274
+ },
275
+ {
276
+ "epoch": 4.852941176470588,
277
+ "grad_norm": 0.08757297694683075,
278
+ "learning_rate": 0.0012239505850387032,
279
+ "loss": 1.0035,
280
+ "step": 165
281
+ },
282
+ {
283
+ "epoch": 5.0,
284
+ "grad_norm": 0.08535553514957428,
285
+ "learning_rate": 0.0011736481776669307,
286
+ "loss": 1.0039,
287
+ "step": 170
288
+ },
289
+ {
290
+ "epoch": 5.0,
291
+ "eval_loss": 1.668144941329956,
292
+ "eval_runtime": 0.8286,
293
+ "eval_samples_per_second": 4.827,
294
+ "eval_steps_per_second": 1.207,
295
+ "step": 170
296
+ },
297
+ {
298
+ "epoch": 5.147058823529412,
299
+ "grad_norm": 0.08561732620000839,
300
+ "learning_rate": 0.0011228882906647141,
301
+ "loss": 0.9888,
302
+ "step": 175
303
+ },
304
+ {
305
+ "epoch": 5.294117647058823,
306
+ "grad_norm": 0.08237646520137787,
307
+ "learning_rate": 0.0010718046519793277,
308
+ "loss": 0.994,
309
+ "step": 180
310
+ },
311
+ {
312
+ "epoch": 5.4411764705882355,
313
+ "grad_norm": 0.07582972943782806,
314
+ "learning_rate": 0.0010205318424883906,
315
+ "loss": 0.9953,
316
+ "step": 185
317
+ },
318
+ {
319
+ "epoch": 5.588235294117647,
320
+ "grad_norm": 0.07832607626914978,
321
+ "learning_rate": 0.0009692049414438299,
322
+ "loss": 0.991,
323
+ "step": 190
324
+ },
325
+ {
326
+ "epoch": 5.735294117647059,
327
+ "grad_norm": 0.08117620646953583,
328
+ "learning_rate": 0.0009179591706028624,
329
+ "loss": 0.9931,
330
+ "step": 195
331
+ },
332
+ {
333
+ "epoch": 5.882352941176471,
334
+ "grad_norm": 0.07949723303318024,
335
+ "learning_rate": 0.0008669295379835467,
336
+ "loss": 0.9957,
337
+ "step": 200
338
+ },
339
+ {
340
+ "epoch": 6.0,
341
+ "eval_loss": 1.6619951725006104,
342
+ "eval_runtime": 0.8305,
343
+ "eval_samples_per_second": 4.817,
344
+ "eval_steps_per_second": 1.204,
345
+ "step": 204
346
+ },
347
+ {
348
+ "epoch": 6.029411764705882,
349
+ "grad_norm": 0.07223918288946152,
350
+ "learning_rate": 0.0008162504821834296,
351
+ "loss": 0.9855,
352
+ "step": 205
353
+ },
354
+ {
355
+ "epoch": 6.176470588235294,
356
+ "grad_norm": 0.0734533816576004,
357
+ "learning_rate": 0.0007660555181983517,
358
+ "loss": 0.9822,
359
+ "step": 210
360
+ },
361
+ {
362
+ "epoch": 6.323529411764706,
363
+ "grad_norm": 0.07729926705360413,
364
+ "learning_rate": 0.0007164768856744892,
365
+ "loss": 0.9813,
366
+ "step": 215
367
+ },
368
+ {
369
+ "epoch": 6.470588235294118,
370
+ "grad_norm": 0.0778665617108345,
371
+ "learning_rate": 0.0006676452005203405,
372
+ "loss": 0.9853,
373
+ "step": 220
374
+ },
375
+ {
376
+ "epoch": 6.617647058823529,
377
+ "grad_norm": 0.06967757642269135,
378
+ "learning_rate": 0.0006196891107964744,
379
+ "loss": 0.9837,
380
+ "step": 225
381
+ },
382
+ {
383
+ "epoch": 6.764705882352941,
384
+ "grad_norm": 0.07201401889324188,
385
+ "learning_rate": 0.0005727349577896194,
386
+ "loss": 0.9823,
387
+ "step": 230
388
+ },
389
+ {
390
+ "epoch": 6.911764705882353,
391
+ "grad_norm": 0.07436826825141907,
392
+ "learning_rate": 0.00052690644316399,
393
+ "loss": 0.9793,
394
+ "step": 235
395
+ },
396
+ {
397
+ "epoch": 7.0,
398
+ "eval_loss": 1.6655919551849365,
399
+ "eval_runtime": 0.8312,
400
+ "eval_samples_per_second": 4.812,
401
+ "eval_steps_per_second": 1.203,
402
+ "step": 238
403
+ },
404
+ {
405
+ "epoch": 7.0588235294117645,
406
+ "grad_norm": 0.07127279043197632,
407
+ "learning_rate": 0.0004823243030667576,
408
+ "loss": 0.98,
409
+ "step": 240
410
+ },
411
+ {
412
+ "epoch": 7.205882352941177,
413
+ "grad_norm": 0.06584794819355011,
414
+ "learning_rate": 0.0004391059900462304,
415
+ "loss": 0.9738,
416
+ "step": 245
417
+ },
418
+ {
419
+ "epoch": 7.352941176470588,
420
+ "grad_norm": 0.06734811514616013,
421
+ "learning_rate": 0.0003973653636207437,
422
+ "loss": 0.9705,
423
+ "step": 250
424
+ },
425
+ {
426
+ "epoch": 7.5,
427
+ "grad_norm": 0.06853578239679337,
428
+ "learning_rate": 0.0003572123903134606,
429
+ "loss": 0.9778,
430
+ "step": 255
431
+ },
432
+ {
433
+ "epoch": 7.647058823529412,
434
+ "grad_norm": 0.0678030252456665,
435
+ "learning_rate": 0.0003187528539433457,
436
+ "loss": 0.9722,
437
+ "step": 260
438
+ },
439
+ {
440
+ "epoch": 7.794117647058823,
441
+ "grad_norm": 0.06483301520347595,
442
+ "learning_rate": 0.0002820880769355582,
443
+ "loss": 0.9782,
444
+ "step": 265
445
+ },
446
+ {
447
+ "epoch": 7.9411764705882355,
448
+ "grad_norm": 0.06444835662841797,
449
+ "learning_rate": 0.00024731465338547555,
450
+ "loss": 0.9761,
451
+ "step": 270
452
+ },
453
+ {
454
+ "epoch": 8.0,
455
+ "eval_loss": 1.670668125152588,
456
+ "eval_runtime": 0.8308,
457
+ "eval_samples_per_second": 4.815,
458
+ "eval_steps_per_second": 1.204,
459
+ "step": 272
460
+ },
461
+ {
462
+ "epoch": 8.088235294117647,
463
+ "grad_norm": 0.06315235048532486,
464
+ "learning_rate": 0.00021452419457960138,
465
+ "loss": 0.9718,
466
+ "step": 275
467
+ },
468
+ {
469
+ "epoch": 8.235294117647058,
470
+ "grad_norm": 0.06345119327306747,
471
+ "learning_rate": 0.0001838030876437784,
472
+ "loss": 0.977,
473
+ "step": 280
474
+ },
475
+ {
476
+ "epoch": 8.382352941176471,
477
+ "grad_norm": 0.0637635886669159,
478
+ "learning_rate": 0.00015523226795456348,
479
+ "loss": 0.9741,
480
+ "step": 285
481
+ },
482
+ {
483
+ "epoch": 8.529411764705882,
484
+ "grad_norm": 0.06295765936374664,
485
+ "learning_rate": 0.00012888700591334225,
486
+ "loss": 0.9698,
487
+ "step": 290
488
+ },
489
+ {
490
+ "epoch": 8.676470588235293,
491
+ "grad_norm": 0.06005469709634781,
492
+ "learning_rate": 0.00010483670864493777,
493
+ "loss": 0.967,
494
+ "step": 295
495
+ },
496
+ {
497
+ "epoch": 8.823529411764707,
498
+ "grad_norm": 0.060096003115177155,
499
+ "learning_rate": 8.31447371431372e-05,
500
+ "loss": 0.9651,
501
+ "step": 300
502
+ },
503
+ {
504
+ "epoch": 8.970588235294118,
505
+ "grad_norm": 0.05939820781350136,
506
+ "learning_rate": 6.386823934487617e-05,
507
+ "loss": 0.9678,
508
+ "step": 305
509
+ },
510
+ {
511
+ "epoch": 9.0,
512
+ "eval_loss": 1.6740888357162476,
513
+ "eval_runtime": 0.8293,
514
+ "eval_samples_per_second": 4.824,
515
+ "eval_steps_per_second": 1.206,
516
+ "step": 306
517
+ },
518
+ {
519
+ "epoch": 9.117647058823529,
520
+ "grad_norm": 0.058787424117326736,
521
+ "learning_rate": 4.705799957284351e-05,
522
+ "loss": 0.9715,
523
+ "step": 310
524
+ },
525
+ {
526
+ "epoch": 9.264705882352942,
527
+ "grad_norm": 0.057846549898386,
528
+ "learning_rate": 3.275830474315855e-05,
529
+ "loss": 0.9632,
530
+ "step": 315
531
+ },
532
+ {
533
+ "epoch": 9.411764705882353,
534
+ "grad_norm": 0.05796463415026665,
535
+ "learning_rate": 2.1006827690595476e-05,
536
+ "loss": 0.9622,
537
+ "step": 320
538
+ },
539
+ {
540
+ "epoch": 9.558823529411764,
541
+ "grad_norm": 0.057375218719244,
542
+ "learning_rate": 1.1834527918740623e-05,
543
+ "loss": 0.9712,
544
+ "step": 325
545
+ },
546
+ {
547
+ "epoch": 9.705882352941176,
548
+ "grad_norm": 0.05756480246782303,
549
+ "learning_rate": 5.265570036553813e-06,
550
+ "loss": 0.9672,
551
+ "step": 330
552
+ },
553
+ {
554
+ "epoch": 9.852941176470589,
555
+ "grad_norm": 0.05712839215993881,
556
+ "learning_rate": 1.3172600962190196e-06,
557
+ "loss": 0.9695,
558
+ "step": 335
559
+ },
560
+ {
561
+ "epoch": 10.0,
562
+ "grad_norm": 0.057304926216602325,
563
+ "learning_rate": 0.0,
564
+ "loss": 0.9709,
565
+ "step": 340
566
+ },
567
+ {
568
+ "epoch": 10.0,
569
+ "eval_loss": 1.6745353937149048,
570
+ "eval_runtime": 0.8292,
571
+ "eval_samples_per_second": 4.824,
572
+ "eval_steps_per_second": 1.206,
573
+ "step": 340
574
+ },
575
+ {
576
+ "epoch": 10.0,
577
+ "step": 340,
578
+ "total_flos": 1.0444708917333197e+18,
579
+ "train_loss": 1.0473914388348073,
580
+ "train_runtime": 1591.5084,
581
+ "train_samples_per_second": 108.407,
582
+ "train_steps_per_second": 0.214
583
+ }
584
+ ],
585
+ "logging_steps": 5,
586
+ "max_steps": 340,
587
+ "num_input_tokens_seen": 0,
588
+ "num_train_epochs": 10,
589
+ "save_steps": 100,
590
+ "stateful_callbacks": {
591
+ "TrainerControl": {
592
+ "args": {
593
+ "should_epoch_stop": false,
594
+ "should_evaluate": false,
595
+ "should_log": false,
596
+ "should_save": true,
597
+ "should_training_stop": true
598
+ },
599
+ "attributes": {}
600
+ }
601
+ },
602
+ "total_flos": 1.0444708917333197e+18,
603
+ "train_batch_size": 32,
604
+ "trial_name": null,
605
+ "trial_params": null
606
+ }