chansung commited on
Commit
3d088d7
·
verified ·
1 Parent(s): 7c802b9

Model save

Browse files
Files changed (4) hide show
  1. README.md +78 -0
  2. all_results.json +9 -0
  3. train_results.json +9 -0
  4. trainer_state.json +340 -0
README.md ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ license: llama3.2
4
+ base_model: meta-llama/Llama-3.2-1B
5
+ tags:
6
+ - trl
7
+ - sft
8
+ - generated_from_trainer
9
+ datasets:
10
+ - generator
11
+ model-index:
12
+ - name: llama3-1b-classification-gpt4o-100k
13
+ results: []
14
+ ---
15
+
16
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
17
+ should probably proofread and complete it, then remove this comment. -->
18
+
19
+ # llama3-1b-classification-gpt4o-100k
20
+
21
+ This model is a fine-tuned version of [meta-llama/Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B) on the generator dataset.
22
+ It achieves the following results on the evaluation set:
23
+ - Loss: 2.3686
24
+
25
+ ## Model description
26
+
27
+ More information needed
28
+
29
+ ## Intended uses & limitations
30
+
31
+ More information needed
32
+
33
+ ## Training and evaluation data
34
+
35
+ More information needed
36
+
37
+ ## Training procedure
38
+
39
+ ### Training hyperparameters
40
+
41
+ The following hyperparameters were used during training:
42
+ - learning_rate: 0.0002
43
+ - train_batch_size: 32
44
+ - eval_batch_size: 32
45
+ - seed: 42
46
+ - distributed_type: multi-GPU
47
+ - num_devices: 8
48
+ - gradient_accumulation_steps: 2
49
+ - total_train_batch_size: 512
50
+ - total_eval_batch_size: 256
51
+ - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
52
+ - lr_scheduler_type: cosine
53
+ - lr_scheduler_warmup_ratio: 0.1
54
+ - num_epochs: 10
55
+
56
+ ### Training results
57
+
58
+ | Training Loss | Epoch | Step | Validation Loss |
59
+ |:-------------:|:-----:|:----:|:---------------:|
60
+ | 2.759 | 1.0 | 15 | 2.6168 |
61
+ | 2.3698 | 2.0 | 30 | 2.4602 |
62
+ | 2.2521 | 3.0 | 45 | 2.3993 |
63
+ | 2.1865 | 4.0 | 60 | 2.3780 |
64
+ | 2.1518 | 5.0 | 75 | 2.3706 |
65
+ | 2.1331 | 6.0 | 90 | 2.3701 |
66
+ | 2.1168 | 7.0 | 105 | 2.3705 |
67
+ | 2.1121 | 8.0 | 120 | 2.3697 |
68
+ | 2.1081 | 9.0 | 135 | 2.3696 |
69
+ | 2.1077 | 10.0 | 150 | 2.3686 |
70
+
71
+
72
+ ### Framework versions
73
+
74
+ - PEFT 0.15.1
75
+ - Transformers 4.50.3
76
+ - Pytorch 2.6.0+cu124
77
+ - Datasets 3.5.0
78
+ - Tokenizers 0.21.1
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "total_flos": 4.607959816470528e+17,
4
+ "train_loss": 2.2735363483428954,
5
+ "train_runtime": 707.5313,
6
+ "train_samples": 92634,
7
+ "train_samples_per_second": 106.978,
8
+ "train_steps_per_second": 0.212
9
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "total_flos": 4.607959816470528e+17,
4
+ "train_loss": 2.2735363483428954,
5
+ "train_runtime": 707.5313,
6
+ "train_samples": 92634,
7
+ "train_samples_per_second": 106.978,
8
+ "train_steps_per_second": 0.212
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 10.0,
6
+ "eval_steps": 500,
7
+ "global_step": 150,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.06666666666666667,
14
+ "grad_norm": 6.182748794555664,
15
+ "learning_rate": 1.3333333333333333e-05,
16
+ "loss": 3.3242,
17
+ "step": 1
18
+ },
19
+ {
20
+ "epoch": 0.3333333333333333,
21
+ "grad_norm": 5.428755283355713,
22
+ "learning_rate": 6.666666666666667e-05,
23
+ "loss": 3.3024,
24
+ "step": 5
25
+ },
26
+ {
27
+ "epoch": 0.6666666666666666,
28
+ "grad_norm": 1.8031461238861084,
29
+ "learning_rate": 0.00013333333333333334,
30
+ "loss": 3.0512,
31
+ "step": 10
32
+ },
33
+ {
34
+ "epoch": 1.0,
35
+ "grad_norm": 1.0242935419082642,
36
+ "learning_rate": 0.0002,
37
+ "loss": 2.759,
38
+ "step": 15
39
+ },
40
+ {
41
+ "epoch": 1.0,
42
+ "eval_loss": 2.6168200969696045,
43
+ "eval_runtime": 0.8249,
44
+ "eval_samples_per_second": 2.425,
45
+ "eval_steps_per_second": 1.212,
46
+ "step": 15
47
+ },
48
+ {
49
+ "epoch": 1.3333333333333333,
50
+ "grad_norm": 0.654896080493927,
51
+ "learning_rate": 0.00019932383577419432,
52
+ "loss": 2.5777,
53
+ "step": 20
54
+ },
55
+ {
56
+ "epoch": 1.6666666666666665,
57
+ "grad_norm": 0.4863899052143097,
58
+ "learning_rate": 0.00019730448705798239,
59
+ "loss": 2.451,
60
+ "step": 25
61
+ },
62
+ {
63
+ "epoch": 2.0,
64
+ "grad_norm": 0.3055514991283417,
65
+ "learning_rate": 0.00019396926207859084,
66
+ "loss": 2.3698,
67
+ "step": 30
68
+ },
69
+ {
70
+ "epoch": 2.0,
71
+ "eval_loss": 2.460226058959961,
72
+ "eval_runtime": 0.8278,
73
+ "eval_samples_per_second": 2.416,
74
+ "eval_steps_per_second": 1.208,
75
+ "step": 30
76
+ },
77
+ {
78
+ "epoch": 2.3333333333333335,
79
+ "grad_norm": 0.2144581824541092,
80
+ "learning_rate": 0.00018936326403234125,
81
+ "loss": 2.3071,
82
+ "step": 35
83
+ },
84
+ {
85
+ "epoch": 2.6666666666666665,
86
+ "grad_norm": 0.18541619181632996,
87
+ "learning_rate": 0.00018354878114129367,
88
+ "loss": 2.2806,
89
+ "step": 40
90
+ },
91
+ {
92
+ "epoch": 3.0,
93
+ "grad_norm": 0.16438105702400208,
94
+ "learning_rate": 0.0001766044443118978,
95
+ "loss": 2.2521,
96
+ "step": 45
97
+ },
98
+ {
99
+ "epoch": 3.0,
100
+ "eval_loss": 2.399292230606079,
101
+ "eval_runtime": 0.8291,
102
+ "eval_samples_per_second": 2.412,
103
+ "eval_steps_per_second": 1.206,
104
+ "step": 45
105
+ },
106
+ {
107
+ "epoch": 3.3333333333333335,
108
+ "grad_norm": 0.1444542109966278,
109
+ "learning_rate": 0.0001686241637868734,
110
+ "loss": 2.2256,
111
+ "step": 50
112
+ },
113
+ {
114
+ "epoch": 3.6666666666666665,
115
+ "grad_norm": 0.12350399047136307,
116
+ "learning_rate": 0.00015971585917027862,
117
+ "loss": 2.2069,
118
+ "step": 55
119
+ },
120
+ {
121
+ "epoch": 4.0,
122
+ "grad_norm": 0.12858685851097107,
123
+ "learning_rate": 0.00015000000000000001,
124
+ "loss": 2.1865,
125
+ "step": 60
126
+ },
127
+ {
128
+ "epoch": 4.0,
129
+ "eval_loss": 2.3779759407043457,
130
+ "eval_runtime": 0.8275,
131
+ "eval_samples_per_second": 2.417,
132
+ "eval_steps_per_second": 1.208,
133
+ "step": 60
134
+ },
135
+ {
136
+ "epoch": 4.333333333333333,
137
+ "grad_norm": 0.11426849663257599,
138
+ "learning_rate": 0.0001396079766039157,
139
+ "loss": 2.1796,
140
+ "step": 65
141
+ },
142
+ {
143
+ "epoch": 4.666666666666667,
144
+ "grad_norm": 0.11002852022647858,
145
+ "learning_rate": 0.00012868032327110904,
146
+ "loss": 2.1632,
147
+ "step": 70
148
+ },
149
+ {
150
+ "epoch": 5.0,
151
+ "grad_norm": 0.11463355273008347,
152
+ "learning_rate": 0.00011736481776669306,
153
+ "loss": 2.1518,
154
+ "step": 75
155
+ },
156
+ {
157
+ "epoch": 5.0,
158
+ "eval_loss": 2.3705692291259766,
159
+ "eval_runtime": 0.8277,
160
+ "eval_samples_per_second": 2.416,
161
+ "eval_steps_per_second": 1.208,
162
+ "step": 75
163
+ },
164
+ {
165
+ "epoch": 5.333333333333333,
166
+ "grad_norm": 0.10701094567775726,
167
+ "learning_rate": 0.00010581448289104758,
168
+ "loss": 2.1438,
169
+ "step": 80
170
+ },
171
+ {
172
+ "epoch": 5.666666666666667,
173
+ "grad_norm": 0.10402818769216537,
174
+ "learning_rate": 9.418551710895243e-05,
175
+ "loss": 2.1353,
176
+ "step": 85
177
+ },
178
+ {
179
+ "epoch": 6.0,
180
+ "grad_norm": 0.10872997343540192,
181
+ "learning_rate": 8.263518223330697e-05,
182
+ "loss": 2.1331,
183
+ "step": 90
184
+ },
185
+ {
186
+ "epoch": 6.0,
187
+ "eval_loss": 2.370138645172119,
188
+ "eval_runtime": 0.8279,
189
+ "eval_samples_per_second": 2.416,
190
+ "eval_steps_per_second": 1.208,
191
+ "step": 90
192
+ },
193
+ {
194
+ "epoch": 6.333333333333333,
195
+ "grad_norm": 0.11619067192077637,
196
+ "learning_rate": 7.131967672889101e-05,
197
+ "loss": 2.1258,
198
+ "step": 95
199
+ },
200
+ {
201
+ "epoch": 6.666666666666667,
202
+ "grad_norm": 0.12228544801473618,
203
+ "learning_rate": 6.039202339608432e-05,
204
+ "loss": 2.1196,
205
+ "step": 100
206
+ },
207
+ {
208
+ "epoch": 7.0,
209
+ "grad_norm": 0.11531439423561096,
210
+ "learning_rate": 5.000000000000002e-05,
211
+ "loss": 2.1168,
212
+ "step": 105
213
+ },
214
+ {
215
+ "epoch": 7.0,
216
+ "eval_loss": 2.3705224990844727,
217
+ "eval_runtime": 0.8289,
218
+ "eval_samples_per_second": 2.413,
219
+ "eval_steps_per_second": 1.206,
220
+ "step": 105
221
+ },
222
+ {
223
+ "epoch": 7.333333333333333,
224
+ "grad_norm": 0.10404527932405472,
225
+ "learning_rate": 4.028414082972141e-05,
226
+ "loss": 2.1119,
227
+ "step": 110
228
+ },
229
+ {
230
+ "epoch": 7.666666666666667,
231
+ "grad_norm": 0.10795953124761581,
232
+ "learning_rate": 3.137583621312665e-05,
233
+ "loss": 2.1085,
234
+ "step": 115
235
+ },
236
+ {
237
+ "epoch": 8.0,
238
+ "grad_norm": 0.11347240209579468,
239
+ "learning_rate": 2.339555568810221e-05,
240
+ "loss": 2.1121,
241
+ "step": 120
242
+ },
243
+ {
244
+ "epoch": 8.0,
245
+ "eval_loss": 2.369734048843384,
246
+ "eval_runtime": 0.8296,
247
+ "eval_samples_per_second": 2.411,
248
+ "eval_steps_per_second": 1.205,
249
+ "step": 120
250
+ },
251
+ {
252
+ "epoch": 8.333333333333334,
253
+ "grad_norm": 0.10211199522018433,
254
+ "learning_rate": 1.6451218858706374e-05,
255
+ "loss": 2.1088,
256
+ "step": 125
257
+ },
258
+ {
259
+ "epoch": 8.666666666666666,
260
+ "grad_norm": 0.0971965491771698,
261
+ "learning_rate": 1.0636735967658784e-05,
262
+ "loss": 2.101,
263
+ "step": 130
264
+ },
265
+ {
266
+ "epoch": 9.0,
267
+ "grad_norm": 0.0981559231877327,
268
+ "learning_rate": 6.030737921409169e-06,
269
+ "loss": 2.1081,
270
+ "step": 135
271
+ },
272
+ {
273
+ "epoch": 9.0,
274
+ "eval_loss": 2.3695523738861084,
275
+ "eval_runtime": 0.8287,
276
+ "eval_samples_per_second": 2.413,
277
+ "eval_steps_per_second": 1.207,
278
+ "step": 135
279
+ },
280
+ {
281
+ "epoch": 9.333333333333334,
282
+ "grad_norm": 0.09640078991651535,
283
+ "learning_rate": 2.6955129420176196e-06,
284
+ "loss": 2.1009,
285
+ "step": 140
286
+ },
287
+ {
288
+ "epoch": 9.666666666666666,
289
+ "grad_norm": 0.09775780886411667,
290
+ "learning_rate": 6.761642258056978e-07,
291
+ "loss": 2.1037,
292
+ "step": 145
293
+ },
294
+ {
295
+ "epoch": 10.0,
296
+ "grad_norm": 0.09380096942186356,
297
+ "learning_rate": 0.0,
298
+ "loss": 2.1077,
299
+ "step": 150
300
+ },
301
+ {
302
+ "epoch": 10.0,
303
+ "eval_loss": 2.368610143661499,
304
+ "eval_runtime": 0.8302,
305
+ "eval_samples_per_second": 2.409,
306
+ "eval_steps_per_second": 1.205,
307
+ "step": 150
308
+ },
309
+ {
310
+ "epoch": 10.0,
311
+ "step": 150,
312
+ "total_flos": 4.607959816470528e+17,
313
+ "train_loss": 2.2735363483428954,
314
+ "train_runtime": 707.5313,
315
+ "train_samples_per_second": 106.978,
316
+ "train_steps_per_second": 0.212
317
+ }
318
+ ],
319
+ "logging_steps": 5,
320
+ "max_steps": 150,
321
+ "num_input_tokens_seen": 0,
322
+ "num_train_epochs": 10,
323
+ "save_steps": 100,
324
+ "stateful_callbacks": {
325
+ "TrainerControl": {
326
+ "args": {
327
+ "should_epoch_stop": false,
328
+ "should_evaluate": false,
329
+ "should_log": false,
330
+ "should_save": true,
331
+ "should_training_stop": true
332
+ },
333
+ "attributes": {}
334
+ }
335
+ },
336
+ "total_flos": 4.607959816470528e+17,
337
+ "train_batch_size": 32,
338
+ "trial_name": null,
339
+ "trial_params": null
340
+ }