SodaXII commited on
Commit
dfa928c
·
verified ·
1 Parent(s): 8f89412

Model save

Browse files
README.md CHANGED
@@ -18,8 +18,8 @@ should probably proofread and complete it, then remove this comment. -->
18
 
19
  This model is a fine-tuned version of [apple/mobilevit-small](https://huggingface.co/apple/mobilevit-small) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
- - Loss: 0.2911
22
- - Accuracy: 0.9228
23
 
24
  ## Model description
25
 
@@ -45,43 +45,73 @@ The following hyperparameters were used during training:
45
  - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
46
  - lr_scheduler_type: cosine_with_restarts
47
  - lr_scheduler_warmup_steps: 256
48
- - num_epochs: 15
49
  - mixed_precision_training: Native AMP
50
 
51
  ### Training results
52
 
53
- | Training Loss | Epoch | Step | Validation Loss | Accuracy |
54
- |:-------------:|:-----:|:----:|:---------------:|:--------:|
55
- | 2.0561 | 0.5 | 64 | 2.0213 | 0.2886 |
56
- | 1.9819 | 1.0 | 128 | 1.8788 | 0.5503 |
57
- | 1.771 | 1.5 | 192 | 1.5291 | 0.6107 |
58
- | 1.3911 | 2.0 | 256 | 1.0706 | 0.7349 |
59
- | 1.0026 | 2.5 | 320 | 0.7560 | 0.8054 |
60
- | 0.7657 | 3.0 | 384 | 0.6180 | 0.8356 |
61
- | 0.6082 | 3.5 | 448 | 0.5422 | 0.8389 |
62
- | 0.5313 | 4.0 | 512 | 0.4946 | 0.8523 |
63
- | 0.4623 | 4.5 | 576 | 0.4512 | 0.8758 |
64
- | 0.4212 | 5.0 | 640 | 0.4322 | 0.8792 |
65
- | 0.4025 | 5.5 | 704 | 0.4259 | 0.8893 |
66
- | 0.3892 | 6.0 | 768 | 0.4238 | 0.8859 |
67
- | 0.3959 | 6.5 | 832 | 0.4083 | 0.8859 |
68
- | 0.3279 | 7.0 | 896 | 0.3750 | 0.8826 |
69
- | 0.2793 | 7.5 | 960 | 0.3350 | 0.8993 |
70
- | 0.222 | 8.0 | 1024 | 0.3208 | 0.8960 |
71
- | 0.1862 | 8.5 | 1088 | 0.3128 | 0.8993 |
72
- | 0.1717 | 9.0 | 1152 | 0.3049 | 0.9027 |
73
- | 0.1408 | 9.5 | 1216 | 0.3010 | 0.9027 |
74
- | 0.1507 | 10.0 | 1280 | 0.3240 | 0.9161 |
75
- | 0.1369 | 10.5 | 1344 | 0.3063 | 0.9060 |
76
- | 0.1389 | 11.0 | 1408 | 0.3045 | 0.9060 |
77
- | 0.1199 | 11.5 | 1472 | 0.3062 | 0.9094 |
78
- | 0.1003 | 12.0 | 1536 | 0.3131 | 0.9128 |
79
- | 0.0756 | 12.5 | 1600 | 0.3002 | 0.9228 |
80
- | 0.0636 | 13.0 | 1664 | 0.3177 | 0.9128 |
81
- | 0.058 | 13.5 | 1728 | 0.3143 | 0.9228 |
82
- | 0.0566 | 14.0 | 1792 | 0.3136 | 0.9195 |
83
- | 0.0516 | 14.5 | 1856 | 0.3447 | 0.9161 |
84
- | 0.0426 | 15.0 | 1920 | 0.2911 | 0.9228 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
 
87
  ### Framework versions
@@ -89,4 +119,4 @@ The following hyperparameters were used during training:
89
  - Transformers 4.48.3
90
  - Pytorch 2.5.1+cu124
91
  - Datasets 3.3.2
92
- - Tokenizers 0.21.0
 
18
 
19
  This model is a fine-tuned version of [apple/mobilevit-small](https://huggingface.co/apple/mobilevit-small) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 0.4089
22
+ - Accuracy: 0.9295
23
 
24
  ## Model description
25
 
 
45
  - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
46
  - lr_scheduler_type: cosine_with_restarts
47
  - lr_scheduler_warmup_steps: 256
48
+ - num_epochs: 30
49
  - mixed_precision_training: Native AMP
50
 
51
  ### Training results
52
 
53
+ | Training Loss | Epoch | Step | Accuracy | Validation Loss |
54
+ |:-------------:|:-----:|:----:|:--------:|:---------------:|
55
+ | 2.0561 | 0.5 | 64 | 0.2886 | 2.0213 |
56
+ | 1.9819 | 1.0 | 128 | 0.5503 | 1.8788 |
57
+ | 1.771 | 1.5 | 192 | 0.6107 | 1.5291 |
58
+ | 1.3911 | 2.0 | 256 | 0.7349 | 1.0706 |
59
+ | 1.0026 | 2.5 | 320 | 0.8054 | 0.7560 |
60
+ | 0.7657 | 3.0 | 384 | 0.8356 | 0.6180 |
61
+ | 0.6082 | 3.5 | 448 | 0.8389 | 0.5422 |
62
+ | 0.5313 | 4.0 | 512 | 0.8523 | 0.4946 |
63
+ | 0.4623 | 4.5 | 576 | 0.8758 | 0.4512 |
64
+ | 0.4212 | 5.0 | 640 | 0.8792 | 0.4322 |
65
+ | 0.4025 | 5.5 | 704 | 0.8893 | 0.4259 |
66
+ | 0.3892 | 6.0 | 768 | 0.8859 | 0.4238 |
67
+ | 0.3959 | 6.5 | 832 | 0.8859 | 0.4083 |
68
+ | 0.3279 | 7.0 | 896 | 0.8826 | 0.3750 |
69
+ | 0.2793 | 7.5 | 960 | 0.8993 | 0.3350 |
70
+ | 0.222 | 8.0 | 1024 | 0.8960 | 0.3208 |
71
+ | 0.1862 | 8.5 | 1088 | 0.8993 | 0.3128 |
72
+ | 0.1717 | 9.0 | 1152 | 0.9027 | 0.3049 |
73
+ | 0.1408 | 9.5 | 1216 | 0.9027 | 0.3010 |
74
+ | 0.1507 | 10.0 | 1280 | 0.9161 | 0.3240 |
75
+ | 0.1369 | 10.5 | 1344 | 0.9060 | 0.3063 |
76
+ | 0.1389 | 11.0 | 1408 | 0.9060 | 0.3045 |
77
+ | 0.1199 | 11.5 | 1472 | 0.9094 | 0.3062 |
78
+ | 0.1003 | 12.0 | 1536 | 0.9128 | 0.3131 |
79
+ | 0.0756 | 12.5 | 1600 | 0.9228 | 0.3002 |
80
+ | 0.0636 | 13.0 | 1664 | 0.9128 | 0.3177 |
81
+ | 0.058 | 13.5 | 1728 | 0.9228 | 0.3143 |
82
+ | 0.0566 | 14.0 | 1792 | 0.9195 | 0.3136 |
83
+ | 0.0516 | 14.5 | 1856 | 0.9161 | 0.3447 |
84
+ | 0.0426 | 15.0 | 1920 | 0.9228 | 0.2911 |
85
+ | 0.0513 | 15.5 | 1984 | 0.3028 | 0.9228 |
86
+ | 0.0447 | 16.0 | 2048 | 0.3328 | 0.9195 |
87
+ | 0.0332 | 16.5 | 2112 | 0.3193 | 0.9262 |
88
+ | 0.0358 | 17.0 | 2176 | 0.3385 | 0.9161 |
89
+ | 0.0343 | 17.5 | 2240 | 0.3297 | 0.9295 |
90
+ | 0.0291 | 18.0 | 2304 | 0.3518 | 0.9161 |
91
+ | 0.0287 | 18.5 | 2368 | 0.3224 | 0.9195 |
92
+ | 0.0197 | 19.0 | 2432 | 0.3099 | 0.9228 |
93
+ | 0.0223 | 19.5 | 2496 | 0.3305 | 0.9295 |
94
+ | 0.0282 | 20.0 | 2560 | 0.3378 | 0.9161 |
95
+ | 0.0231 | 20.5 | 2624 | 0.3077 | 0.9228 |
96
+ | 0.0251 | 21.0 | 2688 | 0.3520 | 0.9161 |
97
+ | 0.021 | 21.5 | 2752 | 0.3506 | 0.9228 |
98
+ | 0.0222 | 22.0 | 2816 | 0.3561 | 0.9128 |
99
+ | 0.016 | 22.5 | 2880 | 0.3482 | 0.9195 |
100
+ | 0.0163 | 23.0 | 2944 | 0.3429 | 0.9228 |
101
+ | 0.0114 | 23.5 | 3008 | 0.3839 | 0.9329 |
102
+ | 0.0106 | 24.0 | 3072 | 0.4066 | 0.9262 |
103
+ | 0.0111 | 24.5 | 3136 | 0.4003 | 0.9329 |
104
+ | 0.009 | 25.0 | 3200 | 0.4000 | 0.9262 |
105
+ | 0.0088 | 25.5 | 3264 | 0.3667 | 0.9228 |
106
+ | 0.0057 | 26.0 | 3328 | 0.3587 | 0.9195 |
107
+ | 0.0073 | 26.5 | 3392 | 0.3686 | 0.9329 |
108
+ | 0.0085 | 27.0 | 3456 | 0.3676 | 0.9195 |
109
+ | 0.0087 | 27.5 | 3520 | 0.4251 | 0.9262 |
110
+ | 0.0061 | 28.0 | 3584 | 0.3879 | 0.9195 |
111
+ | 0.0062 | 28.5 | 3648 | 0.3865 | 0.9195 |
112
+ | 0.0068 | 29.0 | 3712 | 0.3943 | 0.9262 |
113
+ | 0.0092 | 29.5 | 3776 | 0.4064 | 0.9228 |
114
+ | 0.0078 | 30.0 | 3840 | 0.4089 | 0.9295 |
115
 
116
 
117
  ### Framework versions
 
119
  - Transformers 4.48.3
120
  - Pytorch 2.5.1+cu124
121
  - Datasets 3.3.2
122
+ - Tokenizers 0.21.1
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 15.0,
3
+ "total_flos": 7.164784986292224e+17,
4
+ "train_loss": 0.48338687382638457,
5
+ "train_runtime": 5148.0361,
6
+ "train_samples_per_second": 23.869,
7
+ "train_steps_per_second": 0.373
8
+ }
logs/events.out.tfevents.1743190156.4d701d7b0712.5422.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70b2c8d08479bc0fe7360f0f7efbaf4230964f41739d5218dbe876fc335fb782
3
+ size 22373
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 15.0,
3
+ "total_flos": 7.164784986292224e+17,
4
+ "train_loss": 0.48338687382638457,
5
+ "train_runtime": 5148.0361,
6
+ "train_samples_per_second": 23.869,
7
+ "train_steps_per_second": 0.373
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,522 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.2911371886730194,
3
+ "best_model_checkpoint": "./drive/Shareddrives/CS198-Drones/[v4] Training Output/mobilevit-small_rice-leaf-disease-augmented-v4_fft/checkpoint-1920",
4
+ "epoch": 15.0,
5
+ "eval_steps": 64,
6
+ "global_step": 1920,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.5,
13
+ "grad_norm": 0.8390832543373108,
14
+ "learning_rate": 7.5e-06,
15
+ "loss": 2.0561,
16
+ "step": 64
17
+ },
18
+ {
19
+ "epoch": 0.5,
20
+ "eval_accuracy": 0.28859060402684567,
21
+ "eval_loss": 2.0213236808776855,
22
+ "eval_runtime": 9.1133,
23
+ "eval_samples_per_second": 32.7,
24
+ "eval_steps_per_second": 0.549,
25
+ "step": 64
26
+ },
27
+ {
28
+ "epoch": 1.0,
29
+ "grad_norm": 0.9097115397453308,
30
+ "learning_rate": 1.5e-05,
31
+ "loss": 1.9819,
32
+ "step": 128
33
+ },
34
+ {
35
+ "epoch": 1.0,
36
+ "eval_accuracy": 0.5503355704697986,
37
+ "eval_loss": 1.8788046836853027,
38
+ "eval_runtime": 9.5999,
39
+ "eval_samples_per_second": 31.042,
40
+ "eval_steps_per_second": 0.521,
41
+ "step": 128
42
+ },
43
+ {
44
+ "epoch": 1.5,
45
+ "grad_norm": 1.2205861806869507,
46
+ "learning_rate": 2.25e-05,
47
+ "loss": 1.771,
48
+ "step": 192
49
+ },
50
+ {
51
+ "epoch": 1.5,
52
+ "eval_accuracy": 0.610738255033557,
53
+ "eval_loss": 1.5290961265563965,
54
+ "eval_runtime": 9.5293,
55
+ "eval_samples_per_second": 31.272,
56
+ "eval_steps_per_second": 0.525,
57
+ "step": 192
58
+ },
59
+ {
60
+ "epoch": 2.0,
61
+ "grad_norm": 1.496865153312683,
62
+ "learning_rate": 3e-05,
63
+ "loss": 1.3911,
64
+ "step": 256
65
+ },
66
+ {
67
+ "epoch": 2.0,
68
+ "eval_accuracy": 0.7348993288590604,
69
+ "eval_loss": 1.0706205368041992,
70
+ "eval_runtime": 9.6229,
71
+ "eval_samples_per_second": 30.968,
72
+ "eval_steps_per_second": 0.52,
73
+ "step": 256
74
+ },
75
+ {
76
+ "epoch": 2.5,
77
+ "grad_norm": 1.845400333404541,
78
+ "learning_rate": 2.9025243640281226e-05,
79
+ "loss": 1.0026,
80
+ "step": 320
81
+ },
82
+ {
83
+ "epoch": 2.5,
84
+ "eval_accuracy": 0.8053691275167785,
85
+ "eval_loss": 0.756048321723938,
86
+ "eval_runtime": 9.6895,
87
+ "eval_samples_per_second": 30.755,
88
+ "eval_steps_per_second": 0.516,
89
+ "step": 320
90
+ },
91
+ {
92
+ "epoch": 3.0,
93
+ "grad_norm": 2.1688945293426514,
94
+ "learning_rate": 2.6227661222566516e-05,
95
+ "loss": 0.7657,
96
+ "step": 384
97
+ },
98
+ {
99
+ "epoch": 3.0,
100
+ "eval_accuracy": 0.8355704697986577,
101
+ "eval_loss": 0.6180014610290527,
102
+ "eval_runtime": 9.5752,
103
+ "eval_samples_per_second": 31.122,
104
+ "eval_steps_per_second": 0.522,
105
+ "step": 384
106
+ },
107
+ {
108
+ "epoch": 3.5,
109
+ "grad_norm": 1.9307341575622559,
110
+ "learning_rate": 2.197084758065653e-05,
111
+ "loss": 0.6082,
112
+ "step": 448
113
+ },
114
+ {
115
+ "epoch": 3.5,
116
+ "eval_accuracy": 0.8389261744966443,
117
+ "eval_loss": 0.542212188243866,
118
+ "eval_runtime": 9.8282,
119
+ "eval_samples_per_second": 30.321,
120
+ "eval_steps_per_second": 0.509,
121
+ "step": 448
122
+ },
123
+ {
124
+ "epoch": 4.0,
125
+ "grad_norm": 2.4017300605773926,
126
+ "learning_rate": 1.6892360241408803e-05,
127
+ "loss": 0.5313,
128
+ "step": 512
129
+ },
130
+ {
131
+ "epoch": 4.0,
132
+ "eval_accuracy": 0.8523489932885906,
133
+ "eval_loss": 0.494586706161499,
134
+ "eval_runtime": 9.5005,
135
+ "eval_samples_per_second": 31.367,
136
+ "eval_steps_per_second": 0.526,
137
+ "step": 512
138
+ },
139
+ {
140
+ "epoch": 4.5,
141
+ "grad_norm": 1.9675699472427368,
142
+ "learning_rate": 1.149281235767183e-05,
143
+ "loss": 0.4623,
144
+ "step": 576
145
+ },
146
+ {
147
+ "epoch": 4.5,
148
+ "eval_accuracy": 0.8758389261744967,
149
+ "eval_loss": 0.4512217044830322,
150
+ "eval_runtime": 9.6531,
151
+ "eval_samples_per_second": 30.871,
152
+ "eval_steps_per_second": 0.518,
153
+ "step": 576
154
+ },
155
+ {
156
+ "epoch": 5.0,
157
+ "grad_norm": 2.900230884552002,
158
+ "learning_rate": 6.549084935146384e-06,
159
+ "loss": 0.4212,
160
+ "step": 640
161
+ },
162
+ {
163
+ "epoch": 5.0,
164
+ "eval_accuracy": 0.8791946308724832,
165
+ "eval_loss": 0.4321761131286621,
166
+ "eval_runtime": 9.537,
167
+ "eval_samples_per_second": 31.247,
168
+ "eval_steps_per_second": 0.524,
169
+ "step": 640
170
+ },
171
+ {
172
+ "epoch": 5.5,
173
+ "grad_norm": 2.782975196838379,
174
+ "learning_rate": 2.7037019399421785e-06,
175
+ "loss": 0.4025,
176
+ "step": 704
177
+ },
178
+ {
179
+ "epoch": 5.5,
180
+ "eval_accuracy": 0.889261744966443,
181
+ "eval_loss": 0.42589595913887024,
182
+ "eval_runtime": 9.6007,
183
+ "eval_samples_per_second": 31.039,
184
+ "eval_steps_per_second": 0.521,
185
+ "step": 704
186
+ },
187
+ {
188
+ "epoch": 6.0,
189
+ "grad_norm": 2.2286922931671143,
190
+ "learning_rate": 4.564382427431779e-07,
191
+ "loss": 0.3892,
192
+ "step": 768
193
+ },
194
+ {
195
+ "epoch": 6.0,
196
+ "eval_accuracy": 0.8859060402684564,
197
+ "eval_loss": 0.4237956702709198,
198
+ "eval_runtime": 9.608,
199
+ "eval_samples_per_second": 31.016,
200
+ "eval_steps_per_second": 0.52,
201
+ "step": 768
202
+ },
203
+ {
204
+ "epoch": 6.5,
205
+ "grad_norm": 2.1533002853393555,
206
+ "learning_rate": 2.9900634879009304e-05,
207
+ "loss": 0.3959,
208
+ "step": 832
209
+ },
210
+ {
211
+ "epoch": 6.5,
212
+ "eval_accuracy": 0.8859060402684564,
213
+ "eval_loss": 0.40830349922180176,
214
+ "eval_runtime": 9.7273,
215
+ "eval_samples_per_second": 30.635,
216
+ "eval_steps_per_second": 0.514,
217
+ "step": 832
218
+ },
219
+ {
220
+ "epoch": 7.0,
221
+ "grad_norm": 3.03064227104187,
222
+ "learning_rate": 2.8321109519140222e-05,
223
+ "loss": 0.3279,
224
+ "step": 896
225
+ },
226
+ {
227
+ "epoch": 7.0,
228
+ "eval_accuracy": 0.8825503355704698,
229
+ "eval_loss": 0.37496834993362427,
230
+ "eval_runtime": 9.1243,
231
+ "eval_samples_per_second": 32.66,
232
+ "eval_steps_per_second": 0.548,
233
+ "step": 896
234
+ },
235
+ {
236
+ "epoch": 7.5,
237
+ "grad_norm": 2.8390941619873047,
238
+ "learning_rate": 2.50102726629655e-05,
239
+ "loss": 0.2793,
240
+ "step": 960
241
+ },
242
+ {
243
+ "epoch": 7.5,
244
+ "eval_accuracy": 0.8993288590604027,
245
+ "eval_loss": 0.335045725107193,
246
+ "eval_runtime": 8.9197,
247
+ "eval_samples_per_second": 33.409,
248
+ "eval_steps_per_second": 0.561,
249
+ "step": 960
250
+ },
251
+ {
252
+ "epoch": 8.0,
253
+ "grad_norm": 2.106311321258545,
254
+ "learning_rate": 2.0398425548024824e-05,
255
+ "loss": 0.222,
256
+ "step": 1024
257
+ },
258
+ {
259
+ "epoch": 8.0,
260
+ "eval_accuracy": 0.8959731543624161,
261
+ "eval_loss": 0.3207705318927765,
262
+ "eval_runtime": 8.4413,
263
+ "eval_samples_per_second": 35.303,
264
+ "eval_steps_per_second": 0.592,
265
+ "step": 1024
266
+ },
267
+ {
268
+ "epoch": 8.5,
269
+ "grad_norm": 2.5288846492767334,
270
+ "learning_rate": 1.5084958481696745e-05,
271
+ "loss": 0.1862,
272
+ "step": 1088
273
+ },
274
+ {
275
+ "epoch": 8.5,
276
+ "eval_accuracy": 0.8993288590604027,
277
+ "eval_loss": 0.31276702880859375,
278
+ "eval_runtime": 8.8019,
279
+ "eval_samples_per_second": 33.856,
280
+ "eval_steps_per_second": 0.568,
281
+ "step": 1088
282
+ },
283
+ {
284
+ "epoch": 9.0,
285
+ "grad_norm": 2.468576431274414,
286
+ "learning_rate": 9.76044957265588e-06,
287
+ "loss": 0.1717,
288
+ "step": 1152
289
+ },
290
+ {
291
+ "epoch": 9.0,
292
+ "eval_accuracy": 0.9026845637583892,
293
+ "eval_loss": 0.3048604130744934,
294
+ "eval_runtime": 9.6883,
295
+ "eval_samples_per_second": 30.759,
296
+ "eval_steps_per_second": 0.516,
297
+ "step": 1152
298
+ },
299
+ {
300
+ "epoch": 9.5,
301
+ "grad_norm": 4.155700206756592,
302
+ "learning_rate": 5.116912010431121e-06,
303
+ "loss": 0.1408,
304
+ "step": 1216
305
+ },
306
+ {
307
+ "epoch": 9.5,
308
+ "eval_accuracy": 0.9026845637583892,
309
+ "eval_loss": 0.3009721338748932,
310
+ "eval_runtime": 9.5695,
311
+ "eval_samples_per_second": 31.141,
312
+ "eval_steps_per_second": 0.522,
313
+ "step": 1216
314
+ },
315
+ {
316
+ "epoch": 10.0,
317
+ "grad_norm": 2.336121082305908,
318
+ "learning_rate": 1.757854831072046e-06,
319
+ "loss": 0.1507,
320
+ "step": 1280
321
+ },
322
+ {
323
+ "epoch": 10.0,
324
+ "eval_accuracy": 0.9161073825503355,
325
+ "eval_loss": 0.32400238513946533,
326
+ "eval_runtime": 9.2278,
327
+ "eval_samples_per_second": 32.294,
328
+ "eval_steps_per_second": 0.542,
329
+ "step": 1280
330
+ },
331
+ {
332
+ "epoch": 10.5,
333
+ "grad_norm": 2.2721362113952637,
334
+ "learning_rate": 1.198463476772116e-07,
335
+ "loss": 0.1369,
336
+ "step": 1344
337
+ },
338
+ {
339
+ "epoch": 10.5,
340
+ "eval_accuracy": 0.9060402684563759,
341
+ "eval_loss": 0.3062884509563446,
342
+ "eval_runtime": 8.635,
343
+ "eval_samples_per_second": 34.511,
344
+ "eval_steps_per_second": 0.579,
345
+ "step": 1344
346
+ },
347
+ {
348
+ "epoch": 11.0,
349
+ "grad_norm": 2.3275115489959717,
350
+ "learning_rate": 2.9584225548225062e-05,
351
+ "loss": 0.1389,
352
+ "step": 1408
353
+ },
354
+ {
355
+ "epoch": 11.0,
356
+ "eval_accuracy": 0.9060402684563759,
357
+ "eval_loss": 0.3044714033603668,
358
+ "eval_runtime": 8.4938,
359
+ "eval_samples_per_second": 35.084,
360
+ "eval_steps_per_second": 0.589,
361
+ "step": 1408
362
+ },
363
+ {
364
+ "epoch": 11.5,
365
+ "grad_norm": 2.064596652984619,
366
+ "learning_rate": 2.7392821896833267e-05,
367
+ "loss": 0.1199,
368
+ "step": 1472
369
+ },
370
+ {
371
+ "epoch": 11.5,
372
+ "eval_accuracy": 0.9093959731543624,
373
+ "eval_loss": 0.30615922808647156,
374
+ "eval_runtime": 9.5329,
375
+ "eval_samples_per_second": 31.26,
376
+ "eval_steps_per_second": 0.524,
377
+ "step": 1472
378
+ },
379
+ {
380
+ "epoch": 12.0,
381
+ "grad_norm": 2.055306911468506,
382
+ "learning_rate": 2.359075398426809e-05,
383
+ "loss": 0.1003,
384
+ "step": 1536
385
+ },
386
+ {
387
+ "epoch": 12.0,
388
+ "eval_accuracy": 0.912751677852349,
389
+ "eval_loss": 0.3130672872066498,
390
+ "eval_runtime": 8.6515,
391
+ "eval_samples_per_second": 34.445,
392
+ "eval_steps_per_second": 0.578,
393
+ "step": 1536
394
+ },
395
+ {
396
+ "epoch": 12.5,
397
+ "grad_norm": 3.542067289352417,
398
+ "learning_rate": 1.867216712757697e-05,
399
+ "loss": 0.0756,
400
+ "step": 1600
401
+ },
402
+ {
403
+ "epoch": 12.5,
404
+ "eval_accuracy": 0.9228187919463087,
405
+ "eval_loss": 0.3002491891384125,
406
+ "eval_runtime": 8.8978,
407
+ "eval_samples_per_second": 33.491,
408
+ "eval_steps_per_second": 0.562,
409
+ "step": 1600
410
+ },
411
+ {
412
+ "epoch": 13.0,
413
+ "grad_norm": 1.0521228313446045,
414
+ "learning_rate": 1.3276317836011714e-05,
415
+ "loss": 0.0636,
416
+ "step": 1664
417
+ },
418
+ {
419
+ "epoch": 13.0,
420
+ "eval_accuracy": 0.912751677852349,
421
+ "eval_loss": 0.3176516890525818,
422
+ "eval_runtime": 9.2104,
423
+ "eval_samples_per_second": 32.355,
424
+ "eval_steps_per_second": 0.543,
425
+ "step": 1664
426
+ },
427
+ {
428
+ "epoch": 13.5,
429
+ "grad_norm": 1.8493869304656982,
430
+ "learning_rate": 8.104491231310648e-06,
431
+ "loss": 0.058,
432
+ "step": 1728
433
+ },
434
+ {
435
+ "epoch": 13.5,
436
+ "eval_accuracy": 0.9228187919463087,
437
+ "eval_loss": 0.3142910599708557,
438
+ "eval_runtime": 9.7948,
439
+ "eval_samples_per_second": 30.424,
440
+ "eval_steps_per_second": 0.51,
441
+ "step": 1728
442
+ },
443
+ {
444
+ "epoch": 14.0,
445
+ "grad_norm": 1.3355131149291992,
446
+ "learning_rate": 3.828856763379785e-06,
447
+ "loss": 0.0566,
448
+ "step": 1792
449
+ },
450
+ {
451
+ "epoch": 14.0,
452
+ "eval_accuracy": 0.9194630872483222,
453
+ "eval_loss": 0.3135838806629181,
454
+ "eval_runtime": 9.833,
455
+ "eval_samples_per_second": 30.306,
456
+ "eval_steps_per_second": 0.508,
457
+ "step": 1792
458
+ },
459
+ {
460
+ "epoch": 14.5,
461
+ "grad_norm": 2.9695112705230713,
462
+ "learning_rate": 1.0051080174789174e-06,
463
+ "loss": 0.0516,
464
+ "step": 1856
465
+ },
466
+ {
467
+ "epoch": 14.5,
468
+ "eval_accuracy": 0.9161073825503355,
469
+ "eval_loss": 0.34468716382980347,
470
+ "eval_runtime": 9.6587,
471
+ "eval_samples_per_second": 30.853,
472
+ "eval_steps_per_second": 0.518,
473
+ "step": 1856
474
+ },
475
+ {
476
+ "epoch": 15.0,
477
+ "grad_norm": 0.7186270952224731,
478
+ "learning_rate": 2.4060005001991235e-10,
479
+ "loss": 0.0426,
480
+ "step": 1920
481
+ },
482
+ {
483
+ "epoch": 15.0,
484
+ "eval_accuracy": 0.9228187919463087,
485
+ "eval_loss": 0.2911371886730194,
486
+ "eval_runtime": 9.7016,
487
+ "eval_samples_per_second": 30.717,
488
+ "eval_steps_per_second": 0.515,
489
+ "step": 1920
490
+ },
491
+ {
492
+ "epoch": 15.0,
493
+ "step": 1920,
494
+ "total_flos": 7.164784986292224e+17,
495
+ "train_loss": 0.48338687382638457,
496
+ "train_runtime": 5148.0361,
497
+ "train_samples_per_second": 23.869,
498
+ "train_steps_per_second": 0.373
499
+ }
500
+ ],
501
+ "logging_steps": 64,
502
+ "max_steps": 1920,
503
+ "num_input_tokens_seen": 0,
504
+ "num_train_epochs": 15,
505
+ "save_steps": 64,
506
+ "stateful_callbacks": {
507
+ "TrainerControl": {
508
+ "args": {
509
+ "should_epoch_stop": false,
510
+ "should_evaluate": false,
511
+ "should_log": false,
512
+ "should_save": true,
513
+ "should_training_stop": true
514
+ },
515
+ "attributes": {}
516
+ }
517
+ },
518
+ "total_flos": 7.164784986292224e+17,
519
+ "train_batch_size": 64,
520
+ "trial_name": null,
521
+ "trial_params": null
522
+ }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:46df04962540106071084ba7459a16300b3186544bf3cbb04bce688583d183d4
3
  size 5496
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a65bff28c03c6c9e01c24d73c794599da24b606df0220c01aac2771132c075ec
3
  size 5496
training_metrics.xlsx CHANGED
Binary files a/training_metrics.xlsx and b/training_metrics.xlsx differ