augustocsc commited on
Commit
94cb8ed
·
verified ·
1 Parent(s): 7468e7f

Model save

Browse files
Files changed (3) hide show
  1. README.md +310 -187
  2. all_results.json +6 -6
  3. train_results.json +6 -6
README.md CHANGED
@@ -5,18 +5,18 @@ base_model: gpt2
5
  tags:
6
  - generated_from_trainer
7
  model-index:
8
- - name: Se124M100KInfPrompt_WT_EOS
9
  results: []
10
  ---
11
 
12
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
  should probably proofread and complete it, then remove this comment. -->
14
 
15
- # Se124M100KInfPrompt_WT_EOS
16
 
17
  This model is a fine-tuned version of [gpt2](https://huggingface.co/gpt2) on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
- - Loss: 0.7591
20
 
21
  ## Model description
22
 
@@ -44,195 +44,318 @@ The following hyperparameters were used during training:
44
  - optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
45
  - lr_scheduler_type: cosine
46
  - lr_scheduler_warmup_ratio: 0.03
47
- - num_epochs: 3
 
48
 
49
  ### Training results
50
 
51
  | Training Loss | Epoch | Step | Validation Loss |
52
  |:-------------:|:------:|:----:|:---------------:|
53
- | 3.2702 | 0.0164 | 20 | 3.0061 |
54
- | 3.2625 | 0.0327 | 40 | 3.0018 |
55
- | 3.2351 | 0.0491 | 60 | 2.9647 |
56
- | 3.1987 | 0.0655 | 80 | 2.9085 |
57
- | 3.0557 | 0.0819 | 100 | 2.8161 |
58
- | 2.939 | 0.0982 | 120 | 2.6976 |
59
- | 2.8112 | 0.1146 | 140 | 2.5745 |
60
- | 2.6056 | 0.1310 | 160 | 2.4240 |
61
- | 2.4701 | 0.1474 | 180 | 2.2716 |
62
- | 2.3312 | 0.1637 | 200 | 2.1298 |
63
- | 2.1946 | 0.1801 | 220 | 1.9889 |
64
- | 2.0407 | 0.1965 | 240 | 1.8554 |
65
- | 1.9256 | 0.2129 | 260 | 1.7339 |
66
- | 1.803 | 0.2292 | 280 | 1.6261 |
67
- | 1.6977 | 0.2456 | 300 | 1.5231 |
68
- | 1.6354 | 0.2620 | 320 | 1.4378 |
69
- | 1.5545 | 0.2783 | 340 | 1.3478 |
70
- | 1.4609 | 0.2947 | 360 | 1.2741 |
71
- | 1.3616 | 0.3111 | 380 | 1.2038 |
72
- | 1.3137 | 0.3275 | 400 | 1.1477 |
73
- | 1.2682 | 0.3438 | 420 | 1.1003 |
74
- | 1.2262 | 0.3602 | 440 | 1.0645 |
75
- | 1.2021 | 0.3766 | 460 | 1.0357 |
76
- | 1.1659 | 0.3930 | 480 | 1.0129 |
77
- | 1.1355 | 0.4093 | 500 | 0.9911 |
78
- | 1.1085 | 0.4257 | 520 | 0.9732 |
79
- | 1.0979 | 0.4421 | 540 | 0.9571 |
80
- | 1.0775 | 0.4585 | 560 | 0.9420 |
81
- | 1.0659 | 0.4748 | 580 | 0.9295 |
82
- | 1.0356 | 0.4912 | 600 | 0.9158 |
83
- | 1.0184 | 0.5076 | 620 | 0.9056 |
84
- | 0.9979 | 0.5239 | 640 | 0.8965 |
85
- | 0.988 | 0.5403 | 660 | 0.8879 |
86
- | 0.9913 | 0.5567 | 680 | 0.8794 |
87
- | 0.9681 | 0.5731 | 700 | 0.8734 |
88
- | 0.959 | 0.5894 | 720 | 0.8670 |
89
- | 0.9389 | 0.6058 | 740 | 0.8591 |
90
- | 0.9363 | 0.6222 | 760 | 0.8556 |
91
- | 0.9274 | 0.6386 | 780 | 0.8490 |
92
- | 0.9265 | 0.6549 | 800 | 0.8472 |
93
- | 0.9265 | 0.6713 | 820 | 0.8432 |
94
- | 0.9033 | 0.6877 | 840 | 0.8404 |
95
- | 0.912 | 0.7041 | 860 | 0.8372 |
96
- | 0.9067 | 0.7204 | 880 | 0.8337 |
97
- | 0.896 | 0.7368 | 900 | 0.8309 |
98
- | 0.9061 | 0.7532 | 920 | 0.8280 |
99
- | 0.8966 | 0.7695 | 940 | 0.8247 |
100
- | 0.8871 | 0.7859 | 960 | 0.8233 |
101
- | 0.8844 | 0.8023 | 980 | 0.8209 |
102
- | 0.8903 | 0.8187 | 1000 | 0.8188 |
103
- | 0.8938 | 0.8350 | 1020 | 0.8166 |
104
- | 0.884 | 0.8514 | 1040 | 0.8143 |
105
- | 0.8614 | 0.8678 | 1060 | 0.8128 |
106
- | 0.8716 | 0.8842 | 1080 | 0.8100 |
107
- | 0.8719 | 0.9005 | 1100 | 0.8082 |
108
- | 0.8685 | 0.9169 | 1120 | 0.8063 |
109
- | 0.8784 | 0.9333 | 1140 | 0.8049 |
110
- | 0.8534 | 0.9497 | 1160 | 0.8037 |
111
- | 0.8556 | 0.9660 | 1180 | 0.8021 |
112
- | 0.862 | 0.9824 | 1200 | 0.8007 |
113
- | 0.852 | 0.9988 | 1220 | 0.7989 |
114
- | 0.857 | 1.0147 | 1240 | 0.7969 |
115
- | 0.8562 | 1.0311 | 1260 | 0.7957 |
116
- | 0.8508 | 1.0475 | 1280 | 0.7965 |
117
- | 0.845 | 1.0639 | 1300 | 0.7952 |
118
- | 0.839 | 1.0802 | 1320 | 0.7951 |
119
- | 0.8472 | 1.0966 | 1340 | 0.7919 |
120
- | 0.8425 | 1.1130 | 1360 | 0.7899 |
121
- | 0.8285 | 1.1293 | 1380 | 0.7908 |
122
- | 0.8287 | 1.1457 | 1400 | 0.7894 |
123
- | 0.8371 | 1.1621 | 1420 | 0.7880 |
124
- | 0.8281 | 1.1785 | 1440 | 0.7873 |
125
- | 0.8468 | 1.1948 | 1460 | 0.7863 |
126
- | 0.833 | 1.2112 | 1480 | 0.7851 |
127
- | 0.8298 | 1.2276 | 1500 | 0.7854 |
128
- | 0.8321 | 1.2440 | 1520 | 0.7841 |
129
- | 0.823 | 1.2603 | 1540 | 0.7835 |
130
- | 0.828 | 1.2767 | 1560 | 0.7832 |
131
- | 0.8353 | 1.2931 | 1580 | 0.7830 |
132
- | 0.8277 | 1.3095 | 1600 | 0.7814 |
133
- | 0.8172 | 1.3258 | 1620 | 0.7815 |
134
- | 0.8192 | 1.3422 | 1640 | 0.7814 |
135
- | 0.8223 | 1.3586 | 1660 | 0.7800 |
136
- | 0.8272 | 1.3749 | 1680 | 0.7789 |
137
- | 0.8202 | 1.3913 | 1700 | 0.7785 |
138
- | 0.8185 | 1.4077 | 1720 | 0.7769 |
139
- | 0.8325 | 1.4241 | 1740 | 0.7778 |
140
- | 0.8128 | 1.4404 | 1760 | 0.7760 |
141
- | 0.8153 | 1.4568 | 1780 | 0.7767 |
142
- | 0.8212 | 1.4732 | 1800 | 0.7772 |
143
- | 0.8049 | 1.4896 | 1820 | 0.7753 |
144
- | 0.825 | 1.5059 | 1840 | 0.7744 |
145
- | 0.8175 | 1.5223 | 1860 | 0.7746 |
146
- | 0.8218 | 1.5387 | 1880 | 0.7731 |
147
- | 0.814 | 1.5551 | 1900 | 0.7730 |
148
- | 0.8172 | 1.5714 | 1920 | 0.7723 |
149
- | 0.8305 | 1.5878 | 1940 | 0.7717 |
150
- | 0.8145 | 1.6042 | 1960 | 0.7728 |
151
- | 0.7996 | 1.6205 | 1980 | 0.7710 |
152
- | 0.809 | 1.6369 | 2000 | 0.7705 |
153
- | 0.8013 | 1.6533 | 2020 | 0.7707 |
154
- | 0.8106 | 1.6697 | 2040 | 0.7680 |
155
- | 0.8235 | 1.6860 | 2060 | 0.7695 |
156
- | 0.8075 | 1.7024 | 2080 | 0.7692 |
157
- | 0.8098 | 1.7188 | 2100 | 0.7679 |
158
- | 0.8036 | 1.7352 | 2120 | 0.7690 |
159
- | 0.8119 | 1.7515 | 2140 | 0.7664 |
160
- | 0.8111 | 1.7679 | 2160 | 0.7672 |
161
- | 0.8052 | 1.7843 | 2180 | 0.7670 |
162
- | 0.8084 | 1.8007 | 2200 | 0.7673 |
163
- | 0.8102 | 1.8170 | 2220 | 0.7675 |
164
- | 0.8099 | 1.8334 | 2240 | 0.7665 |
165
- | 0.7982 | 1.8498 | 2260 | 0.7660 |
166
- | 0.8052 | 1.8661 | 2280 | 0.7652 |
167
- | 0.8096 | 1.8825 | 2300 | 0.7646 |
168
- | 0.8099 | 1.8989 | 2320 | 0.7659 |
169
- | 0.7973 | 1.9153 | 2340 | 0.7661 |
170
- | 0.817 | 1.9316 | 2360 | 0.7650 |
171
- | 0.7987 | 1.9480 | 2380 | 0.7646 |
172
- | 0.8112 | 1.9644 | 2400 | 0.7630 |
173
- | 0.8051 | 1.9808 | 2420 | 0.7635 |
174
- | 0.797 | 1.9971 | 2440 | 0.7635 |
175
- | 0.7871 | 2.0131 | 2460 | 0.7629 |
176
- | 0.8124 | 2.0295 | 2480 | 0.7633 |
177
- | 0.7969 | 2.0458 | 2500 | 0.7642 |
178
- | 0.7906 | 2.0622 | 2520 | 0.7634 |
179
- | 0.8032 | 2.0786 | 2540 | 0.7632 |
180
- | 0.806 | 2.0950 | 2560 | 0.7625 |
181
- | 0.8019 | 2.1113 | 2580 | 0.7628 |
182
- | 0.8017 | 2.1277 | 2600 | 0.7614 |
183
- | 0.8106 | 2.1441 | 2620 | 0.7623 |
184
- | 0.7996 | 2.1605 | 2640 | 0.7622 |
185
- | 0.8038 | 2.1768 | 2660 | 0.7621 |
186
- | 0.8036 | 2.1932 | 2680 | 0.7607 |
187
- | 0.8032 | 2.2096 | 2700 | 0.7618 |
188
- | 0.797 | 2.2260 | 2720 | 0.7608 |
189
- | 0.8031 | 2.2423 | 2740 | 0.7607 |
190
- | 0.8004 | 2.2587 | 2760 | 0.7606 |
191
- | 0.8043 | 2.2751 | 2780 | 0.7604 |
192
- | 0.7971 | 2.2914 | 2800 | 0.7614 |
193
- | 0.7982 | 2.3078 | 2820 | 0.7607 |
194
- | 0.7972 | 2.3242 | 2840 | 0.7601 |
195
- | 0.8071 | 2.3406 | 2860 | 0.7609 |
196
- | 0.8094 | 2.3569 | 2880 | 0.7605 |
197
- | 0.7994 | 2.3733 | 2900 | 0.7600 |
198
- | 0.7918 | 2.3897 | 2920 | 0.7596 |
199
- | 0.7978 | 2.4061 | 2940 | 0.7597 |
200
- | 0.8026 | 2.4224 | 2960 | 0.7599 |
201
- | 0.7981 | 2.4388 | 2980 | 0.7593 |
202
- | 0.7957 | 2.4552 | 3000 | 0.7600 |
203
- | 0.8089 | 2.4716 | 3020 | 0.7601 |
204
- | 0.801 | 2.4879 | 3040 | 0.7598 |
205
- | 0.8044 | 2.5043 | 3060 | 0.7592 |
206
- | 0.7967 | 2.5207 | 3080 | 0.7591 |
207
- | 0.803 | 2.5370 | 3100 | 0.7589 |
208
- | 0.8006 | 2.5534 | 3120 | 0.7591 |
209
- | 0.7933 | 2.5698 | 3140 | 0.7599 |
210
- | 0.7954 | 2.5862 | 3160 | 0.7590 |
211
- | 0.7996 | 2.6025 | 3180 | 0.7592 |
212
- | 0.7931 | 2.6189 | 3200 | 0.7597 |
213
- | 0.8034 | 2.6353 | 3220 | 0.7595 |
214
- | 0.7954 | 2.6517 | 3240 | 0.7606 |
215
- | 0.801 | 2.6680 | 3260 | 0.7592 |
216
- | 0.7973 | 2.6844 | 3280 | 0.7595 |
217
- | 0.7976 | 2.7008 | 3300 | 0.7601 |
218
- | 0.8014 | 2.7172 | 3320 | 0.7606 |
219
- | 0.8052 | 2.7335 | 3340 | 0.7590 |
220
- | 0.7898 | 2.7499 | 3360 | 0.7609 |
221
- | 0.794 | 2.7663 | 3380 | 0.7598 |
222
- | 0.7984 | 2.7826 | 3400 | 0.7609 |
223
- | 0.7905 | 2.7990 | 3420 | 0.7587 |
224
- | 0.7994 | 2.8154 | 3440 | 0.7595 |
225
- | 0.8083 | 2.8318 | 3460 | 0.7588 |
226
- | 0.7979 | 2.8481 | 3480 | 0.7585 |
227
- | 0.7947 | 2.8645 | 3500 | 0.7596 |
228
- | 0.8081 | 2.8809 | 3520 | 0.7592 |
229
- | 0.801 | 2.8973 | 3540 | 0.7615 |
230
- | 0.7959 | 2.9136 | 3560 | 0.7590 |
231
- | 0.8081 | 2.9300 | 3580 | 0.7594 |
232
- | 0.7927 | 2.9464 | 3600 | 0.7589 |
233
- | 0.8006 | 2.9628 | 3620 | 0.7594 |
234
- | 0.7933 | 2.9791 | 3640 | 0.7591 |
235
- | 0.7896 | 2.9955 | 3660 | 0.7591 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
 
237
 
238
  ### Framework versions
 
5
  tags:
6
  - generated_from_trainer
7
  model-index:
8
+ - name: Se124M100KInfPrompt_WT_EOS_Label_Smooth
9
  results: []
10
  ---
11
 
12
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
  should probably proofread and complete it, then remove this comment. -->
14
 
15
+ # Se124M100KInfPrompt_WT_EOS_Label_Smooth
16
 
17
  This model is a fine-tuned version of [gpt2](https://huggingface.co/gpt2) on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
+ - Loss: 2.1617
20
 
21
  ## Model description
22
 
 
44
  - optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
45
  - lr_scheduler_type: cosine
46
  - lr_scheduler_warmup_ratio: 0.03
47
+ - num_epochs: 5
48
+ - label_smoothing_factor: 0.1
49
 
50
  ### Training results
51
 
52
  | Training Loss | Epoch | Step | Validation Loss |
53
  |:-------------:|:------:|:----:|:---------------:|
54
+ | 19.0108 | 0.0164 | 20 | 4.5546 |
55
+ | 18.9846 | 0.0327 | 40 | 4.5496 |
56
+ | 18.9358 | 0.0491 | 60 | 4.5317 |
57
+ | 18.9041 | 0.0655 | 80 | 4.5064 |
58
+ | 18.5139 | 0.0819 | 100 | 4.4510 |
59
+ | 18.2552 | 0.0982 | 120 | 4.3778 |
60
+ | 17.9591 | 0.1146 | 140 | 4.2887 |
61
+ | 17.322 | 0.1310 | 160 | 4.1740 |
62
+ | 16.8904 | 0.1474 | 180 | 4.0367 |
63
+ | 16.3008 | 0.1637 | 200 | 3.8727 |
64
+ | 15.7206 | 0.1801 | 220 | 3.7338 |
65
+ | 15.0629 | 0.1965 | 240 | 3.5839 |
66
+ | 14.5732 | 0.2129 | 260 | 3.4582 |
67
+ | 14.0294 | 0.2292 | 280 | 3.3334 |
68
+ | 13.5401 | 0.2456 | 300 | 3.2152 |
69
+ | 13.2086 | 0.2620 | 320 | 3.1111 |
70
+ | 12.8368 | 0.2783 | 340 | 3.0145 |
71
+ | 12.4193 | 0.2947 | 360 | 2.9264 |
72
+ | 11.9977 | 0.3111 | 380 | 2.8435 |
73
+ | 11.7412 | 0.3275 | 400 | 2.7679 |
74
+ | 11.5111 | 0.3438 | 420 | 2.6997 |
75
+ | 11.2634 | 0.3602 | 440 | 2.6409 |
76
+ | 11.0944 | 0.3766 | 460 | 2.5882 |
77
+ | 10.8847 | 0.3930 | 480 | 2.5459 |
78
+ | 10.6994 | 0.4093 | 500 | 2.5115 |
79
+ | 10.5561 | 0.4257 | 520 | 2.4840 |
80
+ | 10.4584 | 0.4421 | 540 | 2.4625 |
81
+ | 10.3285 | 0.4585 | 560 | 2.4426 |
82
+ | 10.2707 | 0.4748 | 580 | 2.4283 |
83
+ | 10.124 | 0.4912 | 600 | 2.4124 |
84
+ | 10.0467 | 0.5076 | 620 | 2.3997 |
85
+ | 9.955 | 0.5239 | 640 | 2.3867 |
86
+ | 9.893 | 0.5403 | 660 | 2.3739 |
87
+ | 9.8885 | 0.5567 | 680 | 2.3627 |
88
+ | 9.8025 | 0.5731 | 700 | 2.3534 |
89
+ | 9.7378 | 0.5894 | 720 | 2.3436 |
90
+ | 9.6593 | 0.6058 | 740 | 2.3343 |
91
+ | 9.6328 | 0.6222 | 760 | 2.3285 |
92
+ | 9.585 | 0.6386 | 780 | 2.3200 |
93
+ | 9.5782 | 0.6549 | 800 | 2.3149 |
94
+ | 9.5723 | 0.6713 | 820 | 2.3079 |
95
+ | 9.4824 | 0.6877 | 840 | 2.3040 |
96
+ | 9.4909 | 0.7041 | 860 | 2.2979 |
97
+ | 9.4709 | 0.7204 | 880 | 2.2943 |
98
+ | 9.4306 | 0.7368 | 900 | 2.2877 |
99
+ | 9.4688 | 0.7532 | 920 | 2.2841 |
100
+ | 9.4184 | 0.7695 | 940 | 2.2805 |
101
+ | 9.3729 | 0.7859 | 960 | 2.2780 |
102
+ | 9.3634 | 0.8023 | 980 | 2.2768 |
103
+ | 9.3779 | 0.8187 | 1000 | 2.2718 |
104
+ | 9.3945 | 0.8350 | 1020 | 2.2683 |
105
+ | 9.3539 | 0.8514 | 1040 | 2.2668 |
106
+ | 9.2679 | 0.8678 | 1060 | 2.2614 |
107
+ | 9.2974 | 0.8842 | 1080 | 2.2592 |
108
+ | 9.2907 | 0.9005 | 1100 | 2.2591 |
109
+ | 9.2787 | 0.9169 | 1120 | 2.2559 |
110
+ | 9.3063 | 0.9333 | 1140 | 2.2561 |
111
+ | 9.2133 | 0.9497 | 1160 | 2.2538 |
112
+ | 9.2134 | 0.9660 | 1180 | 2.2497 |
113
+ | 9.2464 | 0.9824 | 1200 | 2.2472 |
114
+ | 9.1947 | 0.9988 | 1220 | 2.2476 |
115
+ | 8.9748 | 1.0147 | 1240 | 2.2448 |
116
+ | 9.209 | 1.0311 | 1260 | 2.2441 |
117
+ | 9.1883 | 1.0475 | 1280 | 2.2431 |
118
+ | 9.1606 | 1.0639 | 1300 | 2.2415 |
119
+ | 9.1384 | 1.0802 | 1320 | 2.2379 |
120
+ | 9.1674 | 1.0966 | 1340 | 2.2368 |
121
+ | 9.144 | 1.1130 | 1360 | 2.2347 |
122
+ | 9.095 | 1.1293 | 1380 | 2.2343 |
123
+ | 9.0968 | 1.1457 | 1400 | 2.2331 |
124
+ | 9.1172 | 1.1621 | 1420 | 2.2303 |
125
+ | 9.084 | 1.1785 | 1440 | 2.2304 |
126
+ | 9.1447 | 1.1948 | 1460 | 2.2274 |
127
+ | 9.0922 | 1.2112 | 1480 | 2.2261 |
128
+ | 9.0842 | 1.2276 | 1500 | 2.2258 |
129
+ | 9.0883 | 1.2440 | 1520 | 2.2258 |
130
+ | 9.0633 | 1.2603 | 1540 | 2.2256 |
131
+ | 9.065 | 1.2767 | 1560 | 2.2211 |
132
+ | 9.0941 | 1.2931 | 1580 | 2.2225 |
133
+ | 9.071 | 1.3095 | 1600 | 2.2219 |
134
+ | 9.0327 | 1.3258 | 1620 | 2.2205 |
135
+ | 9.0414 | 1.3422 | 1640 | 2.2183 |
136
+ | 9.0465 | 1.3586 | 1660 | 2.2171 |
137
+ | 9.0607 | 1.3749 | 1680 | 2.2179 |
138
+ | 9.0418 | 1.3913 | 1700 | 2.2150 |
139
+ | 9.0179 | 1.4077 | 1720 | 2.2146 |
140
+ | 9.0646 | 1.4241 | 1740 | 2.2160 |
141
+ | 8.9973 | 1.4404 | 1760 | 2.2140 |
142
+ | 9.0051 | 1.4568 | 1780 | 2.2124 |
143
+ | 9.023 | 1.4732 | 1800 | 2.2136 |
144
+ | 8.9645 | 1.4896 | 1820 | 2.2082 |
145
+ | 9.0326 | 1.5059 | 1840 | 2.2099 |
146
+ | 9.009 | 1.5223 | 1860 | 2.2081 |
147
+ | 9.0208 | 1.5387 | 1880 | 2.2100 |
148
+ | 8.9923 | 1.5551 | 1900 | 2.2074 |
149
+ | 9.0036 | 1.5714 | 1920 | 2.2067 |
150
+ | 9.0372 | 1.5878 | 1940 | 2.2050 |
151
+ | 8.9954 | 1.6042 | 1960 | 2.2058 |
152
+ | 8.9362 | 1.6205 | 1980 | 2.2063 |
153
+ | 8.9638 | 1.6369 | 2000 | 2.2049 |
154
+ | 8.9343 | 1.6533 | 2020 | 2.2022 |
155
+ | 8.9586 | 1.6697 | 2040 | 2.2018 |
156
+ | 9.0058 | 1.6860 | 2060 | 2.2022 |
157
+ | 8.9595 | 1.7024 | 2080 | 2.2002 |
158
+ | 8.9547 | 1.7188 | 2100 | 2.1979 |
159
+ | 8.9423 | 1.7352 | 2120 | 2.1992 |
160
+ | 8.9637 | 1.7515 | 2140 | 2.1980 |
161
+ | 8.9599 | 1.7679 | 2160 | 2.1984 |
162
+ | 8.9396 | 1.7843 | 2180 | 2.1964 |
163
+ | 8.9515 | 1.8007 | 2200 | 2.1947 |
164
+ | 8.9479 | 1.8170 | 2220 | 2.1962 |
165
+ | 8.9487 | 1.8334 | 2240 | 2.1938 |
166
+ | 8.9059 | 1.8498 | 2260 | 2.1944 |
167
+ | 8.9323 | 1.8661 | 2280 | 2.1948 |
168
+ | 8.9462 | 1.8825 | 2300 | 2.1946 |
169
+ | 8.9453 | 1.8989 | 2320 | 2.1931 |
170
+ | 8.8958 | 1.9153 | 2340 | 2.1918 |
171
+ | 8.9608 | 1.9316 | 2360 | 2.1924 |
172
+ | 8.8996 | 1.9480 | 2380 | 2.1898 |
173
+ | 8.9414 | 1.9644 | 2400 | 2.1890 |
174
+ | 8.9095 | 1.9808 | 2420 | 2.1883 |
175
+ | 8.8899 | 1.9971 | 2440 | 2.1882 |
176
+ | 8.645 | 2.0131 | 2460 | 2.1876 |
177
+ | 8.9385 | 2.0295 | 2480 | 2.1892 |
178
+ | 8.8846 | 2.0458 | 2500 | 2.1876 |
179
+ | 8.8618 | 2.0622 | 2520 | 2.1868 |
180
+ | 8.9023 | 2.0786 | 2540 | 2.1857 |
181
+ | 8.9133 | 2.0950 | 2560 | 2.1849 |
182
+ | 8.9055 | 2.1113 | 2580 | 2.1854 |
183
+ | 8.891 | 2.1277 | 2600 | 2.1843 |
184
+ | 8.9237 | 2.1441 | 2620 | 2.1849 |
185
+ | 8.887 | 2.1605 | 2640 | 2.1835 |
186
+ | 8.9018 | 2.1768 | 2660 | 2.1825 |
187
+ | 8.9009 | 2.1932 | 2680 | 2.1831 |
188
+ | 8.8959 | 2.2096 | 2700 | 2.1842 |
189
+ | 8.8711 | 2.2260 | 2720 | 2.1823 |
190
+ | 8.891 | 2.2423 | 2740 | 2.1811 |
191
+ | 8.8813 | 2.2587 | 2760 | 2.1805 |
192
+ | 8.8852 | 2.2751 | 2780 | 2.1817 |
193
+ | 8.8702 | 2.2914 | 2800 | 2.1816 |
194
+ | 8.8775 | 2.3078 | 2820 | 2.1779 |
195
+ | 8.867 | 2.3242 | 2840 | 2.1802 |
196
+ | 8.8928 | 2.3406 | 2860 | 2.1781 |
197
+ | 8.9039 | 2.3569 | 2880 | 2.1784 |
198
+ | 8.8728 | 2.3733 | 2900 | 2.1798 |
199
+ | 8.8428 | 2.3897 | 2920 | 2.1774 |
200
+ | 8.8585 | 2.4061 | 2940 | 2.1786 |
201
+ | 8.879 | 2.4224 | 2960 | 2.1765 |
202
+ | 8.8633 | 2.4388 | 2980 | 2.1768 |
203
+ | 8.8498 | 2.4552 | 3000 | 2.1766 |
204
+ | 8.8998 | 2.4716 | 3020 | 2.1757 |
205
+ | 8.8642 | 2.4879 | 3040 | 2.1746 |
206
+ | 8.8752 | 2.5043 | 3060 | 2.1777 |
207
+ | 8.8417 | 2.5207 | 3080 | 2.1765 |
208
+ | 8.8695 | 2.5370 | 3100 | 2.1772 |
209
+ | 8.8683 | 2.5534 | 3120 | 2.1771 |
210
+ | 8.8323 | 2.5698 | 3140 | 2.1767 |
211
+ | 8.8448 | 2.5862 | 3160 | 2.1769 |
212
+ | 8.8549 | 2.6025 | 3180 | 2.1762 |
213
+ | 8.8315 | 2.6189 | 3200 | 2.1728 |
214
+ | 8.8652 | 2.6353 | 3220 | 2.1766 |
215
+ | 8.8402 | 2.6517 | 3240 | 2.1766 |
216
+ | 8.8491 | 2.6680 | 3260 | 2.1740 |
217
+ | 8.8438 | 2.6844 | 3280 | 2.1761 |
218
+ | 8.8378 | 2.7008 | 3300 | 2.1749 |
219
+ | 8.8587 | 2.7172 | 3320 | 2.1758 |
220
+ | 8.8655 | 2.7335 | 3340 | 2.1738 |
221
+ | 8.8079 | 2.7499 | 3360 | 2.1741 |
222
+ | 8.8234 | 2.7663 | 3380 | 2.1734 |
223
+ | 8.8389 | 2.7826 | 3400 | 2.1737 |
224
+ | 8.8085 | 2.7990 | 3420 | 2.1727 |
225
+ | 8.8397 | 2.8154 | 3440 | 2.1716 |
226
+ | 8.8679 | 2.8318 | 3460 | 2.1725 |
227
+ | 8.8381 | 2.8481 | 3480 | 2.1711 |
228
+ | 8.8267 | 2.8645 | 3500 | 2.1731 |
229
+ | 8.8671 | 2.8809 | 3520 | 2.1710 |
230
+ | 8.8439 | 2.8973 | 3540 | 2.1707 |
231
+ | 8.8276 | 2.9136 | 3560 | 2.1715 |
232
+ | 8.8624 | 2.9300 | 3580 | 2.1717 |
233
+ | 8.8096 | 2.9464 | 3600 | 2.1719 |
234
+ | 8.8429 | 2.9628 | 3620 | 2.1711 |
235
+ | 8.8152 | 2.9791 | 3640 | 2.1719 |
236
+ | 8.7951 | 2.9955 | 3660 | 2.1718 |
237
+ | 8.6203 | 3.0115 | 3680 | 2.1699 |
238
+ | 8.8258 | 3.0278 | 3700 | 2.1702 |
239
+ | 8.8285 | 3.0442 | 3720 | 2.1690 |
240
+ | 8.8504 | 3.0606 | 3740 | 2.1696 |
241
+ | 8.8282 | 3.0770 | 3760 | 2.1683 |
242
+ | 8.8457 | 3.0933 | 3780 | 2.1687 |
243
+ | 8.8096 | 3.1097 | 3800 | 2.1696 |
244
+ | 8.8035 | 3.1261 | 3820 | 2.1692 |
245
+ | 8.8099 | 3.1424 | 3840 | 2.1695 |
246
+ | 8.7912 | 3.1588 | 3860 | 2.1690 |
247
+ | 8.8371 | 3.1752 | 3880 | 2.1675 |
248
+ | 8.8418 | 3.1916 | 3900 | 2.1696 |
249
+ | 8.821 | 3.2079 | 3920 | 2.1685 |
250
+ | 8.7993 | 3.2243 | 3940 | 2.1673 |
251
+ | 8.7873 | 3.2407 | 3960 | 2.1680 |
252
+ | 8.7995 | 3.2571 | 3980 | 2.1672 |
253
+ | 8.7745 | 3.2734 | 4000 | 2.1669 |
254
+ | 8.8271 | 3.2898 | 4020 | 2.1682 |
255
+ | 8.8021 | 3.3062 | 4040 | 2.1670 |
256
+ | 8.8327 | 3.3226 | 4060 | 2.1669 |
257
+ | 8.8031 | 3.3389 | 4080 | 2.1676 |
258
+ | 8.7912 | 3.3553 | 4100 | 2.1670 |
259
+ | 8.8087 | 3.3717 | 4120 | 2.1669 |
260
+ | 8.8377 | 3.3880 | 4140 | 2.1677 |
261
+ | 8.8045 | 3.4044 | 4160 | 2.1674 |
262
+ | 8.7921 | 3.4208 | 4180 | 2.1663 |
263
+ | 8.8128 | 3.4372 | 4200 | 2.1670 |
264
+ | 8.8479 | 3.4535 | 4220 | 2.1668 |
265
+ | 8.8072 | 3.4699 | 4240 | 2.1668 |
266
+ | 8.7718 | 3.4863 | 4260 | 2.1665 |
267
+ | 8.8012 | 3.5027 | 4280 | 2.1666 |
268
+ | 8.809 | 3.5190 | 4300 | 2.1666 |
269
+ | 8.8306 | 3.5354 | 4320 | 2.1653 |
270
+ | 8.8264 | 3.5518 | 4340 | 2.1654 |
271
+ | 8.8202 | 3.5682 | 4360 | 2.1651 |
272
+ | 8.793 | 3.5845 | 4380 | 2.1643 |
273
+ | 8.8171 | 3.6009 | 4400 | 2.1647 |
274
+ | 8.8277 | 3.6173 | 4420 | 2.1643 |
275
+ | 8.8055 | 3.6336 | 4440 | 2.1650 |
276
+ | 8.7796 | 3.6500 | 4460 | 2.1651 |
277
+ | 8.8176 | 3.6664 | 4480 | 2.1645 |
278
+ | 8.7721 | 3.6828 | 4500 | 2.1651 |
279
+ | 8.7966 | 3.6991 | 4520 | 2.1649 |
280
+ | 8.841 | 3.7155 | 4540 | 2.1649 |
281
+ | 8.8044 | 3.7319 | 4560 | 2.1641 |
282
+ | 8.7891 | 3.7483 | 4580 | 2.1638 |
283
+ | 8.7594 | 3.7646 | 4600 | 2.1639 |
284
+ | 8.7963 | 3.7810 | 4620 | 2.1636 |
285
+ | 8.8074 | 3.7974 | 4640 | 2.1638 |
286
+ | 8.8025 | 3.8138 | 4660 | 2.1641 |
287
+ | 8.8361 | 3.8301 | 4680 | 2.1635 |
288
+ | 8.8129 | 3.8465 | 4700 | 2.1641 |
289
+ | 8.7971 | 3.8629 | 4720 | 2.1642 |
290
+ | 8.8033 | 3.8792 | 4740 | 2.1639 |
291
+ | 8.78 | 3.8956 | 4760 | 2.1637 |
292
+ | 8.8012 | 3.9120 | 4780 | 2.1638 |
293
+ | 8.8109 | 3.9284 | 4800 | 2.1633 |
294
+ | 8.8402 | 3.9447 | 4820 | 2.1634 |
295
+ | 8.7862 | 3.9611 | 4840 | 2.1634 |
296
+ | 8.8204 | 3.9775 | 4860 | 2.1630 |
297
+ | 8.8033 | 3.9939 | 4880 | 2.1637 |
298
+ | 8.5571 | 4.0098 | 4900 | 2.1625 |
299
+ | 8.8107 | 4.0262 | 4920 | 2.1628 |
300
+ | 8.8381 | 4.0426 | 4940 | 2.1637 |
301
+ | 8.7981 | 4.0589 | 4960 | 2.1626 |
302
+ | 8.7817 | 4.0753 | 4980 | 2.1626 |
303
+ | 8.7938 | 4.0917 | 5000 | 2.1635 |
304
+ | 8.8026 | 4.1081 | 5020 | 2.1638 |
305
+ | 8.7924 | 4.1244 | 5040 | 2.1622 |
306
+ | 8.8206 | 4.1408 | 5060 | 2.1629 |
307
+ | 8.7942 | 4.1572 | 5080 | 2.1633 |
308
+ | 8.7939 | 4.1736 | 5100 | 2.1627 |
309
+ | 8.8211 | 4.1899 | 5120 | 2.1622 |
310
+ | 8.7513 | 4.2063 | 5140 | 2.1630 |
311
+ | 8.79 | 4.2227 | 5160 | 2.1635 |
312
+ | 8.8063 | 4.2391 | 5180 | 2.1631 |
313
+ | 8.8049 | 4.2554 | 5200 | 2.1626 |
314
+ | 8.8196 | 4.2718 | 5220 | 2.1627 |
315
+ | 8.8215 | 4.2882 | 5240 | 2.1631 |
316
+ | 8.798 | 4.3045 | 5260 | 2.1634 |
317
+ | 8.7946 | 4.3209 | 5280 | 2.1621 |
318
+ | 8.7797 | 4.3373 | 5300 | 2.1619 |
319
+ | 8.8163 | 4.3537 | 5320 | 2.1627 |
320
+ | 8.7569 | 4.3700 | 5340 | 2.1621 |
321
+ | 8.7671 | 4.3864 | 5360 | 2.1629 |
322
+ | 8.7883 | 4.4028 | 5380 | 2.1628 |
323
+ | 8.7788 | 4.4192 | 5400 | 2.1628 |
324
+ | 8.7826 | 4.4355 | 5420 | 2.1621 |
325
+ | 8.7884 | 4.4519 | 5440 | 2.1622 |
326
+ | 8.8011 | 4.4683 | 5460 | 2.1628 |
327
+ | 8.796 | 4.4847 | 5480 | 2.1629 |
328
+ | 8.7943 | 4.5010 | 5500 | 2.1627 |
329
+ | 8.8184 | 4.5174 | 5520 | 2.1618 |
330
+ | 8.7747 | 4.5338 | 5540 | 2.1629 |
331
+ | 8.784 | 4.5501 | 5560 | 2.1630 |
332
+ | 8.8176 | 4.5665 | 5580 | 2.1628 |
333
+ | 8.8134 | 4.5829 | 5600 | 2.1624 |
334
+ | 8.7711 | 4.5993 | 5620 | 2.1629 |
335
+ | 8.7939 | 4.6156 | 5640 | 2.1631 |
336
+ | 8.8057 | 4.6320 | 5660 | 2.1631 |
337
+ | 8.8042 | 4.6484 | 5680 | 2.1623 |
338
+ | 8.8248 | 4.6648 | 5700 | 2.1624 |
339
+ | 8.7954 | 4.6811 | 5720 | 2.1626 |
340
+ | 8.7767 | 4.6975 | 5740 | 2.1622 |
341
+ | 8.7603 | 4.7139 | 5760 | 2.1631 |
342
+ | 8.8185 | 4.7302 | 5780 | 2.1632 |
343
+ | 8.7975 | 4.7466 | 5800 | 2.1629 |
344
+ | 8.7933 | 4.7630 | 5820 | 2.1627 |
345
+ | 8.7949 | 4.7794 | 5840 | 2.1627 |
346
+ | 8.7701 | 4.7957 | 5860 | 2.1629 |
347
+ | 8.7875 | 4.8121 | 5880 | 2.1628 |
348
+ | 8.7731 | 4.8285 | 5900 | 2.1627 |
349
+ | 8.8287 | 4.8449 | 5920 | 2.1629 |
350
+ | 8.7871 | 4.8612 | 5940 | 2.1626 |
351
+ | 8.7655 | 4.8776 | 5960 | 2.1627 |
352
+ | 8.7744 | 4.8940 | 5980 | 2.1629 |
353
+ | 8.764 | 4.9104 | 6000 | 2.1618 |
354
+ | 8.8085 | 4.9267 | 6020 | 2.1627 |
355
+ | 8.7985 | 4.9431 | 6040 | 2.1629 |
356
+ | 8.8205 | 4.9595 | 6060 | 2.1631 |
357
+ | 8.866 | 4.9758 | 6080 | 2.1619 |
358
+ | 8.785 | 4.9922 | 6100 | 2.1617 |
359
 
360
 
361
  ### Framework versions
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 2.997953336062219,
3
  "eval_loss": 0.759117603302002,
4
  "eval_runtime": 31.7884,
5
  "eval_samples_per_second": 526.953,
6
  "eval_steps_per_second": 32.937,
7
  "perplexity": 2.136390244832166,
8
- "total_flos": 1.415185036075008e+16,
9
- "train_loss": 1.0084684829779607,
10
- "train_runtime": 6791.37,
11
- "train_samples_per_second": 34.531,
12
- "train_steps_per_second": 0.539
13
  }
 
1
  {
2
+ "epoch": 4.996316004911994,
3
  "eval_loss": 0.759117603302002,
4
  "eval_runtime": 31.7884,
5
  "eval_samples_per_second": 526.953,
6
  "eval_steps_per_second": 32.937,
7
  "perplexity": 2.136390244832166,
8
+ "total_flos": 2.358780336672768e+16,
9
+ "train_loss": 9.447584004445119,
10
+ "train_runtime": 9280.2532,
11
+ "train_samples_per_second": 42.117,
12
+ "train_steps_per_second": 0.658
13
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 2.997953336062219,
3
- "total_flos": 1.415185036075008e+16,
4
- "train_loss": 1.0084684829779607,
5
- "train_runtime": 6791.37,
6
- "train_samples_per_second": 34.531,
7
- "train_steps_per_second": 0.539
8
  }
 
1
  {
2
+ "epoch": 4.996316004911994,
3
+ "total_flos": 2.358780336672768e+16,
4
+ "train_loss": 9.447584004445119,
5
+ "train_runtime": 9280.2532,
6
+ "train_samples_per_second": 42.117,
7
+ "train_steps_per_second": 0.658
8
  }