sxyao commited on
Commit
f3c989f
·
verified ·
1 Parent(s): 90db2d8

bugfix in loading and data processing

Browse files
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fd5067b43ff84e6568feb148b9dada8aa0608d516364fb3ae6268dc6a238f160
3
  size 4943274328
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a55f92b3ec1d5cc83a8e5e5ebb644b016c89d6672f89611c730144c9d8e23db
3
  size 4943274328
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:35b9981cf2af1a882cd5baf8b395d15cc69814898586d1330e4987837cb6ed4e
3
  size 1050673280
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81801955b717ee7af3e03d0af29b78dcf85765b62a0f59765e1425fa418f51bf
3
  size 1050673280
model.safetensors.index.json CHANGED
@@ -3,7 +3,6 @@
3
  "total_size": 5993930752
4
  },
5
  "weight_map": {
6
- "lm_head.weight": "model-00002-of-00002.safetensors",
7
  "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
8
  "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
9
  "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
 
3
  "total_size": 5993930752
4
  },
5
  "weight_map": {
 
6
  "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
7
  "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
8
  "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
trainer_state.json CHANGED
@@ -10,208 +10,208 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.10256410256410256,
13
- "grad_norm": 5.980730056762695,
14
  "learning_rate": 2e-05,
15
- "loss": 0.9641,
16
  "step": 5
17
  },
18
  {
19
  "epoch": 0.20512820512820512,
20
- "grad_norm": 4.265922546386719,
21
  "learning_rate": 1.9936215093023884e-05,
22
- "loss": 0.7271,
23
  "step": 10
24
  },
25
  {
26
  "epoch": 0.3076923076923077,
27
- "grad_norm": 1.7026264667510986,
28
  "learning_rate": 1.974567407496712e-05,
29
- "loss": 0.5734,
30
  "step": 15
31
  },
32
  {
33
  "epoch": 0.41025641025641024,
34
- "grad_norm": 1.6455885171890259,
35
  "learning_rate": 1.9430807674052092e-05,
36
- "loss": 0.4955,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 0.5128205128205128,
41
- "grad_norm": 1.5250736474990845,
42
  "learning_rate": 1.899563263509725e-05,
43
- "loss": 0.4722,
44
  "step": 25
45
  },
46
  {
47
  "epoch": 0.6153846153846154,
48
- "grad_norm": 1.467634916305542,
49
  "learning_rate": 1.8445700477978207e-05,
50
- "loss": 0.4283,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 0.717948717948718,
55
- "grad_norm": 1.3555026054382324,
56
  "learning_rate": 1.778802667699196e-05,
57
- "loss": 0.417,
58
  "step": 35
59
  },
60
  {
61
  "epoch": 0.8205128205128205,
62
- "grad_norm": 1.4207082986831665,
63
  "learning_rate": 1.7031001164581828e-05,
64
- "loss": 0.3969,
65
  "step": 40
66
  },
67
  {
68
  "epoch": 0.9230769230769231,
69
- "grad_norm": 1.3894375562667847,
70
  "learning_rate": 1.618428130112533e-05,
71
- "loss": 0.3896,
72
  "step": 45
73
  },
74
  {
75
  "epoch": 1.0205128205128204,
76
- "grad_norm": 1.3288708925247192,
77
  "learning_rate": 1.5258668676167548e-05,
78
- "loss": 0.3586,
79
  "step": 50
80
  },
81
  {
82
  "epoch": 1.123076923076923,
83
- "grad_norm": 1.3780906200408936,
84
  "learning_rate": 1.4265971312744252e-05,
85
- "loss": 0.3267,
86
  "step": 55
87
  },
88
  {
89
  "epoch": 1.2256410256410257,
90
- "grad_norm": 1.1098214387893677,
91
  "learning_rate": 1.3218853032651719e-05,
92
- "loss": 0.3194,
93
  "step": 60
94
  },
95
  {
96
  "epoch": 1.3282051282051281,
97
- "grad_norm": 1.3044273853302002,
98
  "learning_rate": 1.2130671904307692e-05,
99
- "loss": 0.3065,
100
  "step": 65
101
  },
102
  {
103
  "epoch": 1.4307692307692308,
104
- "grad_norm": 1.2135035991668701,
105
  "learning_rate": 1.1015309834121083e-05,
106
- "loss": 0.2907,
107
  "step": 70
108
  },
109
  {
110
  "epoch": 1.5333333333333332,
111
- "grad_norm": 1.172560214996338,
112
  "learning_rate": 9.886995475270205e-06,
113
- "loss": 0.299,
114
  "step": 75
115
  },
116
  {
117
  "epoch": 1.6358974358974359,
118
- "grad_norm": 1.200344204902649,
119
  "learning_rate": 8.76012271303888e-06,
120
- "loss": 0.2953,
121
  "step": 80
122
  },
123
  {
124
  "epoch": 1.7384615384615385,
125
- "grad_norm": 1.1982589960098267,
126
  "learning_rate": 7.649067042289681e-06,
127
- "loss": 0.2942,
128
  "step": 85
129
  },
130
  {
131
  "epoch": 1.8410256410256411,
132
- "grad_norm": 1.2551528215408325,
133
  "learning_rate": 6.568002179543409e-06,
134
- "loss": 0.2883,
135
  "step": 90
136
  },
137
  {
138
  "epoch": 1.9435897435897436,
139
- "grad_norm": 1.276993989944458,
140
  "learning_rate": 5.530719249141148e-06,
141
- "loss": 0.2885,
142
  "step": 95
143
  },
144
  {
145
  "epoch": 2.041025641025641,
146
- "grad_norm": 1.176174521446228,
147
  "learning_rate": 4.550450850127626e-06,
148
- "loss": 0.2587,
149
  "step": 100
150
  },
151
  {
152
  "epoch": 2.1435897435897435,
153
- "grad_norm": 1.3641650676727295,
154
  "learning_rate": 3.6397022482313804e-06,
155
- "loss": 0.2231,
156
  "step": 105
157
  },
158
  {
159
  "epoch": 2.246153846153846,
160
- "grad_norm": 1.1558363437652588,
161
  "learning_rate": 2.8100918464225304e-06,
162
- "loss": 0.2215,
163
  "step": 110
164
  },
165
  {
166
  "epoch": 2.348717948717949,
167
- "grad_norm": 1.2226226329803467,
168
  "learning_rate": 2.072202969162234e-06,
169
- "loss": 0.2165,
170
  "step": 115
171
  },
172
  {
173
  "epoch": 2.4512820512820515,
174
- "grad_norm": 1.1804394721984863,
175
  "learning_rate": 1.4354488511294418e-06,
176
- "loss": 0.2089,
177
  "step": 120
178
  },
179
  {
180
  "epoch": 2.5538461538461537,
181
- "grad_norm": 1.253951072692871,
182
  "learning_rate": 9.079525527612321e-07,
183
- "loss": 0.2123,
184
  "step": 125
185
  },
186
  {
187
  "epoch": 2.6564102564102563,
188
- "grad_norm": 1.2056702375411987,
189
  "learning_rate": 4.964433345219354e-07,
190
- "loss": 0.2107,
191
  "step": 130
192
  },
193
  {
194
  "epoch": 2.758974358974359,
195
- "grad_norm": 1.1851439476013184,
196
  "learning_rate": 2.0617081185259512e-07,
197
- "loss": 0.2129,
198
  "step": 135
199
  },
200
  {
201
  "epoch": 2.8615384615384616,
202
- "grad_norm": 1.1076850891113281,
203
  "learning_rate": 4.083798592444899e-08,
204
- "loss": 0.2175,
205
  "step": 140
206
  },
207
  {
208
  "epoch": 2.943589743589744,
209
  "step": 144,
210
  "total_flos": 5.491122506196582e+16,
211
- "train_loss": 0.35015756347113186,
212
- "train_runtime": 1479.6793,
213
- "train_samples_per_second": 6.324,
214
- "train_steps_per_second": 0.097
215
  }
216
  ],
217
  "logging_steps": 5,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.10256410256410256,
13
+ "grad_norm": 6.330384731292725,
14
  "learning_rate": 2e-05,
15
+ "loss": 0.9917,
16
  "step": 5
17
  },
18
  {
19
  "epoch": 0.20512820512820512,
20
+ "grad_norm": 2.223167896270752,
21
  "learning_rate": 1.9936215093023884e-05,
22
+ "loss": 0.7004,
23
  "step": 10
24
  },
25
  {
26
  "epoch": 0.3076923076923077,
27
+ "grad_norm": 1.8093996047973633,
28
  "learning_rate": 1.974567407496712e-05,
29
+ "loss": 0.5531,
30
  "step": 15
31
  },
32
  {
33
  "epoch": 0.41025641025641024,
34
+ "grad_norm": 1.6542998552322388,
35
  "learning_rate": 1.9430807674052092e-05,
36
+ "loss": 0.4873,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 0.5128205128205128,
41
+ "grad_norm": 1.5152506828308105,
42
  "learning_rate": 1.899563263509725e-05,
43
+ "loss": 0.4658,
44
  "step": 25
45
  },
46
  {
47
  "epoch": 0.6153846153846154,
48
+ "grad_norm": 1.466179370880127,
49
  "learning_rate": 1.8445700477978207e-05,
50
+ "loss": 0.4239,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 0.717948717948718,
55
+ "grad_norm": 1.3292107582092285,
56
  "learning_rate": 1.778802667699196e-05,
57
+ "loss": 0.4138,
58
  "step": 35
59
  },
60
  {
61
  "epoch": 0.8205128205128205,
62
+ "grad_norm": 1.4440351724624634,
63
  "learning_rate": 1.7031001164581828e-05,
64
+ "loss": 0.3948,
65
  "step": 40
66
  },
67
  {
68
  "epoch": 0.9230769230769231,
69
+ "grad_norm": 1.3163414001464844,
70
  "learning_rate": 1.618428130112533e-05,
71
+ "loss": 0.3881,
72
  "step": 45
73
  },
74
  {
75
  "epoch": 1.0205128205128204,
76
+ "grad_norm": 1.3107187747955322,
77
  "learning_rate": 1.5258668676167548e-05,
78
+ "loss": 0.3565,
79
  "step": 50
80
  },
81
  {
82
  "epoch": 1.123076923076923,
83
+ "grad_norm": 1.3364475965499878,
84
  "learning_rate": 1.4265971312744252e-05,
85
+ "loss": 0.324,
86
  "step": 55
87
  },
88
  {
89
  "epoch": 1.2256410256410257,
90
+ "grad_norm": 1.1268199682235718,
91
  "learning_rate": 1.3218853032651719e-05,
92
+ "loss": 0.3167,
93
  "step": 60
94
  },
95
  {
96
  "epoch": 1.3282051282051281,
97
+ "grad_norm": 1.2984614372253418,
98
  "learning_rate": 1.2130671904307692e-05,
99
+ "loss": 0.3045,
100
  "step": 65
101
  },
102
  {
103
  "epoch": 1.4307692307692308,
104
+ "grad_norm": 1.2189068794250488,
105
  "learning_rate": 1.1015309834121083e-05,
106
+ "loss": 0.2888,
107
  "step": 70
108
  },
109
  {
110
  "epoch": 1.5333333333333332,
111
+ "grad_norm": 1.208184003829956,
112
  "learning_rate": 9.886995475270205e-06,
113
+ "loss": 0.2973,
114
  "step": 75
115
  },
116
  {
117
  "epoch": 1.6358974358974359,
118
+ "grad_norm": 1.1965588331222534,
119
  "learning_rate": 8.76012271303888e-06,
120
+ "loss": 0.2931,
121
  "step": 80
122
  },
123
  {
124
  "epoch": 1.7384615384615385,
125
+ "grad_norm": 1.1966915130615234,
126
  "learning_rate": 7.649067042289681e-06,
127
+ "loss": 0.2915,
128
  "step": 85
129
  },
130
  {
131
  "epoch": 1.8410256410256411,
132
+ "grad_norm": 1.2417312860488892,
133
  "learning_rate": 6.568002179543409e-06,
134
+ "loss": 0.2858,
135
  "step": 90
136
  },
137
  {
138
  "epoch": 1.9435897435897436,
139
+ "grad_norm": 1.2801405191421509,
140
  "learning_rate": 5.530719249141148e-06,
141
+ "loss": 0.287,
142
  "step": 95
143
  },
144
  {
145
  "epoch": 2.041025641025641,
146
+ "grad_norm": 1.1753469705581665,
147
  "learning_rate": 4.550450850127626e-06,
148
+ "loss": 0.2573,
149
  "step": 100
150
  },
151
  {
152
  "epoch": 2.1435897435897435,
153
+ "grad_norm": 1.3422598838806152,
154
  "learning_rate": 3.6397022482313804e-06,
155
+ "loss": 0.2207,
156
  "step": 105
157
  },
158
  {
159
  "epoch": 2.246153846153846,
160
+ "grad_norm": 1.1634305715560913,
161
  "learning_rate": 2.8100918464225304e-06,
162
+ "loss": 0.2188,
163
  "step": 110
164
  },
165
  {
166
  "epoch": 2.348717948717949,
167
+ "grad_norm": 1.2047914266586304,
168
  "learning_rate": 2.072202969162234e-06,
169
+ "loss": 0.2144,
170
  "step": 115
171
  },
172
  {
173
  "epoch": 2.4512820512820515,
174
+ "grad_norm": 1.1728473901748657,
175
  "learning_rate": 1.4354488511294418e-06,
176
+ "loss": 0.2065,
177
  "step": 120
178
  },
179
  {
180
  "epoch": 2.5538461538461537,
181
+ "grad_norm": 1.2496421337127686,
182
  "learning_rate": 9.079525527612321e-07,
183
+ "loss": 0.2107,
184
  "step": 125
185
  },
186
  {
187
  "epoch": 2.6564102564102563,
188
+ "grad_norm": 1.200203537940979,
189
  "learning_rate": 4.964433345219354e-07,
190
+ "loss": 0.2083,
191
  "step": 130
192
  },
193
  {
194
  "epoch": 2.758974358974359,
195
+ "grad_norm": 1.2024612426757812,
196
  "learning_rate": 2.0617081185259512e-07,
197
+ "loss": 0.2111,
198
  "step": 135
199
  },
200
  {
201
  "epoch": 2.8615384615384616,
202
+ "grad_norm": 1.1126112937927246,
203
  "learning_rate": 4.083798592444899e-08,
204
+ "loss": 0.2157,
205
  "step": 140
206
  },
207
  {
208
  "epoch": 2.943589743589744,
209
  "step": 144,
210
  "total_flos": 5.491122506196582e+16,
211
+ "train_loss": 0.34712929568356937,
212
+ "train_runtime": 1463.7078,
213
+ "train_samples_per_second": 6.393,
214
+ "train_steps_per_second": 0.098
215
  }
216
  ],
217
  "logging_steps": 5,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8ca955ae44219660913cf31e0416e5dada732a8cc00f42076a6739b75ba27b10
3
  size 5624
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10d62da16287a3bdc5aa2c1ba62da1ca7cc3a2218b5694c72c5959ee56c6cd93
3
  size 5624