Training in progress, step 10345, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 2384234968
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:70673c4b3a23c350c3d34964ccf5e6d071a142d9fed0284db5f78da9fe543e09
|
3 |
size 2384234968
|
last-checkpoint/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4768662910
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:592af7aba28b8c635fdc8123da87f1a25c80f7f6ef5d578a1399d7cbe6c53fdd
|
3 |
size 4768662910
|
last-checkpoint/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9139898b21e9040c38ef91d3deb1fcf3a1358aa4f9c7c7522299d4b1a4f3fc86
|
3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
@@ -2,9 +2,9 @@
|
|
2 |
"best_global_step": null,
|
3 |
"best_metric": null,
|
4 |
"best_model_checkpoint": null,
|
5 |
-
"epoch": 0.
|
6 |
"eval_steps": 100,
|
7 |
-
"global_step":
|
8 |
"is_hyper_param_search": false,
|
9 |
"is_local_process_zero": true,
|
10 |
"is_world_process_zero": true,
|
@@ -9008,6 +9008,312 @@
|
|
9008 |
"mean_token_accuracy": 0.968175146728754,
|
9009 |
"num_tokens": 40960000.0,
|
9010 |
"step": 10000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9011 |
}
|
9012 |
],
|
9013 |
"logging_steps": 10,
|
@@ -9022,12 +9328,12 @@
|
|
9022 |
"should_evaluate": false,
|
9023 |
"should_log": false,
|
9024 |
"should_save": true,
|
9025 |
-
"should_training_stop":
|
9026 |
},
|
9027 |
"attributes": {}
|
9028 |
}
|
9029 |
},
|
9030 |
-
"total_flos": 1.
|
9031 |
"train_batch_size": 1,
|
9032 |
"trial_name": null,
|
9033 |
"trial_params": null
|
|
|
2 |
"best_global_step": null,
|
3 |
"best_metric": null,
|
4 |
"best_model_checkpoint": null,
|
5 |
+
"epoch": 0.9999516698081291,
|
6 |
"eval_steps": 100,
|
7 |
+
"global_step": 10345,
|
8 |
"is_hyper_param_search": false,
|
9 |
"is_local_process_zero": true,
|
10 |
"is_world_process_zero": true,
|
|
|
9008 |
"mean_token_accuracy": 0.968175146728754,
|
9009 |
"num_tokens": 40960000.0,
|
9010 |
"step": 10000
|
9011 |
+
},
|
9012 |
+
{
|
9013 |
+
"epoch": 0.9675704412546517,
|
9014 |
+
"grad_norm": 0.6231664419174194,
|
9015 |
+
"learning_rate": 1.804511278195489e-06,
|
9016 |
+
"loss": 0.1437,
|
9017 |
+
"mean_token_accuracy": 0.9698630094528198,
|
9018 |
+
"num_tokens": 41000960.0,
|
9019 |
+
"step": 10010
|
9020 |
+
},
|
9021 |
+
{
|
9022 |
+
"epoch": 0.9685370450920691,
|
9023 |
+
"grad_norm": 0.767872154712677,
|
9024 |
+
"learning_rate": 1.7508055853920516e-06,
|
9025 |
+
"loss": 0.1313,
|
9026 |
+
"mean_token_accuracy": 0.9726271979510784,
|
9027 |
+
"num_tokens": 41041920.0,
|
9028 |
+
"step": 10020
|
9029 |
+
},
|
9030 |
+
{
|
9031 |
+
"epoch": 0.9695036489294863,
|
9032 |
+
"grad_norm": 0.7130146622657776,
|
9033 |
+
"learning_rate": 1.6970998925886145e-06,
|
9034 |
+
"loss": 0.1165,
|
9035 |
+
"mean_token_accuracy": 0.9753179997205734,
|
9036 |
+
"num_tokens": 41082880.0,
|
9037 |
+
"step": 10030
|
9038 |
+
},
|
9039 |
+
{
|
9040 |
+
"epoch": 0.9704702527669035,
|
9041 |
+
"grad_norm": 0.6905496716499329,
|
9042 |
+
"learning_rate": 1.6433941997851776e-06,
|
9043 |
+
"loss": 0.1549,
|
9044 |
+
"mean_token_accuracy": 0.9695450082421303,
|
9045 |
+
"num_tokens": 41123840.0,
|
9046 |
+
"step": 10040
|
9047 |
+
},
|
9048 |
+
{
|
9049 |
+
"epoch": 0.9714368566043207,
|
9050 |
+
"grad_norm": 0.9023261070251465,
|
9051 |
+
"learning_rate": 1.5896885069817402e-06,
|
9052 |
+
"loss": 0.1334,
|
9053 |
+
"mean_token_accuracy": 0.9723825827240944,
|
9054 |
+
"num_tokens": 41164800.0,
|
9055 |
+
"step": 10050
|
9056 |
+
},
|
9057 |
+
{
|
9058 |
+
"epoch": 0.972403460441738,
|
9059 |
+
"grad_norm": 0.7522259950637817,
|
9060 |
+
"learning_rate": 1.5359828141783029e-06,
|
9061 |
+
"loss": 0.1305,
|
9062 |
+
"mean_token_accuracy": 0.9732142813503742,
|
9063 |
+
"num_tokens": 41205760.0,
|
9064 |
+
"step": 10060
|
9065 |
+
},
|
9066 |
+
{
|
9067 |
+
"epoch": 0.9733700642791552,
|
9068 |
+
"grad_norm": 0.7042533755302429,
|
9069 |
+
"learning_rate": 1.4822771213748658e-06,
|
9070 |
+
"loss": 0.1328,
|
9071 |
+
"mean_token_accuracy": 0.9719422683119774,
|
9072 |
+
"num_tokens": 41246720.0,
|
9073 |
+
"step": 10070
|
9074 |
+
},
|
9075 |
+
{
|
9076 |
+
"epoch": 0.9743366681165724,
|
9077 |
+
"grad_norm": 0.6883172392845154,
|
9078 |
+
"learning_rate": 1.4285714285714286e-06,
|
9079 |
+
"loss": 0.1244,
|
9080 |
+
"mean_token_accuracy": 0.973972599953413,
|
9081 |
+
"num_tokens": 41287680.0,
|
9082 |
+
"step": 10080
|
9083 |
+
},
|
9084 |
+
{
|
9085 |
+
"epoch": 0.9753032719539897,
|
9086 |
+
"grad_norm": 0.7336052656173706,
|
9087 |
+
"learning_rate": 1.3748657357679915e-06,
|
9088 |
+
"loss": 0.1481,
|
9089 |
+
"mean_token_accuracy": 0.9694471590220928,
|
9090 |
+
"num_tokens": 41328640.0,
|
9091 |
+
"step": 10090
|
9092 |
+
},
|
9093 |
+
{
|
9094 |
+
"epoch": 0.9762698757914069,
|
9095 |
+
"grad_norm": 0.7422951459884644,
|
9096 |
+
"learning_rate": 1.3211600429645542e-06,
|
9097 |
+
"loss": 0.1391,
|
9098 |
+
"mean_token_accuracy": 0.9710861049592495,
|
9099 |
+
"num_tokens": 41369600.0,
|
9100 |
+
"step": 10100
|
9101 |
+
},
|
9102 |
+
{
|
9103 |
+
"epoch": 0.9772364796288241,
|
9104 |
+
"grad_norm": 0.6071293354034424,
|
9105 |
+
"learning_rate": 1.2674543501611172e-06,
|
9106 |
+
"loss": 0.1317,
|
9107 |
+
"mean_token_accuracy": 0.9728718191385269,
|
9108 |
+
"num_tokens": 41410560.0,
|
9109 |
+
"step": 10110
|
9110 |
+
},
|
9111 |
+
{
|
9112 |
+
"epoch": 0.9782030834662414,
|
9113 |
+
"grad_norm": 0.7155930995941162,
|
9114 |
+
"learning_rate": 1.21374865735768e-06,
|
9115 |
+
"loss": 0.1362,
|
9116 |
+
"mean_token_accuracy": 0.9724804274737835,
|
9117 |
+
"num_tokens": 41451520.0,
|
9118 |
+
"step": 10120
|
9119 |
+
},
|
9120 |
+
{
|
9121 |
+
"epoch": 0.9791696873036586,
|
9122 |
+
"grad_norm": 0.6978849172592163,
|
9123 |
+
"learning_rate": 1.1600429645542428e-06,
|
9124 |
+
"loss": 0.1461,
|
9125 |
+
"mean_token_accuracy": 0.9695205435156822,
|
9126 |
+
"num_tokens": 41492480.0,
|
9127 |
+
"step": 10130
|
9128 |
+
},
|
9129 |
+
{
|
9130 |
+
"epoch": 0.9801362911410758,
|
9131 |
+
"grad_norm": 0.6737282872200012,
|
9132 |
+
"learning_rate": 1.1063372717508057e-06,
|
9133 |
+
"loss": 0.1475,
|
9134 |
+
"mean_token_accuracy": 0.9696183927357197,
|
9135 |
+
"num_tokens": 41533440.0,
|
9136 |
+
"step": 10140
|
9137 |
+
},
|
9138 |
+
{
|
9139 |
+
"epoch": 0.981102894978493,
|
9140 |
+
"grad_norm": 0.8504825234413147,
|
9141 |
+
"learning_rate": 1.0526315789473685e-06,
|
9142 |
+
"loss": 0.1474,
|
9143 |
+
"mean_token_accuracy": 0.9688845336437225,
|
9144 |
+
"num_tokens": 41574400.0,
|
9145 |
+
"step": 10150
|
9146 |
+
},
|
9147 |
+
{
|
9148 |
+
"epoch": 0.9820694988159103,
|
9149 |
+
"grad_norm": 0.7281203269958496,
|
9150 |
+
"learning_rate": 9.989258861439314e-07,
|
9151 |
+
"loss": 0.1525,
|
9152 |
+
"mean_token_accuracy": 0.9684197634458542,
|
9153 |
+
"num_tokens": 41615360.0,
|
9154 |
+
"step": 10160
|
9155 |
+
},
|
9156 |
+
{
|
9157 |
+
"epoch": 0.9830361026533275,
|
9158 |
+
"grad_norm": 0.686882734298706,
|
9159 |
+
"learning_rate": 9.452201933404941e-07,
|
9160 |
+
"loss": 0.1371,
|
9161 |
+
"mean_token_accuracy": 0.971844419836998,
|
9162 |
+
"num_tokens": 41656320.0,
|
9163 |
+
"step": 10170
|
9164 |
+
},
|
9165 |
+
{
|
9166 |
+
"epoch": 0.9840027064907447,
|
9167 |
+
"grad_norm": 0.6194028854370117,
|
9168 |
+
"learning_rate": 8.91514500537057e-07,
|
9169 |
+
"loss": 0.14,
|
9170 |
+
"mean_token_accuracy": 0.9716487258672715,
|
9171 |
+
"num_tokens": 41697280.0,
|
9172 |
+
"step": 10180
|
9173 |
+
},
|
9174 |
+
{
|
9175 |
+
"epoch": 0.984969310328162,
|
9176 |
+
"grad_norm": 0.7701581716537476,
|
9177 |
+
"learning_rate": 8.378088077336197e-07,
|
9178 |
+
"loss": 0.1333,
|
9179 |
+
"mean_token_accuracy": 0.9725048907101155,
|
9180 |
+
"num_tokens": 41738240.0,
|
9181 |
+
"step": 10190
|
9182 |
+
},
|
9183 |
+
{
|
9184 |
+
"epoch": 0.9859359141655792,
|
9185 |
+
"grad_norm": 0.6333341002464294,
|
9186 |
+
"learning_rate": 7.841031149301827e-07,
|
9187 |
+
"loss": 0.1553,
|
9188 |
+
"mean_token_accuracy": 0.9693982377648354,
|
9189 |
+
"num_tokens": 41779200.0,
|
9190 |
+
"step": 10200
|
9191 |
+
},
|
9192 |
+
{
|
9193 |
+
"epoch": 0.9869025180029964,
|
9194 |
+
"grad_norm": 0.7522182464599609,
|
9195 |
+
"learning_rate": 7.303974221267455e-07,
|
9196 |
+
"loss": 0.136,
|
9197 |
+
"mean_token_accuracy": 0.9721624210476876,
|
9198 |
+
"num_tokens": 41820160.0,
|
9199 |
+
"step": 10210
|
9200 |
+
},
|
9201 |
+
{
|
9202 |
+
"epoch": 0.9878691218404138,
|
9203 |
+
"grad_norm": 0.7271556258201599,
|
9204 |
+
"learning_rate": 6.766917293233083e-07,
|
9205 |
+
"loss": 0.1372,
|
9206 |
+
"mean_token_accuracy": 0.9710371807217598,
|
9207 |
+
"num_tokens": 41861120.0,
|
9208 |
+
"step": 10220
|
9209 |
+
},
|
9210 |
+
{
|
9211 |
+
"epoch": 0.988835725677831,
|
9212 |
+
"grad_norm": 0.6557773351669312,
|
9213 |
+
"learning_rate": 6.229860365198711e-07,
|
9214 |
+
"loss": 0.1551,
|
9215 |
+
"mean_token_accuracy": 0.9677837543189526,
|
9216 |
+
"num_tokens": 41902080.0,
|
9217 |
+
"step": 10230
|
9218 |
+
},
|
9219 |
+
{
|
9220 |
+
"epoch": 0.9898023295152482,
|
9221 |
+
"grad_norm": 0.6414600014686584,
|
9222 |
+
"learning_rate": 5.69280343716434e-07,
|
9223 |
+
"loss": 0.1355,
|
9224 |
+
"mean_token_accuracy": 0.9722602687776088,
|
9225 |
+
"num_tokens": 41943040.0,
|
9226 |
+
"step": 10240
|
9227 |
+
},
|
9228 |
+
{
|
9229 |
+
"epoch": 0.9907689333526654,
|
9230 |
+
"grad_norm": 0.6378936767578125,
|
9231 |
+
"learning_rate": 5.155746509129967e-07,
|
9232 |
+
"loss": 0.1462,
|
9233 |
+
"mean_token_accuracy": 0.9706457868218422,
|
9234 |
+
"num_tokens": 41984000.0,
|
9235 |
+
"step": 10250
|
9236 |
+
},
|
9237 |
+
{
|
9238 |
+
"epoch": 0.9917355371900827,
|
9239 |
+
"grad_norm": 0.7637057304382324,
|
9240 |
+
"learning_rate": 4.618689581095596e-07,
|
9241 |
+
"loss": 0.1509,
|
9242 |
+
"mean_token_accuracy": 0.9690313085913658,
|
9243 |
+
"num_tokens": 42024960.0,
|
9244 |
+
"step": 10260
|
9245 |
+
},
|
9246 |
+
{
|
9247 |
+
"epoch": 0.9927021410274999,
|
9248 |
+
"grad_norm": 0.8268054127693176,
|
9249 |
+
"learning_rate": 4.081632653061225e-07,
|
9250 |
+
"loss": 0.1434,
|
9251 |
+
"mean_token_accuracy": 0.969985318928957,
|
9252 |
+
"num_tokens": 42065920.0,
|
9253 |
+
"step": 10270
|
9254 |
+
},
|
9255 |
+
{
|
9256 |
+
"epoch": 0.9936687448649171,
|
9257 |
+
"grad_norm": 0.685546338558197,
|
9258 |
+
"learning_rate": 3.544575725026853e-07,
|
9259 |
+
"loss": 0.1458,
|
9260 |
+
"mean_token_accuracy": 0.9699608586728573,
|
9261 |
+
"num_tokens": 42106880.0,
|
9262 |
+
"step": 10280
|
9263 |
+
},
|
9264 |
+
{
|
9265 |
+
"epoch": 0.9946353487023344,
|
9266 |
+
"grad_norm": 0.6523056626319885,
|
9267 |
+
"learning_rate": 3.007518796992482e-07,
|
9268 |
+
"loss": 0.1334,
|
9269 |
+
"mean_token_accuracy": 0.9717954970896244,
|
9270 |
+
"num_tokens": 42147840.0,
|
9271 |
+
"step": 10290
|
9272 |
+
},
|
9273 |
+
{
|
9274 |
+
"epoch": 0.9956019525397516,
|
9275 |
+
"grad_norm": 0.6793970465660095,
|
9276 |
+
"learning_rate": 2.47046186895811e-07,
|
9277 |
+
"loss": 0.1371,
|
9278 |
+
"mean_token_accuracy": 0.9715264149010181,
|
9279 |
+
"num_tokens": 42188800.0,
|
9280 |
+
"step": 10300
|
9281 |
+
},
|
9282 |
+
{
|
9283 |
+
"epoch": 0.9965685563771688,
|
9284 |
+
"grad_norm": 0.6603143215179443,
|
9285 |
+
"learning_rate": 1.933404940923738e-07,
|
9286 |
+
"loss": 0.1116,
|
9287 |
+
"mean_token_accuracy": 0.9765166319906712,
|
9288 |
+
"num_tokens": 42229760.0,
|
9289 |
+
"step": 10310
|
9290 |
+
},
|
9291 |
+
{
|
9292 |
+
"epoch": 0.9975351602145861,
|
9293 |
+
"grad_norm": 0.7608389258384705,
|
9294 |
+
"learning_rate": 1.3963480128893664e-07,
|
9295 |
+
"loss": 0.141,
|
9296 |
+
"mean_token_accuracy": 0.9707680970430375,
|
9297 |
+
"num_tokens": 42270720.0,
|
9298 |
+
"step": 10320
|
9299 |
+
},
|
9300 |
+
{
|
9301 |
+
"epoch": 0.9985017640520033,
|
9302 |
+
"grad_norm": 0.7448744177818298,
|
9303 |
+
"learning_rate": 8.592910848549947e-08,
|
9304 |
+
"loss": 0.1155,
|
9305 |
+
"mean_token_accuracy": 0.9754892319440842,
|
9306 |
+
"num_tokens": 42311680.0,
|
9307 |
+
"step": 10330
|
9308 |
+
},
|
9309 |
+
{
|
9310 |
+
"epoch": 0.9994683678894205,
|
9311 |
+
"grad_norm": 0.7076250910758972,
|
9312 |
+
"learning_rate": 3.2223415682062296e-08,
|
9313 |
+
"loss": 0.1324,
|
9314 |
+
"mean_token_accuracy": 0.9716731876134872,
|
9315 |
+
"num_tokens": 42352640.0,
|
9316 |
+
"step": 10340
|
9317 |
}
|
9318 |
],
|
9319 |
"logging_steps": 10,
|
|
|
9328 |
"should_evaluate": false,
|
9329 |
"should_log": false,
|
9330 |
"should_save": true,
|
9331 |
+
"should_training_stop": true
|
9332 |
},
|
9333 |
"attributes": {}
|
9334 |
}
|
9335 |
},
|
9336 |
+
"total_flos": 1.1198388221509632e+17,
|
9337 |
"train_batch_size": 1,
|
9338 |
"trial_name": null,
|
9339 |
"trial_params": null
|