shulijia commited on
Commit
0a0cf26
·
verified ·
1 Parent(s): f69583f

Training in progress, step 10345, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0036e61ddac96c13d28af5b7348463838da31642973c16e5370deba79e225fb7
3
  size 2384234968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70673c4b3a23c350c3d34964ccf5e6d071a142d9fed0284db5f78da9fe543e09
3
  size 2384234968
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:81c5b9d8b1806de7455aa1e925033cdc78ae0c9f0b199eac035d87169284a120
3
  size 4768662910
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:592af7aba28b8c635fdc8123da87f1a25c80f7f6ef5d578a1399d7cbe6c53fdd
3
  size 4768662910
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3ba427af78f54355503e8fb146121e9f936d278226f07d5bf09468fc62083d77
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9139898b21e9040c38ef91d3deb1fcf3a1358aa4f9c7c7522299d4b1a4f3fc86
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.9666038374172345,
6
  "eval_steps": 100,
7
- "global_step": 10000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -9008,6 +9008,312 @@
9008
  "mean_token_accuracy": 0.968175146728754,
9009
  "num_tokens": 40960000.0,
9010
  "step": 10000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9011
  }
9012
  ],
9013
  "logging_steps": 10,
@@ -9022,12 +9328,12 @@
9022
  "should_evaluate": false,
9023
  "should_log": false,
9024
  "should_save": true,
9025
- "should_training_stop": false
9026
  },
9027
  "attributes": {}
9028
  }
9029
  },
9030
- "total_flos": 1.0824928198656e+17,
9031
  "train_batch_size": 1,
9032
  "trial_name": null,
9033
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.9999516698081291,
6
  "eval_steps": 100,
7
+ "global_step": 10345,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
9008
  "mean_token_accuracy": 0.968175146728754,
9009
  "num_tokens": 40960000.0,
9010
  "step": 10000
9011
+ },
9012
+ {
9013
+ "epoch": 0.9675704412546517,
9014
+ "grad_norm": 0.6231664419174194,
9015
+ "learning_rate": 1.804511278195489e-06,
9016
+ "loss": 0.1437,
9017
+ "mean_token_accuracy": 0.9698630094528198,
9018
+ "num_tokens": 41000960.0,
9019
+ "step": 10010
9020
+ },
9021
+ {
9022
+ "epoch": 0.9685370450920691,
9023
+ "grad_norm": 0.767872154712677,
9024
+ "learning_rate": 1.7508055853920516e-06,
9025
+ "loss": 0.1313,
9026
+ "mean_token_accuracy": 0.9726271979510784,
9027
+ "num_tokens": 41041920.0,
9028
+ "step": 10020
9029
+ },
9030
+ {
9031
+ "epoch": 0.9695036489294863,
9032
+ "grad_norm": 0.7130146622657776,
9033
+ "learning_rate": 1.6970998925886145e-06,
9034
+ "loss": 0.1165,
9035
+ "mean_token_accuracy": 0.9753179997205734,
9036
+ "num_tokens": 41082880.0,
9037
+ "step": 10030
9038
+ },
9039
+ {
9040
+ "epoch": 0.9704702527669035,
9041
+ "grad_norm": 0.6905496716499329,
9042
+ "learning_rate": 1.6433941997851776e-06,
9043
+ "loss": 0.1549,
9044
+ "mean_token_accuracy": 0.9695450082421303,
9045
+ "num_tokens": 41123840.0,
9046
+ "step": 10040
9047
+ },
9048
+ {
9049
+ "epoch": 0.9714368566043207,
9050
+ "grad_norm": 0.9023261070251465,
9051
+ "learning_rate": 1.5896885069817402e-06,
9052
+ "loss": 0.1334,
9053
+ "mean_token_accuracy": 0.9723825827240944,
9054
+ "num_tokens": 41164800.0,
9055
+ "step": 10050
9056
+ },
9057
+ {
9058
+ "epoch": 0.972403460441738,
9059
+ "grad_norm": 0.7522259950637817,
9060
+ "learning_rate": 1.5359828141783029e-06,
9061
+ "loss": 0.1305,
9062
+ "mean_token_accuracy": 0.9732142813503742,
9063
+ "num_tokens": 41205760.0,
9064
+ "step": 10060
9065
+ },
9066
+ {
9067
+ "epoch": 0.9733700642791552,
9068
+ "grad_norm": 0.7042533755302429,
9069
+ "learning_rate": 1.4822771213748658e-06,
9070
+ "loss": 0.1328,
9071
+ "mean_token_accuracy": 0.9719422683119774,
9072
+ "num_tokens": 41246720.0,
9073
+ "step": 10070
9074
+ },
9075
+ {
9076
+ "epoch": 0.9743366681165724,
9077
+ "grad_norm": 0.6883172392845154,
9078
+ "learning_rate": 1.4285714285714286e-06,
9079
+ "loss": 0.1244,
9080
+ "mean_token_accuracy": 0.973972599953413,
9081
+ "num_tokens": 41287680.0,
9082
+ "step": 10080
9083
+ },
9084
+ {
9085
+ "epoch": 0.9753032719539897,
9086
+ "grad_norm": 0.7336052656173706,
9087
+ "learning_rate": 1.3748657357679915e-06,
9088
+ "loss": 0.1481,
9089
+ "mean_token_accuracy": 0.9694471590220928,
9090
+ "num_tokens": 41328640.0,
9091
+ "step": 10090
9092
+ },
9093
+ {
9094
+ "epoch": 0.9762698757914069,
9095
+ "grad_norm": 0.7422951459884644,
9096
+ "learning_rate": 1.3211600429645542e-06,
9097
+ "loss": 0.1391,
9098
+ "mean_token_accuracy": 0.9710861049592495,
9099
+ "num_tokens": 41369600.0,
9100
+ "step": 10100
9101
+ },
9102
+ {
9103
+ "epoch": 0.9772364796288241,
9104
+ "grad_norm": 0.6071293354034424,
9105
+ "learning_rate": 1.2674543501611172e-06,
9106
+ "loss": 0.1317,
9107
+ "mean_token_accuracy": 0.9728718191385269,
9108
+ "num_tokens": 41410560.0,
9109
+ "step": 10110
9110
+ },
9111
+ {
9112
+ "epoch": 0.9782030834662414,
9113
+ "grad_norm": 0.7155930995941162,
9114
+ "learning_rate": 1.21374865735768e-06,
9115
+ "loss": 0.1362,
9116
+ "mean_token_accuracy": 0.9724804274737835,
9117
+ "num_tokens": 41451520.0,
9118
+ "step": 10120
9119
+ },
9120
+ {
9121
+ "epoch": 0.9791696873036586,
9122
+ "grad_norm": 0.6978849172592163,
9123
+ "learning_rate": 1.1600429645542428e-06,
9124
+ "loss": 0.1461,
9125
+ "mean_token_accuracy": 0.9695205435156822,
9126
+ "num_tokens": 41492480.0,
9127
+ "step": 10130
9128
+ },
9129
+ {
9130
+ "epoch": 0.9801362911410758,
9131
+ "grad_norm": 0.6737282872200012,
9132
+ "learning_rate": 1.1063372717508057e-06,
9133
+ "loss": 0.1475,
9134
+ "mean_token_accuracy": 0.9696183927357197,
9135
+ "num_tokens": 41533440.0,
9136
+ "step": 10140
9137
+ },
9138
+ {
9139
+ "epoch": 0.981102894978493,
9140
+ "grad_norm": 0.8504825234413147,
9141
+ "learning_rate": 1.0526315789473685e-06,
9142
+ "loss": 0.1474,
9143
+ "mean_token_accuracy": 0.9688845336437225,
9144
+ "num_tokens": 41574400.0,
9145
+ "step": 10150
9146
+ },
9147
+ {
9148
+ "epoch": 0.9820694988159103,
9149
+ "grad_norm": 0.7281203269958496,
9150
+ "learning_rate": 9.989258861439314e-07,
9151
+ "loss": 0.1525,
9152
+ "mean_token_accuracy": 0.9684197634458542,
9153
+ "num_tokens": 41615360.0,
9154
+ "step": 10160
9155
+ },
9156
+ {
9157
+ "epoch": 0.9830361026533275,
9158
+ "grad_norm": 0.686882734298706,
9159
+ "learning_rate": 9.452201933404941e-07,
9160
+ "loss": 0.1371,
9161
+ "mean_token_accuracy": 0.971844419836998,
9162
+ "num_tokens": 41656320.0,
9163
+ "step": 10170
9164
+ },
9165
+ {
9166
+ "epoch": 0.9840027064907447,
9167
+ "grad_norm": 0.6194028854370117,
9168
+ "learning_rate": 8.91514500537057e-07,
9169
+ "loss": 0.14,
9170
+ "mean_token_accuracy": 0.9716487258672715,
9171
+ "num_tokens": 41697280.0,
9172
+ "step": 10180
9173
+ },
9174
+ {
9175
+ "epoch": 0.984969310328162,
9176
+ "grad_norm": 0.7701581716537476,
9177
+ "learning_rate": 8.378088077336197e-07,
9178
+ "loss": 0.1333,
9179
+ "mean_token_accuracy": 0.9725048907101155,
9180
+ "num_tokens": 41738240.0,
9181
+ "step": 10190
9182
+ },
9183
+ {
9184
+ "epoch": 0.9859359141655792,
9185
+ "grad_norm": 0.6333341002464294,
9186
+ "learning_rate": 7.841031149301827e-07,
9187
+ "loss": 0.1553,
9188
+ "mean_token_accuracy": 0.9693982377648354,
9189
+ "num_tokens": 41779200.0,
9190
+ "step": 10200
9191
+ },
9192
+ {
9193
+ "epoch": 0.9869025180029964,
9194
+ "grad_norm": 0.7522182464599609,
9195
+ "learning_rate": 7.303974221267455e-07,
9196
+ "loss": 0.136,
9197
+ "mean_token_accuracy": 0.9721624210476876,
9198
+ "num_tokens": 41820160.0,
9199
+ "step": 10210
9200
+ },
9201
+ {
9202
+ "epoch": 0.9878691218404138,
9203
+ "grad_norm": 0.7271556258201599,
9204
+ "learning_rate": 6.766917293233083e-07,
9205
+ "loss": 0.1372,
9206
+ "mean_token_accuracy": 0.9710371807217598,
9207
+ "num_tokens": 41861120.0,
9208
+ "step": 10220
9209
+ },
9210
+ {
9211
+ "epoch": 0.988835725677831,
9212
+ "grad_norm": 0.6557773351669312,
9213
+ "learning_rate": 6.229860365198711e-07,
9214
+ "loss": 0.1551,
9215
+ "mean_token_accuracy": 0.9677837543189526,
9216
+ "num_tokens": 41902080.0,
9217
+ "step": 10230
9218
+ },
9219
+ {
9220
+ "epoch": 0.9898023295152482,
9221
+ "grad_norm": 0.6414600014686584,
9222
+ "learning_rate": 5.69280343716434e-07,
9223
+ "loss": 0.1355,
9224
+ "mean_token_accuracy": 0.9722602687776088,
9225
+ "num_tokens": 41943040.0,
9226
+ "step": 10240
9227
+ },
9228
+ {
9229
+ "epoch": 0.9907689333526654,
9230
+ "grad_norm": 0.6378936767578125,
9231
+ "learning_rate": 5.155746509129967e-07,
9232
+ "loss": 0.1462,
9233
+ "mean_token_accuracy": 0.9706457868218422,
9234
+ "num_tokens": 41984000.0,
9235
+ "step": 10250
9236
+ },
9237
+ {
9238
+ "epoch": 0.9917355371900827,
9239
+ "grad_norm": 0.7637057304382324,
9240
+ "learning_rate": 4.618689581095596e-07,
9241
+ "loss": 0.1509,
9242
+ "mean_token_accuracy": 0.9690313085913658,
9243
+ "num_tokens": 42024960.0,
9244
+ "step": 10260
9245
+ },
9246
+ {
9247
+ "epoch": 0.9927021410274999,
9248
+ "grad_norm": 0.8268054127693176,
9249
+ "learning_rate": 4.081632653061225e-07,
9250
+ "loss": 0.1434,
9251
+ "mean_token_accuracy": 0.969985318928957,
9252
+ "num_tokens": 42065920.0,
9253
+ "step": 10270
9254
+ },
9255
+ {
9256
+ "epoch": 0.9936687448649171,
9257
+ "grad_norm": 0.685546338558197,
9258
+ "learning_rate": 3.544575725026853e-07,
9259
+ "loss": 0.1458,
9260
+ "mean_token_accuracy": 0.9699608586728573,
9261
+ "num_tokens": 42106880.0,
9262
+ "step": 10280
9263
+ },
9264
+ {
9265
+ "epoch": 0.9946353487023344,
9266
+ "grad_norm": 0.6523056626319885,
9267
+ "learning_rate": 3.007518796992482e-07,
9268
+ "loss": 0.1334,
9269
+ "mean_token_accuracy": 0.9717954970896244,
9270
+ "num_tokens": 42147840.0,
9271
+ "step": 10290
9272
+ },
9273
+ {
9274
+ "epoch": 0.9956019525397516,
9275
+ "grad_norm": 0.6793970465660095,
9276
+ "learning_rate": 2.47046186895811e-07,
9277
+ "loss": 0.1371,
9278
+ "mean_token_accuracy": 0.9715264149010181,
9279
+ "num_tokens": 42188800.0,
9280
+ "step": 10300
9281
+ },
9282
+ {
9283
+ "epoch": 0.9965685563771688,
9284
+ "grad_norm": 0.6603143215179443,
9285
+ "learning_rate": 1.933404940923738e-07,
9286
+ "loss": 0.1116,
9287
+ "mean_token_accuracy": 0.9765166319906712,
9288
+ "num_tokens": 42229760.0,
9289
+ "step": 10310
9290
+ },
9291
+ {
9292
+ "epoch": 0.9975351602145861,
9293
+ "grad_norm": 0.7608389258384705,
9294
+ "learning_rate": 1.3963480128893664e-07,
9295
+ "loss": 0.141,
9296
+ "mean_token_accuracy": 0.9707680970430375,
9297
+ "num_tokens": 42270720.0,
9298
+ "step": 10320
9299
+ },
9300
+ {
9301
+ "epoch": 0.9985017640520033,
9302
+ "grad_norm": 0.7448744177818298,
9303
+ "learning_rate": 8.592910848549947e-08,
9304
+ "loss": 0.1155,
9305
+ "mean_token_accuracy": 0.9754892319440842,
9306
+ "num_tokens": 42311680.0,
9307
+ "step": 10330
9308
+ },
9309
+ {
9310
+ "epoch": 0.9994683678894205,
9311
+ "grad_norm": 0.7076250910758972,
9312
+ "learning_rate": 3.2223415682062296e-08,
9313
+ "loss": 0.1324,
9314
+ "mean_token_accuracy": 0.9716731876134872,
9315
+ "num_tokens": 42352640.0,
9316
+ "step": 10340
9317
  }
9318
  ],
9319
  "logging_steps": 10,
 
9328
  "should_evaluate": false,
9329
  "should_log": false,
9330
  "should_save": true,
9331
+ "should_training_stop": true
9332
  },
9333
  "attributes": {}
9334
  }
9335
  },
9336
+ "total_flos": 1.1198388221509632e+17,
9337
  "train_batch_size": 1,
9338
  "trial_name": null,
9339
  "trial_params": null