shulijia commited on
Commit
e536a55
·
verified ·
1 Parent(s): bdf3294

Training in progress, step 10500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:21e9c11e02543045a52d1d10e85b29deee320e577ed8c40299be1aac88002bab
3
  size 2384234968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b85521102a01aa9ca0cac30f77dc681cafb77e29acfb0cfb308a2655c5df66d7
3
  size 2384234968
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4c87bdbbf96a91780aaf4a58c008036f2bfda78e91f3d428d63005f735fe1e0c
3
  size 4768663315
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:540ea13aa0494c204cfb6c6b5f87988f9d4d15f8ed6f18e14b57f97e602a0555
3
  size 4768663315
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:33e6b43d263edc3fb19dbc74c4a7ae9df523ccc7c2602c8a0c606ae6abf92007
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4dd1ea77e8b79a8e0c06815d69eb9b02aa74cbe131a4af6f145c955f8944e41f
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.8320883633660214,
6
  "eval_steps": 100,
7
- "global_step": 10000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -9008,6 +9008,456 @@
9008
  "mean_token_accuracy": 0.7658023487776517,
9009
  "num_tokens": 81917952.0,
9010
  "step": 10000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9011
  }
9012
  ],
9013
  "logging_steps": 10,
@@ -9027,7 +9477,7 @@
9027
  "attributes": {}
9028
  }
9029
  },
9030
- "total_flos": 2.1649315150902067e+17,
9031
  "train_batch_size": 2,
9032
  "trial_name": null,
9033
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 2.9736963217332812,
6
  "eval_steps": 100,
7
+ "global_step": 10500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
9008
  "mean_token_accuracy": 0.7658023487776517,
9009
  "num_tokens": 81917952.0,
9010
  "step": 10000
9011
+ },
9012
+ {
9013
+ "epoch": 2.8349205225333662,
9014
+ "grad_norm": 2.1276166439056396,
9015
+ "learning_rate": 6.126088324766601e-07,
9016
+ "loss": 0.1176,
9017
+ "mean_token_accuracy": 0.7733365941792727,
9018
+ "num_tokens": 81999872.0,
9019
+ "step": 10010
9020
+ },
9021
+ {
9022
+ "epoch": 2.8377526817007115,
9023
+ "grad_norm": 1.4682493209838867,
9024
+ "learning_rate": 6.021189552082242e-07,
9025
+ "loss": 0.1185,
9026
+ "mean_token_accuracy": 0.7733732853084803,
9027
+ "num_tokens": 82081792.0,
9028
+ "step": 10020
9029
+ },
9030
+ {
9031
+ "epoch": 2.8405848408680567,
9032
+ "grad_norm": 1.2164413928985596,
9033
+ "learning_rate": 5.916290779397881e-07,
9034
+ "loss": 0.1097,
9035
+ "mean_token_accuracy": 0.7680772997438907,
9036
+ "num_tokens": 82163712.0,
9037
+ "step": 10030
9038
+ },
9039
+ {
9040
+ "epoch": 2.843417000035402,
9041
+ "grad_norm": 1.2230916023254395,
9042
+ "learning_rate": 5.811392006713522e-07,
9043
+ "loss": 0.1209,
9044
+ "mean_token_accuracy": 0.770596868917346,
9045
+ "num_tokens": 82245632.0,
9046
+ "step": 10040
9047
+ },
9048
+ {
9049
+ "epoch": 2.8462491592027472,
9050
+ "grad_norm": 1.3805590867996216,
9051
+ "learning_rate": 5.706493234029163e-07,
9052
+ "loss": 0.1109,
9053
+ "mean_token_accuracy": 0.7664261229336262,
9054
+ "num_tokens": 82327552.0,
9055
+ "step": 10050
9056
+ },
9057
+ {
9058
+ "epoch": 2.8490813183700925,
9059
+ "grad_norm": 1.2335084676742554,
9060
+ "learning_rate": 5.601594461344803e-07,
9061
+ "loss": 0.1087,
9062
+ "mean_token_accuracy": 0.7922333665192127,
9063
+ "num_tokens": 82409472.0,
9064
+ "step": 10060
9065
+ },
9066
+ {
9067
+ "epoch": 2.8519134775374377,
9068
+ "grad_norm": 1.4766696691513062,
9069
+ "learning_rate": 5.496695688660443e-07,
9070
+ "loss": 0.0949,
9071
+ "mean_token_accuracy": 0.7757460854947567,
9072
+ "num_tokens": 82491392.0,
9073
+ "step": 10070
9074
+ },
9075
+ {
9076
+ "epoch": 2.8547456367047825,
9077
+ "grad_norm": 1.2470474243164062,
9078
+ "learning_rate": 5.391796915976084e-07,
9079
+ "loss": 0.112,
9080
+ "mean_token_accuracy": 0.7856042079627514,
9081
+ "num_tokens": 82573312.0,
9082
+ "step": 10080
9083
+ },
9084
+ {
9085
+ "epoch": 2.8575777958721282,
9086
+ "grad_norm": 1.7810742855072021,
9087
+ "learning_rate": 5.286898143291724e-07,
9088
+ "loss": 0.121,
9089
+ "mean_token_accuracy": 0.7693003930151463,
9090
+ "num_tokens": 82655232.0,
9091
+ "step": 10090
9092
+ },
9093
+ {
9094
+ "epoch": 2.860409955039473,
9095
+ "grad_norm": 1.3474197387695312,
9096
+ "learning_rate": 5.181999370607364e-07,
9097
+ "loss": 0.1182,
9098
+ "mean_token_accuracy": 0.7601394318044186,
9099
+ "num_tokens": 82737152.0,
9100
+ "step": 10100
9101
+ },
9102
+ {
9103
+ "epoch": 2.8632421142068183,
9104
+ "grad_norm": 1.096218466758728,
9105
+ "learning_rate": 5.077100597923005e-07,
9106
+ "loss": 0.13,
9107
+ "mean_token_accuracy": 0.7537181980907917,
9108
+ "num_tokens": 82819072.0,
9109
+ "step": 10110
9110
+ },
9111
+ {
9112
+ "epoch": 2.8660742733741635,
9113
+ "grad_norm": 1.064784049987793,
9114
+ "learning_rate": 4.972201825238645e-07,
9115
+ "loss": 0.1348,
9116
+ "mean_token_accuracy": 0.7513820916414261,
9117
+ "num_tokens": 82900992.0,
9118
+ "step": 10120
9119
+ },
9120
+ {
9121
+ "epoch": 2.868906432541509,
9122
+ "grad_norm": 1.5605591535568237,
9123
+ "learning_rate": 4.867303052554286e-07,
9124
+ "loss": 0.141,
9125
+ "mean_token_accuracy": 0.7740704540163279,
9126
+ "num_tokens": 82982912.0,
9127
+ "step": 10130
9128
+ },
9129
+ {
9130
+ "epoch": 2.871738591708854,
9131
+ "grad_norm": 1.420284390449524,
9132
+ "learning_rate": 4.7624042798699264e-07,
9133
+ "loss": 0.11,
9134
+ "mean_token_accuracy": 0.7743272982537747,
9135
+ "num_tokens": 83064832.0,
9136
+ "step": 10140
9137
+ },
9138
+ {
9139
+ "epoch": 2.8745707508761993,
9140
+ "grad_norm": 1.2748111486434937,
9141
+ "learning_rate": 4.657505507185566e-07,
9142
+ "loss": 0.1273,
9143
+ "mean_token_accuracy": 0.7646037183701992,
9144
+ "num_tokens": 83146752.0,
9145
+ "step": 10150
9146
+ },
9147
+ {
9148
+ "epoch": 2.8774029100435445,
9149
+ "grad_norm": 1.1738097667694092,
9150
+ "learning_rate": 4.552606734501207e-07,
9151
+ "loss": 0.1224,
9152
+ "mean_token_accuracy": 0.7754525426775217,
9153
+ "num_tokens": 83228672.0,
9154
+ "step": 10160
9155
+ },
9156
+ {
9157
+ "epoch": 2.88023506921089,
9158
+ "grad_norm": 1.5003738403320312,
9159
+ "learning_rate": 4.4477079618168476e-07,
9160
+ "loss": 0.1128,
9161
+ "mean_token_accuracy": 0.7775073368102312,
9162
+ "num_tokens": 83310592.0,
9163
+ "step": 10170
9164
+ },
9165
+ {
9166
+ "epoch": 2.883067228378235,
9167
+ "grad_norm": 1.2533864974975586,
9168
+ "learning_rate": 4.3428091891324873e-07,
9169
+ "loss": 0.1311,
9170
+ "mean_token_accuracy": 0.7413649678230285,
9171
+ "num_tokens": 83392512.0,
9172
+ "step": 10180
9173
+ },
9174
+ {
9175
+ "epoch": 2.88589938754558,
9176
+ "grad_norm": 1.5065313577651978,
9177
+ "learning_rate": 4.237910416448128e-07,
9178
+ "loss": 0.1546,
9179
+ "mean_token_accuracy": 0.77030332647264,
9180
+ "num_tokens": 83474432.0,
9181
+ "step": 10190
9182
+ },
9183
+ {
9184
+ "epoch": 2.8887315467129255,
9185
+ "grad_norm": 1.491937518119812,
9186
+ "learning_rate": 4.133011643763768e-07,
9187
+ "loss": 0.1268,
9188
+ "mean_token_accuracy": 0.7824853226542473,
9189
+ "num_tokens": 83556352.0,
9190
+ "step": 10200
9191
+ },
9192
+ {
9193
+ "epoch": 2.8915637058802703,
9194
+ "grad_norm": 1.166266918182373,
9195
+ "learning_rate": 4.0281128710794085e-07,
9196
+ "loss": 0.1116,
9197
+ "mean_token_accuracy": 0.782081701233983,
9198
+ "num_tokens": 83638272.0,
9199
+ "step": 10210
9200
+ },
9201
+ {
9202
+ "epoch": 2.8943958650476156,
9203
+ "grad_norm": 1.42288076877594,
9204
+ "learning_rate": 3.923214098395049e-07,
9205
+ "loss": 0.1282,
9206
+ "mean_token_accuracy": 0.7608121354132891,
9207
+ "num_tokens": 83720192.0,
9208
+ "step": 10220
9209
+ },
9210
+ {
9211
+ "epoch": 2.897228024214961,
9212
+ "grad_norm": 1.6304948329925537,
9213
+ "learning_rate": 3.818315325710689e-07,
9214
+ "loss": 0.1231,
9215
+ "mean_token_accuracy": 0.7633317038416862,
9216
+ "num_tokens": 83802112.0,
9217
+ "step": 10230
9218
+ },
9219
+ {
9220
+ "epoch": 2.900060183382306,
9221
+ "grad_norm": 1.4208807945251465,
9222
+ "learning_rate": 3.7134165530263297e-07,
9223
+ "loss": 0.0992,
9224
+ "mean_token_accuracy": 0.7627568505704403,
9225
+ "num_tokens": 83884032.0,
9226
+ "step": 10240
9227
+ },
9228
+ {
9229
+ "epoch": 2.9028923425496513,
9230
+ "grad_norm": 1.291266679763794,
9231
+ "learning_rate": 3.6085177803419705e-07,
9232
+ "loss": 0.11,
9233
+ "mean_token_accuracy": 0.7762964777648449,
9234
+ "num_tokens": 83965952.0,
9235
+ "step": 10250
9236
+ },
9237
+ {
9238
+ "epoch": 2.9057245017169966,
9239
+ "grad_norm": 1.5174055099487305,
9240
+ "learning_rate": 3.5036190076576107e-07,
9241
+ "loss": 0.1259,
9242
+ "mean_token_accuracy": 0.7930772956460714,
9243
+ "num_tokens": 84047872.0,
9244
+ "step": 10260
9245
+ },
9246
+ {
9247
+ "epoch": 2.908556660884342,
9248
+ "grad_norm": 1.2579764127731323,
9249
+ "learning_rate": 3.398720234973251e-07,
9250
+ "loss": 0.1173,
9251
+ "mean_token_accuracy": 0.7738992158323527,
9252
+ "num_tokens": 84129792.0,
9253
+ "step": 10270
9254
+ },
9255
+ {
9256
+ "epoch": 2.9113888200516866,
9257
+ "grad_norm": 1.7533577680587769,
9258
+ "learning_rate": 3.2938214622888917e-07,
9259
+ "loss": 0.1219,
9260
+ "mean_token_accuracy": 0.7630137003958225,
9261
+ "num_tokens": 84211712.0,
9262
+ "step": 10280
9263
+ },
9264
+ {
9265
+ "epoch": 2.9142209792190323,
9266
+ "grad_norm": 1.3265914916992188,
9267
+ "learning_rate": 3.188922689604532e-07,
9268
+ "loss": 0.139,
9269
+ "mean_token_accuracy": 0.7553082194179297,
9270
+ "num_tokens": 84293632.0,
9271
+ "step": 10290
9272
+ },
9273
+ {
9274
+ "epoch": 2.917053138386377,
9275
+ "grad_norm": 1.803127408027649,
9276
+ "learning_rate": 3.084023916920172e-07,
9277
+ "loss": 0.1207,
9278
+ "mean_token_accuracy": 0.7525073390454053,
9279
+ "num_tokens": 84375552.0,
9280
+ "step": 10300
9281
+ },
9282
+ {
9283
+ "epoch": 2.9198852975537224,
9284
+ "grad_norm": 1.6787763833999634,
9285
+ "learning_rate": 2.979125144235813e-07,
9286
+ "loss": 0.1139,
9287
+ "mean_token_accuracy": 0.7773361060768366,
9288
+ "num_tokens": 84457472.0,
9289
+ "step": 10310
9290
+ },
9291
+ {
9292
+ "epoch": 2.9227174567210676,
9293
+ "grad_norm": 1.486560344696045,
9294
+ "learning_rate": 2.874226371551453e-07,
9295
+ "loss": 0.1424,
9296
+ "mean_token_accuracy": 0.739921722188592,
9297
+ "num_tokens": 84539392.0,
9298
+ "step": 10320
9299
+ },
9300
+ {
9301
+ "epoch": 2.925549615888413,
9302
+ "grad_norm": 1.3302429914474487,
9303
+ "learning_rate": 2.7693275988670933e-07,
9304
+ "loss": 0.0954,
9305
+ "mean_token_accuracy": 0.7770058684051037,
9306
+ "num_tokens": 84621312.0,
9307
+ "step": 10330
9308
+ },
9309
+ {
9310
+ "epoch": 2.928381775055758,
9311
+ "grad_norm": 1.5905101299285889,
9312
+ "learning_rate": 2.664428826182734e-07,
9313
+ "loss": 0.1068,
9314
+ "mean_token_accuracy": 0.7657045032829046,
9315
+ "num_tokens": 84703232.0,
9316
+ "step": 10340
9317
+ },
9318
+ {
9319
+ "epoch": 2.9312139342231034,
9320
+ "grad_norm": 1.2340965270996094,
9321
+ "learning_rate": 2.559530053498374e-07,
9322
+ "loss": 0.121,
9323
+ "mean_token_accuracy": 0.7530577316880226,
9324
+ "num_tokens": 84785152.0,
9325
+ "step": 10350
9326
+ },
9327
+ {
9328
+ "epoch": 2.9340460933904486,
9329
+ "grad_norm": 1.4800512790679932,
9330
+ "learning_rate": 2.454631280814015e-07,
9331
+ "loss": 0.1025,
9332
+ "mean_token_accuracy": 0.783512718975544,
9333
+ "num_tokens": 84867072.0,
9334
+ "step": 10360
9335
+ },
9336
+ {
9337
+ "epoch": 2.936878252557794,
9338
+ "grad_norm": 1.4509563446044922,
9339
+ "learning_rate": 2.349732508129655e-07,
9340
+ "loss": 0.1136,
9341
+ "mean_token_accuracy": 0.7643346376717091,
9342
+ "num_tokens": 84948992.0,
9343
+ "step": 10370
9344
+ },
9345
+ {
9346
+ "epoch": 2.939710411725139,
9347
+ "grad_norm": 1.5300997495651245,
9348
+ "learning_rate": 2.2448337354452955e-07,
9349
+ "loss": 0.1394,
9350
+ "mean_token_accuracy": 0.7647504940629005,
9351
+ "num_tokens": 85030912.0,
9352
+ "step": 10380
9353
+ },
9354
+ {
9355
+ "epoch": 2.942542570892484,
9356
+ "grad_norm": 1.0120151042938232,
9357
+ "learning_rate": 2.139934962760936e-07,
9358
+ "loss": 0.1119,
9359
+ "mean_token_accuracy": 0.7749999992549419,
9360
+ "num_tokens": 85112832.0,
9361
+ "step": 10390
9362
+ },
9363
+ {
9364
+ "epoch": 2.9453747300598296,
9365
+ "grad_norm": 1.1445319652557373,
9366
+ "learning_rate": 2.0350361900765764e-07,
9367
+ "loss": 0.1343,
9368
+ "mean_token_accuracy": 0.7669031299650669,
9369
+ "num_tokens": 85194752.0,
9370
+ "step": 10400
9371
+ },
9372
+ {
9373
+ "epoch": 2.9482068892271744,
9374
+ "grad_norm": 1.1299060583114624,
9375
+ "learning_rate": 1.9301374173922166e-07,
9376
+ "loss": 0.1373,
9377
+ "mean_token_accuracy": 0.7592465754598379,
9378
+ "num_tokens": 85276672.0,
9379
+ "step": 10410
9380
+ },
9381
+ {
9382
+ "epoch": 2.9510390483945197,
9383
+ "grad_norm": 1.0287593603134155,
9384
+ "learning_rate": 1.8252386447078569e-07,
9385
+ "loss": 0.1243,
9386
+ "mean_token_accuracy": 0.7871330726891757,
9387
+ "num_tokens": 85358592.0,
9388
+ "step": 10420
9389
+ },
9390
+ {
9391
+ "epoch": 2.953871207561865,
9392
+ "grad_norm": 1.2568093538284302,
9393
+ "learning_rate": 1.7203398720234976e-07,
9394
+ "loss": 0.0979,
9395
+ "mean_token_accuracy": 0.77096379250288,
9396
+ "num_tokens": 85440512.0,
9397
+ "step": 10430
9398
+ },
9399
+ {
9400
+ "epoch": 2.95670336672921,
9401
+ "grad_norm": 2.05387020111084,
9402
+ "learning_rate": 1.6154410993391378e-07,
9403
+ "loss": 0.1146,
9404
+ "mean_token_accuracy": 0.7754647746682167,
9405
+ "num_tokens": 85522432.0,
9406
+ "step": 10440
9407
+ },
9408
+ {
9409
+ "epoch": 2.9595355258965554,
9410
+ "grad_norm": 1.3246551752090454,
9411
+ "learning_rate": 1.5105423266547783e-07,
9412
+ "loss": 0.1108,
9413
+ "mean_token_accuracy": 0.779549902677536,
9414
+ "num_tokens": 85604352.0,
9415
+ "step": 10450
9416
+ },
9417
+ {
9418
+ "epoch": 2.9623676850639007,
9419
+ "grad_norm": 1.5421769618988037,
9420
+ "learning_rate": 1.4056435539704185e-07,
9421
+ "loss": 0.1215,
9422
+ "mean_token_accuracy": 0.7485934443771839,
9423
+ "num_tokens": 85686272.0,
9424
+ "step": 10460
9425
+ },
9426
+ {
9427
+ "epoch": 2.965199844231246,
9428
+ "grad_norm": 1.457680583000183,
9429
+ "learning_rate": 1.300744781286059e-07,
9430
+ "loss": 0.1266,
9431
+ "mean_token_accuracy": 0.7566780854016543,
9432
+ "num_tokens": 85768192.0,
9433
+ "step": 10470
9434
+ },
9435
+ {
9436
+ "epoch": 2.9680320033985907,
9437
+ "grad_norm": 1.1517871618270874,
9438
+ "learning_rate": 1.1958460086016993e-07,
9439
+ "loss": 0.1209,
9440
+ "mean_token_accuracy": 0.7584148690104484,
9441
+ "num_tokens": 85850112.0,
9442
+ "step": 10480
9443
+ },
9444
+ {
9445
+ "epoch": 2.9708641625659364,
9446
+ "grad_norm": 1.3935081958770752,
9447
+ "learning_rate": 1.0909472359173399e-07,
9448
+ "loss": 0.1103,
9449
+ "mean_token_accuracy": 0.782497552037239,
9450
+ "num_tokens": 85932032.0,
9451
+ "step": 10490
9452
+ },
9453
+ {
9454
+ "epoch": 2.9736963217332812,
9455
+ "grad_norm": 1.209938883781433,
9456
+ "learning_rate": 9.860484632329804e-08,
9457
+ "loss": 0.1417,
9458
+ "mean_token_accuracy": 0.7534368880093097,
9459
+ "num_tokens": 86013952.0,
9460
+ "step": 10500
9461
  }
9462
  ],
9463
  "logging_steps": 10,
 
9477
  "attributes": {}
9478
  }
9479
  },
9480
+ "total_flos": 2.2731807970767667e+17,
9481
  "train_batch_size": 2,
9482
  "trial_name": null,
9483
  "trial_params": null