shulijia commited on
Commit
d3c0637
·
verified ·
1 Parent(s): 4d1ccda

Training in progress, step 9500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:20cf46c56d7f3485e1e04402e75314d9e24b57ac66adacab0e227b4e09b7b6ba
3
  size 2384234968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ca7c4e3749f06ebc1778c062b3d70c4f488a26b411ddd7c3d301ae4023802d1
3
  size 2384234968
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8812294e6cc0d56351a244dc2b3183bad4f1c7754f092149fbe3c0af525abea9
3
  size 4768662910
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d45b61751a61e4b6f882922592537bf8e092f455741fa220a9008ab320f07ad
3
  size 4768662910
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e92ceb30375ee28452e703331052914ba0c2676dfeac9e08cb4ed5c07c26b7fd
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a81369904e00a468d2ec4beb1dd4e8f30c6191c2e29c4144f662ff07eadf5eab
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.8699434536755111,
6
  "eval_steps": 100,
7
- "global_step": 9000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -8108,6 +8108,456 @@
8108
  "mean_token_accuracy": 0.9694716207683086,
8109
  "num_tokens": 36864000.0,
8110
  "step": 9000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8111
  }
8112
  ],
8113
  "logging_steps": 10,
@@ -8127,7 +8577,7 @@
8127
  "attributes": {}
8128
  }
8129
  },
8130
- "total_flos": 9.7424353787904e+16,
8131
  "train_batch_size": 1,
8132
  "trial_name": null,
8133
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.9182736455463728,
6
  "eval_steps": 100,
7
+ "global_step": 9500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
8108
  "mean_token_accuracy": 0.9694716207683086,
8109
  "num_tokens": 36864000.0,
8110
  "step": 9000
8111
+ },
8112
+ {
8113
+ "epoch": 0.8709100575129284,
8114
+ "grad_norm": 0.7295970320701599,
8115
+ "learning_rate": 7.175080558539206e-06,
8116
+ "loss": 0.1497,
8117
+ "mean_token_accuracy": 0.9693737745285034,
8118
+ "num_tokens": 36904960.0,
8119
+ "step": 9010
8120
+ },
8121
+ {
8122
+ "epoch": 0.8718766613503456,
8123
+ "grad_norm": 0.7322930097579956,
8124
+ "learning_rate": 7.121374865735768e-06,
8125
+ "loss": 0.144,
8126
+ "mean_token_accuracy": 0.9701565556228161,
8127
+ "num_tokens": 36945920.0,
8128
+ "step": 9020
8129
+ },
8130
+ {
8131
+ "epoch": 0.8728432651877628,
8132
+ "grad_norm": 0.7817639112472534,
8133
+ "learning_rate": 7.067669172932331e-06,
8134
+ "loss": 0.1716,
8135
+ "mean_token_accuracy": 0.9648238711059094,
8136
+ "num_tokens": 36986880.0,
8137
+ "step": 9030
8138
+ },
8139
+ {
8140
+ "epoch": 0.87380986902518,
8141
+ "grad_norm": 0.6489200592041016,
8142
+ "learning_rate": 7.013963480128894e-06,
8143
+ "loss": 0.1481,
8144
+ "mean_token_accuracy": 0.9706457898020744,
8145
+ "num_tokens": 37027840.0,
8146
+ "step": 9040
8147
+ },
8148
+ {
8149
+ "epoch": 0.8747764728625973,
8150
+ "grad_norm": 0.8958516716957092,
8151
+ "learning_rate": 6.960257787325457e-06,
8152
+ "loss": 0.1468,
8153
+ "mean_token_accuracy": 0.9693003885447979,
8154
+ "num_tokens": 37068800.0,
8155
+ "step": 9050
8156
+ },
8157
+ {
8158
+ "epoch": 0.8757430767000145,
8159
+ "grad_norm": 0.7400612235069275,
8160
+ "learning_rate": 6.906552094522021e-06,
8161
+ "loss": 0.1493,
8162
+ "mean_token_accuracy": 0.9704745575785637,
8163
+ "num_tokens": 37109760.0,
8164
+ "step": 9060
8165
+ },
8166
+ {
8167
+ "epoch": 0.8767096805374317,
8168
+ "grad_norm": 0.6856247186660767,
8169
+ "learning_rate": 6.852846401718582e-06,
8170
+ "loss": 0.1293,
8171
+ "mean_token_accuracy": 0.9728718161582947,
8172
+ "num_tokens": 37150720.0,
8173
+ "step": 9070
8174
+ },
8175
+ {
8176
+ "epoch": 0.877676284374849,
8177
+ "grad_norm": 0.7188942432403564,
8178
+ "learning_rate": 6.799140708915146e-06,
8179
+ "loss": 0.1254,
8180
+ "mean_token_accuracy": 0.9737769059836865,
8181
+ "num_tokens": 37191680.0,
8182
+ "step": 9080
8183
+ },
8184
+ {
8185
+ "epoch": 0.8786428882122662,
8186
+ "grad_norm": 0.5900483727455139,
8187
+ "learning_rate": 6.745435016111707e-06,
8188
+ "loss": 0.1473,
8189
+ "mean_token_accuracy": 0.9698874719440937,
8190
+ "num_tokens": 37232640.0,
8191
+ "step": 9090
8192
+ },
8193
+ {
8194
+ "epoch": 0.8796094920496834,
8195
+ "grad_norm": 0.7418830990791321,
8196
+ "learning_rate": 6.691729323308271e-06,
8197
+ "loss": 0.1365,
8198
+ "mean_token_accuracy": 0.9710371784865857,
8199
+ "num_tokens": 37273600.0,
8200
+ "step": 9100
8201
+ },
8202
+ {
8203
+ "epoch": 0.8805760958871006,
8204
+ "grad_norm": 0.6702271699905396,
8205
+ "learning_rate": 6.6380236305048335e-06,
8206
+ "loss": 0.1411,
8207
+ "mean_token_accuracy": 0.969960855692625,
8208
+ "num_tokens": 37314560.0,
8209
+ "step": 9110
8210
+ },
8211
+ {
8212
+ "epoch": 0.8815426997245179,
8213
+ "grad_norm": 0.8139868378639221,
8214
+ "learning_rate": 6.584317937701397e-06,
8215
+ "loss": 0.1456,
8216
+ "mean_token_accuracy": 0.969985319674015,
8217
+ "num_tokens": 37355520.0,
8218
+ "step": 9120
8219
+ },
8220
+ {
8221
+ "epoch": 0.8825093035619351,
8222
+ "grad_norm": 0.6658422350883484,
8223
+ "learning_rate": 6.53061224489796e-06,
8224
+ "loss": 0.1475,
8225
+ "mean_token_accuracy": 0.9693982362747192,
8226
+ "num_tokens": 37396480.0,
8227
+ "step": 9130
8228
+ },
8229
+ {
8230
+ "epoch": 0.8834759073993523,
8231
+ "grad_norm": 0.702655017375946,
8232
+ "learning_rate": 6.476906552094522e-06,
8233
+ "loss": 0.1379,
8234
+ "mean_token_accuracy": 0.9712817974388599,
8235
+ "num_tokens": 37437440.0,
8236
+ "step": 9140
8237
+ },
8238
+ {
8239
+ "epoch": 0.8844425112367696,
8240
+ "grad_norm": 0.7942471504211426,
8241
+ "learning_rate": 6.423200859291086e-06,
8242
+ "loss": 0.1531,
8243
+ "mean_token_accuracy": 0.9690068490803242,
8244
+ "num_tokens": 37478400.0,
8245
+ "step": 9150
8246
+ },
8247
+ {
8248
+ "epoch": 0.8854091150741868,
8249
+ "grad_norm": 0.7765222787857056,
8250
+ "learning_rate": 6.369495166487647e-06,
8251
+ "loss": 0.1528,
8252
+ "mean_token_accuracy": 0.9692270055413246,
8253
+ "num_tokens": 37519360.0,
8254
+ "step": 9160
8255
+ },
8256
+ {
8257
+ "epoch": 0.886375718911604,
8258
+ "grad_norm": 0.5786271095275879,
8259
+ "learning_rate": 6.315789473684211e-06,
8260
+ "loss": 0.1372,
8261
+ "mean_token_accuracy": 0.9717954978346824,
8262
+ "num_tokens": 37560320.0,
8263
+ "step": 9170
8264
+ },
8265
+ {
8266
+ "epoch": 0.8873423227490214,
8267
+ "grad_norm": 0.6450340151786804,
8268
+ "learning_rate": 6.262083780880773e-06,
8269
+ "loss": 0.1624,
8270
+ "mean_token_accuracy": 0.9683219164609909,
8271
+ "num_tokens": 37601280.0,
8272
+ "step": 9180
8273
+ },
8274
+ {
8275
+ "epoch": 0.8883089265864386,
8276
+ "grad_norm": 0.6178423166275024,
8277
+ "learning_rate": 6.2083780880773365e-06,
8278
+ "loss": 0.1253,
8279
+ "mean_token_accuracy": 0.973556748777628,
8280
+ "num_tokens": 37642240.0,
8281
+ "step": 9190
8282
+ },
8283
+ {
8284
+ "epoch": 0.8892755304238558,
8285
+ "grad_norm": 0.7988136410713196,
8286
+ "learning_rate": 6.1546723952739e-06,
8287
+ "loss": 0.1451,
8288
+ "mean_token_accuracy": 0.969740703701973,
8289
+ "num_tokens": 37683200.0,
8290
+ "step": 9200
8291
+ },
8292
+ {
8293
+ "epoch": 0.890242134261273,
8294
+ "grad_norm": 0.6491620540618896,
8295
+ "learning_rate": 6.100966702470462e-06,
8296
+ "loss": 0.1374,
8297
+ "mean_token_accuracy": 0.9728228926658631,
8298
+ "num_tokens": 37724160.0,
8299
+ "step": 9210
8300
+ },
8301
+ {
8302
+ "epoch": 0.8912087380986903,
8303
+ "grad_norm": 0.9206412434577942,
8304
+ "learning_rate": 6.047261009667025e-06,
8305
+ "loss": 0.1457,
8306
+ "mean_token_accuracy": 0.9691780783236027,
8307
+ "num_tokens": 37765120.0,
8308
+ "step": 9220
8309
+ },
8310
+ {
8311
+ "epoch": 0.8921753419361075,
8312
+ "grad_norm": 0.672639787197113,
8313
+ "learning_rate": 5.993555316863588e-06,
8314
+ "loss": 0.1409,
8315
+ "mean_token_accuracy": 0.9709882564842701,
8316
+ "num_tokens": 37806080.0,
8317
+ "step": 9230
8318
+ },
8319
+ {
8320
+ "epoch": 0.8931419457735247,
8321
+ "grad_norm": 0.626698911190033,
8322
+ "learning_rate": 5.939849624060151e-06,
8323
+ "loss": 0.1344,
8324
+ "mean_token_accuracy": 0.9726027339696884,
8325
+ "num_tokens": 37847040.0,
8326
+ "step": 9240
8327
+ },
8328
+ {
8329
+ "epoch": 0.894108549610942,
8330
+ "grad_norm": 0.7072364091873169,
8331
+ "learning_rate": 5.886143931256713e-06,
8332
+ "loss": 0.1351,
8333
+ "mean_token_accuracy": 0.9722113452851773,
8334
+ "num_tokens": 37888000.0,
8335
+ "step": 9250
8336
+ },
8337
+ {
8338
+ "epoch": 0.8950751534483592,
8339
+ "grad_norm": 0.6620608568191528,
8340
+ "learning_rate": 5.832438238453276e-06,
8341
+ "loss": 0.131,
8342
+ "mean_token_accuracy": 0.9728228934109211,
8343
+ "num_tokens": 37928960.0,
8344
+ "step": 9260
8345
+ },
8346
+ {
8347
+ "epoch": 0.8960417572857764,
8348
+ "grad_norm": 0.649089515209198,
8349
+ "learning_rate": 5.7787325456498395e-06,
8350
+ "loss": 0.1534,
8351
+ "mean_token_accuracy": 0.9675146721303463,
8352
+ "num_tokens": 37969920.0,
8353
+ "step": 9270
8354
+ },
8355
+ {
8356
+ "epoch": 0.8970083611231937,
8357
+ "grad_norm": 0.8602608442306519,
8358
+ "learning_rate": 5.725026852846402e-06,
8359
+ "loss": 0.1635,
8360
+ "mean_token_accuracy": 0.9658512689173222,
8361
+ "num_tokens": 38010880.0,
8362
+ "step": 9280
8363
+ },
8364
+ {
8365
+ "epoch": 0.8979749649606109,
8366
+ "grad_norm": 0.6629733443260193,
8367
+ "learning_rate": 5.671321160042965e-06,
8368
+ "loss": 0.1449,
8369
+ "mean_token_accuracy": 0.9702054776251317,
8370
+ "num_tokens": 38051840.0,
8371
+ "step": 9290
8372
+ },
8373
+ {
8374
+ "epoch": 0.8989415687980281,
8375
+ "grad_norm": 0.6702824831008911,
8376
+ "learning_rate": 5.617615467239528e-06,
8377
+ "loss": 0.1383,
8378
+ "mean_token_accuracy": 0.9710861049592495,
8379
+ "num_tokens": 38092800.0,
8380
+ "step": 9300
8381
+ },
8382
+ {
8383
+ "epoch": 0.8999081726354453,
8384
+ "grad_norm": 0.633313000202179,
8385
+ "learning_rate": 5.563909774436091e-06,
8386
+ "loss": 0.1405,
8387
+ "mean_token_accuracy": 0.9705724023282528,
8388
+ "num_tokens": 38133760.0,
8389
+ "step": 9310
8390
+ },
8391
+ {
8392
+ "epoch": 0.9008747764728626,
8393
+ "grad_norm": 0.701628565788269,
8394
+ "learning_rate": 5.510204081632653e-06,
8395
+ "loss": 0.1523,
8396
+ "mean_token_accuracy": 0.9679794482886791,
8397
+ "num_tokens": 38174720.0,
8398
+ "step": 9320
8399
+ },
8400
+ {
8401
+ "epoch": 0.9018413803102798,
8402
+ "grad_norm": 0.6978937387466431,
8403
+ "learning_rate": 5.456498388829216e-06,
8404
+ "loss": 0.1476,
8405
+ "mean_token_accuracy": 0.9705234795808793,
8406
+ "num_tokens": 38215680.0,
8407
+ "step": 9330
8408
+ },
8409
+ {
8410
+ "epoch": 0.902807984147697,
8411
+ "grad_norm": 0.6952319741249084,
8412
+ "learning_rate": 5.4027926960257785e-06,
8413
+ "loss": 0.1294,
8414
+ "mean_token_accuracy": 0.9723825819790364,
8415
+ "num_tokens": 38256640.0,
8416
+ "step": 9340
8417
+ },
8418
+ {
8419
+ "epoch": 0.9037745879851143,
8420
+ "grad_norm": 0.7691527605056763,
8421
+ "learning_rate": 5.349087003222342e-06,
8422
+ "loss": 0.1352,
8423
+ "mean_token_accuracy": 0.9732876695692539,
8424
+ "num_tokens": 38297600.0,
8425
+ "step": 9350
8426
+ },
8427
+ {
8428
+ "epoch": 0.9047411918225315,
8429
+ "grad_norm": 0.8168832659721375,
8430
+ "learning_rate": 5.295381310418905e-06,
8431
+ "loss": 0.1408,
8432
+ "mean_token_accuracy": 0.9707191728055478,
8433
+ "num_tokens": 38338560.0,
8434
+ "step": 9360
8435
+ },
8436
+ {
8437
+ "epoch": 0.9057077956599487,
8438
+ "grad_norm": 0.7141739130020142,
8439
+ "learning_rate": 5.241675617615467e-06,
8440
+ "loss": 0.1324,
8441
+ "mean_token_accuracy": 0.9724315024912358,
8442
+ "num_tokens": 38379520.0,
8443
+ "step": 9370
8444
+ },
8445
+ {
8446
+ "epoch": 0.9066743994973661,
8447
+ "grad_norm": 0.7228880524635315,
8448
+ "learning_rate": 5.187969924812031e-06,
8449
+ "loss": 0.1449,
8450
+ "mean_token_accuracy": 0.9706213280558587,
8451
+ "num_tokens": 38420480.0,
8452
+ "step": 9380
8453
+ },
8454
+ {
8455
+ "epoch": 0.9076410033347833,
8456
+ "grad_norm": 0.6438316702842712,
8457
+ "learning_rate": 5.134264232008593e-06,
8458
+ "loss": 0.1572,
8459
+ "mean_token_accuracy": 0.9682240657508373,
8460
+ "num_tokens": 38461440.0,
8461
+ "step": 9390
8462
+ },
8463
+ {
8464
+ "epoch": 0.9086076071722005,
8465
+ "grad_norm": 0.608272910118103,
8466
+ "learning_rate": 5.080558539205156e-06,
8467
+ "loss": 0.1341,
8468
+ "mean_token_accuracy": 0.9718688815832138,
8469
+ "num_tokens": 38502400.0,
8470
+ "step": 9400
8471
+ },
8472
+ {
8473
+ "epoch": 0.9095742110096177,
8474
+ "grad_norm": 0.5616933107376099,
8475
+ "learning_rate": 5.0268528464017184e-06,
8476
+ "loss": 0.1406,
8477
+ "mean_token_accuracy": 0.9704990208148956,
8478
+ "num_tokens": 38543360.0,
8479
+ "step": 9410
8480
+ },
8481
+ {
8482
+ "epoch": 0.910540814847035,
8483
+ "grad_norm": 0.758497416973114,
8484
+ "learning_rate": 4.9731471535982815e-06,
8485
+ "loss": 0.1427,
8486
+ "mean_token_accuracy": 0.9709637947380543,
8487
+ "num_tokens": 38584320.0,
8488
+ "step": 9420
8489
+ },
8490
+ {
8491
+ "epoch": 0.9115074186844522,
8492
+ "grad_norm": 0.6793957948684692,
8493
+ "learning_rate": 4.919441460794845e-06,
8494
+ "loss": 0.1438,
8495
+ "mean_token_accuracy": 0.9701320923864841,
8496
+ "num_tokens": 38625280.0,
8497
+ "step": 9430
8498
+ },
8499
+ {
8500
+ "epoch": 0.9124740225218694,
8501
+ "grad_norm": 0.7057655453681946,
8502
+ "learning_rate": 4.865735767991407e-06,
8503
+ "loss": 0.1398,
8504
+ "mean_token_accuracy": 0.9713307209312916,
8505
+ "num_tokens": 38666240.0,
8506
+ "step": 9440
8507
+ },
8508
+ {
8509
+ "epoch": 0.9134406263592867,
8510
+ "grad_norm": 0.7207940220832825,
8511
+ "learning_rate": 4.812030075187971e-06,
8512
+ "loss": 0.1577,
8513
+ "mean_token_accuracy": 0.9684442207217216,
8514
+ "num_tokens": 38707200.0,
8515
+ "step": 9450
8516
+ },
8517
+ {
8518
+ "epoch": 0.9144072301967039,
8519
+ "grad_norm": 0.625789999961853,
8520
+ "learning_rate": 4.758324382384533e-06,
8521
+ "loss": 0.1324,
8522
+ "mean_token_accuracy": 0.9726516611874103,
8523
+ "num_tokens": 38748160.0,
8524
+ "step": 9460
8525
+ },
8526
+ {
8527
+ "epoch": 0.9153738340341211,
8528
+ "grad_norm": 0.7996525764465332,
8529
+ "learning_rate": 4.704618689581096e-06,
8530
+ "loss": 0.1499,
8531
+ "mean_token_accuracy": 0.9679549895226955,
8532
+ "num_tokens": 38789120.0,
8533
+ "step": 9470
8534
+ },
8535
+ {
8536
+ "epoch": 0.9163404378715384,
8537
+ "grad_norm": 0.6713771820068359,
8538
+ "learning_rate": 4.650912996777658e-06,
8539
+ "loss": 0.152,
8540
+ "mean_token_accuracy": 0.9697651579976082,
8541
+ "num_tokens": 38830080.0,
8542
+ "step": 9480
8543
+ },
8544
+ {
8545
+ "epoch": 0.9173070417089556,
8546
+ "grad_norm": 0.9124431014060974,
8547
+ "learning_rate": 4.5972073039742214e-06,
8548
+ "loss": 0.133,
8549
+ "mean_token_accuracy": 0.9723336569964885,
8550
+ "num_tokens": 38871040.0,
8551
+ "step": 9490
8552
+ },
8553
+ {
8554
+ "epoch": 0.9182736455463728,
8555
+ "grad_norm": 0.6981998085975647,
8556
+ "learning_rate": 4.5435016111707845e-06,
8557
+ "loss": 0.1483,
8558
+ "mean_token_accuracy": 0.9684931464493275,
8559
+ "num_tokens": 38912000.0,
8560
+ "step": 9500
8561
  }
8562
  ],
8563
  "logging_steps": 10,
 
8577
  "attributes": {}
8578
  }
8579
  },
8580
+ "total_flos": 1.02836817887232e+17,
8581
  "train_batch_size": 1,
8582
  "trial_name": null,
8583
  "trial_params": null