antonpolishko commited on
Commit
c301e21
·
verified ·
1 Parent(s): caf1182

Training in progress, epoch 3, checkpoint

Browse files
last-checkpoint/model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:97ae37313ece2ed08d238c388465b3e41445622b5edcafeb70e7b37126408296
3
  size 4976698672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d3b18816811918972cba42da78818e3c7aa5d8e31f822375470c2f40641e8a2
3
  size 4976698672
last-checkpoint/model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e94f293539218d0e86b6dcfc6dd06d51caf62e6b9843628b95ffe32d618b912e
3
  size 4999802720
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:609faa192d9a31b460e1e9d369f7a2bb92f9ecbedb8d282dfd9a9d86e77b82e2
3
  size 4999802720
last-checkpoint/model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:68f46a57f0cd49116bd86ca8cdbb4f453ac1d305e00dda24f2a61291b0cab5b7
3
  size 4915916176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1187480a882d25ada76753b8c72449b0e409556587c1ef893ce5b508bb4bb2ba
3
  size 4915916176
last-checkpoint/model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4c8ac884a06c7b7d4299a53261bf24cd674c047620bbb8ed9b34291790b2a549
3
  size 1168138808
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7c05715f857592efaf0022aee229538b0cbd4bc4b8784e7e8d58053ec6d017e
3
  size 1168138808
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6b7f666d6c870f95e122d808b00c94d5d21c8d98e0c771e7f511b3c55864e863
3
  size 32121299754
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e3d3fec74b231513b476c27b7eda7f7d9835dec42b905326e0577ccee3a0cc3
3
  size 32121299754
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0c461c9d337dfc684e9352ec72bfa344e2f5d377f7cfc4475de9acae294dca89
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69ec6e3926fa071bede113523efa3dc6e630c3c7958c54a9ca321cf4d62ed145
3
  size 15984
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fae392ec6232cbf9da21d6ed12bc8247d0d24e7f3a3606acd23be00f3e8bbfc5
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6127ee4f0c13500ec5038fce65af8f7beec63c137c7d4b7c157aa6303cf5879
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cbf3e7ca9991a58b0b16574a3c653483c551c270aa05aba06c162ea593f7b0f2
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da01d1c5eb2cc3a323f97c1f590d13ccfac2a4c5b1479bd378b4e643304f5a4f
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c695bebf6bcb75cbe26378bfe0ab7e2a33c49f713b9d6e4d10632b24322977e7
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49a3f04d76c0d3acc7d3dd95a04215f368f35a451ae8cba8a2fdba38cda9ca0a
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d5ebb13c71265c5464c9aa9bb9b66f07764d73befe6cd63a2aaf8e781bf0a374
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df7d2c9825dba80cb544920f8cc0c72122f96514e6cd259052a8765b034393e2
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:12cc6e245e189be568c8dfd43a4dd8f04bb3dbd9f17f41458107935d2c2a6a9d
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a20a42d44ff48cc162224010190e898fe28598ddad8cd1896d330a3bb1d8ec3
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:36086646e9a8f76fea69f8a227112e83bb63524964ccdfb82f4cdad88b90e5e4
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18ac0dc4f09f25179860561fcea7c5c8f997aabdc46a170665f9dc5a72bc27c6
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9b44153bacf860d0ca6ce4c6b9380a199feab8a72ca613e6745bfb671b02c4e4
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a16fcb5411ff961b47eff7378d85105fe9837e0492d19ea5ce3b7c4b77aa3b6
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:48b4bd96da52558b6c8c53763eebae27fa97c77e7808af54519fbcb81c8c8dc4
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04a46754a7468c445356d55f12e6f57375db57b7d43b8c6963579dc82de997b3
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 2.0,
5
  "eval_steps": 300,
6
- "global_step": 726,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1045,6 +1045,518 @@
1045
  "learning_rate": 5.041773562018135e-07,
1046
  "loss": 1.7019,
1047
  "step": 725
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1048
  }
1049
  ],
1050
  "logging_steps": 5,
@@ -1059,12 +1571,12 @@
1059
  "should_evaluate": false,
1060
  "should_log": false,
1061
  "should_save": true,
1062
- "should_training_stop": false
1063
  },
1064
  "attributes": {}
1065
  }
1066
  },
1067
- "total_flos": 4.2849341332112015e+18,
1068
  "train_batch_size": 8,
1069
  "trial_name": null,
1070
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
  "eval_steps": 300,
6
+ "global_step": 1089,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1045
  "learning_rate": 5.041773562018135e-07,
1046
  "loss": 1.7019,
1047
  "step": 725
1048
+ },
1049
+ {
1050
+ "epoch": 2.0110192837465566,
1051
+ "grad_norm": 1.265625,
1052
+ "learning_rate": 4.91680188206047e-07,
1053
+ "loss": 1.7011,
1054
+ "step": 730
1055
+ },
1056
+ {
1057
+ "epoch": 2.024793388429752,
1058
+ "grad_norm": 1.265625,
1059
+ "learning_rate": 4.792891678019115e-07,
1060
+ "loss": 1.7013,
1061
+ "step": 735
1062
+ },
1063
+ {
1064
+ "epoch": 2.038567493112948,
1065
+ "grad_norm": 1.2734375,
1066
+ "learning_rate": 4.6700688248834664e-07,
1067
+ "loss": 1.6895,
1068
+ "step": 740
1069
+ },
1070
+ {
1071
+ "epoch": 2.0523415977961434,
1072
+ "grad_norm": 1.3125,
1073
+ "learning_rate": 4.548358970581757e-07,
1074
+ "loss": 1.7029,
1075
+ "step": 745
1076
+ },
1077
+ {
1078
+ "epoch": 2.0661157024793386,
1079
+ "grad_norm": 1.2734375,
1080
+ "learning_rate": 4.427787530625278e-07,
1081
+ "loss": 1.6931,
1082
+ "step": 750
1083
+ },
1084
+ {
1085
+ "epoch": 2.0798898071625342,
1086
+ "grad_norm": 1.2734375,
1087
+ "learning_rate": 4.3083796828010675e-07,
1088
+ "loss": 1.6886,
1089
+ "step": 755
1090
+ },
1091
+ {
1092
+ "epoch": 2.09366391184573,
1093
+ "grad_norm": 1.3046875,
1094
+ "learning_rate": 4.190160361914292e-07,
1095
+ "loss": 1.6907,
1096
+ "step": 760
1097
+ },
1098
+ {
1099
+ "epoch": 2.1074380165289255,
1100
+ "grad_norm": 1.296875,
1101
+ "learning_rate": 4.07315425458134e-07,
1102
+ "loss": 1.6924,
1103
+ "step": 765
1104
+ },
1105
+ {
1106
+ "epoch": 2.121212121212121,
1107
+ "grad_norm": 1.28125,
1108
+ "learning_rate": 3.9573857940747537e-07,
1109
+ "loss": 1.7019,
1110
+ "step": 770
1111
+ },
1112
+ {
1113
+ "epoch": 2.1349862258953167,
1114
+ "grad_norm": 1.28125,
1115
+ "learning_rate": 3.8428791552210594e-07,
1116
+ "loss": 1.6975,
1117
+ "step": 775
1118
+ },
1119
+ {
1120
+ "epoch": 2.1487603305785123,
1121
+ "grad_norm": 1.2890625,
1122
+ "learning_rate": 3.729658249352563e-07,
1123
+ "loss": 1.6986,
1124
+ "step": 780
1125
+ },
1126
+ {
1127
+ "epoch": 2.162534435261708,
1128
+ "grad_norm": 1.3125,
1129
+ "learning_rate": 3.6177467193141886e-07,
1130
+ "loss": 1.6893,
1131
+ "step": 785
1132
+ },
1133
+ {
1134
+ "epoch": 2.1763085399449036,
1135
+ "grad_norm": 1.2734375,
1136
+ "learning_rate": 3.5071679345263537e-07,
1137
+ "loss": 1.6833,
1138
+ "step": 790
1139
+ },
1140
+ {
1141
+ "epoch": 2.190082644628099,
1142
+ "grad_norm": 1.2734375,
1143
+ "learning_rate": 3.397944986104968e-07,
1144
+ "loss": 1.693,
1145
+ "step": 795
1146
+ },
1147
+ {
1148
+ "epoch": 2.203856749311295,
1149
+ "grad_norm": 1.3046875,
1150
+ "learning_rate": 3.290100682039516e-07,
1151
+ "loss": 1.6978,
1152
+ "step": 800
1153
+ },
1154
+ {
1155
+ "epoch": 2.2176308539944904,
1156
+ "grad_norm": 1.28125,
1157
+ "learning_rate": 3.1836575424303034e-07,
1158
+ "loss": 1.7019,
1159
+ "step": 805
1160
+ },
1161
+ {
1162
+ "epoch": 2.231404958677686,
1163
+ "grad_norm": 1.2890625,
1164
+ "learning_rate": 3.078637794785791e-07,
1165
+ "loss": 1.6977,
1166
+ "step": 810
1167
+ },
1168
+ {
1169
+ "epoch": 2.2451790633608817,
1170
+ "grad_norm": 1.28125,
1171
+ "learning_rate": 2.9750633693810224e-07,
1172
+ "loss": 1.6898,
1173
+ "step": 815
1174
+ },
1175
+ {
1176
+ "epoch": 2.2589531680440773,
1177
+ "grad_norm": 1.28125,
1178
+ "learning_rate": 2.872955894678153e-07,
1179
+ "loss": 1.6915,
1180
+ "step": 820
1181
+ },
1182
+ {
1183
+ "epoch": 2.2727272727272725,
1184
+ "grad_norm": 1.265625,
1185
+ "learning_rate": 2.7723366928099754e-07,
1186
+ "loss": 1.6922,
1187
+ "step": 825
1188
+ },
1189
+ {
1190
+ "epoch": 2.2865013774104685,
1191
+ "grad_norm": 1.3046875,
1192
+ "learning_rate": 2.673226775127422e-07,
1193
+ "loss": 1.6922,
1194
+ "step": 830
1195
+ },
1196
+ {
1197
+ "epoch": 2.3002754820936637,
1198
+ "grad_norm": 1.3046875,
1199
+ "learning_rate": 2.5756468378119533e-07,
1200
+ "loss": 1.6873,
1201
+ "step": 835
1202
+ },
1203
+ {
1204
+ "epoch": 2.3140495867768593,
1205
+ "grad_norm": 1.265625,
1206
+ "learning_rate": 2.4796172575537934e-07,
1207
+ "loss": 1.7068,
1208
+ "step": 840
1209
+ },
1210
+ {
1211
+ "epoch": 2.327823691460055,
1212
+ "grad_norm": 1.3203125,
1213
+ "learning_rate": 2.3851580872968435e-07,
1214
+ "loss": 1.6993,
1215
+ "step": 845
1216
+ },
1217
+ {
1218
+ "epoch": 2.3415977961432506,
1219
+ "grad_norm": 1.3046875,
1220
+ "learning_rate": 2.292289052051224e-07,
1221
+ "loss": 1.6992,
1222
+ "step": 850
1223
+ },
1224
+ {
1225
+ "epoch": 2.355371900826446,
1226
+ "grad_norm": 1.2578125,
1227
+ "learning_rate": 2.2010295447742743e-07,
1228
+ "loss": 1.6891,
1229
+ "step": 855
1230
+ },
1231
+ {
1232
+ "epoch": 2.369146005509642,
1233
+ "grad_norm": 1.296875,
1234
+ "learning_rate": 2.111398622320927e-07,
1235
+ "loss": 1.6968,
1236
+ "step": 860
1237
+ },
1238
+ {
1239
+ "epoch": 2.3829201101928374,
1240
+ "grad_norm": 1.296875,
1241
+ "learning_rate": 2.0234150014642305e-07,
1242
+ "loss": 1.6946,
1243
+ "step": 865
1244
+ },
1245
+ {
1246
+ "epoch": 2.396694214876033,
1247
+ "grad_norm": 1.265625,
1248
+ "learning_rate": 1.937097054986915e-07,
1249
+ "loss": 1.6892,
1250
+ "step": 870
1251
+ },
1252
+ {
1253
+ "epoch": 2.4104683195592287,
1254
+ "grad_norm": 1.2578125,
1255
+ "learning_rate": 1.8524628078447602e-07,
1256
+ "loss": 1.6915,
1257
+ "step": 875
1258
+ },
1259
+ {
1260
+ "epoch": 2.4242424242424243,
1261
+ "grad_norm": 1.2890625,
1262
+ "learning_rate": 1.769529933402637e-07,
1263
+ "loss": 1.6946,
1264
+ "step": 880
1265
+ },
1266
+ {
1267
+ "epoch": 2.43801652892562,
1268
+ "grad_norm": 1.2578125,
1269
+ "learning_rate": 1.6883157497439349e-07,
1270
+ "loss": 1.6975,
1271
+ "step": 885
1272
+ },
1273
+ {
1274
+ "epoch": 2.4517906336088156,
1275
+ "grad_norm": 1.2734375,
1276
+ "learning_rate": 1.6088372160541962e-07,
1277
+ "loss": 1.6871,
1278
+ "step": 890
1279
+ },
1280
+ {
1281
+ "epoch": 2.465564738292011,
1282
+ "grad_norm": 1.2734375,
1283
+ "learning_rate": 1.531110929079681e-07,
1284
+ "loss": 1.6909,
1285
+ "step": 895
1286
+ },
1287
+ {
1288
+ "epoch": 2.479338842975207,
1289
+ "grad_norm": 1.28125,
1290
+ "learning_rate": 1.4551531196616396e-07,
1291
+ "loss": 1.6908,
1292
+ "step": 900
1293
+ },
1294
+ {
1295
+ "epoch": 2.479338842975207,
1296
+ "eval_loss": 1.7025996446609497,
1297
+ "eval_runtime": 8.3873,
1298
+ "eval_samples_per_second": 83.936,
1299
+ "eval_steps_per_second": 2.623,
1300
+ "step": 900
1301
+ },
1302
+ {
1303
+ "epoch": 2.4931129476584024,
1304
+ "grad_norm": 1.265625,
1305
+ "learning_rate": 1.3809796493469728e-07,
1306
+ "loss": 1.6981,
1307
+ "step": 905
1308
+ },
1309
+ {
1310
+ "epoch": 2.5068870523415976,
1311
+ "grad_norm": 1.3046875,
1312
+ "learning_rate": 1.3086060070760196e-07,
1313
+ "loss": 1.6902,
1314
+ "step": 910
1315
+ },
1316
+ {
1317
+ "epoch": 2.5206611570247937,
1318
+ "grad_norm": 1.3046875,
1319
+ "learning_rate": 1.23804730594814e-07,
1320
+ "loss": 1.6964,
1321
+ "step": 915
1322
+ },
1323
+ {
1324
+ "epoch": 2.534435261707989,
1325
+ "grad_norm": 1.2890625,
1326
+ "learning_rate": 1.1693182800658042e-07,
1327
+ "loss": 1.6884,
1328
+ "step": 920
1329
+ },
1330
+ {
1331
+ "epoch": 2.5482093663911844,
1332
+ "grad_norm": 1.2890625,
1333
+ "learning_rate": 1.102433281457802e-07,
1334
+ "loss": 1.6969,
1335
+ "step": 925
1336
+ },
1337
+ {
1338
+ "epoch": 2.56198347107438,
1339
+ "grad_norm": 1.265625,
1340
+ "learning_rate": 1.0374062770822411e-07,
1341
+ "loss": 1.7003,
1342
+ "step": 930
1343
+ },
1344
+ {
1345
+ "epoch": 2.5757575757575757,
1346
+ "grad_norm": 1.2578125,
1347
+ "learning_rate": 9.742508459099707e-08,
1348
+ "loss": 1.7095,
1349
+ "step": 935
1350
+ },
1351
+ {
1352
+ "epoch": 2.5895316804407713,
1353
+ "grad_norm": 1.2890625,
1354
+ "learning_rate": 9.129801760890076e-08,
1355
+ "loss": 1.7026,
1356
+ "step": 940
1357
+ },
1358
+ {
1359
+ "epoch": 2.603305785123967,
1360
+ "grad_norm": 1.2578125,
1361
+ "learning_rate": 8.536070621905811e-08,
1362
+ "loss": 1.6964,
1363
+ "step": 945
1364
+ },
1365
+ {
1366
+ "epoch": 2.6170798898071626,
1367
+ "grad_norm": 1.3046875,
1368
+ "learning_rate": 7.961439025373617e-08,
1369
+ "loss": 1.6984,
1370
+ "step": 950
1371
+ },
1372
+ {
1373
+ "epoch": 2.630853994490358,
1374
+ "grad_norm": 1.28125,
1375
+ "learning_rate": 7.40602696614444e-08,
1376
+ "loss": 1.7022,
1377
+ "step": 955
1378
+ },
1379
+ {
1380
+ "epoch": 2.644628099173554,
1381
+ "grad_norm": 1.2734375,
1382
+ "learning_rate": 6.869950425636095e-08,
1383
+ "loss": 1.6955,
1384
+ "step": 960
1385
+ },
1386
+ {
1387
+ "epoch": 2.6584022038567494,
1388
+ "grad_norm": 1.265625,
1389
+ "learning_rate": 6.353321347613815e-08,
1390
+ "loss": 1.6962,
1391
+ "step": 965
1392
+ },
1393
+ {
1394
+ "epoch": 2.672176308539945,
1395
+ "grad_norm": 1.3828125,
1396
+ "learning_rate": 5.856247614814292e-08,
1397
+ "loss": 1.6914,
1398
+ "step": 970
1399
+ },
1400
+ {
1401
+ "epoch": 2.6859504132231407,
1402
+ "grad_norm": 1.296875,
1403
+ "learning_rate": 5.3788330264174506e-08,
1404
+ "loss": 1.6934,
1405
+ "step": 975
1406
+ },
1407
+ {
1408
+ "epoch": 2.6997245179063363,
1409
+ "grad_norm": 1.28125,
1410
+ "learning_rate": 4.921177276371069e-08,
1411
+ "loss": 1.6947,
1412
+ "step": 980
1413
+ },
1414
+ {
1415
+ "epoch": 2.7134986225895315,
1416
+ "grad_norm": 1.2890625,
1417
+ "learning_rate": 4.483375932572597e-08,
1418
+ "loss": 1.6929,
1419
+ "step": 985
1420
+ },
1421
+ {
1422
+ "epoch": 2.7272727272727275,
1423
+ "grad_norm": 1.2890625,
1424
+ "learning_rate": 4.0655204169127156e-08,
1425
+ "loss": 1.6944,
1426
+ "step": 990
1427
+ },
1428
+ {
1429
+ "epoch": 2.7410468319559227,
1430
+ "grad_norm": 1.2734375,
1431
+ "learning_rate": 3.667697986184526e-08,
1432
+ "loss": 1.6898,
1433
+ "step": 995
1434
+ },
1435
+ {
1436
+ "epoch": 2.7548209366391183,
1437
+ "grad_norm": 1.2578125,
1438
+ "learning_rate": 3.2899917138625055e-08,
1439
+ "loss": 1.7061,
1440
+ "step": 1000
1441
+ },
1442
+ {
1443
+ "epoch": 2.768595041322314,
1444
+ "grad_norm": 1.3125,
1445
+ "learning_rate": 2.9324804727551055e-08,
1446
+ "loss": 1.6974,
1447
+ "step": 1005
1448
+ },
1449
+ {
1450
+ "epoch": 2.7823691460055096,
1451
+ "grad_norm": 1.2734375,
1452
+ "learning_rate": 2.5952389185344925e-08,
1453
+ "loss": 1.6892,
1454
+ "step": 1010
1455
+ },
1456
+ {
1457
+ "epoch": 2.796143250688705,
1458
+ "grad_norm": 1.2421875,
1459
+ "learning_rate": 2.2783374741469186e-08,
1460
+ "loss": 1.696,
1461
+ "step": 1015
1462
+ },
1463
+ {
1464
+ "epoch": 2.809917355371901,
1465
+ "grad_norm": 1.28125,
1466
+ "learning_rate": 1.9818423151069406e-08,
1467
+ "loss": 1.6879,
1468
+ "step": 1020
1469
+ },
1470
+ {
1471
+ "epoch": 2.8236914600550964,
1472
+ "grad_norm": 1.3125,
1473
+ "learning_rate": 1.705815355678619e-08,
1474
+ "loss": 1.6943,
1475
+ "step": 1025
1476
+ },
1477
+ {
1478
+ "epoch": 2.837465564738292,
1479
+ "grad_norm": 1.3046875,
1480
+ "learning_rate": 1.4503142359465925e-08,
1481
+ "loss": 1.6919,
1482
+ "step": 1030
1483
+ },
1484
+ {
1485
+ "epoch": 2.8512396694214877,
1486
+ "grad_norm": 1.2421875,
1487
+ "learning_rate": 1.215392309779617e-08,
1488
+ "loss": 1.6907,
1489
+ "step": 1035
1490
+ },
1491
+ {
1492
+ "epoch": 2.8650137741046833,
1493
+ "grad_norm": 1.2734375,
1494
+ "learning_rate": 1.0010986336891458e-08,
1495
+ "loss": 1.704,
1496
+ "step": 1040
1497
+ },
1498
+ {
1499
+ "epoch": 2.878787878787879,
1500
+ "grad_norm": 1.2578125,
1501
+ "learning_rate": 8.074779565854117e-09,
1502
+ "loss": 1.691,
1503
+ "step": 1045
1504
+ },
1505
+ {
1506
+ "epoch": 2.8925619834710745,
1507
+ "grad_norm": 1.3203125,
1508
+ "learning_rate": 6.34570710432869e-09,
1509
+ "loss": 1.6975,
1510
+ "step": 1050
1511
+ },
1512
+ {
1513
+ "epoch": 2.90633608815427,
1514
+ "grad_norm": 1.2734375,
1515
+ "learning_rate": 4.824130018072026e-09,
1516
+ "loss": 1.6918,
1517
+ "step": 1055
1518
+ },
1519
+ {
1520
+ "epoch": 2.9201101928374653,
1521
+ "grad_norm": 1.2890625,
1522
+ "learning_rate": 3.5103660435551465e-09,
1523
+ "loss": 1.6933,
1524
+ "step": 1060
1525
+ },
1526
+ {
1527
+ "epoch": 2.9338842975206614,
1528
+ "grad_norm": 1.3046875,
1529
+ "learning_rate": 2.4046895216136563e-09,
1530
+ "loss": 1.6872,
1531
+ "step": 1065
1532
+ },
1533
+ {
1534
+ "epoch": 2.9476584022038566,
1535
+ "grad_norm": 1.28125,
1536
+ "learning_rate": 1.5073313401594568e-09,
1537
+ "loss": 1.696,
1538
+ "step": 1070
1539
+ },
1540
+ {
1541
+ "epoch": 2.9614325068870526,
1542
+ "grad_norm": 1.234375,
1543
+ "learning_rate": 8.184788859667557e-10,
1544
+ "loss": 1.6964,
1545
+ "step": 1075
1546
+ },
1547
+ {
1548
+ "epoch": 2.975206611570248,
1549
+ "grad_norm": 1.3203125,
1550
+ "learning_rate": 3.3827600554170444e-10,
1551
+ "loss": 1.6941,
1552
+ "step": 1080
1553
+ },
1554
+ {
1555
+ "epoch": 2.9889807162534434,
1556
+ "grad_norm": 1.3125,
1557
+ "learning_rate": 6.682297508464608e-11,
1558
+ "loss": 1.6993,
1559
+ "step": 1085
1560
  }
1561
  ],
1562
  "logging_steps": 5,
 
1571
  "should_evaluate": false,
1572
  "should_log": false,
1573
  "should_save": true,
1574
+ "should_training_stop": true
1575
  },
1576
  "attributes": {}
1577
  }
1578
  },
1579
+ "total_flos": 6.427401199279931e+18,
1580
  "train_batch_size": 8,
1581
  "trial_name": null,
1582
  "trial_params": null