shulijia commited on
Commit
1dc352d
·
verified ·
1 Parent(s): 3641622

Training in progress, step 1459, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:330b09269164b35b0f3c54a9cf983afbfeb081ef2b5b0170118f717863e196a1
3
  size 2384234968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9831bb5e1e35eb5ed4ecb1b927fa6851453cf5a06fc1cc4390a9a213b31e41bc
3
  size 2384234968
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:552d37d315f2eaac7d9ad9bb5d2d51f32d6d568481f2f1bdd97301c2e32568de
3
  size 4768662910
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:643c1fb2bb7ea8916b25d0d480037eb4d301934ce55538f041d5e3fe49515bbd
3
  size 4768662910
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4dbcc143299cdd7d867d2d634f760dd811a54474128b1f304d7494dc68139ed8
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66e08749a49f64992fd7ac7288e0b5870a058f2e4c931055f9ea11441a8f2414
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.6849901532665468,
6
  "eval_steps": 100,
7
- "global_step": 1000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -908,6 +908,411 @@
908
  "mean_token_accuracy": 0.974119370430708,
909
  "num_tokens": 4096000.0,
910
  "step": 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
911
  }
912
  ],
913
  "logging_steps": 10,
@@ -922,12 +1327,12 @@
922
  "should_evaluate": false,
923
  "should_log": false,
924
  "should_save": true,
925
- "should_training_stop": false
926
  },
927
  "attributes": {}
928
  }
929
  },
930
- "total_flos": 1.0824928198656e+16,
931
  "train_batch_size": 1,
932
  "trial_name": null,
933
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.9994006336158918,
6
  "eval_steps": 100,
7
+ "global_step": 1459,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
908
  "mean_token_accuracy": 0.974119370430708,
909
  "num_tokens": 4096000.0,
910
  "step": 1000
911
+ },
912
+ {
913
+ "epoch": 0.6918400547992123,
914
+ "grad_norm": 0.7098048329353333,
915
+ "learning_rate": 1.7136329017517136e-05,
916
+ "loss": 0.1468,
917
+ "mean_token_accuracy": 0.9731898210942745,
918
+ "num_tokens": 4136960.0,
919
+ "step": 1010
920
+ },
921
+ {
922
+ "epoch": 0.6986899563318777,
923
+ "grad_norm": 0.784039318561554,
924
+ "learning_rate": 1.6755521706016754e-05,
925
+ "loss": 0.1281,
926
+ "mean_token_accuracy": 0.9749999955296517,
927
+ "num_tokens": 4177920.0,
928
+ "step": 1020
929
+ },
930
+ {
931
+ "epoch": 0.7055398578645432,
932
+ "grad_norm": 0.7651719450950623,
933
+ "learning_rate": 1.6374714394516375e-05,
934
+ "loss": 0.1362,
935
+ "mean_token_accuracy": 0.9742906041443348,
936
+ "num_tokens": 4218880.0,
937
+ "step": 1030
938
+ },
939
+ {
940
+ "epoch": 0.7123897593972086,
941
+ "grad_norm": 0.6100867390632629,
942
+ "learning_rate": 1.5993907083015994e-05,
943
+ "loss": 0.132,
944
+ "mean_token_accuracy": 0.9753180012106896,
945
+ "num_tokens": 4259840.0,
946
+ "step": 1040
947
+ },
948
+ {
949
+ "epoch": 0.7192396609298741,
950
+ "grad_norm": 0.789665162563324,
951
+ "learning_rate": 1.561309977151561e-05,
952
+ "loss": 0.1362,
953
+ "mean_token_accuracy": 0.974975535273552,
954
+ "num_tokens": 4300800.0,
955
+ "step": 1050
956
+ },
957
+ {
958
+ "epoch": 0.7260895624625396,
959
+ "grad_norm": 0.7343530058860779,
960
+ "learning_rate": 1.5232292460015233e-05,
961
+ "loss": 0.1338,
962
+ "mean_token_accuracy": 0.9751712307333946,
963
+ "num_tokens": 4341760.0,
964
+ "step": 1060
965
+ },
966
+ {
967
+ "epoch": 0.732939463995205,
968
+ "grad_norm": 0.6612673997879028,
969
+ "learning_rate": 1.4851485148514851e-05,
970
+ "loss": 0.1346,
971
+ "mean_token_accuracy": 0.9744862951338291,
972
+ "num_tokens": 4382720.0,
973
+ "step": 1070
974
+ },
975
+ {
976
+ "epoch": 0.7397893655278706,
977
+ "grad_norm": 0.6594858765602112,
978
+ "learning_rate": 1.4470677837014471e-05,
979
+ "loss": 0.1239,
980
+ "mean_token_accuracy": 0.9765900149941444,
981
+ "num_tokens": 4423680.0,
982
+ "step": 1080
983
+ },
984
+ {
985
+ "epoch": 0.746639267060536,
986
+ "grad_norm": 0.8898121118545532,
987
+ "learning_rate": 1.408987052551409e-05,
988
+ "loss": 0.1309,
989
+ "mean_token_accuracy": 0.9749021507799626,
990
+ "num_tokens": 4464640.0,
991
+ "step": 1090
992
+ },
993
+ {
994
+ "epoch": 0.7534891685932015,
995
+ "grad_norm": 1.0296205282211304,
996
+ "learning_rate": 1.3709063214013709e-05,
997
+ "loss": 0.1333,
998
+ "mean_token_accuracy": 0.9747553780674935,
999
+ "num_tokens": 4505600.0,
1000
+ "step": 1100
1001
+ },
1002
+ {
1003
+ "epoch": 0.760339070125867,
1004
+ "grad_norm": 0.7691386938095093,
1005
+ "learning_rate": 1.3328255902513328e-05,
1006
+ "loss": 0.1371,
1007
+ "mean_token_accuracy": 0.9743639886379242,
1008
+ "num_tokens": 4546560.0,
1009
+ "step": 1110
1010
+ },
1011
+ {
1012
+ "epoch": 0.7671889716585324,
1013
+ "grad_norm": 0.6772127151489258,
1014
+ "learning_rate": 1.2947448591012948e-05,
1015
+ "loss": 0.1272,
1016
+ "mean_token_accuracy": 0.9763209350407124,
1017
+ "num_tokens": 4587520.0,
1018
+ "step": 1120
1019
+ },
1020
+ {
1021
+ "epoch": 0.7740388731911979,
1022
+ "grad_norm": 0.8715652227401733,
1023
+ "learning_rate": 1.2566641279512566e-05,
1024
+ "loss": 0.1355,
1025
+ "mean_token_accuracy": 0.9732142828404904,
1026
+ "num_tokens": 4628480.0,
1027
+ "step": 1130
1028
+ },
1029
+ {
1030
+ "epoch": 0.7808887747238633,
1031
+ "grad_norm": 0.723171055316925,
1032
+ "learning_rate": 1.2185833968012186e-05,
1033
+ "loss": 0.1303,
1034
+ "mean_token_accuracy": 0.9756360031664372,
1035
+ "num_tokens": 4669440.0,
1036
+ "step": 1140
1037
+ },
1038
+ {
1039
+ "epoch": 0.7877386762565288,
1040
+ "grad_norm": 0.7655256986618042,
1041
+ "learning_rate": 1.1805026656511806e-05,
1042
+ "loss": 0.1344,
1043
+ "mean_token_accuracy": 0.9737279817461968,
1044
+ "num_tokens": 4710400.0,
1045
+ "step": 1150
1046
+ },
1047
+ {
1048
+ "epoch": 0.7945885777891942,
1049
+ "grad_norm": 0.7253991365432739,
1050
+ "learning_rate": 1.1424219345011426e-05,
1051
+ "loss": 0.1303,
1052
+ "mean_token_accuracy": 0.9757093906402587,
1053
+ "num_tokens": 4751360.0,
1054
+ "step": 1160
1055
+ },
1056
+ {
1057
+ "epoch": 0.8014384793218597,
1058
+ "grad_norm": 0.8294143080711365,
1059
+ "learning_rate": 1.1043412033511044e-05,
1060
+ "loss": 0.1349,
1061
+ "mean_token_accuracy": 0.9750489175319672,
1062
+ "num_tokens": 4792320.0,
1063
+ "step": 1170
1064
+ },
1065
+ {
1066
+ "epoch": 0.8082883808545253,
1067
+ "grad_norm": 0.7206155061721802,
1068
+ "learning_rate": 1.0662604722010663e-05,
1069
+ "loss": 0.1298,
1070
+ "mean_token_accuracy": 0.975220151245594,
1071
+ "num_tokens": 4833280.0,
1072
+ "step": 1180
1073
+ },
1074
+ {
1075
+ "epoch": 0.8151382823871907,
1076
+ "grad_norm": 0.6955848336219788,
1077
+ "learning_rate": 1.0281797410510283e-05,
1078
+ "loss": 0.1245,
1079
+ "mean_token_accuracy": 0.9763453997671604,
1080
+ "num_tokens": 4874240.0,
1081
+ "step": 1190
1082
+ },
1083
+ {
1084
+ "epoch": 0.8219881839198562,
1085
+ "grad_norm": 0.9156601428985596,
1086
+ "learning_rate": 9.900990099009901e-06,
1087
+ "loss": 0.1289,
1088
+ "mean_token_accuracy": 0.9753913819789887,
1089
+ "num_tokens": 4915200.0,
1090
+ "step": 1200
1091
+ },
1092
+ {
1093
+ "epoch": 0.8288380854525216,
1094
+ "grad_norm": 0.7370653748512268,
1095
+ "learning_rate": 9.520182787509521e-06,
1096
+ "loss": 0.1268,
1097
+ "mean_token_accuracy": 0.9757827781140804,
1098
+ "num_tokens": 4956160.0,
1099
+ "step": 1210
1100
+ },
1101
+ {
1102
+ "epoch": 0.8356879869851871,
1103
+ "grad_norm": 0.8186970353126526,
1104
+ "learning_rate": 9.13937547600914e-06,
1105
+ "loss": 0.139,
1106
+ "mean_token_accuracy": 0.9738747522234916,
1107
+ "num_tokens": 4997120.0,
1108
+ "step": 1220
1109
+ },
1110
+ {
1111
+ "epoch": 0.8425378885178526,
1112
+ "grad_norm": 0.7266510128974915,
1113
+ "learning_rate": 8.758568164508759e-06,
1114
+ "loss": 0.1344,
1115
+ "mean_token_accuracy": 0.9744373761117459,
1116
+ "num_tokens": 5038080.0,
1117
+ "step": 1230
1118
+ },
1119
+ {
1120
+ "epoch": 0.849387790050518,
1121
+ "grad_norm": 0.7429907321929932,
1122
+ "learning_rate": 8.377760853008377e-06,
1123
+ "loss": 0.136,
1124
+ "mean_token_accuracy": 0.9745841458439827,
1125
+ "num_tokens": 5079040.0,
1126
+ "step": 1240
1127
+ },
1128
+ {
1129
+ "epoch": 0.8562376915831835,
1130
+ "grad_norm": 0.6797091364860535,
1131
+ "learning_rate": 7.996953541507997e-06,
1132
+ "loss": 0.1364,
1133
+ "mean_token_accuracy": 0.9743395261466503,
1134
+ "num_tokens": 5120000.0,
1135
+ "step": 1250
1136
+ },
1137
+ {
1138
+ "epoch": 0.8630875931158489,
1139
+ "grad_norm": 0.6236211061477661,
1140
+ "learning_rate": 7.6161462300076165e-06,
1141
+ "loss": 0.1247,
1142
+ "mean_token_accuracy": 0.9762230902910233,
1143
+ "num_tokens": 5160960.0,
1144
+ "step": 1260
1145
+ },
1146
+ {
1147
+ "epoch": 0.8699374946485144,
1148
+ "grad_norm": 1.0793285369873047,
1149
+ "learning_rate": 7.2353389185072354e-06,
1150
+ "loss": 0.1384,
1151
+ "mean_token_accuracy": 0.9733121328055858,
1152
+ "num_tokens": 5201920.0,
1153
+ "step": 1270
1154
+ },
1155
+ {
1156
+ "epoch": 0.8767873961811798,
1157
+ "grad_norm": 0.7861872911453247,
1158
+ "learning_rate": 6.854531607006854e-06,
1159
+ "loss": 0.1303,
1160
+ "mean_token_accuracy": 0.9757093906402587,
1161
+ "num_tokens": 5242880.0,
1162
+ "step": 1280
1163
+ },
1164
+ {
1165
+ "epoch": 0.8836372977138454,
1166
+ "grad_norm": 0.6387534141540527,
1167
+ "learning_rate": 6.473724295506474e-06,
1168
+ "loss": 0.1276,
1169
+ "mean_token_accuracy": 0.9762964725494385,
1170
+ "num_tokens": 5283840.0,
1171
+ "step": 1290
1172
+ },
1173
+ {
1174
+ "epoch": 0.8904871992465109,
1175
+ "grad_norm": 0.8344358801841736,
1176
+ "learning_rate": 6.092916984006093e-06,
1177
+ "loss": 0.1265,
1178
+ "mean_token_accuracy": 0.9753424629569054,
1179
+ "num_tokens": 5324800.0,
1180
+ "step": 1300
1181
+ },
1182
+ {
1183
+ "epoch": 0.8973371007791763,
1184
+ "grad_norm": 0.847582995891571,
1185
+ "learning_rate": 5.712109672505713e-06,
1186
+ "loss": 0.1279,
1187
+ "mean_token_accuracy": 0.9758561603724957,
1188
+ "num_tokens": 5365760.0,
1189
+ "step": 1310
1190
+ },
1191
+ {
1192
+ "epoch": 0.9041870023118418,
1193
+ "grad_norm": 0.9168363809585571,
1194
+ "learning_rate": 5.331302361005332e-06,
1195
+ "loss": 0.1372,
1196
+ "mean_token_accuracy": 0.9740215204656124,
1197
+ "num_tokens": 5406720.0,
1198
+ "step": 1320
1199
+ },
1200
+ {
1201
+ "epoch": 0.9110369038445072,
1202
+ "grad_norm": 0.8992748856544495,
1203
+ "learning_rate": 4.950495049504951e-06,
1204
+ "loss": 0.1267,
1205
+ "mean_token_accuracy": 0.9755381584167481,
1206
+ "num_tokens": 5447680.0,
1207
+ "step": 1330
1208
+ },
1209
+ {
1210
+ "epoch": 0.9178868053771727,
1211
+ "grad_norm": 0.8741844296455383,
1212
+ "learning_rate": 4.56968773800457e-06,
1213
+ "loss": 0.1305,
1214
+ "mean_token_accuracy": 0.974828764051199,
1215
+ "num_tokens": 5488640.0,
1216
+ "step": 1340
1217
+ },
1218
+ {
1219
+ "epoch": 0.9247367069098382,
1220
+ "grad_norm": 0.8191506862640381,
1221
+ "learning_rate": 4.1888804265041885e-06,
1222
+ "loss": 0.1298,
1223
+ "mean_token_accuracy": 0.9747309193015099,
1224
+ "num_tokens": 5529600.0,
1225
+ "step": 1350
1226
+ },
1227
+ {
1228
+ "epoch": 0.9315866084425036,
1229
+ "grad_norm": 0.7575434446334839,
1230
+ "learning_rate": 3.8080731150038083e-06,
1231
+ "loss": 0.1357,
1232
+ "mean_token_accuracy": 0.9738013677299022,
1233
+ "num_tokens": 5570560.0,
1234
+ "step": 1360
1235
+ },
1236
+ {
1237
+ "epoch": 0.9384365099751691,
1238
+ "grad_norm": 0.8960770964622498,
1239
+ "learning_rate": 3.427265803503427e-06,
1240
+ "loss": 0.1213,
1241
+ "mean_token_accuracy": 0.9772749491035938,
1242
+ "num_tokens": 5611520.0,
1243
+ "step": 1370
1244
+ },
1245
+ {
1246
+ "epoch": 0.9452864115078345,
1247
+ "grad_norm": 0.6886873841285706,
1248
+ "learning_rate": 3.0464584920030465e-06,
1249
+ "loss": 0.1228,
1250
+ "mean_token_accuracy": 0.9769324824213982,
1251
+ "num_tokens": 5652480.0,
1252
+ "step": 1380
1253
+ },
1254
+ {
1255
+ "epoch": 0.9521363130405001,
1256
+ "grad_norm": 0.606661319732666,
1257
+ "learning_rate": 2.665651180502666e-06,
1258
+ "loss": 0.1297,
1259
+ "mean_token_accuracy": 0.9750244595110417,
1260
+ "num_tokens": 5693440.0,
1261
+ "step": 1390
1262
+ },
1263
+ {
1264
+ "epoch": 0.9589862145731655,
1265
+ "grad_norm": 0.7276999950408936,
1266
+ "learning_rate": 2.284843869002285e-06,
1267
+ "loss": 0.1398,
1268
+ "mean_token_accuracy": 0.9740215227007866,
1269
+ "num_tokens": 5734400.0,
1270
+ "step": 1400
1271
+ },
1272
+ {
1273
+ "epoch": 0.965836116105831,
1274
+ "grad_norm": 0.7728492021560669,
1275
+ "learning_rate": 1.9040365575019041e-06,
1276
+ "loss": 0.1288,
1277
+ "mean_token_accuracy": 0.9757827736437321,
1278
+ "num_tokens": 5775360.0,
1279
+ "step": 1410
1280
+ },
1281
+ {
1282
+ "epoch": 0.9726860176384965,
1283
+ "grad_norm": 0.6705936193466187,
1284
+ "learning_rate": 1.5232292460015233e-06,
1285
+ "loss": 0.1246,
1286
+ "mean_token_accuracy": 0.9765166334807873,
1287
+ "num_tokens": 5816320.0,
1288
+ "step": 1420
1289
+ },
1290
+ {
1291
+ "epoch": 0.9795359191711619,
1292
+ "grad_norm": 1.092812418937683,
1293
+ "learning_rate": 1.1424219345011426e-06,
1294
+ "loss": 0.1269,
1295
+ "mean_token_accuracy": 0.9758806228637695,
1296
+ "num_tokens": 5857280.0,
1297
+ "step": 1430
1298
+ },
1299
+ {
1300
+ "epoch": 0.9863858207038274,
1301
+ "grad_norm": 0.7121983170509338,
1302
+ "learning_rate": 7.616146230007616e-07,
1303
+ "loss": 0.1245,
1304
+ "mean_token_accuracy": 0.9758806228637695,
1305
+ "num_tokens": 5898240.0,
1306
+ "step": 1440
1307
+ },
1308
+ {
1309
+ "epoch": 0.9932357222364928,
1310
+ "grad_norm": 0.6971803307533264,
1311
+ "learning_rate": 3.808073115003808e-07,
1312
+ "loss": 0.1298,
1313
+ "mean_token_accuracy": 0.9753179982304573,
1314
+ "num_tokens": 5939200.0,
1315
+ "step": 1450
1316
  }
1317
  ],
1318
  "logging_steps": 10,
 
1327
  "should_evaluate": false,
1328
  "should_log": false,
1329
  "should_save": true,
1330
+ "should_training_stop": true
1331
  },
1332
  "attributes": {}
1333
  }
1334
  },
1335
+ "total_flos": 1.5793570241839104e+16,
1336
  "train_batch_size": 1,
1337
  "trial_name": null,
1338
  "trial_params": null