Ehsanl commited on
Commit
56b01d7
·
verified ·
1 Parent(s): b9094ae

Checkpoint 2032

Browse files
Files changed (3) hide show
  1. config.json +1 -1
  2. model.safetensors +2 -2
  3. trainer_state.json +359 -2
config.json CHANGED
@@ -7,7 +7,7 @@
7
  "bos_token_id": 0,
8
  "classifier_dropout": null,
9
  "cls_token_id": 0,
10
- "dtype": "bfloat16",
11
  "eos_token_id": 1,
12
  "hidden_act": "gelu",
13
  "hidden_dropout_prob": 0.1,
 
7
  "bos_token_id": 0,
8
  "classifier_dropout": null,
9
  "cls_token_id": 0,
10
+ "dtype": "float32",
11
  "eos_token_id": 1,
12
  "hidden_act": "gelu",
13
  "hidden_dropout_prob": 0.1,
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:16a91e00f07692e4bc2d08429911bc4fda978b089a21c0e77a8ccf1e257412a5
3
- size 688600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d962f5f2ab34269a713b2159215c844a6d7e6bb9da5a159e26d3a164845de2c2
3
+ size 1420398464
trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.4955839057899902,
6
  "eval_steps": 500,
7
- "global_step": 1524,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1072,6 +1072,363 @@
1072
  "learning_rate": 2e-05,
1073
  "loss": 1.8683,
1074
  "step": 1520
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1075
  }
1076
  ],
1077
  "logging_steps": 10,
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.9941118743866535,
6
  "eval_steps": 500,
7
+ "global_step": 2032,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1072
  "learning_rate": 2e-05,
1073
  "loss": 1.8683,
1074
  "step": 1520
1075
+ },
1076
+ {
1077
+ "epoch": 1.5014720314033365,
1078
+ "grad_norm": 24.893234252929688,
1079
+ "learning_rate": 2e-05,
1080
+ "loss": 2.7447,
1081
+ "step": 1530
1082
+ },
1083
+ {
1084
+ "epoch": 1.5112855740922473,
1085
+ "grad_norm": 20.632429122924805,
1086
+ "learning_rate": 2e-05,
1087
+ "loss": 2.0633,
1088
+ "step": 1540
1089
+ },
1090
+ {
1091
+ "epoch": 1.521099116781158,
1092
+ "grad_norm": 11.803605079650879,
1093
+ "learning_rate": 2e-05,
1094
+ "loss": 1.4992,
1095
+ "step": 1550
1096
+ },
1097
+ {
1098
+ "epoch": 1.5309126594700686,
1099
+ "grad_norm": 11.409942626953125,
1100
+ "learning_rate": 2e-05,
1101
+ "loss": 1.9619,
1102
+ "step": 1560
1103
+ },
1104
+ {
1105
+ "epoch": 1.5407262021589794,
1106
+ "grad_norm": 19.285724639892578,
1107
+ "learning_rate": 2e-05,
1108
+ "loss": 2.4316,
1109
+ "step": 1570
1110
+ },
1111
+ {
1112
+ "epoch": 1.5505397448478901,
1113
+ "grad_norm": 3.482271909713745,
1114
+ "learning_rate": 2e-05,
1115
+ "loss": 1.8275,
1116
+ "step": 1580
1117
+ },
1118
+ {
1119
+ "epoch": 1.5603532875368007,
1120
+ "grad_norm": 8.602056503295898,
1121
+ "learning_rate": 2e-05,
1122
+ "loss": 1.8139,
1123
+ "step": 1590
1124
+ },
1125
+ {
1126
+ "epoch": 1.5701668302257115,
1127
+ "grad_norm": 6.880626201629639,
1128
+ "learning_rate": 2e-05,
1129
+ "loss": 1.162,
1130
+ "step": 1600
1131
+ },
1132
+ {
1133
+ "epoch": 1.5799803729146222,
1134
+ "grad_norm": 10.55600643157959,
1135
+ "learning_rate": 2e-05,
1136
+ "loss": 1.8477,
1137
+ "step": 1610
1138
+ },
1139
+ {
1140
+ "epoch": 1.5897939156035328,
1141
+ "grad_norm": 5.864988327026367,
1142
+ "learning_rate": 2e-05,
1143
+ "loss": 1.0593,
1144
+ "step": 1620
1145
+ },
1146
+ {
1147
+ "epoch": 1.5996074582924436,
1148
+ "grad_norm": 11.867835998535156,
1149
+ "learning_rate": 2e-05,
1150
+ "loss": 1.8334,
1151
+ "step": 1630
1152
+ },
1153
+ {
1154
+ "epoch": 1.6094210009813543,
1155
+ "grad_norm": 8.558449745178223,
1156
+ "learning_rate": 2e-05,
1157
+ "loss": 1.7685,
1158
+ "step": 1640
1159
+ },
1160
+ {
1161
+ "epoch": 1.6192345436702649,
1162
+ "grad_norm": 7.745001792907715,
1163
+ "learning_rate": 2e-05,
1164
+ "loss": 1.1716,
1165
+ "step": 1650
1166
+ },
1167
+ {
1168
+ "epoch": 1.6290480863591756,
1169
+ "grad_norm": 25.266897201538086,
1170
+ "learning_rate": 2e-05,
1171
+ "loss": 1.1115,
1172
+ "step": 1660
1173
+ },
1174
+ {
1175
+ "epoch": 1.6388616290480864,
1176
+ "grad_norm": 2.910959243774414,
1177
+ "learning_rate": 2e-05,
1178
+ "loss": 1.5551,
1179
+ "step": 1670
1180
+ },
1181
+ {
1182
+ "epoch": 1.648675171736997,
1183
+ "grad_norm": 3.1996586322784424,
1184
+ "learning_rate": 2e-05,
1185
+ "loss": 2.0908,
1186
+ "step": 1680
1187
+ },
1188
+ {
1189
+ "epoch": 1.6584887144259077,
1190
+ "grad_norm": 7.940663814544678,
1191
+ "learning_rate": 2e-05,
1192
+ "loss": 2.1272,
1193
+ "step": 1690
1194
+ },
1195
+ {
1196
+ "epoch": 1.6683022571148185,
1197
+ "grad_norm": 10.518881797790527,
1198
+ "learning_rate": 2e-05,
1199
+ "loss": 2.2504,
1200
+ "step": 1700
1201
+ },
1202
+ {
1203
+ "epoch": 1.678115799803729,
1204
+ "grad_norm": 13.522726058959961,
1205
+ "learning_rate": 2e-05,
1206
+ "loss": 3.0516,
1207
+ "step": 1710
1208
+ },
1209
+ {
1210
+ "epoch": 1.6879293424926398,
1211
+ "grad_norm": 72.19231414794922,
1212
+ "learning_rate": 2e-05,
1213
+ "loss": 1.6259,
1214
+ "step": 1720
1215
+ },
1216
+ {
1217
+ "epoch": 1.6977428851815506,
1218
+ "grad_norm": 16.836326599121094,
1219
+ "learning_rate": 2e-05,
1220
+ "loss": 1.5319,
1221
+ "step": 1730
1222
+ },
1223
+ {
1224
+ "epoch": 1.7075564278704611,
1225
+ "grad_norm": 5.992958068847656,
1226
+ "learning_rate": 2e-05,
1227
+ "loss": 1.9591,
1228
+ "step": 1740
1229
+ },
1230
+ {
1231
+ "epoch": 1.717369970559372,
1232
+ "grad_norm": 16.606359481811523,
1233
+ "learning_rate": 2e-05,
1234
+ "loss": 2.8002,
1235
+ "step": 1750
1236
+ },
1237
+ {
1238
+ "epoch": 1.7271835132482827,
1239
+ "grad_norm": 13.355613708496094,
1240
+ "learning_rate": 2e-05,
1241
+ "loss": 1.202,
1242
+ "step": 1760
1243
+ },
1244
+ {
1245
+ "epoch": 1.7369970559371932,
1246
+ "grad_norm": 22.692291259765625,
1247
+ "learning_rate": 2e-05,
1248
+ "loss": 1.5828,
1249
+ "step": 1770
1250
+ },
1251
+ {
1252
+ "epoch": 1.746810598626104,
1253
+ "grad_norm": 4.382213592529297,
1254
+ "learning_rate": 2e-05,
1255
+ "loss": 1.3851,
1256
+ "step": 1780
1257
+ },
1258
+ {
1259
+ "epoch": 1.7566241413150148,
1260
+ "grad_norm": 18.94695281982422,
1261
+ "learning_rate": 2e-05,
1262
+ "loss": 1.6604,
1263
+ "step": 1790
1264
+ },
1265
+ {
1266
+ "epoch": 1.7664376840039253,
1267
+ "grad_norm": 4.105762958526611,
1268
+ "learning_rate": 2e-05,
1269
+ "loss": 2.3467,
1270
+ "step": 1800
1271
+ },
1272
+ {
1273
+ "epoch": 1.776251226692836,
1274
+ "grad_norm": 17.720151901245117,
1275
+ "learning_rate": 2e-05,
1276
+ "loss": 1.0744,
1277
+ "step": 1810
1278
+ },
1279
+ {
1280
+ "epoch": 1.7860647693817469,
1281
+ "grad_norm": 23.243711471557617,
1282
+ "learning_rate": 2e-05,
1283
+ "loss": 1.5227,
1284
+ "step": 1820
1285
+ },
1286
+ {
1287
+ "epoch": 1.7958783120706574,
1288
+ "grad_norm": 9.93882942199707,
1289
+ "learning_rate": 2e-05,
1290
+ "loss": 2.278,
1291
+ "step": 1830
1292
+ },
1293
+ {
1294
+ "epoch": 1.8056918547595682,
1295
+ "grad_norm": 47.31532287597656,
1296
+ "learning_rate": 2e-05,
1297
+ "loss": 2.4806,
1298
+ "step": 1840
1299
+ },
1300
+ {
1301
+ "epoch": 1.815505397448479,
1302
+ "grad_norm": 18.85761260986328,
1303
+ "learning_rate": 2e-05,
1304
+ "loss": 1.2861,
1305
+ "step": 1850
1306
+ },
1307
+ {
1308
+ "epoch": 1.8253189401373895,
1309
+ "grad_norm": 12.545721054077148,
1310
+ "learning_rate": 2e-05,
1311
+ "loss": 1.8076,
1312
+ "step": 1860
1313
+ },
1314
+ {
1315
+ "epoch": 1.8351324828263003,
1316
+ "grad_norm": 3.5376434326171875,
1317
+ "learning_rate": 2e-05,
1318
+ "loss": 1.8752,
1319
+ "step": 1870
1320
+ },
1321
+ {
1322
+ "epoch": 1.844946025515211,
1323
+ "grad_norm": 7.608773708343506,
1324
+ "learning_rate": 2e-05,
1325
+ "loss": 1.2668,
1326
+ "step": 1880
1327
+ },
1328
+ {
1329
+ "epoch": 1.8547595682041216,
1330
+ "grad_norm": 15.594606399536133,
1331
+ "learning_rate": 2e-05,
1332
+ "loss": 2.4341,
1333
+ "step": 1890
1334
+ },
1335
+ {
1336
+ "epoch": 1.8645731108930323,
1337
+ "grad_norm": 10.585665702819824,
1338
+ "learning_rate": 2e-05,
1339
+ "loss": 1.6737,
1340
+ "step": 1900
1341
+ },
1342
+ {
1343
+ "epoch": 1.8743866535819431,
1344
+ "grad_norm": 3.565300703048706,
1345
+ "learning_rate": 2e-05,
1346
+ "loss": 1.1942,
1347
+ "step": 1910
1348
+ },
1349
+ {
1350
+ "epoch": 1.8842001962708537,
1351
+ "grad_norm": 3.813704490661621,
1352
+ "learning_rate": 2e-05,
1353
+ "loss": 1.5228,
1354
+ "step": 1920
1355
+ },
1356
+ {
1357
+ "epoch": 1.8940137389597644,
1358
+ "grad_norm": 11.792035102844238,
1359
+ "learning_rate": 2e-05,
1360
+ "loss": 1.3769,
1361
+ "step": 1930
1362
+ },
1363
+ {
1364
+ "epoch": 1.9038272816486752,
1365
+ "grad_norm": 6.388332366943359,
1366
+ "learning_rate": 2e-05,
1367
+ "loss": 2.39,
1368
+ "step": 1940
1369
+ },
1370
+ {
1371
+ "epoch": 1.9136408243375858,
1372
+ "grad_norm": 6.338537216186523,
1373
+ "learning_rate": 2e-05,
1374
+ "loss": 1.3043,
1375
+ "step": 1950
1376
+ },
1377
+ {
1378
+ "epoch": 1.9234543670264965,
1379
+ "grad_norm": 6.711911201477051,
1380
+ "learning_rate": 2e-05,
1381
+ "loss": 1.2017,
1382
+ "step": 1960
1383
+ },
1384
+ {
1385
+ "epoch": 1.9332679097154073,
1386
+ "grad_norm": 8.049627304077148,
1387
+ "learning_rate": 2e-05,
1388
+ "loss": 0.9924,
1389
+ "step": 1970
1390
+ },
1391
+ {
1392
+ "epoch": 1.9430814524043178,
1393
+ "grad_norm": 15.855047225952148,
1394
+ "learning_rate": 2e-05,
1395
+ "loss": 1.5458,
1396
+ "step": 1980
1397
+ },
1398
+ {
1399
+ "epoch": 1.9528949950932286,
1400
+ "grad_norm": 27.410795211791992,
1401
+ "learning_rate": 2e-05,
1402
+ "loss": 1.5845,
1403
+ "step": 1990
1404
+ },
1405
+ {
1406
+ "epoch": 1.9627085377821394,
1407
+ "grad_norm": 10.094039916992188,
1408
+ "learning_rate": 2e-05,
1409
+ "loss": 2.6201,
1410
+ "step": 2000
1411
+ },
1412
+ {
1413
+ "epoch": 1.97252208047105,
1414
+ "grad_norm": 7.608443260192871,
1415
+ "learning_rate": 2e-05,
1416
+ "loss": 2.2552,
1417
+ "step": 2010
1418
+ },
1419
+ {
1420
+ "epoch": 1.9823356231599607,
1421
+ "grad_norm": 6.73370885848999,
1422
+ "learning_rate": 2e-05,
1423
+ "loss": 1.299,
1424
+ "step": 2020
1425
+ },
1426
+ {
1427
+ "epoch": 1.9921491658488715,
1428
+ "grad_norm": 13.77723217010498,
1429
+ "learning_rate": 2e-05,
1430
+ "loss": 1.3727,
1431
+ "step": 2030
1432
  }
1433
  ],
1434
  "logging_steps": 10,