|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.901098901098901, |
|
"eval_steps": 500, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.47619047619047616, |
|
"grad_norm": 5716.10205078125, |
|
"learning_rate": 3.0000000000000004e-08, |
|
"loss": 1383.1007, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.9523809523809523, |
|
"grad_norm": 12343.9111328125, |
|
"learning_rate": 6.75e-08, |
|
"loss": 1349.0135, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 1.380952380952381, |
|
"grad_norm": 10210.48046875, |
|
"learning_rate": 1.05e-07, |
|
"loss": 1340.3351, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 1.8571428571428572, |
|
"grad_norm": 7633.39599609375, |
|
"learning_rate": 1.425e-07, |
|
"loss": 1369.3844, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.5494505494505495, |
|
"grad_norm": 12217.7294921875, |
|
"learning_rate": 1.8e-07, |
|
"loss": 1362.663, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.6593406593406593, |
|
"grad_norm": 31348.806640625, |
|
"learning_rate": 2.175e-07, |
|
"loss": 1361.9012, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"grad_norm": 7385.05322265625, |
|
"learning_rate": 2.5500000000000005e-07, |
|
"loss": 1365.5517, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.8791208791208791, |
|
"grad_norm": 15594.4677734375, |
|
"learning_rate": 2.925e-07, |
|
"loss": 1341.6402, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.989010989010989, |
|
"grad_norm": 6404.94775390625, |
|
"learning_rate": 3.2999999999999996e-07, |
|
"loss": 1343.9832, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.10989010989011, |
|
"grad_norm": 6927.478515625, |
|
"learning_rate": 3.6750000000000003e-07, |
|
"loss": 1611.4768, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.2197802197802199, |
|
"grad_norm": 12092.9287109375, |
|
"learning_rate": 4.05e-07, |
|
"loss": 1330.508, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.3296703296703296, |
|
"grad_norm": 5830.10693359375, |
|
"learning_rate": 4.425e-07, |
|
"loss": 1333.5215, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.4395604395604396, |
|
"grad_norm": 5059.5302734375, |
|
"learning_rate": 4.800000000000001e-07, |
|
"loss": 1305.5257, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.5494505494505495, |
|
"grad_norm": 14086.837890625, |
|
"learning_rate": 5.175e-07, |
|
"loss": 1293.7212, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.6593406593406592, |
|
"grad_norm": 4606.49462890625, |
|
"learning_rate": 5.55e-07, |
|
"loss": 1293.7292, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.7692307692307692, |
|
"grad_norm": 7120.25244140625, |
|
"learning_rate": 5.925e-07, |
|
"loss": 1283.8888, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.879120879120879, |
|
"grad_norm": 4402.51513671875, |
|
"learning_rate": 6.3e-07, |
|
"loss": 1270.6494, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.989010989010989, |
|
"grad_norm": 4826.724609375, |
|
"learning_rate": 6.675e-07, |
|
"loss": 1263.9209, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.087912087912088, |
|
"grad_norm": 4356.83056640625, |
|
"learning_rate": 7.05e-07, |
|
"loss": 1255.111, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 2.197802197802198, |
|
"grad_norm": 6895.61962890625, |
|
"learning_rate": 7.425000000000001e-07, |
|
"loss": 1224.4364, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.3076923076923075, |
|
"grad_norm": 4793.59375, |
|
"learning_rate": 7.799999999999999e-07, |
|
"loss": 1211.3252, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 2.4175824175824174, |
|
"grad_norm": 6701.6357421875, |
|
"learning_rate": 8.175e-07, |
|
"loss": 1183.1627, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.5274725274725274, |
|
"grad_norm": 4253.81005859375, |
|
"learning_rate": 8.550000000000001e-07, |
|
"loss": 1179.29, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 2.6373626373626373, |
|
"grad_norm": 4965.86181640625, |
|
"learning_rate": 8.925e-07, |
|
"loss": 1168.3469, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.7472527472527473, |
|
"grad_norm": 3376.816650390625, |
|
"learning_rate": 9.3e-07, |
|
"loss": 1139.6433, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 3226.073486328125, |
|
"learning_rate": 9.675e-07, |
|
"loss": 1133.0777, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.967032967032967, |
|
"grad_norm": 3560.005126953125, |
|
"learning_rate": 1.0050000000000001e-06, |
|
"loss": 1113.3471, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 3.065934065934066, |
|
"grad_norm": 2917.787109375, |
|
"learning_rate": 1.0425000000000002e-06, |
|
"loss": 1094.2038, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 3.1758241758241756, |
|
"grad_norm": 3291.990478515625, |
|
"learning_rate": 1.08e-06, |
|
"loss": 1069.3087, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 3.2857142857142856, |
|
"grad_norm": 3082.956298828125, |
|
"learning_rate": 1.1174999999999999e-06, |
|
"loss": 1043.3319, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 3.3956043956043955, |
|
"grad_norm": 2947.577392578125, |
|
"learning_rate": 1.155e-06, |
|
"loss": 1040.2479, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 3.5054945054945055, |
|
"grad_norm": 2917.072021484375, |
|
"learning_rate": 1.1925e-06, |
|
"loss": 1014.5039, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.6153846153846154, |
|
"grad_norm": 2572.1064453125, |
|
"learning_rate": 1.23e-06, |
|
"loss": 980.6475, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 3.7252747252747254, |
|
"grad_norm": 3038.7041015625, |
|
"learning_rate": 1.2675000000000001e-06, |
|
"loss": 969.8687, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 3.8351648351648353, |
|
"grad_norm": 2459.19482421875, |
|
"learning_rate": 1.305e-06, |
|
"loss": 946.8183, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 3.9450549450549453, |
|
"grad_norm": 2485.954345703125, |
|
"learning_rate": 1.3425e-06, |
|
"loss": 923.6708, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 4.043956043956044, |
|
"grad_norm": 2574.14599609375, |
|
"learning_rate": 1.38e-06, |
|
"loss": 906.8975, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 4.153846153846154, |
|
"grad_norm": 2498.670166015625, |
|
"learning_rate": 1.4175e-06, |
|
"loss": 879.6545, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 4.263736263736264, |
|
"grad_norm": 2621.924072265625, |
|
"learning_rate": 1.455e-06, |
|
"loss": 859.6516, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 4.373626373626374, |
|
"grad_norm": 2298.13671875, |
|
"learning_rate": 1.4925000000000001e-06, |
|
"loss": 835.168, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 4.483516483516484, |
|
"grad_norm": 2228.95458984375, |
|
"learning_rate": 1.53e-06, |
|
"loss": 817.8176, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 4.593406593406593, |
|
"grad_norm": 2075.57568359375, |
|
"learning_rate": 1.5675e-06, |
|
"loss": 795.5578, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 4.7032967032967035, |
|
"grad_norm": 2308.193603515625, |
|
"learning_rate": 1.605e-06, |
|
"loss": 769.8123, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 4.813186813186813, |
|
"grad_norm": 2495.91259765625, |
|
"learning_rate": 1.6425e-06, |
|
"loss": 745.4971, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 4.923076923076923, |
|
"grad_norm": 2490.6796875, |
|
"learning_rate": 1.68e-06, |
|
"loss": 724.5927, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 5.021978021978022, |
|
"grad_norm": 1986.7320556640625, |
|
"learning_rate": 1.7175e-06, |
|
"loss": 711.5865, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 5.131868131868132, |
|
"grad_norm": 1901.4664306640625, |
|
"learning_rate": 1.7550000000000001e-06, |
|
"loss": 692.5993, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 5.241758241758242, |
|
"grad_norm": 2567.854248046875, |
|
"learning_rate": 1.7925e-06, |
|
"loss": 665.2945, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 5.351648351648351, |
|
"grad_norm": 1668.3482666015625, |
|
"learning_rate": 1.83e-06, |
|
"loss": 648.977, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 5.461538461538462, |
|
"grad_norm": 1845.254150390625, |
|
"learning_rate": 1.8675000000000001e-06, |
|
"loss": 627.8005, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 5.571428571428571, |
|
"grad_norm": 1811.66845703125, |
|
"learning_rate": 1.905e-06, |
|
"loss": 606.1213, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 5.681318681318682, |
|
"grad_norm": 1720.802734375, |
|
"learning_rate": 1.9425e-06, |
|
"loss": 587.5198, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 5.791208791208791, |
|
"grad_norm": 1535.228759765625, |
|
"learning_rate": 1.98e-06, |
|
"loss": 570.1765, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 5.9010989010989015, |
|
"grad_norm": 1655.658447265625, |
|
"learning_rate": 2.0175e-06, |
|
"loss": 551.0042, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 3979.0732421875, |
|
"learning_rate": 2.0550000000000002e-06, |
|
"loss": 535.4148, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 6.1098901098901095, |
|
"grad_norm": 1727.4771728515625, |
|
"learning_rate": 2.0925000000000003e-06, |
|
"loss": 519.8219, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 6.21978021978022, |
|
"grad_norm": 1350.4666748046875, |
|
"learning_rate": 2.13e-06, |
|
"loss": 510.5004, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 6.329670329670329, |
|
"grad_norm": 2242.578857421875, |
|
"learning_rate": 2.1675e-06, |
|
"loss": 497.497, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 6.43956043956044, |
|
"grad_norm": 1353.7908935546875, |
|
"learning_rate": 2.205e-06, |
|
"loss": 476.5748, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 6.549450549450549, |
|
"grad_norm": 1527.2796630859375, |
|
"learning_rate": 2.2425e-06, |
|
"loss": 468.7679, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 6.65934065934066, |
|
"grad_norm": 1145.3853759765625, |
|
"learning_rate": 2.28e-06, |
|
"loss": 454.7695, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 6.769230769230769, |
|
"grad_norm": 1233.435302734375, |
|
"learning_rate": 2.3175e-06, |
|
"loss": 438.3474, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 6.8791208791208796, |
|
"grad_norm": 1669.4859619140625, |
|
"learning_rate": 2.355e-06, |
|
"loss": 424.2438, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 6.989010989010989, |
|
"grad_norm": 1667.7239990234375, |
|
"learning_rate": 2.3925e-06, |
|
"loss": 412.884, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 7.087912087912088, |
|
"grad_norm": 1087.6927490234375, |
|
"learning_rate": 2.43e-06, |
|
"loss": 399.3542, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 7.197802197802198, |
|
"grad_norm": 1415.59765625, |
|
"learning_rate": 2.4675e-06, |
|
"loss": 387.3436, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 7.3076923076923075, |
|
"grad_norm": 1952.5645751953125, |
|
"learning_rate": 2.505e-06, |
|
"loss": 377.6118, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 7.417582417582418, |
|
"grad_norm": 1404.5712890625, |
|
"learning_rate": 2.5425000000000002e-06, |
|
"loss": 371.8316, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 7.527472527472527, |
|
"grad_norm": 1185.5135498046875, |
|
"learning_rate": 2.58e-06, |
|
"loss": 363.4571, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 7.637362637362637, |
|
"grad_norm": 4727.9638671875, |
|
"learning_rate": 2.6175e-06, |
|
"loss": 354.3981, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 7.747252747252747, |
|
"grad_norm": 937.0252075195312, |
|
"learning_rate": 2.655e-06, |
|
"loss": 344.7615, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 7.857142857142857, |
|
"grad_norm": 1180.9298095703125, |
|
"learning_rate": 2.6925e-06, |
|
"loss": 340.5888, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 7.967032967032967, |
|
"grad_norm": 1405.458984375, |
|
"learning_rate": 2.73e-06, |
|
"loss": 326.6432, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 8.065934065934066, |
|
"grad_norm": 936.8318481445312, |
|
"learning_rate": 2.7675e-06, |
|
"loss": 317.4954, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 8.175824175824175, |
|
"grad_norm": 888.236328125, |
|
"learning_rate": 2.8050000000000002e-06, |
|
"loss": 309.4422, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 8.285714285714286, |
|
"grad_norm": 970.7135620117188, |
|
"learning_rate": 2.8425e-06, |
|
"loss": 301.2726, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 8.395604395604396, |
|
"grad_norm": 1607.8035888671875, |
|
"learning_rate": 2.88e-06, |
|
"loss": 295.7361, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 8.505494505494505, |
|
"grad_norm": 814.48486328125, |
|
"learning_rate": 2.9175e-06, |
|
"loss": 290.0745, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 8.615384615384615, |
|
"grad_norm": 803.1909790039062, |
|
"learning_rate": 2.955e-06, |
|
"loss": 279.9774, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 8.725274725274724, |
|
"grad_norm": 1034.652099609375, |
|
"learning_rate": 2.9925e-06, |
|
"loss": 277.7123, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 8.835164835164836, |
|
"grad_norm": 993.9649658203125, |
|
"learning_rate": 3.0300000000000002e-06, |
|
"loss": 267.6938, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 8.945054945054945, |
|
"grad_norm": 1010.8787841796875, |
|
"learning_rate": 3.0675e-06, |
|
"loss": 266.7134, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 9.043956043956044, |
|
"grad_norm": 762.2285766601562, |
|
"learning_rate": 3.105e-06, |
|
"loss": 264.4567, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 9.153846153846153, |
|
"grad_norm": 5807.85791015625, |
|
"learning_rate": 3.1425e-06, |
|
"loss": 254.0208, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 9.263736263736265, |
|
"grad_norm": 717.2619018554688, |
|
"learning_rate": 3.18e-06, |
|
"loss": 249.0201, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 9.373626373626374, |
|
"grad_norm": 725.6996459960938, |
|
"learning_rate": 3.2175e-06, |
|
"loss": 243.3289, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 9.483516483516484, |
|
"grad_norm": 761.1790161132812, |
|
"learning_rate": 3.255e-06, |
|
"loss": 238.8994, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 9.593406593406593, |
|
"grad_norm": 792.3602905273438, |
|
"learning_rate": 3.2925000000000002e-06, |
|
"loss": 231.8852, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 9.703296703296703, |
|
"grad_norm": 744.7413330078125, |
|
"learning_rate": 3.3300000000000003e-06, |
|
"loss": 229.136, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 9.813186813186814, |
|
"grad_norm": 673.6207885742188, |
|
"learning_rate": 3.3675000000000004e-06, |
|
"loss": 221.0368, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 9.923076923076923, |
|
"grad_norm": 966.46630859375, |
|
"learning_rate": 3.405e-06, |
|
"loss": 217.8422, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 10.021978021978022, |
|
"grad_norm": 740.294921875, |
|
"learning_rate": 3.4425e-06, |
|
"loss": 228.7085, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 10.131868131868131, |
|
"grad_norm": 629.9981689453125, |
|
"learning_rate": 3.48e-06, |
|
"loss": 211.0439, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 10.241758241758241, |
|
"grad_norm": 809.6885375976562, |
|
"learning_rate": 3.5174999999999998e-06, |
|
"loss": 204.4649, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 10.351648351648352, |
|
"grad_norm": 1631.996337890625, |
|
"learning_rate": 3.555e-06, |
|
"loss": 204.0642, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 10.461538461538462, |
|
"grad_norm": 958.5594482421875, |
|
"learning_rate": 3.5925e-06, |
|
"loss": 199.6854, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 10.571428571428571, |
|
"grad_norm": 588.5241088867188, |
|
"learning_rate": 3.63e-06, |
|
"loss": 196.1547, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 10.68131868131868, |
|
"grad_norm": 1439.742919921875, |
|
"learning_rate": 3.6675e-06, |
|
"loss": 194.8075, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 10.791208791208792, |
|
"grad_norm": 547.2682495117188, |
|
"learning_rate": 3.705e-06, |
|
"loss": 190.5422, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 10.901098901098901, |
|
"grad_norm": 550.932373046875, |
|
"learning_rate": 3.7425e-06, |
|
"loss": 188.0797, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 2300, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 50, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.326589641008087e+19, |
|
"train_batch_size": 24, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|