|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 408, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0024554941682013503, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 1.2500000000000002e-07, |
|
"loss": 1.3745, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.004910988336402701, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 2.5000000000000004e-07, |
|
"loss": 1.5178, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.007366482504604052, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 3.7500000000000006e-07, |
|
"loss": 1.5165, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.009821976672805401, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 5.000000000000001e-07, |
|
"loss": 1.5066, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.012277470841006752, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 6.25e-07, |
|
"loss": 1.4756, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.014732965009208104, |
|
"grad_norm": 6.4375, |
|
"learning_rate": 7.500000000000001e-07, |
|
"loss": 1.6926, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.017188459177409455, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 8.750000000000001e-07, |
|
"loss": 1.4304, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.019643953345610803, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 1.5449, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.022099447513812154, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 1.125e-06, |
|
"loss": 1.4239, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.024554941682013505, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 1.25e-06, |
|
"loss": 1.5942, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.027010435850214856, |
|
"grad_norm": 4.5, |
|
"learning_rate": 1.375e-06, |
|
"loss": 1.3251, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.029465930018416207, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 1.5000000000000002e-06, |
|
"loss": 1.398, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.03192142418661756, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 1.6250000000000001e-06, |
|
"loss": 1.4457, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.03437691835481891, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 1.7500000000000002e-06, |
|
"loss": 1.7422, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.03683241252302026, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 1.8750000000000003e-06, |
|
"loss": 1.403, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.039287906691221605, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 1.5114, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.041743400859422956, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 2.125e-06, |
|
"loss": 1.4917, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.04419889502762431, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 2.25e-06, |
|
"loss": 1.6311, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.04665438919582566, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 2.375e-06, |
|
"loss": 1.5155, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.04910988336402701, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 2.5e-06, |
|
"loss": 1.3574, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05156537753222836, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 2.6250000000000003e-06, |
|
"loss": 1.356, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.05402087170042971, |
|
"grad_norm": 4.125, |
|
"learning_rate": 2.75e-06, |
|
"loss": 1.3505, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.056476365868631064, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 2.875e-06, |
|
"loss": 1.4953, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.058931860036832415, |
|
"grad_norm": 3.78125, |
|
"learning_rate": 3.0000000000000005e-06, |
|
"loss": 1.3245, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.061387354205033766, |
|
"grad_norm": 4.625, |
|
"learning_rate": 3.125e-06, |
|
"loss": 1.7409, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.06384284837323512, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 3.2500000000000002e-06, |
|
"loss": 1.4082, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.06629834254143646, |
|
"grad_norm": 3.90625, |
|
"learning_rate": 3.375e-06, |
|
"loss": 1.4205, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.06875383670963782, |
|
"grad_norm": 3.8125, |
|
"learning_rate": 3.5000000000000004e-06, |
|
"loss": 1.4518, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.07120933087783916, |
|
"grad_norm": 3.8125, |
|
"learning_rate": 3.625e-06, |
|
"loss": 1.5376, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.07366482504604052, |
|
"grad_norm": 3.828125, |
|
"learning_rate": 3.7500000000000005e-06, |
|
"loss": 1.3659, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07612031921424187, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 3.875e-06, |
|
"loss": 1.5217, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.07857581338244321, |
|
"grad_norm": 4.75, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.6483, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.08103130755064457, |
|
"grad_norm": 3.96875, |
|
"learning_rate": 4.125e-06, |
|
"loss": 1.4399, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.08348680171884591, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 4.25e-06, |
|
"loss": 1.4659, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.08594229588704727, |
|
"grad_norm": 3.78125, |
|
"learning_rate": 4.3750000000000005e-06, |
|
"loss": 1.3418, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.08839779005524862, |
|
"grad_norm": 3.6875, |
|
"learning_rate": 4.5e-06, |
|
"loss": 1.5274, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.09085328422344997, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 4.625000000000001e-06, |
|
"loss": 1.4488, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.09330877839165132, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 4.75e-06, |
|
"loss": 1.3321, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.09576427255985268, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 4.875000000000001e-06, |
|
"loss": 1.6221, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.09821976672805402, |
|
"grad_norm": 3.1875, |
|
"learning_rate": 5e-06, |
|
"loss": 1.5383, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.10067526089625538, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 4.99941792782305e-06, |
|
"loss": 1.4365, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.10313075506445672, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 4.998834498834499e-06, |
|
"loss": 1.403, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.10558624923265807, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 4.998249708284714e-06, |
|
"loss": 1.3352, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.10804174340085942, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 4.99766355140187e-06, |
|
"loss": 1.4499, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.11049723756906077, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 4.997076023391814e-06, |
|
"loss": 1.2393, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.11295273173726213, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 4.99648711943794e-06, |
|
"loss": 1.318, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.11540822590546347, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 4.995896834701055e-06, |
|
"loss": 1.302, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.11786372007366483, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 4.995305164319249e-06, |
|
"loss": 1.3988, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.12031921424186617, |
|
"grad_norm": 2.125, |
|
"learning_rate": 4.994712103407756e-06, |
|
"loss": 1.3306, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.12277470841006753, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 4.994117647058823e-06, |
|
"loss": 1.4521, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1252302025782689, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 4.993521790341579e-06, |
|
"loss": 1.2632, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.12768569674647023, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 4.992924528301888e-06, |
|
"loss": 1.3189, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.13014119091467158, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 4.99232585596222e-06, |
|
"loss": 1.2612, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.13259668508287292, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 4.991725768321514e-06, |
|
"loss": 1.4976, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.13505217925107427, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 4.9911242603550295e-06, |
|
"loss": 1.3946, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.13750767341927564, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 4.990521327014218e-06, |
|
"loss": 1.517, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.13996316758747698, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 4.989916963226572e-06, |
|
"loss": 1.3417, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.14241866175567833, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 4.989311163895487e-06, |
|
"loss": 1.5581, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.14487415592387967, |
|
"grad_norm": 2.25, |
|
"learning_rate": 4.988703923900119e-06, |
|
"loss": 1.4299, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.14732965009208104, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 4.988095238095239e-06, |
|
"loss": 1.3143, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1497851442602824, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 4.987485101311085e-06, |
|
"loss": 1.3981, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.15224063842848373, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 4.986873508353222e-06, |
|
"loss": 1.2811, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.15469613259668508, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 4.98626045400239e-06, |
|
"loss": 1.3865, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.15715162676488642, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 4.9856459330143545e-06, |
|
"loss": 1.3941, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.1596071209330878, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 4.985029940119761e-06, |
|
"loss": 1.4471, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.16206261510128914, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 4.984412470023982e-06, |
|
"loss": 1.1329, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.16451810926949048, |
|
"grad_norm": 1.375, |
|
"learning_rate": 4.9837935174069635e-06, |
|
"loss": 1.3922, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.16697360343769183, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 4.983173076923077e-06, |
|
"loss": 1.2263, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.1694290976058932, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 4.982551143200963e-06, |
|
"loss": 1.2131, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.17188459177409454, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 4.981927710843374e-06, |
|
"loss": 1.368, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.17434008594229589, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 4.981302774427021e-06, |
|
"loss": 1.3914, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.17679558011049723, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 4.980676328502416e-06, |
|
"loss": 1.294, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.17925107427869857, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 4.980048367593713e-06, |
|
"loss": 1.2289, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.18170656844689995, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 4.979418886198547e-06, |
|
"loss": 1.26, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.1841620626151013, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 4.9787878787878795e-06, |
|
"loss": 1.4256, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.18661755678330263, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 4.9781553398058256e-06, |
|
"loss": 1.3011, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.18907305095150398, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 4.9775212636695016e-06, |
|
"loss": 1.3333, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.19152854511970535, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 4.976885644768857e-06, |
|
"loss": 1.3712, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.1939840392879067, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 4.9762484774665045e-06, |
|
"loss": 1.2534, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.19643953345610804, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 4.975609756097562e-06, |
|
"loss": 1.4554, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.19889502762430938, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 4.974969474969476e-06, |
|
"loss": 1.2135, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.20135052179251076, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 4.974327628361858e-06, |
|
"loss": 1.206, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.2038060159607121, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 4.973684210526317e-06, |
|
"loss": 1.2847, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.20626151012891344, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 4.973039215686275e-06, |
|
"loss": 1.2353, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.2087170042971148, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 4.97239263803681e-06, |
|
"loss": 1.2502, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.21117249846531613, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 4.971744471744472e-06, |
|
"loss": 1.3201, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.2136279926335175, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 4.97109471094711e-06, |
|
"loss": 1.2823, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.21608348680171885, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 4.970443349753695e-06, |
|
"loss": 1.2085, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.2185389809699202, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 4.969790382244143e-06, |
|
"loss": 1.2188, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.22099447513812154, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 4.9691358024691365e-06, |
|
"loss": 1.1417, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2234499693063229, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 4.968479604449938e-06, |
|
"loss": 1.1576, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.22590546347452425, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 4.967821782178218e-06, |
|
"loss": 1.4177, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.2283609576427256, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 4.967162329615861e-06, |
|
"loss": 1.3878, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.23081645181092694, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 4.966501240694789e-06, |
|
"loss": 1.3376, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.2332719459791283, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 4.965838509316771e-06, |
|
"loss": 1.2629, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.23572744014732966, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 4.965174129353235e-06, |
|
"loss": 1.5579, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.238182934315531, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 4.964508094645081e-06, |
|
"loss": 1.4673, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.24063842848373235, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 4.963840399002494e-06, |
|
"loss": 1.1368, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.2430939226519337, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 4.9631710362047445e-06, |
|
"loss": 1.2686, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.24554941682013506, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 4.9625e-06, |
|
"loss": 1.3491, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2480049109883364, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 4.961827284105132e-06, |
|
"loss": 1.3463, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.2504604051565378, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 4.961152882205514e-06, |
|
"loss": 1.3989, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.2529158993247391, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 4.960476787954831e-06, |
|
"loss": 1.3649, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.25537139349294047, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 4.959798994974875e-06, |
|
"loss": 1.3352, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.2578268876611418, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 4.959119496855347e-06, |
|
"loss": 1.2536, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.26028238182934316, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 4.958438287153652e-06, |
|
"loss": 1.2375, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.2627378759975445, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 4.9577553593947035e-06, |
|
"loss": 1.3203, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.26519337016574585, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 4.957070707070708e-06, |
|
"loss": 1.2825, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.2676488643339472, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 4.9563843236409605e-06, |
|
"loss": 1.4075, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.27010435850214853, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 4.955696202531646e-06, |
|
"loss": 1.2861, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.27255985267034993, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 4.955006337135615e-06, |
|
"loss": 1.3195, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.2750153468385513, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 4.954314720812184e-06, |
|
"loss": 1.4529, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.2774708410067526, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 4.9536213468869126e-06, |
|
"loss": 1.448, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.27992633517495397, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 4.9529262086513995e-06, |
|
"loss": 1.281, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.2823818293431553, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 4.952229299363058e-06, |
|
"loss": 1.2564, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.28483732351135665, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 4.951530612244899e-06, |
|
"loss": 1.2746, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.287292817679558, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 4.950830140485312e-06, |
|
"loss": 1.2465, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.28974831184775934, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 4.950127877237852e-06, |
|
"loss": 1.2232, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.2922038060159607, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 4.949423815621e-06, |
|
"loss": 1.2338, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.2946593001841621, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 4.9487179487179486e-06, |
|
"loss": 1.3921, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.29711479435236343, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 4.94801026957638e-06, |
|
"loss": 1.2641, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.2995702885205648, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 4.9473007712082265e-06, |
|
"loss": 1.2663, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.3020257826887661, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 4.946589446589447e-06, |
|
"loss": 1.365, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.30448127685696746, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 4.945876288659794e-06, |
|
"loss": 1.2591, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.3069367710251688, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 4.945161290322581e-06, |
|
"loss": 1.2908, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.30939226519337015, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 4.944444444444445e-06, |
|
"loss": 1.0606, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.3118477593615715, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 4.943725743855111e-06, |
|
"loss": 1.2429, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.31430325352977284, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 4.9430051813471505e-06, |
|
"loss": 1.1361, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.31675874769797424, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 4.942282749675746e-06, |
|
"loss": 1.3881, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.3192142418661756, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 4.941558441558442e-06, |
|
"loss": 1.4425, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.32166973603437693, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 4.940832249674903e-06, |
|
"loss": 1.4025, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.3241252302025783, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 4.940104166666667e-06, |
|
"loss": 1.3005, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.3265807243707796, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 4.939374185136898e-06, |
|
"loss": 1.2197, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.32903621853898096, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 4.938642297650132e-06, |
|
"loss": 1.2671, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.3314917127071823, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 4.937908496732027e-06, |
|
"loss": 1.4586, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.33394720687538365, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 4.9371727748691105e-06, |
|
"loss": 1.411, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.336402701043585, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 4.936435124508519e-06, |
|
"loss": 1.2816, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.3388581952117864, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 4.935695538057743e-06, |
|
"loss": 1.387, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.34131368937998774, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 4.9349540078843624e-06, |
|
"loss": 1.4591, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.3437691835481891, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 4.93421052631579e-06, |
|
"loss": 1.161, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3462246777163904, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 4.933465085639e-06, |
|
"loss": 1.2872, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.34868017188459177, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 4.932717678100264e-06, |
|
"loss": 1.3039, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.3511356660527931, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 4.931968295904888e-06, |
|
"loss": 1.4027, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.35359116022099446, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 4.931216931216932e-06, |
|
"loss": 1.2858, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.3560466543891958, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 4.93046357615894e-06, |
|
"loss": 1.2619, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.35850214855739715, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 4.929708222811671e-06, |
|
"loss": 1.2746, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.36095764272559855, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 4.928950863213812e-06, |
|
"loss": 1.3008, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.3634131368937999, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 4.928191489361703e-06, |
|
"loss": 1.3898, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.36586863106200124, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 4.927430093209056e-06, |
|
"loss": 1.3964, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.3683241252302026, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 4.926666666666667e-06, |
|
"loss": 1.2544, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3707796193984039, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 4.925901201602136e-06, |
|
"loss": 1.3839, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.37323511356660527, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 4.9251336898395725e-06, |
|
"loss": 1.3611, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.3756906077348066, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 4.924364123159303e-06, |
|
"loss": 1.3025, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.37814610190300796, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 4.923592493297587e-06, |
|
"loss": 1.2579, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.38060159607120936, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 4.922818791946309e-06, |
|
"loss": 1.3892, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.3830570902394107, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 4.922043010752689e-06, |
|
"loss": 1.2966, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.38551258440761205, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 4.921265141318978e-06, |
|
"loss": 1.2942, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.3879680785758134, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 4.920485175202157e-06, |
|
"loss": 1.192, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.39042357274401474, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 4.9197031039136305e-06, |
|
"loss": 1.129, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.3928790669122161, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 4.918918918918919e-06, |
|
"loss": 1.2928, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.3953345610804174, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 4.918132611637348e-06, |
|
"loss": 1.239, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.39779005524861877, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 4.917344173441735e-06, |
|
"loss": 1.1922, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.4002455494168201, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 4.916553595658074e-06, |
|
"loss": 1.19, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.4027010435850215, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 4.915760869565218e-06, |
|
"loss": 1.2543, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.40515653775322286, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 4.914965986394558e-06, |
|
"loss": 1.2686, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.4076120319214242, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 4.9141689373297006e-06, |
|
"loss": 1.2605, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.41006752608962554, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 4.9133697135061394e-06, |
|
"loss": 1.1579, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.4125230202578269, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 4.9125683060109295e-06, |
|
"loss": 1.3781, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.41497851442602823, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 4.911764705882354e-06, |
|
"loss": 1.4131, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.4174340085942296, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 4.910958904109589e-06, |
|
"loss": 1.3198, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.4198895027624309, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 4.9101508916323735e-06, |
|
"loss": 1.4827, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.42234499693063227, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 4.90934065934066e-06, |
|
"loss": 1.2649, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.42480049109883367, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 4.9085281980742785e-06, |
|
"loss": 1.2458, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.427255985267035, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 4.90771349862259e-06, |
|
"loss": 1.0635, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.42971147943523635, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 4.906896551724138e-06, |
|
"loss": 1.3535, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.4321669736034377, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 4.906077348066298e-06, |
|
"loss": 1.1425, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.43462246777163904, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 4.905255878284925e-06, |
|
"loss": 1.4151, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.4370779619398404, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 4.90443213296399e-06, |
|
"loss": 1.4322, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.43953345610804173, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 4.903606102635229e-06, |
|
"loss": 1.2474, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.4419889502762431, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 4.902777777777778e-06, |
|
"loss": 1.2191, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 4.901947148817803e-06, |
|
"loss": 1.32, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.4468999386126458, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 4.901114206128134e-06, |
|
"loss": 1.4193, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.44935543278084716, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 4.900278940027894e-06, |
|
"loss": 1.363, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.4518109269490485, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 4.899441340782124e-06, |
|
"loss": 1.2171, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.45426642111724985, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 4.8986013986013995e-06, |
|
"loss": 1.3648, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.4567219152854512, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 4.8977591036414575e-06, |
|
"loss": 1.3496, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.45917740945365254, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 4.896914446002806e-06, |
|
"loss": 1.1913, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.4616329036218539, |
|
"grad_norm": 0.75, |
|
"learning_rate": 4.896067415730337e-06, |
|
"loss": 1.443, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.46408839779005523, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 4.89521800281294e-06, |
|
"loss": 1.3634, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.4665438919582566, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 4.894366197183099e-06, |
|
"loss": 1.3341, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.468999386126458, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 4.893511988716502e-06, |
|
"loss": 1.2351, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.4714548802946593, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 4.892655367231639e-06, |
|
"loss": 1.1749, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.47391037446286066, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 4.891796322489392e-06, |
|
"loss": 1.3234, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.476365868631062, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 4.890934844192635e-06, |
|
"loss": 1.2423, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.47882136279926335, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 4.890070921985816e-06, |
|
"loss": 1.2254, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.4812768569674647, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 4.889204545454545e-06, |
|
"loss": 1.1784, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.48373235113566604, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 4.888335704125178e-06, |
|
"loss": 1.2662, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.4861878453038674, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 4.887464387464388e-06, |
|
"loss": 1.298, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.4886433394720687, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 4.8865905848787455e-06, |
|
"loss": 1.3095, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.4910988336402701, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 4.8857142857142865e-06, |
|
"loss": 1.5246, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.49355432780847147, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 4.884835479256081e-06, |
|
"loss": 1.3319, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.4960098219766728, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 4.883954154727794e-06, |
|
"loss": 1.4236, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.49846531614487416, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 4.8830703012912485e-06, |
|
"loss": 1.1756, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.5009208103130756, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 4.882183908045977e-06, |
|
"loss": 1.3086, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.5033763044812769, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 4.881294964028777e-06, |
|
"loss": 1.3479, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.5058317986494782, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 4.880403458213257e-06, |
|
"loss": 1.3573, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.5082872928176796, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 4.87950937950938e-06, |
|
"loss": 1.2731, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.5107427869858809, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 4.878612716763007e-06, |
|
"loss": 1.3774, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.5131982811540823, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 4.877713458755427e-06, |
|
"loss": 1.1831, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.5156537753222836, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 4.876811594202899e-06, |
|
"loss": 1.2903, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.518109269490485, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 4.875907111756169e-06, |
|
"loss": 1.2286, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.5205647636586863, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 4.875e-06, |
|
"loss": 1.3615, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.5230202578268877, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 4.874090247452693e-06, |
|
"loss": 1.2775, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.525475751995089, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 4.873177842565599e-06, |
|
"loss": 1.2464, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.5279312461632903, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 4.872262773722629e-06, |
|
"loss": 1.4034, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.5303867403314917, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 4.871345029239766e-06, |
|
"loss": 1.295, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.532842234499693, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 4.870424597364568e-06, |
|
"loss": 1.2426, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.5352977286678944, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 4.86950146627566e-06, |
|
"loss": 1.2622, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.5377532228360957, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 4.868575624082232e-06, |
|
"loss": 1.3219, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.5402087170042971, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 4.8676470588235295e-06, |
|
"loss": 1.5289, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5426642111724984, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 4.8667157584683365e-06, |
|
"loss": 1.2633, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.5451197053406999, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 4.865781710914455e-06, |
|
"loss": 1.2798, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.5475751995089012, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 4.864844903988184e-06, |
|
"loss": 1.2477, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.5500306936771026, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 4.8639053254437875e-06, |
|
"loss": 1.1598, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.5524861878453039, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 4.862962962962963e-06, |
|
"loss": 1.2752, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.5549416820135052, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 4.862017804154303e-06, |
|
"loss": 1.3412, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.5573971761817066, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 4.861069836552749e-06, |
|
"loss": 1.2334, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.5598526703499079, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 4.860119047619048e-06, |
|
"loss": 1.3265, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.5623081645181093, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 4.859165424739195e-06, |
|
"loss": 1.3923, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.5647636586863106, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 4.858208955223881e-06, |
|
"loss": 1.3351, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.567219152854512, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 4.857249626307923e-06, |
|
"loss": 1.187, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.5696746470227133, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 4.856287425149701e-06, |
|
"loss": 1.2872, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.5721301411909147, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 4.855322338830585e-06, |
|
"loss": 1.3065, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.574585635359116, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 4.854354354354354e-06, |
|
"loss": 1.1766, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.5770411295273173, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 4.853383458646616e-06, |
|
"loss": 1.2015, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.5794966236955187, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 4.852409638554218e-06, |
|
"loss": 1.3997, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.58195211786372, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 4.851432880844646e-06, |
|
"loss": 1.4776, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.5844076120319214, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 4.850453172205439e-06, |
|
"loss": 1.4555, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.5868631062001227, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 4.849470499243571e-06, |
|
"loss": 1.1934, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.5893186003683242, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 4.848484848484849e-06, |
|
"loss": 1.2544, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5917740945365255, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 4.847496206373293e-06, |
|
"loss": 1.2578, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.5942295887047269, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 4.846504559270517e-06, |
|
"loss": 1.3459, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.5966850828729282, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 4.8455098934551e-06, |
|
"loss": 1.2658, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.5991405770411296, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 4.844512195121952e-06, |
|
"loss": 1.2501, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.6015960712093309, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 4.84351145038168e-06, |
|
"loss": 1.3957, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.6040515653775322, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 4.8425076452599395e-06, |
|
"loss": 1.4144, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.6065070595457336, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 4.841500765696784e-06, |
|
"loss": 1.3007, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.6089625537139349, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 4.840490797546013e-06, |
|
"loss": 1.171, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.6114180478821363, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 4.839477726574502e-06, |
|
"loss": 1.4907, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.6138735420503376, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 4.8384615384615385e-06, |
|
"loss": 1.2608, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.616329036218539, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 4.837442218798152e-06, |
|
"loss": 1.2279, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.6187845303867403, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 4.83641975308642e-06, |
|
"loss": 1.1505, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.6212400245549416, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 4.835394126738795e-06, |
|
"loss": 1.5144, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.623695518723143, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 4.8343653250773995e-06, |
|
"loss": 1.3644, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.6261510128913443, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 4.833333333333333e-06, |
|
"loss": 1.2564, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.6286065070595457, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 4.832298136645963e-06, |
|
"loss": 1.2714, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.6310620012277471, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 4.831259720062209e-06, |
|
"loss": 1.2751, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.6335174953959485, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 4.830218068535826e-06, |
|
"loss": 1.3451, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.6359729895641498, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 4.8291731669266775e-06, |
|
"loss": 1.2426, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.6384284837323512, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 4.8281250000000005e-06, |
|
"loss": 1.2962, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6408839779005525, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 4.827073552425666e-06, |
|
"loss": 1.2006, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.6433394720687539, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 4.826018808777429e-06, |
|
"loss": 1.345, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.6457949662369552, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 4.824960753532182e-06, |
|
"loss": 1.2243, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.6482504604051565, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 4.823899371069182e-06, |
|
"loss": 1.447, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.6507059545733579, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 4.822834645669292e-06, |
|
"loss": 1.2785, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.6531614487415592, |
|
"grad_norm": 0.625, |
|
"learning_rate": 4.821766561514196e-06, |
|
"loss": 1.1441, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.6556169429097606, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 4.8206951026856246e-06, |
|
"loss": 1.248, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.6580724370779619, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 4.819620253164557e-06, |
|
"loss": 1.4042, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.6605279312461633, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 4.818541996830428e-06, |
|
"loss": 1.1198, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.6629834254143646, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 4.8174603174603175e-06, |
|
"loss": 1.3123, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.665438919582566, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 4.81637519872814e-06, |
|
"loss": 1.2525, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.6678944137507673, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 4.815286624203822e-06, |
|
"loss": 1.2531, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.6703499079189686, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 4.814194577352473e-06, |
|
"loss": 1.2536, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.67280540208717, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 4.813099041533547e-06, |
|
"loss": 1.3437, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.6752608962553714, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 4.812e-06, |
|
"loss": 1.2383, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.6777163904235728, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 4.8108974358974366e-06, |
|
"loss": 1.1858, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.6801718845917741, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 4.809791332263242e-06, |
|
"loss": 1.3243, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.6826273787599755, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 4.808681672025724e-06, |
|
"loss": 1.1714, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.6850828729281768, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 4.807568438003221e-06, |
|
"loss": 1.484, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.6875383670963782, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 4.806451612903227e-06, |
|
"loss": 1.2078, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.6899938612645795, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 4.805331179321487e-06, |
|
"loss": 1.1594, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.6924493554327809, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 4.8042071197411e-06, |
|
"loss": 1.3314, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.6949048496009822, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 4.803079416531605e-06, |
|
"loss": 1.4602, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.6973603437691835, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 4.801948051948052e-06, |
|
"loss": 1.1564, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.6998158379373849, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 4.800813008130081e-06, |
|
"loss": 1.1491, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.7022713321055862, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 4.799674267100978e-06, |
|
"loss": 1.3547, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.7047268262737876, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 4.798531810766721e-06, |
|
"loss": 1.2468, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.7071823204419889, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 4.7973856209150335e-06, |
|
"loss": 1.373, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.7096378146101903, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 4.796235679214403e-06, |
|
"loss": 1.1777, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.7120933087783916, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 4.7950819672131156e-06, |
|
"loss": 1.1952, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.714548802946593, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 4.79392446633826e-06, |
|
"loss": 1.1449, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.7170042971147943, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 4.792763157894737e-06, |
|
"loss": 1.4823, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.7194597912829958, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 4.791598023064251e-06, |
|
"loss": 1.3252, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.7219152854511971, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 4.790429042904291e-06, |
|
"loss": 1.1599, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.7243707796193984, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 4.789256198347108e-06, |
|
"loss": 1.3266, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.7268262737875998, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 4.788079470198676e-06, |
|
"loss": 1.2402, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.7292817679558011, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 4.786898839137645e-06, |
|
"loss": 1.2549, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.7317372621240025, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 4.785714285714287e-06, |
|
"loss": 1.3053, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.7341927562922038, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 4.784525790349418e-06, |
|
"loss": 1.2875, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.7366482504604052, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 4.783333333333334e-06, |
|
"loss": 1.3651, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7391037446286065, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 4.782136894824708e-06, |
|
"loss": 1.2945, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.7415592387968079, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 4.780936454849499e-06, |
|
"loss": 1.2634, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.7440147329650092, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 4.779731993299833e-06, |
|
"loss": 1.2937, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.7464702271332105, |
|
"grad_norm": 0.625, |
|
"learning_rate": 4.7785234899328866e-06, |
|
"loss": 1.2601, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.7489257213014119, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 4.777310924369749e-06, |
|
"loss": 1.2282, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.7513812154696132, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 4.776094276094276e-06, |
|
"loss": 1.3313, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.7538367096378146, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 4.774873524451939e-06, |
|
"loss": 1.1972, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.7562922038060159, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 4.773648648648649e-06, |
|
"loss": 1.1661, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.7587476979742173, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 4.772419627749577e-06, |
|
"loss": 1.1749, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.7612031921424187, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 4.771186440677967e-06, |
|
"loss": 1.3634, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.7636586863106201, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 4.769949066213922e-06, |
|
"loss": 1.2586, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.7661141804788214, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 4.768707482993198e-06, |
|
"loss": 1.0948, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.7685696746470227, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 4.767461669505963e-06, |
|
"loss": 1.2056, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.7710251688152241, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 4.7662116040955635e-06, |
|
"loss": 1.2017, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.7734806629834254, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 4.764957264957265e-06, |
|
"loss": 1.2027, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.7759361571516268, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 4.7636986301369865e-06, |
|
"loss": 1.3685, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.7783916513198281, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 4.762435677530018e-06, |
|
"loss": 1.2291, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.7808471454880295, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 4.761168384879725e-06, |
|
"loss": 1.335, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.7833026396562308, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 4.759896729776248e-06, |
|
"loss": 1.1458, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.7857581338244322, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 4.758620689655173e-06, |
|
"loss": 1.3041, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.7882136279926335, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 4.757340241796201e-06, |
|
"loss": 1.2781, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.7906691221608348, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 4.7560553633218e-06, |
|
"loss": 1.425, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.7931246163290362, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 4.754766031195841e-06, |
|
"loss": 1.2964, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.7955801104972375, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 4.753472222222224e-06, |
|
"loss": 1.2405, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.7980356046654389, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 4.752173913043479e-06, |
|
"loss": 1.2106, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.8004910988336402, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 4.750871080139373e-06, |
|
"loss": 1.3867, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.8029465930018416, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 4.74956369982548e-06, |
|
"loss": 1.2059, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.805402087170043, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 4.748251748251749e-06, |
|
"loss": 1.3675, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.8078575813382444, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 4.746935201401051e-06, |
|
"loss": 1.2453, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.8103130755064457, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 4.74561403508772e-06, |
|
"loss": 1.2978, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8127685696746471, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 4.744288224956063e-06, |
|
"loss": 1.1665, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.8152240638428484, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 4.742957746478874e-06, |
|
"loss": 1.3244, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.8176795580110497, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 4.7416225749559084e-06, |
|
"loss": 1.4481, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.8201350521792511, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 4.740282685512368e-06, |
|
"loss": 1.2474, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.8225905463474524, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 4.738938053097346e-06, |
|
"loss": 1.2216, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.8250460405156538, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 4.73758865248227e-06, |
|
"loss": 1.3191, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.8275015346838551, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 4.736234458259325e-06, |
|
"loss": 1.1618, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.8299570288520565, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 4.734875444839857e-06, |
|
"loss": 1.2559, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.8324125230202578, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 4.733511586452764e-06, |
|
"loss": 1.3254, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.8348680171884592, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 4.732142857142858e-06, |
|
"loss": 1.2175, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8373235113566605, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 4.730769230769231e-06, |
|
"loss": 1.2715, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.8397790055248618, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 4.729390681003584e-06, |
|
"loss": 1.2113, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.8422344996930632, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 4.728007181328546e-06, |
|
"loss": 1.2132, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.8446899938612645, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 4.726618705035971e-06, |
|
"loss": 1.2593, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.8471454880294659, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 4.725225225225225e-06, |
|
"loss": 1.2435, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.8496009821976673, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 4.723826714801444e-06, |
|
"loss": 1.2349, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.8520564763658687, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 4.72242314647378e-06, |
|
"loss": 1.2344, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.85451197053407, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 4.721014492753624e-06, |
|
"loss": 1.1971, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.8569674647022714, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 4.7196007259528135e-06, |
|
"loss": 1.2314, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.8594229588704727, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 4.718181818181819e-06, |
|
"loss": 1.261, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.861878453038674, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 4.7167577413479055e-06, |
|
"loss": 1.2093, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.8643339472068754, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 4.7153284671532855e-06, |
|
"loss": 1.1604, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.8667894413750767, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 4.713893967093236e-06, |
|
"loss": 1.111, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.8692449355432781, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 4.712454212454213e-06, |
|
"loss": 1.2903, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.8717004297114794, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 4.711009174311927e-06, |
|
"loss": 1.2801, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.8741559238796808, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 4.709558823529412e-06, |
|
"loss": 1.3241, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.8766114180478821, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 4.708103130755065e-06, |
|
"loss": 1.2036, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.8790669122160835, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 4.706642066420664e-06, |
|
"loss": 1.2354, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.8815224063842848, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 4.7051756007393715e-06, |
|
"loss": 1.2554, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.8839779005524862, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 4.703703703703704e-06, |
|
"loss": 1.243, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.8864333947206875, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 4.7022263450834885e-06, |
|
"loss": 1.1985, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 4.700743494423793e-06, |
|
"loss": 1.2429, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.8913443830570903, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 4.699255121042831e-06, |
|
"loss": 1.2472, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.8937998772252916, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 4.697761194029851e-06, |
|
"loss": 1.2172, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.896255371393493, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 4.696261682242991e-06, |
|
"loss": 1.4051, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.8987108655616943, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 4.6947565543071164e-06, |
|
"loss": 1.2184, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.9011663597298957, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 4.693245778611632e-06, |
|
"loss": 1.222, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.903621853898097, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 4.691729323308271e-06, |
|
"loss": 1.4323, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.9060773480662984, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 4.690207156308852e-06, |
|
"loss": 1.2835, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.9085328422344997, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 4.68867924528302e-06, |
|
"loss": 1.327, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.910988336402701, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 4.6871455576559546e-06, |
|
"loss": 1.1185, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.9134438305709024, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 4.6856060606060614e-06, |
|
"loss": 1.3439, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.9158993247391037, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 4.6840607210626185e-06, |
|
"loss": 1.2999, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.9183548189073051, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 4.682509505703422e-06, |
|
"loss": 1.5383, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.9208103130755064, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 4.680952380952381e-06, |
|
"loss": 1.4458, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.9232658072437078, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 4.6793893129771e-06, |
|
"loss": 1.3359, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.9257213014119091, |
|
"grad_norm": 0.625, |
|
"learning_rate": 4.677820267686425e-06, |
|
"loss": 1.1187, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.9281767955801105, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 4.67624521072797e-06, |
|
"loss": 1.2834, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.9306322897483118, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 4.674664107485605e-06, |
|
"loss": 1.2328, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.9330877839165131, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 4.673076923076924e-06, |
|
"loss": 1.2323, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9355432780847146, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 4.671483622350675e-06, |
|
"loss": 1.1229, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.937998772252916, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 4.66988416988417e-06, |
|
"loss": 1.366, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.9404542664211173, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 4.668278529980658e-06, |
|
"loss": 1.2033, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.9429097605893186, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 4.666666666666668e-06, |
|
"loss": 1.2062, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.94536525475752, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 4.66504854368932e-06, |
|
"loss": 1.3482, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.9478207489257213, |
|
"grad_norm": 0.625, |
|
"learning_rate": 4.6634241245136196e-06, |
|
"loss": 1.3237, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.9502762430939227, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 4.661793372319688e-06, |
|
"loss": 1.1933, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.952731737262124, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 4.66015625e-06, |
|
"loss": 1.2976, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.9551872314303254, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 4.658512720156556e-06, |
|
"loss": 1.1236, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.9576427255985267, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 4.65686274509804e-06, |
|
"loss": 1.3531, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.960098219766728, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 4.6552062868369355e-06, |
|
"loss": 1.3564, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.9625537139349294, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 4.653543307086615e-06, |
|
"loss": 1.289, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.9650092081031307, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 4.651873767258383e-06, |
|
"loss": 1.4734, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.9674647022713321, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 4.650197628458498e-06, |
|
"loss": 1.2798, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.9699201964395334, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 4.648514851485149e-06, |
|
"loss": 1.2872, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.9723756906077348, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 4.646825396825397e-06, |
|
"loss": 1.315, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.9748311847759361, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 4.645129224652087e-06, |
|
"loss": 1.2918, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.9772866789441375, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 4.643426294820717e-06, |
|
"loss": 1.4759, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.9797421731123389, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 4.6417165668662675e-06, |
|
"loss": 1.3303, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.9821976672805403, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 4.6400000000000005e-06, |
|
"loss": 1.1404, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9846531614487416, |
|
"grad_norm": 0.625, |
|
"learning_rate": 4.638276553106213e-06, |
|
"loss": 1.2628, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.9871086556169429, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 4.636546184738957e-06, |
|
"loss": 1.336, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.9895641497851443, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 4.634808853118712e-06, |
|
"loss": 1.31, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.9920196439533456, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 4.633064516129032e-06, |
|
"loss": 1.3131, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.994475138121547, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 4.6313131313131315e-06, |
|
"loss": 1.5459, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.9969306322897483, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 4.629554655870445e-06, |
|
"loss": 1.4008, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.9993861264579497, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 4.627789046653145e-06, |
|
"loss": 1.2619, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 4.626016260162602e-06, |
|
"loss": 1.6531, |
|
"step": 408 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 814, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 204, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.3184366687420416e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|