|
{ |
|
"best_global_step": 26000, |
|
"best_metric": 74.62046528623556, |
|
"best_model_checkpoint": "./aramaic_diacritization_model_deep/checkpoint-26000", |
|
"epoch": 36.32760898282695, |
|
"eval_steps": 500, |
|
"global_step": 27500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.13210039630118892, |
|
"grad_norm": 49.535423278808594, |
|
"learning_rate": 9.400000000000001e-07, |
|
"loss": 13.2291, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.26420079260237783, |
|
"grad_norm": 8.345534324645996, |
|
"learning_rate": 1.94e-06, |
|
"loss": 4.9005, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3963011889035667, |
|
"grad_norm": 6.039630889892578, |
|
"learning_rate": 2.9400000000000002e-06, |
|
"loss": 3.1719, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5284015852047557, |
|
"grad_norm": 5.1691412925720215, |
|
"learning_rate": 3.94e-06, |
|
"loss": 2.6108, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6605019815059445, |
|
"grad_norm": 4.929126262664795, |
|
"learning_rate": 4.94e-06, |
|
"loss": 2.2213, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6605019815059445, |
|
"eval_bleu": 0.19172411762066338, |
|
"eval_char_accuracy": 10.030946701760158, |
|
"eval_loss": 1.71331787109375, |
|
"eval_runtime": 308.7303, |
|
"eval_samples_per_second": 4.904, |
|
"eval_steps_per_second": 0.615, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.7926023778071334, |
|
"grad_norm": 5.598935604095459, |
|
"learning_rate": 5.94e-06, |
|
"loss": 1.9171, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.9247027741083224, |
|
"grad_norm": 5.103219032287598, |
|
"learning_rate": 6.9400000000000005e-06, |
|
"loss": 1.6624, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.0568031704095113, |
|
"grad_norm": 4.852541923522949, |
|
"learning_rate": 7.94e-06, |
|
"loss": 1.473, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.1889035667107002, |
|
"grad_norm": 4.4410624504089355, |
|
"learning_rate": 8.94e-06, |
|
"loss": 1.3081, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.321003963011889, |
|
"grad_norm": 3.9946470260620117, |
|
"learning_rate": 9.940000000000001e-06, |
|
"loss": 1.1709, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.321003963011889, |
|
"eval_bleu": 5.2707215480174545, |
|
"eval_char_accuracy": 18.391696825135714, |
|
"eval_loss": 0.8503363132476807, |
|
"eval_runtime": 326.5655, |
|
"eval_samples_per_second": 4.636, |
|
"eval_steps_per_second": 0.582, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.453104359313078, |
|
"grad_norm": 5.828360557556152, |
|
"learning_rate": 9.99996092907511e-06, |
|
"loss": 1.0774, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.5852047556142668, |
|
"grad_norm": 3.975123643875122, |
|
"learning_rate": 9.999833582267183e-06, |
|
"loss": 0.9803, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.7173051519154559, |
|
"grad_norm": 4.110162258148193, |
|
"learning_rate": 9.999617802644021e-06, |
|
"loss": 0.9023, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.8494055482166445, |
|
"grad_norm": 4.341949462890625, |
|
"learning_rate": 9.999313594022158e-06, |
|
"loss": 0.8494, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.9815059445178336, |
|
"grad_norm": 3.7582991123199463, |
|
"learning_rate": 9.99892096178217e-06, |
|
"loss": 0.7841, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.9815059445178336, |
|
"eval_bleu": 13.790865530076871, |
|
"eval_char_accuracy": 25.990602895213026, |
|
"eval_loss": 0.5515339374542236, |
|
"eval_runtime": 310.3546, |
|
"eval_samples_per_second": 4.878, |
|
"eval_steps_per_second": 0.612, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.1136063408190227, |
|
"grad_norm": 4.066399574279785, |
|
"learning_rate": 9.998439912868608e-06, |
|
"loss": 0.7379, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.2457067371202113, |
|
"grad_norm": 3.739553928375244, |
|
"learning_rate": 9.997870455789855e-06, |
|
"loss": 0.6859, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.3778071334214004, |
|
"grad_norm": 4.019917964935303, |
|
"learning_rate": 9.997212600617986e-06, |
|
"loss": 0.6547, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.509907529722589, |
|
"grad_norm": 3.1273276805877686, |
|
"learning_rate": 9.99646635898858e-06, |
|
"loss": 0.6313, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.642007926023778, |
|
"grad_norm": 3.0856070518493652, |
|
"learning_rate": 9.995631744100536e-06, |
|
"loss": 0.6058, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.642007926023778, |
|
"eval_bleu": 22.141127198903924, |
|
"eval_char_accuracy": 30.873190491857212, |
|
"eval_loss": 0.4170660674571991, |
|
"eval_runtime": 310.921, |
|
"eval_samples_per_second": 4.869, |
|
"eval_steps_per_second": 0.611, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.7741083223249667, |
|
"grad_norm": 3.660649299621582, |
|
"learning_rate": 9.994708770715807e-06, |
|
"loss": 0.5758, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.906208718626156, |
|
"grad_norm": 3.3534188270568848, |
|
"learning_rate": 9.993697455159165e-06, |
|
"loss": 0.5507, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 3.038309114927345, |
|
"grad_norm": 2.831392526626587, |
|
"learning_rate": 9.992597815317901e-06, |
|
"loss": 0.5334, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 3.1704095112285335, |
|
"grad_norm": 3.2069804668426514, |
|
"learning_rate": 9.991409870641512e-06, |
|
"loss": 0.508, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 3.3025099075297226, |
|
"grad_norm": 3.3302793502807617, |
|
"learning_rate": 9.990133642141359e-06, |
|
"loss": 0.4816, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 3.3025099075297226, |
|
"eval_bleu": 28.8217714543364, |
|
"eval_char_accuracy": 34.30457312057904, |
|
"eval_loss": 0.34086647629737854, |
|
"eval_runtime": 312.1625, |
|
"eval_samples_per_second": 4.85, |
|
"eval_steps_per_second": 0.609, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 3.4346103038309117, |
|
"grad_norm": 3.0527114868164062, |
|
"learning_rate": 9.988769152390284e-06, |
|
"loss": 0.4779, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 3.5667107001321003, |
|
"grad_norm": 2.557722568511963, |
|
"learning_rate": 9.987316425522226e-06, |
|
"loss": 0.4626, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 3.6988110964332894, |
|
"grad_norm": 2.993014097213745, |
|
"learning_rate": 9.985775487231788e-06, |
|
"loss": 0.4452, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 3.830911492734478, |
|
"grad_norm": 2.7321043014526367, |
|
"learning_rate": 9.984146364773777e-06, |
|
"loss": 0.4408, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 3.963011889035667, |
|
"grad_norm": 2.8790836334228516, |
|
"learning_rate": 9.982429086962729e-06, |
|
"loss": 0.4108, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 3.963011889035667, |
|
"eval_bleu": 33.52667574407562, |
|
"eval_char_accuracy": 37.42289027800625, |
|
"eval_loss": 0.29226553440093994, |
|
"eval_runtime": 316.4732, |
|
"eval_samples_per_second": 4.784, |
|
"eval_steps_per_second": 0.6, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 4.095112285336856, |
|
"grad_norm": 2.8862805366516113, |
|
"learning_rate": 9.980623684172396e-06, |
|
"loss": 0.4134, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 4.227212681638045, |
|
"grad_norm": 2.4621527194976807, |
|
"learning_rate": 9.978730188335215e-06, |
|
"loss": 0.3919, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 4.359313077939234, |
|
"grad_norm": 2.822957992553711, |
|
"learning_rate": 9.976748632941733e-06, |
|
"loss": 0.384, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 4.491413474240423, |
|
"grad_norm": 2.448110818862915, |
|
"learning_rate": 9.974679053040018e-06, |
|
"loss": 0.3735, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 4.623513870541611, |
|
"grad_norm": 2.2914109230041504, |
|
"learning_rate": 9.972521485235045e-06, |
|
"loss": 0.3604, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 4.623513870541611, |
|
"eval_bleu": 37.45556816331904, |
|
"eval_char_accuracy": 40.02354416844876, |
|
"eval_loss": 0.25823718309402466, |
|
"eval_runtime": 320.7507, |
|
"eval_samples_per_second": 4.72, |
|
"eval_steps_per_second": 0.592, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 4.755614266842801, |
|
"grad_norm": 2.5924477577209473, |
|
"learning_rate": 9.970275967688047e-06, |
|
"loss": 0.3624, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 4.887714663143989, |
|
"grad_norm": 2.49125075340271, |
|
"learning_rate": 9.967942540115829e-06, |
|
"loss": 0.3508, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 5.019815059445178, |
|
"grad_norm": 2.464569330215454, |
|
"learning_rate": 9.965521243790079e-06, |
|
"loss": 0.3355, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 5.1519154557463676, |
|
"grad_norm": 2.6026785373687744, |
|
"learning_rate": 9.963012121536635e-06, |
|
"loss": 0.3284, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 5.284015852047556, |
|
"grad_norm": 2.351313591003418, |
|
"learning_rate": 9.96041521773472e-06, |
|
"loss": 0.328, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 5.284015852047556, |
|
"eval_bleu": 40.737027833366824, |
|
"eval_char_accuracy": 42.49156933706202, |
|
"eval_loss": 0.23189863562583923, |
|
"eval_runtime": 320.7889, |
|
"eval_samples_per_second": 4.72, |
|
"eval_steps_per_second": 0.592, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 5.416116248348745, |
|
"grad_norm": 3.0100066661834717, |
|
"learning_rate": 9.95773057831617e-06, |
|
"loss": 0.311, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 5.5482166446499335, |
|
"grad_norm": 2.049722671508789, |
|
"learning_rate": 9.954958250764604e-06, |
|
"loss": 0.3136, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 5.680317040951123, |
|
"grad_norm": 2.1132702827453613, |
|
"learning_rate": 9.952098284114604e-06, |
|
"loss": 0.3, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 5.812417437252312, |
|
"grad_norm": 2.174574613571167, |
|
"learning_rate": 9.949150728950833e-06, |
|
"loss": 0.3093, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 5.9445178335535, |
|
"grad_norm": 2.8350446224212646, |
|
"learning_rate": 9.946115637407145e-06, |
|
"loss": 0.2988, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 5.9445178335535, |
|
"eval_bleu": 43.8559984358133, |
|
"eval_char_accuracy": 44.56273646981411, |
|
"eval_loss": 0.21132107079029083, |
|
"eval_runtime": 319.4845, |
|
"eval_samples_per_second": 4.739, |
|
"eval_steps_per_second": 0.595, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 6.07661822985469, |
|
"grad_norm": 2.289716958999634, |
|
"learning_rate": 9.94299306316567e-06, |
|
"loss": 0.2938, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 6.208718626155878, |
|
"grad_norm": 2.9307034015655518, |
|
"learning_rate": 9.939783061455845e-06, |
|
"loss": 0.2814, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 6.340819022457067, |
|
"grad_norm": 2.3431613445281982, |
|
"learning_rate": 9.936485689053462e-06, |
|
"loss": 0.2782, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 6.472919418758257, |
|
"grad_norm": 2.2339768409729004, |
|
"learning_rate": 9.933101004279647e-06, |
|
"loss": 0.2752, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 6.605019815059445, |
|
"grad_norm": 2.076145887374878, |
|
"learning_rate": 9.92962906699983e-06, |
|
"loss": 0.265, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 6.605019815059445, |
|
"eval_bleu": 47.09645003475602, |
|
"eval_char_accuracy": 46.29205050172725, |
|
"eval_loss": 0.1954895406961441, |
|
"eval_runtime": 322.9917, |
|
"eval_samples_per_second": 4.687, |
|
"eval_steps_per_second": 0.588, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 6.737120211360634, |
|
"grad_norm": 1.715720295906067, |
|
"learning_rate": 9.926069938622698e-06, |
|
"loss": 0.266, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 6.869220607661823, |
|
"grad_norm": 2.501234531402588, |
|
"learning_rate": 9.922423682099088e-06, |
|
"loss": 0.2633, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 7.001321003963012, |
|
"grad_norm": 1.929123044013977, |
|
"learning_rate": 9.918690361920898e-06, |
|
"loss": 0.2584, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 7.133421400264201, |
|
"grad_norm": 2.0370264053344727, |
|
"learning_rate": 9.914870044119924e-06, |
|
"loss": 0.2451, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 7.265521796565389, |
|
"grad_norm": 2.3562278747558594, |
|
"learning_rate": 9.91096279626671e-06, |
|
"loss": 0.2476, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 7.265521796565389, |
|
"eval_bleu": 48.90758910970442, |
|
"eval_char_accuracy": 46.93308932390195, |
|
"eval_loss": 0.18339309096336365, |
|
"eval_runtime": 328.302, |
|
"eval_samples_per_second": 4.612, |
|
"eval_steps_per_second": 0.579, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 7.397622192866579, |
|
"grad_norm": 2.410529613494873, |
|
"learning_rate": 9.90696868746934e-06, |
|
"loss": 0.2419, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 7.5297225891677675, |
|
"grad_norm": 1.685939908027649, |
|
"learning_rate": 9.902887788372223e-06, |
|
"loss": 0.2448, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 7.661822985468956, |
|
"grad_norm": 2.3180549144744873, |
|
"learning_rate": 9.89872017115484e-06, |
|
"loss": 0.2379, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 7.793923381770146, |
|
"grad_norm": 2.4159021377563477, |
|
"learning_rate": 9.894465909530471e-06, |
|
"loss": 0.2339, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 7.926023778071334, |
|
"grad_norm": 2.0757477283477783, |
|
"learning_rate": 9.890125078744884e-06, |
|
"loss": 0.2356, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 7.926023778071334, |
|
"eval_bleu": 51.273473767746786, |
|
"eval_char_accuracy": 49.088563086033886, |
|
"eval_loss": 0.1728673279285431, |
|
"eval_runtime": 317.183, |
|
"eval_samples_per_second": 4.773, |
|
"eval_steps_per_second": 0.599, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 8.058124174372523, |
|
"grad_norm": 1.9405860900878906, |
|
"learning_rate": 9.885697755575015e-06, |
|
"loss": 0.2251, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 8.190224570673712, |
|
"grad_norm": 1.7473342418670654, |
|
"learning_rate": 9.881184018327597e-06, |
|
"loss": 0.2195, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 8.3223249669749, |
|
"grad_norm": 1.7633724212646484, |
|
"learning_rate": 9.876583946837787e-06, |
|
"loss": 0.219, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 8.45442536327609, |
|
"grad_norm": 2.1117053031921387, |
|
"learning_rate": 9.871897622467748e-06, |
|
"loss": 0.2148, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 8.58652575957728, |
|
"grad_norm": 2.114854574203491, |
|
"learning_rate": 9.867125128105211e-06, |
|
"loss": 0.2222, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 8.58652575957728, |
|
"eval_bleu": 53.15738681385049, |
|
"eval_char_accuracy": 49.680251686132586, |
|
"eval_loss": 0.1632871925830841, |
|
"eval_runtime": 325.9103, |
|
"eval_samples_per_second": 4.645, |
|
"eval_steps_per_second": 0.583, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 8.718626155878468, |
|
"grad_norm": 2.6163322925567627, |
|
"learning_rate": 9.862266548162008e-06, |
|
"loss": 0.2141, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 8.850726552179657, |
|
"grad_norm": 2.1705501079559326, |
|
"learning_rate": 9.857321968572577e-06, |
|
"loss": 0.2126, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 8.982826948480845, |
|
"grad_norm": 2.4352259635925293, |
|
"learning_rate": 9.85229147679245e-06, |
|
"loss": 0.2124, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 9.114927344782034, |
|
"grad_norm": 1.912975788116455, |
|
"learning_rate": 9.847175161796696e-06, |
|
"loss": 0.2032, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 9.247027741083222, |
|
"grad_norm": 1.7575359344482422, |
|
"learning_rate": 9.841973114078358e-06, |
|
"loss": 0.2005, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 9.247027741083222, |
|
"eval_bleu": 54.81436388056976, |
|
"eval_char_accuracy": 50.20562592531667, |
|
"eval_loss": 0.1546466052532196, |
|
"eval_runtime": 319.5902, |
|
"eval_samples_per_second": 4.737, |
|
"eval_steps_per_second": 0.595, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 9.379128137384413, |
|
"grad_norm": 1.6994798183441162, |
|
"learning_rate": 9.836685425646842e-06, |
|
"loss": 0.1929, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 9.511228533685602, |
|
"grad_norm": 1.8375500440597534, |
|
"learning_rate": 9.831312190026295e-06, |
|
"loss": 0.1954, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 9.64332892998679, |
|
"grad_norm": 2.735320568084717, |
|
"learning_rate": 9.825853502253951e-06, |
|
"loss": 0.1949, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 9.775429326287979, |
|
"grad_norm": 1.9880143404006958, |
|
"learning_rate": 9.820309458878447e-06, |
|
"loss": 0.196, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 9.907529722589167, |
|
"grad_norm": 3.1160881519317627, |
|
"learning_rate": 9.814680157958122e-06, |
|
"loss": 0.1957, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 9.907529722589167, |
|
"eval_bleu": 56.684180257446236, |
|
"eval_char_accuracy": 51.796656522454356, |
|
"eval_loss": 0.14744216203689575, |
|
"eval_runtime": 318.7305, |
|
"eval_samples_per_second": 4.75, |
|
"eval_steps_per_second": 0.596, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 10.039630118890356, |
|
"grad_norm": 1.975994348526001, |
|
"learning_rate": 9.808965699059276e-06, |
|
"loss": 0.1964, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 10.171730515191545, |
|
"grad_norm": 1.6857510805130005, |
|
"learning_rate": 9.80316618325441e-06, |
|
"loss": 0.1832, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 10.303830911492735, |
|
"grad_norm": 1.6473827362060547, |
|
"learning_rate": 9.797281713120438e-06, |
|
"loss": 0.1846, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 10.435931307793924, |
|
"grad_norm": 2.1330363750457764, |
|
"learning_rate": 9.79131239273688e-06, |
|
"loss": 0.1783, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 10.568031704095112, |
|
"grad_norm": 2.0598771572113037, |
|
"learning_rate": 9.785258327684007e-06, |
|
"loss": 0.183, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 10.568031704095112, |
|
"eval_bleu": 57.92928420103779, |
|
"eval_char_accuracy": 51.580235236058556, |
|
"eval_loss": 0.1413801610469818, |
|
"eval_runtime": 318.3694, |
|
"eval_samples_per_second": 4.755, |
|
"eval_steps_per_second": 0.597, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 10.700132100396301, |
|
"grad_norm": 1.8669029474258423, |
|
"learning_rate": 9.779119625040988e-06, |
|
"loss": 0.1801, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 10.83223249669749, |
|
"grad_norm": 1.967623233795166, |
|
"learning_rate": 9.772896393383991e-06, |
|
"loss": 0.1772, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 10.964332892998678, |
|
"grad_norm": 2.0888118743896484, |
|
"learning_rate": 9.766588742784255e-06, |
|
"loss": 0.1741, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 11.096433289299869, |
|
"grad_norm": 1.8615264892578125, |
|
"learning_rate": 9.760196784806155e-06, |
|
"loss": 0.1733, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 11.228533685601057, |
|
"grad_norm": 1.786023736000061, |
|
"learning_rate": 9.753720632505219e-06, |
|
"loss": 0.171, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 11.228533685601057, |
|
"eval_bleu": 58.97794662773473, |
|
"eval_char_accuracy": 53.53111120250041, |
|
"eval_loss": 0.13721999526023865, |
|
"eval_runtime": 310.3442, |
|
"eval_samples_per_second": 4.878, |
|
"eval_steps_per_second": 0.612, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 11.360634081902246, |
|
"grad_norm": 2.22037935256958, |
|
"learning_rate": 9.74716040042614e-06, |
|
"loss": 0.169, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 11.492734478203435, |
|
"grad_norm": 2.2769389152526855, |
|
"learning_rate": 9.740516204600734e-06, |
|
"loss": 0.1631, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 11.624834874504623, |
|
"grad_norm": 2.1513566970825195, |
|
"learning_rate": 9.733788162545902e-06, |
|
"loss": 0.1669, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 11.756935270805812, |
|
"grad_norm": 1.640358328819275, |
|
"learning_rate": 9.726976393261547e-06, |
|
"loss": 0.1674, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 11.889035667107, |
|
"grad_norm": 1.6976934671401978, |
|
"learning_rate": 9.720081017228462e-06, |
|
"loss": 0.1646, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 11.889035667107, |
|
"eval_bleu": 60.38328284328657, |
|
"eval_char_accuracy": 54.70215084717881, |
|
"eval_loss": 0.13088105618953705, |
|
"eval_runtime": 316.5706, |
|
"eval_samples_per_second": 4.783, |
|
"eval_steps_per_second": 0.6, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 12.021136063408191, |
|
"grad_norm": 1.7420779466629028, |
|
"learning_rate": 9.713102156406213e-06, |
|
"loss": 0.1629, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 12.15323645970938, |
|
"grad_norm": 1.9723796844482422, |
|
"learning_rate": 9.706039934230967e-06, |
|
"loss": 0.1578, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 12.285336856010568, |
|
"grad_norm": 2.3517324924468994, |
|
"learning_rate": 9.698894475613323e-06, |
|
"loss": 0.1561, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 12.417437252311757, |
|
"grad_norm": 1.5132865905761719, |
|
"learning_rate": 9.691665906936088e-06, |
|
"loss": 0.157, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 12.549537648612946, |
|
"grad_norm": 2.435624599456787, |
|
"learning_rate": 9.684354356052055e-06, |
|
"loss": 0.1538, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 12.549537648612946, |
|
"eval_bleu": 61.08870144349316, |
|
"eval_char_accuracy": 55.10055107747986, |
|
"eval_loss": 0.12675440311431885, |
|
"eval_runtime": 314.3152, |
|
"eval_samples_per_second": 4.817, |
|
"eval_steps_per_second": 0.604, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 12.681638044914134, |
|
"grad_norm": 2.092256784439087, |
|
"learning_rate": 9.676959952281733e-06, |
|
"loss": 0.1518, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 12.813738441215325, |
|
"grad_norm": 1.7985179424285889, |
|
"learning_rate": 9.669482826411065e-06, |
|
"loss": 0.158, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 12.945838837516513, |
|
"grad_norm": 1.7571889162063599, |
|
"learning_rate": 9.66192311068911e-06, |
|
"loss": 0.152, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 13.077939233817702, |
|
"grad_norm": 1.3526843786239624, |
|
"learning_rate": 9.654280938825705e-06, |
|
"loss": 0.1426, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 13.21003963011889, |
|
"grad_norm": 1.6651784181594849, |
|
"learning_rate": 9.646556445989106e-06, |
|
"loss": 0.1476, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 13.21003963011889, |
|
"eval_bleu": 61.91608356246592, |
|
"eval_char_accuracy": 55.006477216647475, |
|
"eval_loss": 0.12389995902776718, |
|
"eval_runtime": 315.2745, |
|
"eval_samples_per_second": 4.802, |
|
"eval_steps_per_second": 0.603, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 13.34214002642008, |
|
"grad_norm": 1.5739635229110718, |
|
"learning_rate": 9.63874976880359e-06, |
|
"loss": 0.148, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 13.474240422721268, |
|
"grad_norm": 1.779477834701538, |
|
"learning_rate": 9.63086104534704e-06, |
|
"loss": 0.1469, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 13.606340819022456, |
|
"grad_norm": 1.5422449111938477, |
|
"learning_rate": 9.622890415148505e-06, |
|
"loss": 0.143, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 13.738441215323647, |
|
"grad_norm": 1.80446457862854, |
|
"learning_rate": 9.61483801918573e-06, |
|
"loss": 0.1424, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 13.870541611624835, |
|
"grad_norm": 1.7641818523406982, |
|
"learning_rate": 9.606703999882667e-06, |
|
"loss": 0.1406, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 13.870541611624835, |
|
"eval_bleu": 62.97532147351367, |
|
"eval_char_accuracy": 56.364636453364035, |
|
"eval_loss": 0.12048687040805817, |
|
"eval_runtime": 315.4459, |
|
"eval_samples_per_second": 4.8, |
|
"eval_steps_per_second": 0.602, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 14.002642007926024, |
|
"grad_norm": 1.5506337881088257, |
|
"learning_rate": 9.598488501106947e-06, |
|
"loss": 0.1436, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 14.134742404227213, |
|
"grad_norm": 1.338537335395813, |
|
"learning_rate": 9.590191668167343e-06, |
|
"loss": 0.1396, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 14.266842800528401, |
|
"grad_norm": 1.9432475566864014, |
|
"learning_rate": 9.581813647811199e-06, |
|
"loss": 0.1427, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 14.39894319682959, |
|
"grad_norm": 1.8222265243530273, |
|
"learning_rate": 9.573354588221833e-06, |
|
"loss": 0.1352, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 14.531043593130779, |
|
"grad_norm": 1.7141766548156738, |
|
"learning_rate": 9.564814639015915e-06, |
|
"loss": 0.1361, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 14.531043593130779, |
|
"eval_bleu": 63.45481969831487, |
|
"eval_char_accuracy": 56.61755634150354, |
|
"eval_loss": 0.11702162027359009, |
|
"eval_runtime": 318.0259, |
|
"eval_samples_per_second": 4.761, |
|
"eval_steps_per_second": 0.597, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 14.663143989431969, |
|
"grad_norm": 1.473866581916809, |
|
"learning_rate": 9.556193951240821e-06, |
|
"loss": 0.1302, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 14.795244385733158, |
|
"grad_norm": 1.3840774297714233, |
|
"learning_rate": 9.547492677371968e-06, |
|
"loss": 0.1355, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 14.927344782034346, |
|
"grad_norm": 1.8060030937194824, |
|
"learning_rate": 9.538710971310104e-06, |
|
"loss": 0.1332, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 15.059445178335535, |
|
"grad_norm": 1.8242031335830688, |
|
"learning_rate": 9.529848988378597e-06, |
|
"loss": 0.1247, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 15.191545574636724, |
|
"grad_norm": 1.3062950372695923, |
|
"learning_rate": 9.520906885320682e-06, |
|
"loss": 0.1295, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 15.191545574636724, |
|
"eval_bleu": 64.22910948855701, |
|
"eval_char_accuracy": 57.40099111696003, |
|
"eval_loss": 0.11363621801137924, |
|
"eval_runtime": 315.2122, |
|
"eval_samples_per_second": 4.803, |
|
"eval_steps_per_second": 0.603, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 15.323645970937912, |
|
"grad_norm": 1.5021114349365234, |
|
"learning_rate": 9.511884820296695e-06, |
|
"loss": 0.1292, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 15.455746367239101, |
|
"grad_norm": 1.8947999477386475, |
|
"learning_rate": 9.502782952881268e-06, |
|
"loss": 0.128, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 15.587846763540291, |
|
"grad_norm": 1.5116643905639648, |
|
"learning_rate": 9.493601444060514e-06, |
|
"loss": 0.1276, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 15.71994715984148, |
|
"grad_norm": 1.5134871006011963, |
|
"learning_rate": 9.48434045622917e-06, |
|
"loss": 0.132, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 15.852047556142669, |
|
"grad_norm": 1.7438267469406128, |
|
"learning_rate": 9.475000153187733e-06, |
|
"loss": 0.1243, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 15.852047556142669, |
|
"eval_bleu": 65.05289431113529, |
|
"eval_char_accuracy": 57.989081263365684, |
|
"eval_loss": 0.1115257665514946, |
|
"eval_runtime": 309.8437, |
|
"eval_samples_per_second": 4.886, |
|
"eval_steps_per_second": 0.613, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 15.984147952443857, |
|
"grad_norm": 1.913855791091919, |
|
"learning_rate": 9.46558070013956e-06, |
|
"loss": 0.1261, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 16.116248348745046, |
|
"grad_norm": 1.4257742166519165, |
|
"learning_rate": 9.456082263687946e-06, |
|
"loss": 0.117, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 16.248348745046236, |
|
"grad_norm": 1.49489164352417, |
|
"learning_rate": 9.44650501183318e-06, |
|
"loss": 0.1232, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 16.380449141347423, |
|
"grad_norm": 1.7977708578109741, |
|
"learning_rate": 9.436849113969567e-06, |
|
"loss": 0.1212, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 16.512549537648614, |
|
"grad_norm": 1.2978798151016235, |
|
"learning_rate": 9.427212472501483e-06, |
|
"loss": 0.122, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 16.512549537648614, |
|
"eval_bleu": 65.59844085659223, |
|
"eval_char_accuracy": 58.32784997532489, |
|
"eval_loss": 0.10947112739086151, |
|
"eval_runtime": 332.8226, |
|
"eval_samples_per_second": 4.549, |
|
"eval_steps_per_second": 0.571, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 16.6446499339498, |
|
"grad_norm": 1.5628472566604614, |
|
"learning_rate": 9.417400578537868e-06, |
|
"loss": 0.1219, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 16.77675033025099, |
|
"grad_norm": 1.5304769277572632, |
|
"learning_rate": 9.407510553339931e-06, |
|
"loss": 0.1192, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 16.90885072655218, |
|
"grad_norm": 1.6370787620544434, |
|
"learning_rate": 9.397542571834054e-06, |
|
"loss": 0.1181, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 17.040951122853368, |
|
"grad_norm": 1.5388678312301636, |
|
"learning_rate": 9.387496810325436e-06, |
|
"loss": 0.1137, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 17.17305151915456, |
|
"grad_norm": 1.8251888751983643, |
|
"learning_rate": 9.377373446494984e-06, |
|
"loss": 0.1122, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 17.17305151915456, |
|
"eval_bleu": 66.79852257149867, |
|
"eval_char_accuracy": 58.47692877117947, |
|
"eval_loss": 0.1069813147187233, |
|
"eval_runtime": 350.9901, |
|
"eval_samples_per_second": 4.314, |
|
"eval_steps_per_second": 0.541, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 17.305151915455745, |
|
"grad_norm": 1.6730031967163086, |
|
"learning_rate": 9.367172659396172e-06, |
|
"loss": 0.1123, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 17.437252311756936, |
|
"grad_norm": 1.2402830123901367, |
|
"learning_rate": 9.35689462945187e-06, |
|
"loss": 0.1125, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 17.569352708058123, |
|
"grad_norm": 1.9425466060638428, |
|
"learning_rate": 9.34653953845115e-06, |
|
"loss": 0.1109, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 17.701453104359313, |
|
"grad_norm": 3.105457067489624, |
|
"learning_rate": 9.33610756954608e-06, |
|
"loss": 0.1161, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 17.833553500660503, |
|
"grad_norm": 1.679442048072815, |
|
"learning_rate": 9.325598907248478e-06, |
|
"loss": 0.1131, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 17.833553500660503, |
|
"eval_bleu": 66.72469802657548, |
|
"eval_char_accuracy": 59.26910264846191, |
|
"eval_loss": 0.10426344722509384, |
|
"eval_runtime": 353.0152, |
|
"eval_samples_per_second": 4.289, |
|
"eval_steps_per_second": 0.538, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 17.96565389696169, |
|
"grad_norm": 1.1806349754333496, |
|
"learning_rate": 9.315013737426645e-06, |
|
"loss": 0.115, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 18.09775429326288, |
|
"grad_norm": 1.5437259674072266, |
|
"learning_rate": 9.304352247302091e-06, |
|
"loss": 0.1071, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 18.229854689564068, |
|
"grad_norm": 3.9584083557128906, |
|
"learning_rate": 9.293614625446205e-06, |
|
"loss": 0.11, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 18.361955085865258, |
|
"grad_norm": 1.4071292877197266, |
|
"learning_rate": 9.282801061776937e-06, |
|
"loss": 0.1093, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 18.494055482166445, |
|
"grad_norm": 1.599787950515747, |
|
"learning_rate": 9.271911747555425e-06, |
|
"loss": 0.1057, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 18.494055482166445, |
|
"eval_bleu": 67.26763967365643, |
|
"eval_char_accuracy": 58.966318473433134, |
|
"eval_loss": 0.10266197472810745, |
|
"eval_runtime": 318.6046, |
|
"eval_samples_per_second": 4.752, |
|
"eval_steps_per_second": 0.596, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 18.626155878467635, |
|
"grad_norm": 1.3243365287780762, |
|
"learning_rate": 9.260946875382624e-06, |
|
"loss": 0.1054, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 18.758256274768826, |
|
"grad_norm": 1.8453656435012817, |
|
"learning_rate": 9.249906639195894e-06, |
|
"loss": 0.1096, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 18.890356671070013, |
|
"grad_norm": 1.2308754920959473, |
|
"learning_rate": 9.238791234265565e-06, |
|
"loss": 0.1045, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 19.022457067371203, |
|
"grad_norm": 1.3015666007995605, |
|
"learning_rate": 9.22760085719149e-06, |
|
"loss": 0.1065, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 19.15455746367239, |
|
"grad_norm": 1.3568576574325562, |
|
"learning_rate": 9.21633570589957e-06, |
|
"loss": 0.1024, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 19.15455746367239, |
|
"eval_bleu": 67.65790078495372, |
|
"eval_char_accuracy": 59.99547622964303, |
|
"eval_loss": 0.10274580866098404, |
|
"eval_runtime": 317.6719, |
|
"eval_samples_per_second": 4.766, |
|
"eval_steps_per_second": 0.598, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 19.28665785997358, |
|
"grad_norm": 1.6260790824890137, |
|
"learning_rate": 9.204995979638241e-06, |
|
"loss": 0.1025, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 19.418758256274767, |
|
"grad_norm": 1.1852062940597534, |
|
"learning_rate": 9.193581878974964e-06, |
|
"loss": 0.101, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 19.550858652575958, |
|
"grad_norm": 2.5521767139434814, |
|
"learning_rate": 9.18209360579267e-06, |
|
"loss": 0.0999, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 19.682959048877148, |
|
"grad_norm": 1.0231155157089233, |
|
"learning_rate": 9.17053136328619e-06, |
|
"loss": 0.1055, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 19.815059445178335, |
|
"grad_norm": 1.3518396615982056, |
|
"learning_rate": 9.15889535595866e-06, |
|
"loss": 0.1014, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 19.815059445178335, |
|
"eval_bleu": 68.43528395628573, |
|
"eval_char_accuracy": 61.64870866918901, |
|
"eval_loss": 0.09970895200967789, |
|
"eval_runtime": 314.0763, |
|
"eval_samples_per_second": 4.82, |
|
"eval_steps_per_second": 0.605, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 19.947159841479525, |
|
"grad_norm": 1.2678903341293335, |
|
"learning_rate": 9.147185789617907e-06, |
|
"loss": 0.1005, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 20.079260237780712, |
|
"grad_norm": 1.2439242601394653, |
|
"learning_rate": 9.13540287137281e-06, |
|
"loss": 0.0989, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 20.211360634081903, |
|
"grad_norm": 1.3169798851013184, |
|
"learning_rate": 9.123546809629632e-06, |
|
"loss": 0.1006, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 20.34346103038309, |
|
"grad_norm": 1.3063526153564453, |
|
"learning_rate": 9.111617814088332e-06, |
|
"loss": 0.0966, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 20.47556142668428, |
|
"grad_norm": 1.8397212028503418, |
|
"learning_rate": 9.099616095738867e-06, |
|
"loss": 0.0965, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 20.47556142668428, |
|
"eval_bleu": 68.40618966106221, |
|
"eval_char_accuracy": 60.30751357131107, |
|
"eval_loss": 0.09965521842241287, |
|
"eval_runtime": 315.776, |
|
"eval_samples_per_second": 4.795, |
|
"eval_steps_per_second": 0.602, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 20.60766182298547, |
|
"grad_norm": 1.544118046760559, |
|
"learning_rate": 9.087541866857453e-06, |
|
"loss": 0.0954, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 20.739762219286657, |
|
"grad_norm": 1.5378237962722778, |
|
"learning_rate": 9.075395341002804e-06, |
|
"loss": 0.0975, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 20.871862615587848, |
|
"grad_norm": 1.0197510719299316, |
|
"learning_rate": 9.06317673301237e-06, |
|
"loss": 0.0964, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 21.003963011889034, |
|
"grad_norm": 0.9621543884277344, |
|
"learning_rate": 9.05088625899852e-06, |
|
"loss": 0.0925, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 21.136063408190225, |
|
"grad_norm": 1.356550931930542, |
|
"learning_rate": 9.038524136344736e-06, |
|
"loss": 0.0917, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 21.136063408190225, |
|
"eval_bleu": 69.05043210174456, |
|
"eval_char_accuracy": 60.879153643691396, |
|
"eval_loss": 0.0960288867354393, |
|
"eval_runtime": 317.6216, |
|
"eval_samples_per_second": 4.767, |
|
"eval_steps_per_second": 0.598, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 21.268163804491415, |
|
"grad_norm": 2.0014472007751465, |
|
"learning_rate": 9.026090583701755e-06, |
|
"loss": 0.0962, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 21.400264200792602, |
|
"grad_norm": 1.4762715101242065, |
|
"learning_rate": 9.013585820983713e-06, |
|
"loss": 0.0917, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 21.532364597093792, |
|
"grad_norm": 1.242245078086853, |
|
"learning_rate": 9.001010069364241e-06, |
|
"loss": 0.0907, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 21.66446499339498, |
|
"grad_norm": 1.9390721321105957, |
|
"learning_rate": 8.98836355127257e-06, |
|
"loss": 0.0918, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 21.79656538969617, |
|
"grad_norm": 1.070357322692871, |
|
"learning_rate": 8.975646490389581e-06, |
|
"loss": 0.0903, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 21.79656538969617, |
|
"eval_bleu": 69.68531857632678, |
|
"eval_char_accuracy": 62.48920463892087, |
|
"eval_loss": 0.09554192423820496, |
|
"eval_runtime": 335.0637, |
|
"eval_samples_per_second": 4.519, |
|
"eval_steps_per_second": 0.567, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 21.928665785997357, |
|
"grad_norm": 1.8990777730941772, |
|
"learning_rate": 8.962859111643862e-06, |
|
"loss": 0.0946, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 22.060766182298547, |
|
"grad_norm": 1.6244333982467651, |
|
"learning_rate": 8.950001641207719e-06, |
|
"loss": 0.0895, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 22.192866578599737, |
|
"grad_norm": 1.6511205434799194, |
|
"learning_rate": 8.937074306493187e-06, |
|
"loss": 0.0907, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 22.324966974900924, |
|
"grad_norm": 1.421342372894287, |
|
"learning_rate": 8.924077336147992e-06, |
|
"loss": 0.0864, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 22.457067371202115, |
|
"grad_norm": 1.572800874710083, |
|
"learning_rate": 8.911010960051522e-06, |
|
"loss": 0.088, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 22.457067371202115, |
|
"eval_bleu": 69.94746089002493, |
|
"eval_char_accuracy": 61.5572051324231, |
|
"eval_loss": 0.09443064033985138, |
|
"eval_runtime": 345.5738, |
|
"eval_samples_per_second": 4.381, |
|
"eval_steps_per_second": 0.55, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 22.5891677675033, |
|
"grad_norm": 1.2412383556365967, |
|
"learning_rate": 8.897875409310755e-06, |
|
"loss": 0.085, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 22.721268163804492, |
|
"grad_norm": 1.4429903030395508, |
|
"learning_rate": 8.884803301685314e-06, |
|
"loss": 0.0908, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 22.85336856010568, |
|
"grad_norm": 1.1933690309524536, |
|
"learning_rate": 8.871530785794356e-06, |
|
"loss": 0.092, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 22.98546895640687, |
|
"grad_norm": 1.399186611175537, |
|
"learning_rate": 8.85818979355093e-06, |
|
"loss": 0.0837, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 23.11756935270806, |
|
"grad_norm": 1.2939783334732056, |
|
"learning_rate": 8.844780560919194e-06, |
|
"loss": 0.0871, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 23.11756935270806, |
|
"eval_bleu": 70.43896205976631, |
|
"eval_char_accuracy": 61.825032900148045, |
|
"eval_loss": 0.0947960913181305, |
|
"eval_runtime": 324.1925, |
|
"eval_samples_per_second": 4.67, |
|
"eval_steps_per_second": 0.586, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 23.249669749009247, |
|
"grad_norm": 1.2098222970962524, |
|
"learning_rate": 8.831303325070279e-06, |
|
"loss": 0.0827, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 23.381770145310437, |
|
"grad_norm": 1.5045851469039917, |
|
"learning_rate": 8.8177583243781e-06, |
|
"loss": 0.0838, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 23.513870541611624, |
|
"grad_norm": 1.5295897722244263, |
|
"learning_rate": 8.80414579841514e-06, |
|
"loss": 0.0858, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 23.645970937912814, |
|
"grad_norm": 1.4860461950302124, |
|
"learning_rate": 8.790465987948212e-06, |
|
"loss": 0.0875, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 23.778071334214, |
|
"grad_norm": 1.4711731672286987, |
|
"learning_rate": 8.776719134934199e-06, |
|
"loss": 0.0828, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 23.778071334214, |
|
"eval_bleu": 70.5623767627479, |
|
"eval_char_accuracy": 62.505140648132915, |
|
"eval_loss": 0.0924154594540596, |
|
"eval_runtime": 342.2412, |
|
"eval_samples_per_second": 4.424, |
|
"eval_steps_per_second": 0.555, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 23.91017173051519, |
|
"grad_norm": 1.4447731971740723, |
|
"learning_rate": 8.762905482515775e-06, |
|
"loss": 0.0814, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 24.042272126816382, |
|
"grad_norm": 1.350907802581787, |
|
"learning_rate": 8.749025275017107e-06, |
|
"loss": 0.0806, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 24.17437252311757, |
|
"grad_norm": 1.7207551002502441, |
|
"learning_rate": 8.735078757939532e-06, |
|
"loss": 0.08, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 24.30647291941876, |
|
"grad_norm": 1.0851505994796753, |
|
"learning_rate": 8.721066177957213e-06, |
|
"loss": 0.0779, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 24.438573315719946, |
|
"grad_norm": 1.2182328701019287, |
|
"learning_rate": 8.70698778291278e-06, |
|
"loss": 0.0814, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 24.438573315719946, |
|
"eval_bleu": 70.99257164437572, |
|
"eval_char_accuracy": 62.22189093600922, |
|
"eval_loss": 0.09152651578187943, |
|
"eval_runtime": 323.7174, |
|
"eval_samples_per_second": 4.677, |
|
"eval_steps_per_second": 0.587, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 24.570673712021136, |
|
"grad_norm": 2.0363285541534424, |
|
"learning_rate": 8.69284382181294e-06, |
|
"loss": 0.0821, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 24.702774108322323, |
|
"grad_norm": 1.3864432573318481, |
|
"learning_rate": 8.67863454482408e-06, |
|
"loss": 0.0809, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 24.834874504623514, |
|
"grad_norm": 2.032351493835449, |
|
"learning_rate": 8.664360203267838e-06, |
|
"loss": 0.0819, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 24.966974900924704, |
|
"grad_norm": 1.2007182836532593, |
|
"learning_rate": 8.65002104961666e-06, |
|
"loss": 0.0819, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 25.09907529722589, |
|
"grad_norm": 1.167693853378296, |
|
"learning_rate": 8.635617337489331e-06, |
|
"loss": 0.0778, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 25.09907529722589, |
|
"eval_bleu": 70.99070971451881, |
|
"eval_char_accuracy": 63.15440450732028, |
|
"eval_loss": 0.09109245985746384, |
|
"eval_runtime": 331.1976, |
|
"eval_samples_per_second": 4.571, |
|
"eval_steps_per_second": 0.574, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 25.23117569352708, |
|
"grad_norm": 1.9939295053482056, |
|
"learning_rate": 8.621149321646495e-06, |
|
"loss": 0.076, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 25.36327608982827, |
|
"grad_norm": 1.148555874824524, |
|
"learning_rate": 8.60661725798614e-06, |
|
"loss": 0.078, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 25.49537648612946, |
|
"grad_norm": 1.159621238708496, |
|
"learning_rate": 8.592167677001219e-06, |
|
"loss": 0.0823, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 25.627476882430646, |
|
"grad_norm": 1.7136179208755493, |
|
"learning_rate": 8.57750892397125e-06, |
|
"loss": 0.0755, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 25.759577278731836, |
|
"grad_norm": 1.105714201927185, |
|
"learning_rate": 8.5627868949981e-06, |
|
"loss": 0.0756, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 25.759577278731836, |
|
"eval_bleu": 71.15813151741806, |
|
"eval_char_accuracy": 63.43405576575094, |
|
"eval_loss": 0.09030098468065262, |
|
"eval_runtime": 316.3492, |
|
"eval_samples_per_second": 4.786, |
|
"eval_steps_per_second": 0.601, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 25.891677675033026, |
|
"grad_norm": 1.6091099977493286, |
|
"learning_rate": 8.548001850472529e-06, |
|
"loss": 0.0778, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 26.023778071334213, |
|
"grad_norm": 1.1730881929397583, |
|
"learning_rate": 8.533154051899864e-06, |
|
"loss": 0.0787, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 26.155878467635404, |
|
"grad_norm": 1.2703460454940796, |
|
"learning_rate": 8.518243761895369e-06, |
|
"loss": 0.0711, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 26.28797886393659, |
|
"grad_norm": 1.3379662036895752, |
|
"learning_rate": 8.503271244179608e-06, |
|
"loss": 0.075, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 26.42007926023778, |
|
"grad_norm": 1.370871901512146, |
|
"learning_rate": 8.488236763573772e-06, |
|
"loss": 0.0717, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 26.42007926023778, |
|
"eval_bleu": 72.10017178272712, |
|
"eval_char_accuracy": 63.853018588583645, |
|
"eval_loss": 0.08871379494667053, |
|
"eval_runtime": 316.1038, |
|
"eval_samples_per_second": 4.79, |
|
"eval_steps_per_second": 0.601, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 26.552179656538968, |
|
"grad_norm": 1.3396387100219727, |
|
"learning_rate": 8.473140585995004e-06, |
|
"loss": 0.0726, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 26.68428005284016, |
|
"grad_norm": 1.1219794750213623, |
|
"learning_rate": 8.457982978451683e-06, |
|
"loss": 0.0754, |
|
"step": 20200 |
|
}, |
|
{ |
|
"epoch": 26.81638044914135, |
|
"grad_norm": 1.0815324783325195, |
|
"learning_rate": 8.442764209038717e-06, |
|
"loss": 0.0745, |
|
"step": 20300 |
|
}, |
|
{ |
|
"epoch": 26.948480845442536, |
|
"grad_norm": 1.4396206140518188, |
|
"learning_rate": 8.427484546932789e-06, |
|
"loss": 0.0749, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 27.080581241743726, |
|
"grad_norm": 1.2644987106323242, |
|
"learning_rate": 8.4121442623876e-06, |
|
"loss": 0.0731, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 27.080581241743726, |
|
"eval_bleu": 71.61514434619416, |
|
"eval_char_accuracy": 63.06290097055437, |
|
"eval_loss": 0.08903466165065765, |
|
"eval_runtime": 316.4287, |
|
"eval_samples_per_second": 4.785, |
|
"eval_steps_per_second": 0.6, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 27.212681638044913, |
|
"grad_norm": 1.3456913232803345, |
|
"learning_rate": 8.396743626729093e-06, |
|
"loss": 0.0728, |
|
"step": 20600 |
|
}, |
|
{ |
|
"epoch": 27.344782034346103, |
|
"grad_norm": 1.1268248558044434, |
|
"learning_rate": 8.381282912350646e-06, |
|
"loss": 0.072, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 27.476882430647294, |
|
"grad_norm": 0.964856743812561, |
|
"learning_rate": 8.365762392708259e-06, |
|
"loss": 0.0711, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 27.60898282694848, |
|
"grad_norm": 1.2877197265625, |
|
"learning_rate": 8.350182342315719e-06, |
|
"loss": 0.0681, |
|
"step": 20900 |
|
}, |
|
{ |
|
"epoch": 27.74108322324967, |
|
"grad_norm": 3.0796854496002197, |
|
"learning_rate": 8.334543036739743e-06, |
|
"loss": 0.0681, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 27.74108322324967, |
|
"eval_bleu": 72.44362762449254, |
|
"eval_char_accuracy": 64.56139990129955, |
|
"eval_loss": 0.08809462934732437, |
|
"eval_runtime": 319.538, |
|
"eval_samples_per_second": 4.738, |
|
"eval_steps_per_second": 0.595, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 27.873183619550858, |
|
"grad_norm": 1.445993185043335, |
|
"learning_rate": 8.3188447525951e-06, |
|
"loss": 0.0701, |
|
"step": 21100 |
|
}, |
|
{ |
|
"epoch": 28.005284015852048, |
|
"grad_norm": 0.9414767622947693, |
|
"learning_rate": 8.303087767539723e-06, |
|
"loss": 0.0698, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 28.137384412153235, |
|
"grad_norm": 1.0644490718841553, |
|
"learning_rate": 8.28758923914531e-06, |
|
"loss": 0.0674, |
|
"step": 21300 |
|
}, |
|
{ |
|
"epoch": 28.269484808454425, |
|
"grad_norm": 1.2708711624145508, |
|
"learning_rate": 8.27171684949204e-06, |
|
"loss": 0.0689, |
|
"step": 21400 |
|
}, |
|
{ |
|
"epoch": 28.401585204755616, |
|
"grad_norm": 1.2177033424377441, |
|
"learning_rate": 8.25578659248641e-06, |
|
"loss": 0.0677, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 28.401585204755616, |
|
"eval_bleu": 72.7907604984095, |
|
"eval_char_accuracy": 64.78141964138838, |
|
"eval_loss": 0.0869230180978775, |
|
"eval_runtime": 315.3901, |
|
"eval_samples_per_second": 4.8, |
|
"eval_steps_per_second": 0.602, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 28.533685601056803, |
|
"grad_norm": 1.1179392337799072, |
|
"learning_rate": 8.239798749889293e-06, |
|
"loss": 0.0673, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 28.665785997357993, |
|
"grad_norm": 1.2472251653671265, |
|
"learning_rate": 8.223753604480086e-06, |
|
"loss": 0.0682, |
|
"step": 21700 |
|
}, |
|
{ |
|
"epoch": 28.79788639365918, |
|
"grad_norm": 0.8336161375045776, |
|
"learning_rate": 8.207651440051714e-06, |
|
"loss": 0.0689, |
|
"step": 21800 |
|
}, |
|
{ |
|
"epoch": 28.92998678996037, |
|
"grad_norm": 1.2652006149291992, |
|
"learning_rate": 8.1914925414056e-06, |
|
"loss": 0.0688, |
|
"step": 21900 |
|
}, |
|
{ |
|
"epoch": 29.062087186261557, |
|
"grad_norm": 1.1424204111099243, |
|
"learning_rate": 8.175277194346636e-06, |
|
"loss": 0.0677, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 29.062087186261557, |
|
"eval_bleu": 72.84923297683062, |
|
"eval_char_accuracy": 64.63285491034709, |
|
"eval_loss": 0.08696427941322327, |
|
"eval_runtime": 316.7435, |
|
"eval_samples_per_second": 4.78, |
|
"eval_steps_per_second": 0.6, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 29.194187582562748, |
|
"grad_norm": 1.617885708808899, |
|
"learning_rate": 8.159005685678126e-06, |
|
"loss": 0.0638, |
|
"step": 22100 |
|
}, |
|
{ |
|
"epoch": 29.326287978863938, |
|
"grad_norm": 1.2440978288650513, |
|
"learning_rate": 8.142678303196715e-06, |
|
"loss": 0.0606, |
|
"step": 22200 |
|
}, |
|
{ |
|
"epoch": 29.458388375165125, |
|
"grad_norm": 1.1825751066207886, |
|
"learning_rate": 8.12629533568729e-06, |
|
"loss": 0.0661, |
|
"step": 22300 |
|
}, |
|
{ |
|
"epoch": 29.590488771466315, |
|
"grad_norm": 1.5328364372253418, |
|
"learning_rate": 8.109857072917887e-06, |
|
"loss": 0.0647, |
|
"step": 22400 |
|
}, |
|
{ |
|
"epoch": 29.722589167767502, |
|
"grad_norm": 1.1308438777923584, |
|
"learning_rate": 8.093363805634556e-06, |
|
"loss": 0.0666, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 29.722589167767502, |
|
"eval_bleu": 73.15661772633777, |
|
"eval_char_accuracy": 63.81086527389373, |
|
"eval_loss": 0.08648520708084106, |
|
"eval_runtime": 315.6708, |
|
"eval_samples_per_second": 4.796, |
|
"eval_steps_per_second": 0.602, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 29.854689564068693, |
|
"grad_norm": 1.0442135334014893, |
|
"learning_rate": 8.076815825556213e-06, |
|
"loss": 0.0648, |
|
"step": 22600 |
|
}, |
|
{ |
|
"epoch": 29.98678996036988, |
|
"grad_norm": 1.363897681236267, |
|
"learning_rate": 8.060213425369492e-06, |
|
"loss": 0.0654, |
|
"step": 22700 |
|
}, |
|
{ |
|
"epoch": 30.11889035667107, |
|
"grad_norm": 0.7751464247703552, |
|
"learning_rate": 8.043556898723568e-06, |
|
"loss": 0.0628, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 30.25099075297226, |
|
"grad_norm": 0.8685150742530823, |
|
"learning_rate": 8.026846540224956e-06, |
|
"loss": 0.0584, |
|
"step": 22900 |
|
}, |
|
{ |
|
"epoch": 30.383091149273447, |
|
"grad_norm": 1.1484973430633545, |
|
"learning_rate": 8.0100826454323e-06, |
|
"loss": 0.0604, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 30.383091149273447, |
|
"eval_bleu": 73.25107285034876, |
|
"eval_char_accuracy": 65.01943164994243, |
|
"eval_loss": 0.08666232973337173, |
|
"eval_runtime": 323.7755, |
|
"eval_samples_per_second": 4.676, |
|
"eval_steps_per_second": 0.587, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 30.515191545574638, |
|
"grad_norm": 0.986289918422699, |
|
"learning_rate": 7.993265510851148e-06, |
|
"loss": 0.0688, |
|
"step": 23100 |
|
}, |
|
{ |
|
"epoch": 30.647291941875825, |
|
"grad_norm": 1.0403423309326172, |
|
"learning_rate": 7.97639543392872e-06, |
|
"loss": 0.0638, |
|
"step": 23200 |
|
}, |
|
{ |
|
"epoch": 30.779392338177015, |
|
"grad_norm": 1.517040729522705, |
|
"learning_rate": 7.959472713048617e-06, |
|
"loss": 0.0653, |
|
"step": 23300 |
|
}, |
|
{ |
|
"epoch": 30.911492734478202, |
|
"grad_norm": 1.1347965002059937, |
|
"learning_rate": 7.942497647525576e-06, |
|
"loss": 0.0642, |
|
"step": 23400 |
|
}, |
|
{ |
|
"epoch": 31.043593130779392, |
|
"grad_norm": 1.0789778232574463, |
|
"learning_rate": 7.925470537600155e-06, |
|
"loss": 0.0614, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 31.043593130779392, |
|
"eval_bleu": 73.23277401857816, |
|
"eval_char_accuracy": 65.0646693535121, |
|
"eval_loss": 0.08618722856044769, |
|
"eval_runtime": 313.4951, |
|
"eval_samples_per_second": 4.829, |
|
"eval_steps_per_second": 0.606, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 31.175693527080583, |
|
"grad_norm": 1.4853187799453735, |
|
"learning_rate": 7.908391684433432e-06, |
|
"loss": 0.0585, |
|
"step": 23600 |
|
}, |
|
{ |
|
"epoch": 31.30779392338177, |
|
"grad_norm": 0.9656835794448853, |
|
"learning_rate": 7.891261390101675e-06, |
|
"loss": 0.0578, |
|
"step": 23700 |
|
}, |
|
{ |
|
"epoch": 31.43989431968296, |
|
"grad_norm": 1.1521549224853516, |
|
"learning_rate": 7.874079957590997e-06, |
|
"loss": 0.0622, |
|
"step": 23800 |
|
}, |
|
{ |
|
"epoch": 31.571994715984147, |
|
"grad_norm": 1.0636780261993408, |
|
"learning_rate": 7.856847690792002e-06, |
|
"loss": 0.0604, |
|
"step": 23900 |
|
}, |
|
{ |
|
"epoch": 31.704095112285337, |
|
"grad_norm": 1.0833789110183716, |
|
"learning_rate": 7.839564894494409e-06, |
|
"loss": 0.0633, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 31.704095112285337, |
|
"eval_bleu": 73.62536575895216, |
|
"eval_char_accuracy": 65.15565882546471, |
|
"eval_loss": 0.08546082675457001, |
|
"eval_runtime": 318.0064, |
|
"eval_samples_per_second": 4.761, |
|
"eval_steps_per_second": 0.597, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 31.836195508586528, |
|
"grad_norm": 1.1620845794677734, |
|
"learning_rate": 7.822231874381658e-06, |
|
"loss": 0.0604, |
|
"step": 24100 |
|
}, |
|
{ |
|
"epoch": 31.968295904887714, |
|
"grad_norm": 1.315012812614441, |
|
"learning_rate": 7.804848937025507e-06, |
|
"loss": 0.0593, |
|
"step": 24200 |
|
}, |
|
{ |
|
"epoch": 32.1003963011889, |
|
"grad_norm": 0.8739562034606934, |
|
"learning_rate": 7.787416389880605e-06, |
|
"loss": 0.0608, |
|
"step": 24300 |
|
}, |
|
{ |
|
"epoch": 32.23249669749009, |
|
"grad_norm": 0.9168538451194763, |
|
"learning_rate": 7.769934541279059e-06, |
|
"loss": 0.0577, |
|
"step": 24400 |
|
}, |
|
{ |
|
"epoch": 32.36459709379128, |
|
"grad_norm": 0.9820032715797424, |
|
"learning_rate": 7.752403700424978e-06, |
|
"loss": 0.0569, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 32.36459709379128, |
|
"eval_bleu": 73.72149088930817, |
|
"eval_char_accuracy": 65.55457312057904, |
|
"eval_loss": 0.08627723157405853, |
|
"eval_runtime": 314.5801, |
|
"eval_samples_per_second": 4.813, |
|
"eval_steps_per_second": 0.604, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 32.49669749009247, |
|
"grad_norm": 1.0042686462402344, |
|
"learning_rate": 7.734824177389006e-06, |
|
"loss": 0.0582, |
|
"step": 24600 |
|
}, |
|
{ |
|
"epoch": 32.628797886393656, |
|
"grad_norm": 1.251654863357544, |
|
"learning_rate": 7.71719628310283e-06, |
|
"loss": 0.0589, |
|
"step": 24700 |
|
}, |
|
{ |
|
"epoch": 32.760898282694846, |
|
"grad_norm": 1.3150684833526611, |
|
"learning_rate": 7.699520329353694e-06, |
|
"loss": 0.0585, |
|
"step": 24800 |
|
}, |
|
{ |
|
"epoch": 32.89299867899604, |
|
"grad_norm": 1.318556547164917, |
|
"learning_rate": 7.681796628778876e-06, |
|
"loss": 0.0588, |
|
"step": 24900 |
|
}, |
|
{ |
|
"epoch": 33.02509907529723, |
|
"grad_norm": 1.2693874835968018, |
|
"learning_rate": 7.664025494860155e-06, |
|
"loss": 0.0605, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 33.02509907529723, |
|
"eval_bleu": 73.95631775899457, |
|
"eval_char_accuracy": 65.17056670505016, |
|
"eval_loss": 0.08447689563035965, |
|
"eval_runtime": 317.601, |
|
"eval_samples_per_second": 4.767, |
|
"eval_steps_per_second": 0.598, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 33.15719947159842, |
|
"grad_norm": 0.7866926193237305, |
|
"learning_rate": 7.646207241918272e-06, |
|
"loss": 0.055, |
|
"step": 25100 |
|
}, |
|
{ |
|
"epoch": 33.2892998678996, |
|
"grad_norm": 1.0340533256530762, |
|
"learning_rate": 7.628342185107373e-06, |
|
"loss": 0.0563, |
|
"step": 25200 |
|
}, |
|
{ |
|
"epoch": 33.42140026420079, |
|
"grad_norm": 1.6704190969467163, |
|
"learning_rate": 7.610430640409427e-06, |
|
"loss": 0.0568, |
|
"step": 25300 |
|
}, |
|
{ |
|
"epoch": 33.55350066050198, |
|
"grad_norm": 1.4271676540374756, |
|
"learning_rate": 7.592472924628642e-06, |
|
"loss": 0.056, |
|
"step": 25400 |
|
}, |
|
{ |
|
"epoch": 33.68560105680317, |
|
"grad_norm": 1.3886315822601318, |
|
"learning_rate": 7.574469355385865e-06, |
|
"loss": 0.0552, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 33.68560105680317, |
|
"eval_bleu": 73.67062952266826, |
|
"eval_char_accuracy": 65.04410676098043, |
|
"eval_loss": 0.08498267084360123, |
|
"eval_runtime": 314.7186, |
|
"eval_samples_per_second": 4.811, |
|
"eval_steps_per_second": 0.604, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 33.81770145310436, |
|
"grad_norm": 1.1037873029708862, |
|
"learning_rate": 7.556420251112956e-06, |
|
"loss": 0.0551, |
|
"step": 25600 |
|
}, |
|
{ |
|
"epoch": 33.949801849405546, |
|
"grad_norm": 2.125624418258667, |
|
"learning_rate": 7.538325931047159e-06, |
|
"loss": 0.0591, |
|
"step": 25700 |
|
}, |
|
{ |
|
"epoch": 34.081902245706736, |
|
"grad_norm": 1.674501895904541, |
|
"learning_rate": 7.52018671522546e-06, |
|
"loss": 0.0561, |
|
"step": 25800 |
|
}, |
|
{ |
|
"epoch": 34.21400264200793, |
|
"grad_norm": 1.386206030845642, |
|
"learning_rate": 7.502002924478924e-06, |
|
"loss": 0.0509, |
|
"step": 25900 |
|
}, |
|
{ |
|
"epoch": 34.34610303830912, |
|
"grad_norm": 1.0778214931488037, |
|
"learning_rate": 7.48377488042701e-06, |
|
"loss": 0.0544, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 34.34610303830912, |
|
"eval_bleu": 74.62046528623556, |
|
"eval_char_accuracy": 66.02442835992763, |
|
"eval_loss": 0.08464069664478302, |
|
"eval_runtime": 316.1374, |
|
"eval_samples_per_second": 4.789, |
|
"eval_steps_per_second": 0.601, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 34.4782034346103, |
|
"grad_norm": 0.8076276779174805, |
|
"learning_rate": 7.465502905471907e-06, |
|
"loss": 0.055, |
|
"step": 26100 |
|
}, |
|
{ |
|
"epoch": 34.61030383091149, |
|
"grad_norm": 1.1508395671844482, |
|
"learning_rate": 7.447187322792806e-06, |
|
"loss": 0.057, |
|
"step": 26200 |
|
}, |
|
{ |
|
"epoch": 34.74240422721268, |
|
"grad_norm": 1.2695698738098145, |
|
"learning_rate": 7.4288284563401945e-06, |
|
"loss": 0.055, |
|
"step": 26300 |
|
}, |
|
{ |
|
"epoch": 34.87450462351387, |
|
"grad_norm": 1.166051983833313, |
|
"learning_rate": 7.410426630830131e-06, |
|
"loss": 0.0552, |
|
"step": 26400 |
|
}, |
|
{ |
|
"epoch": 35.00660501981506, |
|
"grad_norm": 0.9517413973808289, |
|
"learning_rate": 7.391982171738496e-06, |
|
"loss": 0.0555, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 35.00660501981506, |
|
"eval_bleu": 74.18793581426324, |
|
"eval_char_accuracy": 65.63373910182597, |
|
"eval_loss": 0.08454510569572449, |
|
"eval_runtime": 313.1895, |
|
"eval_samples_per_second": 4.834, |
|
"eval_steps_per_second": 0.607, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 35.138705416116245, |
|
"grad_norm": 1.1265789270401, |
|
"learning_rate": 7.373495405295236e-06, |
|
"loss": 0.0529, |
|
"step": 26600 |
|
}, |
|
{ |
|
"epoch": 35.270805812417436, |
|
"grad_norm": 1.0067466497421265, |
|
"learning_rate": 7.354966658478594e-06, |
|
"loss": 0.0502, |
|
"step": 26700 |
|
}, |
|
{ |
|
"epoch": 35.402906208718626, |
|
"grad_norm": 0.9871610999107361, |
|
"learning_rate": 7.336396259009325e-06, |
|
"loss": 0.0508, |
|
"step": 26800 |
|
}, |
|
{ |
|
"epoch": 35.53500660501982, |
|
"grad_norm": 1.4898390769958496, |
|
"learning_rate": 7.317784535344905e-06, |
|
"loss": 0.0544, |
|
"step": 26900 |
|
}, |
|
{ |
|
"epoch": 35.66710700132101, |
|
"grad_norm": 1.1168763637542725, |
|
"learning_rate": 7.2991318166737126e-06, |
|
"loss": 0.0535, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 35.66710700132101, |
|
"eval_bleu": 74.4767569683976, |
|
"eval_char_accuracy": 65.44353512090805, |
|
"eval_loss": 0.08464961498975754, |
|
"eval_runtime": 313.4254, |
|
"eval_samples_per_second": 4.83, |
|
"eval_steps_per_second": 0.606, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 35.79920739762219, |
|
"grad_norm": 1.197704553604126, |
|
"learning_rate": 7.280625566954032e-06, |
|
"loss": 0.0547, |
|
"step": 27100 |
|
}, |
|
{ |
|
"epoch": 35.93130779392338, |
|
"grad_norm": 1.6144860982894897, |
|
"learning_rate": 7.261892250434568e-06, |
|
"loss": 0.0516, |
|
"step": 27200 |
|
}, |
|
{ |
|
"epoch": 36.06340819022457, |
|
"grad_norm": 1.1599304676055908, |
|
"learning_rate": 7.243118927483657e-06, |
|
"loss": 0.0502, |
|
"step": 27300 |
|
}, |
|
{ |
|
"epoch": 36.19550858652576, |
|
"grad_norm": 1.043449878692627, |
|
"learning_rate": 7.22430593014791e-06, |
|
"loss": 0.0472, |
|
"step": 27400 |
|
}, |
|
{ |
|
"epoch": 36.32760898282695, |
|
"grad_norm": 1.1489434242248535, |
|
"learning_rate": 7.205453591175666e-06, |
|
"loss": 0.0558, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 36.32760898282695, |
|
"eval_bleu": 74.20946250489693, |
|
"eval_char_accuracy": 65.89436996216483, |
|
"eval_loss": 0.08468983322381973, |
|
"eval_runtime": 314.7938, |
|
"eval_samples_per_second": 4.809, |
|
"eval_steps_per_second": 0.604, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 36.32760898282695, |
|
"step": 27500, |
|
"total_flos": 7035660725649408.0, |
|
"train_loss": 0.282735899699818, |
|
"train_runtime": 21727.9767, |
|
"train_samples_per_second": 55.735, |
|
"train_steps_per_second": 3.484 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 75700, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 100, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 3 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7035660725649408.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|