aramaic-diacritization-model / trainer_state.json
johnlockejrr's picture
Upload 15 files
50b10aa verified
{
"best_global_step": 26000,
"best_metric": 74.62046528623556,
"best_model_checkpoint": "./aramaic_diacritization_model_deep/checkpoint-26000",
"epoch": 36.32760898282695,
"eval_steps": 500,
"global_step": 27500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.13210039630118892,
"grad_norm": 49.535423278808594,
"learning_rate": 9.400000000000001e-07,
"loss": 13.2291,
"step": 100
},
{
"epoch": 0.26420079260237783,
"grad_norm": 8.345534324645996,
"learning_rate": 1.94e-06,
"loss": 4.9005,
"step": 200
},
{
"epoch": 0.3963011889035667,
"grad_norm": 6.039630889892578,
"learning_rate": 2.9400000000000002e-06,
"loss": 3.1719,
"step": 300
},
{
"epoch": 0.5284015852047557,
"grad_norm": 5.1691412925720215,
"learning_rate": 3.94e-06,
"loss": 2.6108,
"step": 400
},
{
"epoch": 0.6605019815059445,
"grad_norm": 4.929126262664795,
"learning_rate": 4.94e-06,
"loss": 2.2213,
"step": 500
},
{
"epoch": 0.6605019815059445,
"eval_bleu": 0.19172411762066338,
"eval_char_accuracy": 10.030946701760158,
"eval_loss": 1.71331787109375,
"eval_runtime": 308.7303,
"eval_samples_per_second": 4.904,
"eval_steps_per_second": 0.615,
"step": 500
},
{
"epoch": 0.7926023778071334,
"grad_norm": 5.598935604095459,
"learning_rate": 5.94e-06,
"loss": 1.9171,
"step": 600
},
{
"epoch": 0.9247027741083224,
"grad_norm": 5.103219032287598,
"learning_rate": 6.9400000000000005e-06,
"loss": 1.6624,
"step": 700
},
{
"epoch": 1.0568031704095113,
"grad_norm": 4.852541923522949,
"learning_rate": 7.94e-06,
"loss": 1.473,
"step": 800
},
{
"epoch": 1.1889035667107002,
"grad_norm": 4.4410624504089355,
"learning_rate": 8.94e-06,
"loss": 1.3081,
"step": 900
},
{
"epoch": 1.321003963011889,
"grad_norm": 3.9946470260620117,
"learning_rate": 9.940000000000001e-06,
"loss": 1.1709,
"step": 1000
},
{
"epoch": 1.321003963011889,
"eval_bleu": 5.2707215480174545,
"eval_char_accuracy": 18.391696825135714,
"eval_loss": 0.8503363132476807,
"eval_runtime": 326.5655,
"eval_samples_per_second": 4.636,
"eval_steps_per_second": 0.582,
"step": 1000
},
{
"epoch": 1.453104359313078,
"grad_norm": 5.828360557556152,
"learning_rate": 9.99996092907511e-06,
"loss": 1.0774,
"step": 1100
},
{
"epoch": 1.5852047556142668,
"grad_norm": 3.975123643875122,
"learning_rate": 9.999833582267183e-06,
"loss": 0.9803,
"step": 1200
},
{
"epoch": 1.7173051519154559,
"grad_norm": 4.110162258148193,
"learning_rate": 9.999617802644021e-06,
"loss": 0.9023,
"step": 1300
},
{
"epoch": 1.8494055482166445,
"grad_norm": 4.341949462890625,
"learning_rate": 9.999313594022158e-06,
"loss": 0.8494,
"step": 1400
},
{
"epoch": 1.9815059445178336,
"grad_norm": 3.7582991123199463,
"learning_rate": 9.99892096178217e-06,
"loss": 0.7841,
"step": 1500
},
{
"epoch": 1.9815059445178336,
"eval_bleu": 13.790865530076871,
"eval_char_accuracy": 25.990602895213026,
"eval_loss": 0.5515339374542236,
"eval_runtime": 310.3546,
"eval_samples_per_second": 4.878,
"eval_steps_per_second": 0.612,
"step": 1500
},
{
"epoch": 2.1136063408190227,
"grad_norm": 4.066399574279785,
"learning_rate": 9.998439912868608e-06,
"loss": 0.7379,
"step": 1600
},
{
"epoch": 2.2457067371202113,
"grad_norm": 3.739553928375244,
"learning_rate": 9.997870455789855e-06,
"loss": 0.6859,
"step": 1700
},
{
"epoch": 2.3778071334214004,
"grad_norm": 4.019917964935303,
"learning_rate": 9.997212600617986e-06,
"loss": 0.6547,
"step": 1800
},
{
"epoch": 2.509907529722589,
"grad_norm": 3.1273276805877686,
"learning_rate": 9.99646635898858e-06,
"loss": 0.6313,
"step": 1900
},
{
"epoch": 2.642007926023778,
"grad_norm": 3.0856070518493652,
"learning_rate": 9.995631744100536e-06,
"loss": 0.6058,
"step": 2000
},
{
"epoch": 2.642007926023778,
"eval_bleu": 22.141127198903924,
"eval_char_accuracy": 30.873190491857212,
"eval_loss": 0.4170660674571991,
"eval_runtime": 310.921,
"eval_samples_per_second": 4.869,
"eval_steps_per_second": 0.611,
"step": 2000
},
{
"epoch": 2.7741083223249667,
"grad_norm": 3.660649299621582,
"learning_rate": 9.994708770715807e-06,
"loss": 0.5758,
"step": 2100
},
{
"epoch": 2.906208718626156,
"grad_norm": 3.3534188270568848,
"learning_rate": 9.993697455159165e-06,
"loss": 0.5507,
"step": 2200
},
{
"epoch": 3.038309114927345,
"grad_norm": 2.831392526626587,
"learning_rate": 9.992597815317901e-06,
"loss": 0.5334,
"step": 2300
},
{
"epoch": 3.1704095112285335,
"grad_norm": 3.2069804668426514,
"learning_rate": 9.991409870641512e-06,
"loss": 0.508,
"step": 2400
},
{
"epoch": 3.3025099075297226,
"grad_norm": 3.3302793502807617,
"learning_rate": 9.990133642141359e-06,
"loss": 0.4816,
"step": 2500
},
{
"epoch": 3.3025099075297226,
"eval_bleu": 28.8217714543364,
"eval_char_accuracy": 34.30457312057904,
"eval_loss": 0.34086647629737854,
"eval_runtime": 312.1625,
"eval_samples_per_second": 4.85,
"eval_steps_per_second": 0.609,
"step": 2500
},
{
"epoch": 3.4346103038309117,
"grad_norm": 3.0527114868164062,
"learning_rate": 9.988769152390284e-06,
"loss": 0.4779,
"step": 2600
},
{
"epoch": 3.5667107001321003,
"grad_norm": 2.557722568511963,
"learning_rate": 9.987316425522226e-06,
"loss": 0.4626,
"step": 2700
},
{
"epoch": 3.6988110964332894,
"grad_norm": 2.993014097213745,
"learning_rate": 9.985775487231788e-06,
"loss": 0.4452,
"step": 2800
},
{
"epoch": 3.830911492734478,
"grad_norm": 2.7321043014526367,
"learning_rate": 9.984146364773777e-06,
"loss": 0.4408,
"step": 2900
},
{
"epoch": 3.963011889035667,
"grad_norm": 2.8790836334228516,
"learning_rate": 9.982429086962729e-06,
"loss": 0.4108,
"step": 3000
},
{
"epoch": 3.963011889035667,
"eval_bleu": 33.52667574407562,
"eval_char_accuracy": 37.42289027800625,
"eval_loss": 0.29226553440093994,
"eval_runtime": 316.4732,
"eval_samples_per_second": 4.784,
"eval_steps_per_second": 0.6,
"step": 3000
},
{
"epoch": 4.095112285336856,
"grad_norm": 2.8862805366516113,
"learning_rate": 9.980623684172396e-06,
"loss": 0.4134,
"step": 3100
},
{
"epoch": 4.227212681638045,
"grad_norm": 2.4621527194976807,
"learning_rate": 9.978730188335215e-06,
"loss": 0.3919,
"step": 3200
},
{
"epoch": 4.359313077939234,
"grad_norm": 2.822957992553711,
"learning_rate": 9.976748632941733e-06,
"loss": 0.384,
"step": 3300
},
{
"epoch": 4.491413474240423,
"grad_norm": 2.448110818862915,
"learning_rate": 9.974679053040018e-06,
"loss": 0.3735,
"step": 3400
},
{
"epoch": 4.623513870541611,
"grad_norm": 2.2914109230041504,
"learning_rate": 9.972521485235045e-06,
"loss": 0.3604,
"step": 3500
},
{
"epoch": 4.623513870541611,
"eval_bleu": 37.45556816331904,
"eval_char_accuracy": 40.02354416844876,
"eval_loss": 0.25823718309402466,
"eval_runtime": 320.7507,
"eval_samples_per_second": 4.72,
"eval_steps_per_second": 0.592,
"step": 3500
},
{
"epoch": 4.755614266842801,
"grad_norm": 2.5924477577209473,
"learning_rate": 9.970275967688047e-06,
"loss": 0.3624,
"step": 3600
},
{
"epoch": 4.887714663143989,
"grad_norm": 2.49125075340271,
"learning_rate": 9.967942540115829e-06,
"loss": 0.3508,
"step": 3700
},
{
"epoch": 5.019815059445178,
"grad_norm": 2.464569330215454,
"learning_rate": 9.965521243790079e-06,
"loss": 0.3355,
"step": 3800
},
{
"epoch": 5.1519154557463676,
"grad_norm": 2.6026785373687744,
"learning_rate": 9.963012121536635e-06,
"loss": 0.3284,
"step": 3900
},
{
"epoch": 5.284015852047556,
"grad_norm": 2.351313591003418,
"learning_rate": 9.96041521773472e-06,
"loss": 0.328,
"step": 4000
},
{
"epoch": 5.284015852047556,
"eval_bleu": 40.737027833366824,
"eval_char_accuracy": 42.49156933706202,
"eval_loss": 0.23189863562583923,
"eval_runtime": 320.7889,
"eval_samples_per_second": 4.72,
"eval_steps_per_second": 0.592,
"step": 4000
},
{
"epoch": 5.416116248348745,
"grad_norm": 3.0100066661834717,
"learning_rate": 9.95773057831617e-06,
"loss": 0.311,
"step": 4100
},
{
"epoch": 5.5482166446499335,
"grad_norm": 2.049722671508789,
"learning_rate": 9.954958250764604e-06,
"loss": 0.3136,
"step": 4200
},
{
"epoch": 5.680317040951123,
"grad_norm": 2.1132702827453613,
"learning_rate": 9.952098284114604e-06,
"loss": 0.3,
"step": 4300
},
{
"epoch": 5.812417437252312,
"grad_norm": 2.174574613571167,
"learning_rate": 9.949150728950833e-06,
"loss": 0.3093,
"step": 4400
},
{
"epoch": 5.9445178335535,
"grad_norm": 2.8350446224212646,
"learning_rate": 9.946115637407145e-06,
"loss": 0.2988,
"step": 4500
},
{
"epoch": 5.9445178335535,
"eval_bleu": 43.8559984358133,
"eval_char_accuracy": 44.56273646981411,
"eval_loss": 0.21132107079029083,
"eval_runtime": 319.4845,
"eval_samples_per_second": 4.739,
"eval_steps_per_second": 0.595,
"step": 4500
},
{
"epoch": 6.07661822985469,
"grad_norm": 2.289716958999634,
"learning_rate": 9.94299306316567e-06,
"loss": 0.2938,
"step": 4600
},
{
"epoch": 6.208718626155878,
"grad_norm": 2.9307034015655518,
"learning_rate": 9.939783061455845e-06,
"loss": 0.2814,
"step": 4700
},
{
"epoch": 6.340819022457067,
"grad_norm": 2.3431613445281982,
"learning_rate": 9.936485689053462e-06,
"loss": 0.2782,
"step": 4800
},
{
"epoch": 6.472919418758257,
"grad_norm": 2.2339768409729004,
"learning_rate": 9.933101004279647e-06,
"loss": 0.2752,
"step": 4900
},
{
"epoch": 6.605019815059445,
"grad_norm": 2.076145887374878,
"learning_rate": 9.92962906699983e-06,
"loss": 0.265,
"step": 5000
},
{
"epoch": 6.605019815059445,
"eval_bleu": 47.09645003475602,
"eval_char_accuracy": 46.29205050172725,
"eval_loss": 0.1954895406961441,
"eval_runtime": 322.9917,
"eval_samples_per_second": 4.687,
"eval_steps_per_second": 0.588,
"step": 5000
},
{
"epoch": 6.737120211360634,
"grad_norm": 1.715720295906067,
"learning_rate": 9.926069938622698e-06,
"loss": 0.266,
"step": 5100
},
{
"epoch": 6.869220607661823,
"grad_norm": 2.501234531402588,
"learning_rate": 9.922423682099088e-06,
"loss": 0.2633,
"step": 5200
},
{
"epoch": 7.001321003963012,
"grad_norm": 1.929123044013977,
"learning_rate": 9.918690361920898e-06,
"loss": 0.2584,
"step": 5300
},
{
"epoch": 7.133421400264201,
"grad_norm": 2.0370264053344727,
"learning_rate": 9.914870044119924e-06,
"loss": 0.2451,
"step": 5400
},
{
"epoch": 7.265521796565389,
"grad_norm": 2.3562278747558594,
"learning_rate": 9.91096279626671e-06,
"loss": 0.2476,
"step": 5500
},
{
"epoch": 7.265521796565389,
"eval_bleu": 48.90758910970442,
"eval_char_accuracy": 46.93308932390195,
"eval_loss": 0.18339309096336365,
"eval_runtime": 328.302,
"eval_samples_per_second": 4.612,
"eval_steps_per_second": 0.579,
"step": 5500
},
{
"epoch": 7.397622192866579,
"grad_norm": 2.410529613494873,
"learning_rate": 9.90696868746934e-06,
"loss": 0.2419,
"step": 5600
},
{
"epoch": 7.5297225891677675,
"grad_norm": 1.685939908027649,
"learning_rate": 9.902887788372223e-06,
"loss": 0.2448,
"step": 5700
},
{
"epoch": 7.661822985468956,
"grad_norm": 2.3180549144744873,
"learning_rate": 9.89872017115484e-06,
"loss": 0.2379,
"step": 5800
},
{
"epoch": 7.793923381770146,
"grad_norm": 2.4159021377563477,
"learning_rate": 9.894465909530471e-06,
"loss": 0.2339,
"step": 5900
},
{
"epoch": 7.926023778071334,
"grad_norm": 2.0757477283477783,
"learning_rate": 9.890125078744884e-06,
"loss": 0.2356,
"step": 6000
},
{
"epoch": 7.926023778071334,
"eval_bleu": 51.273473767746786,
"eval_char_accuracy": 49.088563086033886,
"eval_loss": 0.1728673279285431,
"eval_runtime": 317.183,
"eval_samples_per_second": 4.773,
"eval_steps_per_second": 0.599,
"step": 6000
},
{
"epoch": 8.058124174372523,
"grad_norm": 1.9405860900878906,
"learning_rate": 9.885697755575015e-06,
"loss": 0.2251,
"step": 6100
},
{
"epoch": 8.190224570673712,
"grad_norm": 1.7473342418670654,
"learning_rate": 9.881184018327597e-06,
"loss": 0.2195,
"step": 6200
},
{
"epoch": 8.3223249669749,
"grad_norm": 1.7633724212646484,
"learning_rate": 9.876583946837787e-06,
"loss": 0.219,
"step": 6300
},
{
"epoch": 8.45442536327609,
"grad_norm": 2.1117053031921387,
"learning_rate": 9.871897622467748e-06,
"loss": 0.2148,
"step": 6400
},
{
"epoch": 8.58652575957728,
"grad_norm": 2.114854574203491,
"learning_rate": 9.867125128105211e-06,
"loss": 0.2222,
"step": 6500
},
{
"epoch": 8.58652575957728,
"eval_bleu": 53.15738681385049,
"eval_char_accuracy": 49.680251686132586,
"eval_loss": 0.1632871925830841,
"eval_runtime": 325.9103,
"eval_samples_per_second": 4.645,
"eval_steps_per_second": 0.583,
"step": 6500
},
{
"epoch": 8.718626155878468,
"grad_norm": 2.6163322925567627,
"learning_rate": 9.862266548162008e-06,
"loss": 0.2141,
"step": 6600
},
{
"epoch": 8.850726552179657,
"grad_norm": 2.1705501079559326,
"learning_rate": 9.857321968572577e-06,
"loss": 0.2126,
"step": 6700
},
{
"epoch": 8.982826948480845,
"grad_norm": 2.4352259635925293,
"learning_rate": 9.85229147679245e-06,
"loss": 0.2124,
"step": 6800
},
{
"epoch": 9.114927344782034,
"grad_norm": 1.912975788116455,
"learning_rate": 9.847175161796696e-06,
"loss": 0.2032,
"step": 6900
},
{
"epoch": 9.247027741083222,
"grad_norm": 1.7575359344482422,
"learning_rate": 9.841973114078358e-06,
"loss": 0.2005,
"step": 7000
},
{
"epoch": 9.247027741083222,
"eval_bleu": 54.81436388056976,
"eval_char_accuracy": 50.20562592531667,
"eval_loss": 0.1546466052532196,
"eval_runtime": 319.5902,
"eval_samples_per_second": 4.737,
"eval_steps_per_second": 0.595,
"step": 7000
},
{
"epoch": 9.379128137384413,
"grad_norm": 1.6994798183441162,
"learning_rate": 9.836685425646842e-06,
"loss": 0.1929,
"step": 7100
},
{
"epoch": 9.511228533685602,
"grad_norm": 1.8375500440597534,
"learning_rate": 9.831312190026295e-06,
"loss": 0.1954,
"step": 7200
},
{
"epoch": 9.64332892998679,
"grad_norm": 2.735320568084717,
"learning_rate": 9.825853502253951e-06,
"loss": 0.1949,
"step": 7300
},
{
"epoch": 9.775429326287979,
"grad_norm": 1.9880143404006958,
"learning_rate": 9.820309458878447e-06,
"loss": 0.196,
"step": 7400
},
{
"epoch": 9.907529722589167,
"grad_norm": 3.1160881519317627,
"learning_rate": 9.814680157958122e-06,
"loss": 0.1957,
"step": 7500
},
{
"epoch": 9.907529722589167,
"eval_bleu": 56.684180257446236,
"eval_char_accuracy": 51.796656522454356,
"eval_loss": 0.14744216203689575,
"eval_runtime": 318.7305,
"eval_samples_per_second": 4.75,
"eval_steps_per_second": 0.596,
"step": 7500
},
{
"epoch": 10.039630118890356,
"grad_norm": 1.975994348526001,
"learning_rate": 9.808965699059276e-06,
"loss": 0.1964,
"step": 7600
},
{
"epoch": 10.171730515191545,
"grad_norm": 1.6857510805130005,
"learning_rate": 9.80316618325441e-06,
"loss": 0.1832,
"step": 7700
},
{
"epoch": 10.303830911492735,
"grad_norm": 1.6473827362060547,
"learning_rate": 9.797281713120438e-06,
"loss": 0.1846,
"step": 7800
},
{
"epoch": 10.435931307793924,
"grad_norm": 2.1330363750457764,
"learning_rate": 9.79131239273688e-06,
"loss": 0.1783,
"step": 7900
},
{
"epoch": 10.568031704095112,
"grad_norm": 2.0598771572113037,
"learning_rate": 9.785258327684007e-06,
"loss": 0.183,
"step": 8000
},
{
"epoch": 10.568031704095112,
"eval_bleu": 57.92928420103779,
"eval_char_accuracy": 51.580235236058556,
"eval_loss": 0.1413801610469818,
"eval_runtime": 318.3694,
"eval_samples_per_second": 4.755,
"eval_steps_per_second": 0.597,
"step": 8000
},
{
"epoch": 10.700132100396301,
"grad_norm": 1.8669029474258423,
"learning_rate": 9.779119625040988e-06,
"loss": 0.1801,
"step": 8100
},
{
"epoch": 10.83223249669749,
"grad_norm": 1.967623233795166,
"learning_rate": 9.772896393383991e-06,
"loss": 0.1772,
"step": 8200
},
{
"epoch": 10.964332892998678,
"grad_norm": 2.0888118743896484,
"learning_rate": 9.766588742784255e-06,
"loss": 0.1741,
"step": 8300
},
{
"epoch": 11.096433289299869,
"grad_norm": 1.8615264892578125,
"learning_rate": 9.760196784806155e-06,
"loss": 0.1733,
"step": 8400
},
{
"epoch": 11.228533685601057,
"grad_norm": 1.786023736000061,
"learning_rate": 9.753720632505219e-06,
"loss": 0.171,
"step": 8500
},
{
"epoch": 11.228533685601057,
"eval_bleu": 58.97794662773473,
"eval_char_accuracy": 53.53111120250041,
"eval_loss": 0.13721999526023865,
"eval_runtime": 310.3442,
"eval_samples_per_second": 4.878,
"eval_steps_per_second": 0.612,
"step": 8500
},
{
"epoch": 11.360634081902246,
"grad_norm": 2.22037935256958,
"learning_rate": 9.74716040042614e-06,
"loss": 0.169,
"step": 8600
},
{
"epoch": 11.492734478203435,
"grad_norm": 2.2769389152526855,
"learning_rate": 9.740516204600734e-06,
"loss": 0.1631,
"step": 8700
},
{
"epoch": 11.624834874504623,
"grad_norm": 2.1513566970825195,
"learning_rate": 9.733788162545902e-06,
"loss": 0.1669,
"step": 8800
},
{
"epoch": 11.756935270805812,
"grad_norm": 1.640358328819275,
"learning_rate": 9.726976393261547e-06,
"loss": 0.1674,
"step": 8900
},
{
"epoch": 11.889035667107,
"grad_norm": 1.6976934671401978,
"learning_rate": 9.720081017228462e-06,
"loss": 0.1646,
"step": 9000
},
{
"epoch": 11.889035667107,
"eval_bleu": 60.38328284328657,
"eval_char_accuracy": 54.70215084717881,
"eval_loss": 0.13088105618953705,
"eval_runtime": 316.5706,
"eval_samples_per_second": 4.783,
"eval_steps_per_second": 0.6,
"step": 9000
},
{
"epoch": 12.021136063408191,
"grad_norm": 1.7420779466629028,
"learning_rate": 9.713102156406213e-06,
"loss": 0.1629,
"step": 9100
},
{
"epoch": 12.15323645970938,
"grad_norm": 1.9723796844482422,
"learning_rate": 9.706039934230967e-06,
"loss": 0.1578,
"step": 9200
},
{
"epoch": 12.285336856010568,
"grad_norm": 2.3517324924468994,
"learning_rate": 9.698894475613323e-06,
"loss": 0.1561,
"step": 9300
},
{
"epoch": 12.417437252311757,
"grad_norm": 1.5132865905761719,
"learning_rate": 9.691665906936088e-06,
"loss": 0.157,
"step": 9400
},
{
"epoch": 12.549537648612946,
"grad_norm": 2.435624599456787,
"learning_rate": 9.684354356052055e-06,
"loss": 0.1538,
"step": 9500
},
{
"epoch": 12.549537648612946,
"eval_bleu": 61.08870144349316,
"eval_char_accuracy": 55.10055107747986,
"eval_loss": 0.12675440311431885,
"eval_runtime": 314.3152,
"eval_samples_per_second": 4.817,
"eval_steps_per_second": 0.604,
"step": 9500
},
{
"epoch": 12.681638044914134,
"grad_norm": 2.092256784439087,
"learning_rate": 9.676959952281733e-06,
"loss": 0.1518,
"step": 9600
},
{
"epoch": 12.813738441215325,
"grad_norm": 1.7985179424285889,
"learning_rate": 9.669482826411065e-06,
"loss": 0.158,
"step": 9700
},
{
"epoch": 12.945838837516513,
"grad_norm": 1.7571889162063599,
"learning_rate": 9.66192311068911e-06,
"loss": 0.152,
"step": 9800
},
{
"epoch": 13.077939233817702,
"grad_norm": 1.3526843786239624,
"learning_rate": 9.654280938825705e-06,
"loss": 0.1426,
"step": 9900
},
{
"epoch": 13.21003963011889,
"grad_norm": 1.6651784181594849,
"learning_rate": 9.646556445989106e-06,
"loss": 0.1476,
"step": 10000
},
{
"epoch": 13.21003963011889,
"eval_bleu": 61.91608356246592,
"eval_char_accuracy": 55.006477216647475,
"eval_loss": 0.12389995902776718,
"eval_runtime": 315.2745,
"eval_samples_per_second": 4.802,
"eval_steps_per_second": 0.603,
"step": 10000
},
{
"epoch": 13.34214002642008,
"grad_norm": 1.5739635229110718,
"learning_rate": 9.63874976880359e-06,
"loss": 0.148,
"step": 10100
},
{
"epoch": 13.474240422721268,
"grad_norm": 1.779477834701538,
"learning_rate": 9.63086104534704e-06,
"loss": 0.1469,
"step": 10200
},
{
"epoch": 13.606340819022456,
"grad_norm": 1.5422449111938477,
"learning_rate": 9.622890415148505e-06,
"loss": 0.143,
"step": 10300
},
{
"epoch": 13.738441215323647,
"grad_norm": 1.80446457862854,
"learning_rate": 9.61483801918573e-06,
"loss": 0.1424,
"step": 10400
},
{
"epoch": 13.870541611624835,
"grad_norm": 1.7641818523406982,
"learning_rate": 9.606703999882667e-06,
"loss": 0.1406,
"step": 10500
},
{
"epoch": 13.870541611624835,
"eval_bleu": 62.97532147351367,
"eval_char_accuracy": 56.364636453364035,
"eval_loss": 0.12048687040805817,
"eval_runtime": 315.4459,
"eval_samples_per_second": 4.8,
"eval_steps_per_second": 0.602,
"step": 10500
},
{
"epoch": 14.002642007926024,
"grad_norm": 1.5506337881088257,
"learning_rate": 9.598488501106947e-06,
"loss": 0.1436,
"step": 10600
},
{
"epoch": 14.134742404227213,
"grad_norm": 1.338537335395813,
"learning_rate": 9.590191668167343e-06,
"loss": 0.1396,
"step": 10700
},
{
"epoch": 14.266842800528401,
"grad_norm": 1.9432475566864014,
"learning_rate": 9.581813647811199e-06,
"loss": 0.1427,
"step": 10800
},
{
"epoch": 14.39894319682959,
"grad_norm": 1.8222265243530273,
"learning_rate": 9.573354588221833e-06,
"loss": 0.1352,
"step": 10900
},
{
"epoch": 14.531043593130779,
"grad_norm": 1.7141766548156738,
"learning_rate": 9.564814639015915e-06,
"loss": 0.1361,
"step": 11000
},
{
"epoch": 14.531043593130779,
"eval_bleu": 63.45481969831487,
"eval_char_accuracy": 56.61755634150354,
"eval_loss": 0.11702162027359009,
"eval_runtime": 318.0259,
"eval_samples_per_second": 4.761,
"eval_steps_per_second": 0.597,
"step": 11000
},
{
"epoch": 14.663143989431969,
"grad_norm": 1.473866581916809,
"learning_rate": 9.556193951240821e-06,
"loss": 0.1302,
"step": 11100
},
{
"epoch": 14.795244385733158,
"grad_norm": 1.3840774297714233,
"learning_rate": 9.547492677371968e-06,
"loss": 0.1355,
"step": 11200
},
{
"epoch": 14.927344782034346,
"grad_norm": 1.8060030937194824,
"learning_rate": 9.538710971310104e-06,
"loss": 0.1332,
"step": 11300
},
{
"epoch": 15.059445178335535,
"grad_norm": 1.8242031335830688,
"learning_rate": 9.529848988378597e-06,
"loss": 0.1247,
"step": 11400
},
{
"epoch": 15.191545574636724,
"grad_norm": 1.3062950372695923,
"learning_rate": 9.520906885320682e-06,
"loss": 0.1295,
"step": 11500
},
{
"epoch": 15.191545574636724,
"eval_bleu": 64.22910948855701,
"eval_char_accuracy": 57.40099111696003,
"eval_loss": 0.11363621801137924,
"eval_runtime": 315.2122,
"eval_samples_per_second": 4.803,
"eval_steps_per_second": 0.603,
"step": 11500
},
{
"epoch": 15.323645970937912,
"grad_norm": 1.5021114349365234,
"learning_rate": 9.511884820296695e-06,
"loss": 0.1292,
"step": 11600
},
{
"epoch": 15.455746367239101,
"grad_norm": 1.8947999477386475,
"learning_rate": 9.502782952881268e-06,
"loss": 0.128,
"step": 11700
},
{
"epoch": 15.587846763540291,
"grad_norm": 1.5116643905639648,
"learning_rate": 9.493601444060514e-06,
"loss": 0.1276,
"step": 11800
},
{
"epoch": 15.71994715984148,
"grad_norm": 1.5134871006011963,
"learning_rate": 9.48434045622917e-06,
"loss": 0.132,
"step": 11900
},
{
"epoch": 15.852047556142669,
"grad_norm": 1.7438267469406128,
"learning_rate": 9.475000153187733e-06,
"loss": 0.1243,
"step": 12000
},
{
"epoch": 15.852047556142669,
"eval_bleu": 65.05289431113529,
"eval_char_accuracy": 57.989081263365684,
"eval_loss": 0.1115257665514946,
"eval_runtime": 309.8437,
"eval_samples_per_second": 4.886,
"eval_steps_per_second": 0.613,
"step": 12000
},
{
"epoch": 15.984147952443857,
"grad_norm": 1.913855791091919,
"learning_rate": 9.46558070013956e-06,
"loss": 0.1261,
"step": 12100
},
{
"epoch": 16.116248348745046,
"grad_norm": 1.4257742166519165,
"learning_rate": 9.456082263687946e-06,
"loss": 0.117,
"step": 12200
},
{
"epoch": 16.248348745046236,
"grad_norm": 1.49489164352417,
"learning_rate": 9.44650501183318e-06,
"loss": 0.1232,
"step": 12300
},
{
"epoch": 16.380449141347423,
"grad_norm": 1.7977708578109741,
"learning_rate": 9.436849113969567e-06,
"loss": 0.1212,
"step": 12400
},
{
"epoch": 16.512549537648614,
"grad_norm": 1.2978798151016235,
"learning_rate": 9.427212472501483e-06,
"loss": 0.122,
"step": 12500
},
{
"epoch": 16.512549537648614,
"eval_bleu": 65.59844085659223,
"eval_char_accuracy": 58.32784997532489,
"eval_loss": 0.10947112739086151,
"eval_runtime": 332.8226,
"eval_samples_per_second": 4.549,
"eval_steps_per_second": 0.571,
"step": 12500
},
{
"epoch": 16.6446499339498,
"grad_norm": 1.5628472566604614,
"learning_rate": 9.417400578537868e-06,
"loss": 0.1219,
"step": 12600
},
{
"epoch": 16.77675033025099,
"grad_norm": 1.5304769277572632,
"learning_rate": 9.407510553339931e-06,
"loss": 0.1192,
"step": 12700
},
{
"epoch": 16.90885072655218,
"grad_norm": 1.6370787620544434,
"learning_rate": 9.397542571834054e-06,
"loss": 0.1181,
"step": 12800
},
{
"epoch": 17.040951122853368,
"grad_norm": 1.5388678312301636,
"learning_rate": 9.387496810325436e-06,
"loss": 0.1137,
"step": 12900
},
{
"epoch": 17.17305151915456,
"grad_norm": 1.8251888751983643,
"learning_rate": 9.377373446494984e-06,
"loss": 0.1122,
"step": 13000
},
{
"epoch": 17.17305151915456,
"eval_bleu": 66.79852257149867,
"eval_char_accuracy": 58.47692877117947,
"eval_loss": 0.1069813147187233,
"eval_runtime": 350.9901,
"eval_samples_per_second": 4.314,
"eval_steps_per_second": 0.541,
"step": 13000
},
{
"epoch": 17.305151915455745,
"grad_norm": 1.6730031967163086,
"learning_rate": 9.367172659396172e-06,
"loss": 0.1123,
"step": 13100
},
{
"epoch": 17.437252311756936,
"grad_norm": 1.2402830123901367,
"learning_rate": 9.35689462945187e-06,
"loss": 0.1125,
"step": 13200
},
{
"epoch": 17.569352708058123,
"grad_norm": 1.9425466060638428,
"learning_rate": 9.34653953845115e-06,
"loss": 0.1109,
"step": 13300
},
{
"epoch": 17.701453104359313,
"grad_norm": 3.105457067489624,
"learning_rate": 9.33610756954608e-06,
"loss": 0.1161,
"step": 13400
},
{
"epoch": 17.833553500660503,
"grad_norm": 1.679442048072815,
"learning_rate": 9.325598907248478e-06,
"loss": 0.1131,
"step": 13500
},
{
"epoch": 17.833553500660503,
"eval_bleu": 66.72469802657548,
"eval_char_accuracy": 59.26910264846191,
"eval_loss": 0.10426344722509384,
"eval_runtime": 353.0152,
"eval_samples_per_second": 4.289,
"eval_steps_per_second": 0.538,
"step": 13500
},
{
"epoch": 17.96565389696169,
"grad_norm": 1.1806349754333496,
"learning_rate": 9.315013737426645e-06,
"loss": 0.115,
"step": 13600
},
{
"epoch": 18.09775429326288,
"grad_norm": 1.5437259674072266,
"learning_rate": 9.304352247302091e-06,
"loss": 0.1071,
"step": 13700
},
{
"epoch": 18.229854689564068,
"grad_norm": 3.9584083557128906,
"learning_rate": 9.293614625446205e-06,
"loss": 0.11,
"step": 13800
},
{
"epoch": 18.361955085865258,
"grad_norm": 1.4071292877197266,
"learning_rate": 9.282801061776937e-06,
"loss": 0.1093,
"step": 13900
},
{
"epoch": 18.494055482166445,
"grad_norm": 1.599787950515747,
"learning_rate": 9.271911747555425e-06,
"loss": 0.1057,
"step": 14000
},
{
"epoch": 18.494055482166445,
"eval_bleu": 67.26763967365643,
"eval_char_accuracy": 58.966318473433134,
"eval_loss": 0.10266197472810745,
"eval_runtime": 318.6046,
"eval_samples_per_second": 4.752,
"eval_steps_per_second": 0.596,
"step": 14000
},
{
"epoch": 18.626155878467635,
"grad_norm": 1.3243365287780762,
"learning_rate": 9.260946875382624e-06,
"loss": 0.1054,
"step": 14100
},
{
"epoch": 18.758256274768826,
"grad_norm": 1.8453656435012817,
"learning_rate": 9.249906639195894e-06,
"loss": 0.1096,
"step": 14200
},
{
"epoch": 18.890356671070013,
"grad_norm": 1.2308754920959473,
"learning_rate": 9.238791234265565e-06,
"loss": 0.1045,
"step": 14300
},
{
"epoch": 19.022457067371203,
"grad_norm": 1.3015666007995605,
"learning_rate": 9.22760085719149e-06,
"loss": 0.1065,
"step": 14400
},
{
"epoch": 19.15455746367239,
"grad_norm": 1.3568576574325562,
"learning_rate": 9.21633570589957e-06,
"loss": 0.1024,
"step": 14500
},
{
"epoch": 19.15455746367239,
"eval_bleu": 67.65790078495372,
"eval_char_accuracy": 59.99547622964303,
"eval_loss": 0.10274580866098404,
"eval_runtime": 317.6719,
"eval_samples_per_second": 4.766,
"eval_steps_per_second": 0.598,
"step": 14500
},
{
"epoch": 19.28665785997358,
"grad_norm": 1.6260790824890137,
"learning_rate": 9.204995979638241e-06,
"loss": 0.1025,
"step": 14600
},
{
"epoch": 19.418758256274767,
"grad_norm": 1.1852062940597534,
"learning_rate": 9.193581878974964e-06,
"loss": 0.101,
"step": 14700
},
{
"epoch": 19.550858652575958,
"grad_norm": 2.5521767139434814,
"learning_rate": 9.18209360579267e-06,
"loss": 0.0999,
"step": 14800
},
{
"epoch": 19.682959048877148,
"grad_norm": 1.0231155157089233,
"learning_rate": 9.17053136328619e-06,
"loss": 0.1055,
"step": 14900
},
{
"epoch": 19.815059445178335,
"grad_norm": 1.3518396615982056,
"learning_rate": 9.15889535595866e-06,
"loss": 0.1014,
"step": 15000
},
{
"epoch": 19.815059445178335,
"eval_bleu": 68.43528395628573,
"eval_char_accuracy": 61.64870866918901,
"eval_loss": 0.09970895200967789,
"eval_runtime": 314.0763,
"eval_samples_per_second": 4.82,
"eval_steps_per_second": 0.605,
"step": 15000
},
{
"epoch": 19.947159841479525,
"grad_norm": 1.2678903341293335,
"learning_rate": 9.147185789617907e-06,
"loss": 0.1005,
"step": 15100
},
{
"epoch": 20.079260237780712,
"grad_norm": 1.2439242601394653,
"learning_rate": 9.13540287137281e-06,
"loss": 0.0989,
"step": 15200
},
{
"epoch": 20.211360634081903,
"grad_norm": 1.3169798851013184,
"learning_rate": 9.123546809629632e-06,
"loss": 0.1006,
"step": 15300
},
{
"epoch": 20.34346103038309,
"grad_norm": 1.3063526153564453,
"learning_rate": 9.111617814088332e-06,
"loss": 0.0966,
"step": 15400
},
{
"epoch": 20.47556142668428,
"grad_norm": 1.8397212028503418,
"learning_rate": 9.099616095738867e-06,
"loss": 0.0965,
"step": 15500
},
{
"epoch": 20.47556142668428,
"eval_bleu": 68.40618966106221,
"eval_char_accuracy": 60.30751357131107,
"eval_loss": 0.09965521842241287,
"eval_runtime": 315.776,
"eval_samples_per_second": 4.795,
"eval_steps_per_second": 0.602,
"step": 15500
},
{
"epoch": 20.60766182298547,
"grad_norm": 1.544118046760559,
"learning_rate": 9.087541866857453e-06,
"loss": 0.0954,
"step": 15600
},
{
"epoch": 20.739762219286657,
"grad_norm": 1.5378237962722778,
"learning_rate": 9.075395341002804e-06,
"loss": 0.0975,
"step": 15700
},
{
"epoch": 20.871862615587848,
"grad_norm": 1.0197510719299316,
"learning_rate": 9.06317673301237e-06,
"loss": 0.0964,
"step": 15800
},
{
"epoch": 21.003963011889034,
"grad_norm": 0.9621543884277344,
"learning_rate": 9.05088625899852e-06,
"loss": 0.0925,
"step": 15900
},
{
"epoch": 21.136063408190225,
"grad_norm": 1.356550931930542,
"learning_rate": 9.038524136344736e-06,
"loss": 0.0917,
"step": 16000
},
{
"epoch": 21.136063408190225,
"eval_bleu": 69.05043210174456,
"eval_char_accuracy": 60.879153643691396,
"eval_loss": 0.0960288867354393,
"eval_runtime": 317.6216,
"eval_samples_per_second": 4.767,
"eval_steps_per_second": 0.598,
"step": 16000
},
{
"epoch": 21.268163804491415,
"grad_norm": 2.0014472007751465,
"learning_rate": 9.026090583701755e-06,
"loss": 0.0962,
"step": 16100
},
{
"epoch": 21.400264200792602,
"grad_norm": 1.4762715101242065,
"learning_rate": 9.013585820983713e-06,
"loss": 0.0917,
"step": 16200
},
{
"epoch": 21.532364597093792,
"grad_norm": 1.242245078086853,
"learning_rate": 9.001010069364241e-06,
"loss": 0.0907,
"step": 16300
},
{
"epoch": 21.66446499339498,
"grad_norm": 1.9390721321105957,
"learning_rate": 8.98836355127257e-06,
"loss": 0.0918,
"step": 16400
},
{
"epoch": 21.79656538969617,
"grad_norm": 1.070357322692871,
"learning_rate": 8.975646490389581e-06,
"loss": 0.0903,
"step": 16500
},
{
"epoch": 21.79656538969617,
"eval_bleu": 69.68531857632678,
"eval_char_accuracy": 62.48920463892087,
"eval_loss": 0.09554192423820496,
"eval_runtime": 335.0637,
"eval_samples_per_second": 4.519,
"eval_steps_per_second": 0.567,
"step": 16500
},
{
"epoch": 21.928665785997357,
"grad_norm": 1.8990777730941772,
"learning_rate": 8.962859111643862e-06,
"loss": 0.0946,
"step": 16600
},
{
"epoch": 22.060766182298547,
"grad_norm": 1.6244333982467651,
"learning_rate": 8.950001641207719e-06,
"loss": 0.0895,
"step": 16700
},
{
"epoch": 22.192866578599737,
"grad_norm": 1.6511205434799194,
"learning_rate": 8.937074306493187e-06,
"loss": 0.0907,
"step": 16800
},
{
"epoch": 22.324966974900924,
"grad_norm": 1.421342372894287,
"learning_rate": 8.924077336147992e-06,
"loss": 0.0864,
"step": 16900
},
{
"epoch": 22.457067371202115,
"grad_norm": 1.572800874710083,
"learning_rate": 8.911010960051522e-06,
"loss": 0.088,
"step": 17000
},
{
"epoch": 22.457067371202115,
"eval_bleu": 69.94746089002493,
"eval_char_accuracy": 61.5572051324231,
"eval_loss": 0.09443064033985138,
"eval_runtime": 345.5738,
"eval_samples_per_second": 4.381,
"eval_steps_per_second": 0.55,
"step": 17000
},
{
"epoch": 22.5891677675033,
"grad_norm": 1.2412383556365967,
"learning_rate": 8.897875409310755e-06,
"loss": 0.085,
"step": 17100
},
{
"epoch": 22.721268163804492,
"grad_norm": 1.4429903030395508,
"learning_rate": 8.884803301685314e-06,
"loss": 0.0908,
"step": 17200
},
{
"epoch": 22.85336856010568,
"grad_norm": 1.1933690309524536,
"learning_rate": 8.871530785794356e-06,
"loss": 0.092,
"step": 17300
},
{
"epoch": 22.98546895640687,
"grad_norm": 1.399186611175537,
"learning_rate": 8.85818979355093e-06,
"loss": 0.0837,
"step": 17400
},
{
"epoch": 23.11756935270806,
"grad_norm": 1.2939783334732056,
"learning_rate": 8.844780560919194e-06,
"loss": 0.0871,
"step": 17500
},
{
"epoch": 23.11756935270806,
"eval_bleu": 70.43896205976631,
"eval_char_accuracy": 61.825032900148045,
"eval_loss": 0.0947960913181305,
"eval_runtime": 324.1925,
"eval_samples_per_second": 4.67,
"eval_steps_per_second": 0.586,
"step": 17500
},
{
"epoch": 23.249669749009247,
"grad_norm": 1.2098222970962524,
"learning_rate": 8.831303325070279e-06,
"loss": 0.0827,
"step": 17600
},
{
"epoch": 23.381770145310437,
"grad_norm": 1.5045851469039917,
"learning_rate": 8.8177583243781e-06,
"loss": 0.0838,
"step": 17700
},
{
"epoch": 23.513870541611624,
"grad_norm": 1.5295897722244263,
"learning_rate": 8.80414579841514e-06,
"loss": 0.0858,
"step": 17800
},
{
"epoch": 23.645970937912814,
"grad_norm": 1.4860461950302124,
"learning_rate": 8.790465987948212e-06,
"loss": 0.0875,
"step": 17900
},
{
"epoch": 23.778071334214,
"grad_norm": 1.4711731672286987,
"learning_rate": 8.776719134934199e-06,
"loss": 0.0828,
"step": 18000
},
{
"epoch": 23.778071334214,
"eval_bleu": 70.5623767627479,
"eval_char_accuracy": 62.505140648132915,
"eval_loss": 0.0924154594540596,
"eval_runtime": 342.2412,
"eval_samples_per_second": 4.424,
"eval_steps_per_second": 0.555,
"step": 18000
},
{
"epoch": 23.91017173051519,
"grad_norm": 1.4447731971740723,
"learning_rate": 8.762905482515775e-06,
"loss": 0.0814,
"step": 18100
},
{
"epoch": 24.042272126816382,
"grad_norm": 1.350907802581787,
"learning_rate": 8.749025275017107e-06,
"loss": 0.0806,
"step": 18200
},
{
"epoch": 24.17437252311757,
"grad_norm": 1.7207551002502441,
"learning_rate": 8.735078757939532e-06,
"loss": 0.08,
"step": 18300
},
{
"epoch": 24.30647291941876,
"grad_norm": 1.0851505994796753,
"learning_rate": 8.721066177957213e-06,
"loss": 0.0779,
"step": 18400
},
{
"epoch": 24.438573315719946,
"grad_norm": 1.2182328701019287,
"learning_rate": 8.70698778291278e-06,
"loss": 0.0814,
"step": 18500
},
{
"epoch": 24.438573315719946,
"eval_bleu": 70.99257164437572,
"eval_char_accuracy": 62.22189093600922,
"eval_loss": 0.09152651578187943,
"eval_runtime": 323.7174,
"eval_samples_per_second": 4.677,
"eval_steps_per_second": 0.587,
"step": 18500
},
{
"epoch": 24.570673712021136,
"grad_norm": 2.0363285541534424,
"learning_rate": 8.69284382181294e-06,
"loss": 0.0821,
"step": 18600
},
{
"epoch": 24.702774108322323,
"grad_norm": 1.3864432573318481,
"learning_rate": 8.67863454482408e-06,
"loss": 0.0809,
"step": 18700
},
{
"epoch": 24.834874504623514,
"grad_norm": 2.032351493835449,
"learning_rate": 8.664360203267838e-06,
"loss": 0.0819,
"step": 18800
},
{
"epoch": 24.966974900924704,
"grad_norm": 1.2007182836532593,
"learning_rate": 8.65002104961666e-06,
"loss": 0.0819,
"step": 18900
},
{
"epoch": 25.09907529722589,
"grad_norm": 1.167693853378296,
"learning_rate": 8.635617337489331e-06,
"loss": 0.0778,
"step": 19000
},
{
"epoch": 25.09907529722589,
"eval_bleu": 70.99070971451881,
"eval_char_accuracy": 63.15440450732028,
"eval_loss": 0.09109245985746384,
"eval_runtime": 331.1976,
"eval_samples_per_second": 4.571,
"eval_steps_per_second": 0.574,
"step": 19000
},
{
"epoch": 25.23117569352708,
"grad_norm": 1.9939295053482056,
"learning_rate": 8.621149321646495e-06,
"loss": 0.076,
"step": 19100
},
{
"epoch": 25.36327608982827,
"grad_norm": 1.148555874824524,
"learning_rate": 8.60661725798614e-06,
"loss": 0.078,
"step": 19200
},
{
"epoch": 25.49537648612946,
"grad_norm": 1.159621238708496,
"learning_rate": 8.592167677001219e-06,
"loss": 0.0823,
"step": 19300
},
{
"epoch": 25.627476882430646,
"grad_norm": 1.7136179208755493,
"learning_rate": 8.57750892397125e-06,
"loss": 0.0755,
"step": 19400
},
{
"epoch": 25.759577278731836,
"grad_norm": 1.105714201927185,
"learning_rate": 8.5627868949981e-06,
"loss": 0.0756,
"step": 19500
},
{
"epoch": 25.759577278731836,
"eval_bleu": 71.15813151741806,
"eval_char_accuracy": 63.43405576575094,
"eval_loss": 0.09030098468065262,
"eval_runtime": 316.3492,
"eval_samples_per_second": 4.786,
"eval_steps_per_second": 0.601,
"step": 19500
},
{
"epoch": 25.891677675033026,
"grad_norm": 1.6091099977493286,
"learning_rate": 8.548001850472529e-06,
"loss": 0.0778,
"step": 19600
},
{
"epoch": 26.023778071334213,
"grad_norm": 1.1730881929397583,
"learning_rate": 8.533154051899864e-06,
"loss": 0.0787,
"step": 19700
},
{
"epoch": 26.155878467635404,
"grad_norm": 1.2703460454940796,
"learning_rate": 8.518243761895369e-06,
"loss": 0.0711,
"step": 19800
},
{
"epoch": 26.28797886393659,
"grad_norm": 1.3379662036895752,
"learning_rate": 8.503271244179608e-06,
"loss": 0.075,
"step": 19900
},
{
"epoch": 26.42007926023778,
"grad_norm": 1.370871901512146,
"learning_rate": 8.488236763573772e-06,
"loss": 0.0717,
"step": 20000
},
{
"epoch": 26.42007926023778,
"eval_bleu": 72.10017178272712,
"eval_char_accuracy": 63.853018588583645,
"eval_loss": 0.08871379494667053,
"eval_runtime": 316.1038,
"eval_samples_per_second": 4.79,
"eval_steps_per_second": 0.601,
"step": 20000
},
{
"epoch": 26.552179656538968,
"grad_norm": 1.3396387100219727,
"learning_rate": 8.473140585995004e-06,
"loss": 0.0726,
"step": 20100
},
{
"epoch": 26.68428005284016,
"grad_norm": 1.1219794750213623,
"learning_rate": 8.457982978451683e-06,
"loss": 0.0754,
"step": 20200
},
{
"epoch": 26.81638044914135,
"grad_norm": 1.0815324783325195,
"learning_rate": 8.442764209038717e-06,
"loss": 0.0745,
"step": 20300
},
{
"epoch": 26.948480845442536,
"grad_norm": 1.4396206140518188,
"learning_rate": 8.427484546932789e-06,
"loss": 0.0749,
"step": 20400
},
{
"epoch": 27.080581241743726,
"grad_norm": 1.2644987106323242,
"learning_rate": 8.4121442623876e-06,
"loss": 0.0731,
"step": 20500
},
{
"epoch": 27.080581241743726,
"eval_bleu": 71.61514434619416,
"eval_char_accuracy": 63.06290097055437,
"eval_loss": 0.08903466165065765,
"eval_runtime": 316.4287,
"eval_samples_per_second": 4.785,
"eval_steps_per_second": 0.6,
"step": 20500
},
{
"epoch": 27.212681638044913,
"grad_norm": 1.3456913232803345,
"learning_rate": 8.396743626729093e-06,
"loss": 0.0728,
"step": 20600
},
{
"epoch": 27.344782034346103,
"grad_norm": 1.1268248558044434,
"learning_rate": 8.381282912350646e-06,
"loss": 0.072,
"step": 20700
},
{
"epoch": 27.476882430647294,
"grad_norm": 0.964856743812561,
"learning_rate": 8.365762392708259e-06,
"loss": 0.0711,
"step": 20800
},
{
"epoch": 27.60898282694848,
"grad_norm": 1.2877197265625,
"learning_rate": 8.350182342315719e-06,
"loss": 0.0681,
"step": 20900
},
{
"epoch": 27.74108322324967,
"grad_norm": 3.0796854496002197,
"learning_rate": 8.334543036739743e-06,
"loss": 0.0681,
"step": 21000
},
{
"epoch": 27.74108322324967,
"eval_bleu": 72.44362762449254,
"eval_char_accuracy": 64.56139990129955,
"eval_loss": 0.08809462934732437,
"eval_runtime": 319.538,
"eval_samples_per_second": 4.738,
"eval_steps_per_second": 0.595,
"step": 21000
},
{
"epoch": 27.873183619550858,
"grad_norm": 1.445993185043335,
"learning_rate": 8.3188447525951e-06,
"loss": 0.0701,
"step": 21100
},
{
"epoch": 28.005284015852048,
"grad_norm": 0.9414767622947693,
"learning_rate": 8.303087767539723e-06,
"loss": 0.0698,
"step": 21200
},
{
"epoch": 28.137384412153235,
"grad_norm": 1.0644490718841553,
"learning_rate": 8.28758923914531e-06,
"loss": 0.0674,
"step": 21300
},
{
"epoch": 28.269484808454425,
"grad_norm": 1.2708711624145508,
"learning_rate": 8.27171684949204e-06,
"loss": 0.0689,
"step": 21400
},
{
"epoch": 28.401585204755616,
"grad_norm": 1.2177033424377441,
"learning_rate": 8.25578659248641e-06,
"loss": 0.0677,
"step": 21500
},
{
"epoch": 28.401585204755616,
"eval_bleu": 72.7907604984095,
"eval_char_accuracy": 64.78141964138838,
"eval_loss": 0.0869230180978775,
"eval_runtime": 315.3901,
"eval_samples_per_second": 4.8,
"eval_steps_per_second": 0.602,
"step": 21500
},
{
"epoch": 28.533685601056803,
"grad_norm": 1.1179392337799072,
"learning_rate": 8.239798749889293e-06,
"loss": 0.0673,
"step": 21600
},
{
"epoch": 28.665785997357993,
"grad_norm": 1.2472251653671265,
"learning_rate": 8.223753604480086e-06,
"loss": 0.0682,
"step": 21700
},
{
"epoch": 28.79788639365918,
"grad_norm": 0.8336161375045776,
"learning_rate": 8.207651440051714e-06,
"loss": 0.0689,
"step": 21800
},
{
"epoch": 28.92998678996037,
"grad_norm": 1.2652006149291992,
"learning_rate": 8.1914925414056e-06,
"loss": 0.0688,
"step": 21900
},
{
"epoch": 29.062087186261557,
"grad_norm": 1.1424204111099243,
"learning_rate": 8.175277194346636e-06,
"loss": 0.0677,
"step": 22000
},
{
"epoch": 29.062087186261557,
"eval_bleu": 72.84923297683062,
"eval_char_accuracy": 64.63285491034709,
"eval_loss": 0.08696427941322327,
"eval_runtime": 316.7435,
"eval_samples_per_second": 4.78,
"eval_steps_per_second": 0.6,
"step": 22000
},
{
"epoch": 29.194187582562748,
"grad_norm": 1.617885708808899,
"learning_rate": 8.159005685678126e-06,
"loss": 0.0638,
"step": 22100
},
{
"epoch": 29.326287978863938,
"grad_norm": 1.2440978288650513,
"learning_rate": 8.142678303196715e-06,
"loss": 0.0606,
"step": 22200
},
{
"epoch": 29.458388375165125,
"grad_norm": 1.1825751066207886,
"learning_rate": 8.12629533568729e-06,
"loss": 0.0661,
"step": 22300
},
{
"epoch": 29.590488771466315,
"grad_norm": 1.5328364372253418,
"learning_rate": 8.109857072917887e-06,
"loss": 0.0647,
"step": 22400
},
{
"epoch": 29.722589167767502,
"grad_norm": 1.1308438777923584,
"learning_rate": 8.093363805634556e-06,
"loss": 0.0666,
"step": 22500
},
{
"epoch": 29.722589167767502,
"eval_bleu": 73.15661772633777,
"eval_char_accuracy": 63.81086527389373,
"eval_loss": 0.08648520708084106,
"eval_runtime": 315.6708,
"eval_samples_per_second": 4.796,
"eval_steps_per_second": 0.602,
"step": 22500
},
{
"epoch": 29.854689564068693,
"grad_norm": 1.0442135334014893,
"learning_rate": 8.076815825556213e-06,
"loss": 0.0648,
"step": 22600
},
{
"epoch": 29.98678996036988,
"grad_norm": 1.363897681236267,
"learning_rate": 8.060213425369492e-06,
"loss": 0.0654,
"step": 22700
},
{
"epoch": 30.11889035667107,
"grad_norm": 0.7751464247703552,
"learning_rate": 8.043556898723568e-06,
"loss": 0.0628,
"step": 22800
},
{
"epoch": 30.25099075297226,
"grad_norm": 0.8685150742530823,
"learning_rate": 8.026846540224956e-06,
"loss": 0.0584,
"step": 22900
},
{
"epoch": 30.383091149273447,
"grad_norm": 1.1484973430633545,
"learning_rate": 8.0100826454323e-06,
"loss": 0.0604,
"step": 23000
},
{
"epoch": 30.383091149273447,
"eval_bleu": 73.25107285034876,
"eval_char_accuracy": 65.01943164994243,
"eval_loss": 0.08666232973337173,
"eval_runtime": 323.7755,
"eval_samples_per_second": 4.676,
"eval_steps_per_second": 0.587,
"step": 23000
},
{
"epoch": 30.515191545574638,
"grad_norm": 0.986289918422699,
"learning_rate": 7.993265510851148e-06,
"loss": 0.0688,
"step": 23100
},
{
"epoch": 30.647291941875825,
"grad_norm": 1.0403423309326172,
"learning_rate": 7.97639543392872e-06,
"loss": 0.0638,
"step": 23200
},
{
"epoch": 30.779392338177015,
"grad_norm": 1.517040729522705,
"learning_rate": 7.959472713048617e-06,
"loss": 0.0653,
"step": 23300
},
{
"epoch": 30.911492734478202,
"grad_norm": 1.1347965002059937,
"learning_rate": 7.942497647525576e-06,
"loss": 0.0642,
"step": 23400
},
{
"epoch": 31.043593130779392,
"grad_norm": 1.0789778232574463,
"learning_rate": 7.925470537600155e-06,
"loss": 0.0614,
"step": 23500
},
{
"epoch": 31.043593130779392,
"eval_bleu": 73.23277401857816,
"eval_char_accuracy": 65.0646693535121,
"eval_loss": 0.08618722856044769,
"eval_runtime": 313.4951,
"eval_samples_per_second": 4.829,
"eval_steps_per_second": 0.606,
"step": 23500
},
{
"epoch": 31.175693527080583,
"grad_norm": 1.4853187799453735,
"learning_rate": 7.908391684433432e-06,
"loss": 0.0585,
"step": 23600
},
{
"epoch": 31.30779392338177,
"grad_norm": 0.9656835794448853,
"learning_rate": 7.891261390101675e-06,
"loss": 0.0578,
"step": 23700
},
{
"epoch": 31.43989431968296,
"grad_norm": 1.1521549224853516,
"learning_rate": 7.874079957590997e-06,
"loss": 0.0622,
"step": 23800
},
{
"epoch": 31.571994715984147,
"grad_norm": 1.0636780261993408,
"learning_rate": 7.856847690792002e-06,
"loss": 0.0604,
"step": 23900
},
{
"epoch": 31.704095112285337,
"grad_norm": 1.0833789110183716,
"learning_rate": 7.839564894494409e-06,
"loss": 0.0633,
"step": 24000
},
{
"epoch": 31.704095112285337,
"eval_bleu": 73.62536575895216,
"eval_char_accuracy": 65.15565882546471,
"eval_loss": 0.08546082675457001,
"eval_runtime": 318.0064,
"eval_samples_per_second": 4.761,
"eval_steps_per_second": 0.597,
"step": 24000
},
{
"epoch": 31.836195508586528,
"grad_norm": 1.1620845794677734,
"learning_rate": 7.822231874381658e-06,
"loss": 0.0604,
"step": 24100
},
{
"epoch": 31.968295904887714,
"grad_norm": 1.315012812614441,
"learning_rate": 7.804848937025507e-06,
"loss": 0.0593,
"step": 24200
},
{
"epoch": 32.1003963011889,
"grad_norm": 0.8739562034606934,
"learning_rate": 7.787416389880605e-06,
"loss": 0.0608,
"step": 24300
},
{
"epoch": 32.23249669749009,
"grad_norm": 0.9168538451194763,
"learning_rate": 7.769934541279059e-06,
"loss": 0.0577,
"step": 24400
},
{
"epoch": 32.36459709379128,
"grad_norm": 0.9820032715797424,
"learning_rate": 7.752403700424978e-06,
"loss": 0.0569,
"step": 24500
},
{
"epoch": 32.36459709379128,
"eval_bleu": 73.72149088930817,
"eval_char_accuracy": 65.55457312057904,
"eval_loss": 0.08627723157405853,
"eval_runtime": 314.5801,
"eval_samples_per_second": 4.813,
"eval_steps_per_second": 0.604,
"step": 24500
},
{
"epoch": 32.49669749009247,
"grad_norm": 1.0042686462402344,
"learning_rate": 7.734824177389006e-06,
"loss": 0.0582,
"step": 24600
},
{
"epoch": 32.628797886393656,
"grad_norm": 1.251654863357544,
"learning_rate": 7.71719628310283e-06,
"loss": 0.0589,
"step": 24700
},
{
"epoch": 32.760898282694846,
"grad_norm": 1.3150684833526611,
"learning_rate": 7.699520329353694e-06,
"loss": 0.0585,
"step": 24800
},
{
"epoch": 32.89299867899604,
"grad_norm": 1.318556547164917,
"learning_rate": 7.681796628778876e-06,
"loss": 0.0588,
"step": 24900
},
{
"epoch": 33.02509907529723,
"grad_norm": 1.2693874835968018,
"learning_rate": 7.664025494860155e-06,
"loss": 0.0605,
"step": 25000
},
{
"epoch": 33.02509907529723,
"eval_bleu": 73.95631775899457,
"eval_char_accuracy": 65.17056670505016,
"eval_loss": 0.08447689563035965,
"eval_runtime": 317.601,
"eval_samples_per_second": 4.767,
"eval_steps_per_second": 0.598,
"step": 25000
},
{
"epoch": 33.15719947159842,
"grad_norm": 0.7866926193237305,
"learning_rate": 7.646207241918272e-06,
"loss": 0.055,
"step": 25100
},
{
"epoch": 33.2892998678996,
"grad_norm": 1.0340533256530762,
"learning_rate": 7.628342185107373e-06,
"loss": 0.0563,
"step": 25200
},
{
"epoch": 33.42140026420079,
"grad_norm": 1.6704190969467163,
"learning_rate": 7.610430640409427e-06,
"loss": 0.0568,
"step": 25300
},
{
"epoch": 33.55350066050198,
"grad_norm": 1.4271676540374756,
"learning_rate": 7.592472924628642e-06,
"loss": 0.056,
"step": 25400
},
{
"epoch": 33.68560105680317,
"grad_norm": 1.3886315822601318,
"learning_rate": 7.574469355385865e-06,
"loss": 0.0552,
"step": 25500
},
{
"epoch": 33.68560105680317,
"eval_bleu": 73.67062952266826,
"eval_char_accuracy": 65.04410676098043,
"eval_loss": 0.08498267084360123,
"eval_runtime": 314.7186,
"eval_samples_per_second": 4.811,
"eval_steps_per_second": 0.604,
"step": 25500
},
{
"epoch": 33.81770145310436,
"grad_norm": 1.1037873029708862,
"learning_rate": 7.556420251112956e-06,
"loss": 0.0551,
"step": 25600
},
{
"epoch": 33.949801849405546,
"grad_norm": 2.125624418258667,
"learning_rate": 7.538325931047159e-06,
"loss": 0.0591,
"step": 25700
},
{
"epoch": 34.081902245706736,
"grad_norm": 1.674501895904541,
"learning_rate": 7.52018671522546e-06,
"loss": 0.0561,
"step": 25800
},
{
"epoch": 34.21400264200793,
"grad_norm": 1.386206030845642,
"learning_rate": 7.502002924478924e-06,
"loss": 0.0509,
"step": 25900
},
{
"epoch": 34.34610303830912,
"grad_norm": 1.0778214931488037,
"learning_rate": 7.48377488042701e-06,
"loss": 0.0544,
"step": 26000
},
{
"epoch": 34.34610303830912,
"eval_bleu": 74.62046528623556,
"eval_char_accuracy": 66.02442835992763,
"eval_loss": 0.08464069664478302,
"eval_runtime": 316.1374,
"eval_samples_per_second": 4.789,
"eval_steps_per_second": 0.601,
"step": 26000
},
{
"epoch": 34.4782034346103,
"grad_norm": 0.8076276779174805,
"learning_rate": 7.465502905471907e-06,
"loss": 0.055,
"step": 26100
},
{
"epoch": 34.61030383091149,
"grad_norm": 1.1508395671844482,
"learning_rate": 7.447187322792806e-06,
"loss": 0.057,
"step": 26200
},
{
"epoch": 34.74240422721268,
"grad_norm": 1.2695698738098145,
"learning_rate": 7.4288284563401945e-06,
"loss": 0.055,
"step": 26300
},
{
"epoch": 34.87450462351387,
"grad_norm": 1.166051983833313,
"learning_rate": 7.410426630830131e-06,
"loss": 0.0552,
"step": 26400
},
{
"epoch": 35.00660501981506,
"grad_norm": 0.9517413973808289,
"learning_rate": 7.391982171738496e-06,
"loss": 0.0555,
"step": 26500
},
{
"epoch": 35.00660501981506,
"eval_bleu": 74.18793581426324,
"eval_char_accuracy": 65.63373910182597,
"eval_loss": 0.08454510569572449,
"eval_runtime": 313.1895,
"eval_samples_per_second": 4.834,
"eval_steps_per_second": 0.607,
"step": 26500
},
{
"epoch": 35.138705416116245,
"grad_norm": 1.1265789270401,
"learning_rate": 7.373495405295236e-06,
"loss": 0.0529,
"step": 26600
},
{
"epoch": 35.270805812417436,
"grad_norm": 1.0067466497421265,
"learning_rate": 7.354966658478594e-06,
"loss": 0.0502,
"step": 26700
},
{
"epoch": 35.402906208718626,
"grad_norm": 0.9871610999107361,
"learning_rate": 7.336396259009325e-06,
"loss": 0.0508,
"step": 26800
},
{
"epoch": 35.53500660501982,
"grad_norm": 1.4898390769958496,
"learning_rate": 7.317784535344905e-06,
"loss": 0.0544,
"step": 26900
},
{
"epoch": 35.66710700132101,
"grad_norm": 1.1168763637542725,
"learning_rate": 7.2991318166737126e-06,
"loss": 0.0535,
"step": 27000
},
{
"epoch": 35.66710700132101,
"eval_bleu": 74.4767569683976,
"eval_char_accuracy": 65.44353512090805,
"eval_loss": 0.08464961498975754,
"eval_runtime": 313.4254,
"eval_samples_per_second": 4.83,
"eval_steps_per_second": 0.606,
"step": 27000
},
{
"epoch": 35.79920739762219,
"grad_norm": 1.197704553604126,
"learning_rate": 7.280625566954032e-06,
"loss": 0.0547,
"step": 27100
},
{
"epoch": 35.93130779392338,
"grad_norm": 1.6144860982894897,
"learning_rate": 7.261892250434568e-06,
"loss": 0.0516,
"step": 27200
},
{
"epoch": 36.06340819022457,
"grad_norm": 1.1599304676055908,
"learning_rate": 7.243118927483657e-06,
"loss": 0.0502,
"step": 27300
},
{
"epoch": 36.19550858652576,
"grad_norm": 1.043449878692627,
"learning_rate": 7.22430593014791e-06,
"loss": 0.0472,
"step": 27400
},
{
"epoch": 36.32760898282695,
"grad_norm": 1.1489434242248535,
"learning_rate": 7.205453591175666e-06,
"loss": 0.0558,
"step": 27500
},
{
"epoch": 36.32760898282695,
"eval_bleu": 74.20946250489693,
"eval_char_accuracy": 65.89436996216483,
"eval_loss": 0.08468983322381973,
"eval_runtime": 314.7938,
"eval_samples_per_second": 4.809,
"eval_steps_per_second": 0.604,
"step": 27500
},
{
"epoch": 36.32760898282695,
"step": 27500,
"total_flos": 7035660725649408.0,
"train_loss": 0.282735899699818,
"train_runtime": 21727.9767,
"train_samples_per_second": 55.735,
"train_steps_per_second": 3.484
}
],
"logging_steps": 100,
"max_steps": 75700,
"num_input_tokens_seen": 0,
"num_train_epochs": 100,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 3
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7035660725649408.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}