mohmdsh's picture
End of training
5d926ff verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 35.08849557522124,
"eval_steps": 500,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.4424778761061947,
"grad_norm": 70.23552703857422,
"learning_rate": 5.000000000000001e-07,
"loss": 1.0554,
"step": 25
},
{
"epoch": 0.8849557522123894,
"grad_norm": 19.844127655029297,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.7061,
"step": 50
},
{
"epoch": 1.3185840707964602,
"grad_norm": 7.727048397064209,
"learning_rate": 1.5e-06,
"loss": 0.3636,
"step": 75
},
{
"epoch": 1.7610619469026547,
"grad_norm": 4.834627628326416,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.1976,
"step": 100
},
{
"epoch": 2.1946902654867255,
"grad_norm": 4.151981830596924,
"learning_rate": 2.5e-06,
"loss": 0.1237,
"step": 125
},
{
"epoch": 2.6371681415929205,
"grad_norm": 3.742462635040283,
"learning_rate": 3e-06,
"loss": 0.1117,
"step": 150
},
{
"epoch": 3.0707964601769913,
"grad_norm": 4.222021102905273,
"learning_rate": 3.5e-06,
"loss": 0.0927,
"step": 175
},
{
"epoch": 3.5132743362831858,
"grad_norm": 3.9585046768188477,
"learning_rate": 4.000000000000001e-06,
"loss": 0.067,
"step": 200
},
{
"epoch": 3.9557522123893807,
"grad_norm": 2.4528427124023438,
"learning_rate": 4.5e-06,
"loss": 0.0605,
"step": 225
},
{
"epoch": 4.389380530973451,
"grad_norm": 3.324159622192383,
"learning_rate": 5e-06,
"loss": 0.0598,
"step": 250
},
{
"epoch": 4.831858407079646,
"grad_norm": 3.3953816890716553,
"learning_rate": 5.500000000000001e-06,
"loss": 0.0543,
"step": 275
},
{
"epoch": 5.265486725663717,
"grad_norm": 4.006282329559326,
"learning_rate": 6e-06,
"loss": 0.0372,
"step": 300
},
{
"epoch": 5.707964601769912,
"grad_norm": 0.8992710113525391,
"learning_rate": 6.5000000000000004e-06,
"loss": 0.0382,
"step": 325
},
{
"epoch": 6.1415929203539825,
"grad_norm": 1.6717145442962646,
"learning_rate": 7e-06,
"loss": 0.0379,
"step": 350
},
{
"epoch": 6.584070796460177,
"grad_norm": 1.683933973312378,
"learning_rate": 7.500000000000001e-06,
"loss": 0.0354,
"step": 375
},
{
"epoch": 7.017699115044247,
"grad_norm": 4.067246437072754,
"learning_rate": 8.000000000000001e-06,
"loss": 0.0267,
"step": 400
},
{
"epoch": 7.460176991150442,
"grad_norm": 0.14960134029388428,
"learning_rate": 8.5e-06,
"loss": 0.0215,
"step": 425
},
{
"epoch": 7.902654867256637,
"grad_norm": 2.5055603981018066,
"learning_rate": 9e-06,
"loss": 0.0294,
"step": 450
},
{
"epoch": 8.336283185840708,
"grad_norm": 2.1897315979003906,
"learning_rate": 9.5e-06,
"loss": 0.0202,
"step": 475
},
{
"epoch": 8.778761061946902,
"grad_norm": 2.6399195194244385,
"learning_rate": 1e-05,
"loss": 0.0195,
"step": 500
},
{
"epoch": 8.778761061946902,
"eval_loss": 3.6442341804504395,
"eval_runtime": 98.5424,
"eval_samples_per_second": 1.005,
"eval_steps_per_second": 0.071,
"eval_wer": 1.2195824334053276,
"step": 500
},
{
"epoch": 9.212389380530974,
"grad_norm": 1.9207507371902466,
"learning_rate": 9.833333333333333e-06,
"loss": 0.0136,
"step": 525
},
{
"epoch": 9.654867256637168,
"grad_norm": 1.3496884107589722,
"learning_rate": 9.666666666666667e-06,
"loss": 0.0184,
"step": 550
},
{
"epoch": 10.08849557522124,
"grad_norm": 1.9232096672058105,
"learning_rate": 9.5e-06,
"loss": 0.0194,
"step": 575
},
{
"epoch": 10.530973451327434,
"grad_norm": 2.705425977706909,
"learning_rate": 9.333333333333334e-06,
"loss": 0.0139,
"step": 600
},
{
"epoch": 10.973451327433628,
"grad_norm": 0.18757027387619019,
"learning_rate": 9.166666666666666e-06,
"loss": 0.0075,
"step": 625
},
{
"epoch": 11.4070796460177,
"grad_norm": 2.2813608646392822,
"learning_rate": 9e-06,
"loss": 0.0088,
"step": 650
},
{
"epoch": 11.849557522123893,
"grad_norm": 0.2083648443222046,
"learning_rate": 8.833333333333334e-06,
"loss": 0.0108,
"step": 675
},
{
"epoch": 12.283185840707965,
"grad_norm": 3.0644192695617676,
"learning_rate": 8.666666666666668e-06,
"loss": 0.0088,
"step": 700
},
{
"epoch": 12.725663716814159,
"grad_norm": 0.33310991525650024,
"learning_rate": 8.5e-06,
"loss": 0.007,
"step": 725
},
{
"epoch": 13.15929203539823,
"grad_norm": 0.6558440923690796,
"learning_rate": 8.333333333333334e-06,
"loss": 0.0071,
"step": 750
},
{
"epoch": 13.601769911504425,
"grad_norm": 0.04615064710378647,
"learning_rate": 8.166666666666668e-06,
"loss": 0.0066,
"step": 775
},
{
"epoch": 14.035398230088495,
"grad_norm": 2.2992634773254395,
"learning_rate": 8.000000000000001e-06,
"loss": 0.0051,
"step": 800
},
{
"epoch": 14.47787610619469,
"grad_norm": 1.2088443040847778,
"learning_rate": 7.833333333333333e-06,
"loss": 0.0035,
"step": 825
},
{
"epoch": 14.920353982300885,
"grad_norm": 1.7774810791015625,
"learning_rate": 7.666666666666667e-06,
"loss": 0.0059,
"step": 850
},
{
"epoch": 15.353982300884956,
"grad_norm": 0.07690909504890442,
"learning_rate": 7.500000000000001e-06,
"loss": 0.0047,
"step": 875
},
{
"epoch": 15.79646017699115,
"grad_norm": 1.2860054969787598,
"learning_rate": 7.333333333333333e-06,
"loss": 0.0053,
"step": 900
},
{
"epoch": 16.23008849557522,
"grad_norm": 0.6857427358627319,
"learning_rate": 7.166666666666667e-06,
"loss": 0.0023,
"step": 925
},
{
"epoch": 16.672566371681416,
"grad_norm": 0.7444378733634949,
"learning_rate": 7e-06,
"loss": 0.0043,
"step": 950
},
{
"epoch": 17.106194690265486,
"grad_norm": 0.0285570677369833,
"learning_rate": 6.833333333333334e-06,
"loss": 0.0033,
"step": 975
},
{
"epoch": 17.548672566371682,
"grad_norm": 1.755213737487793,
"learning_rate": 6.666666666666667e-06,
"loss": 0.0044,
"step": 1000
},
{
"epoch": 17.548672566371682,
"eval_loss": 4.271320343017578,
"eval_runtime": 87.4509,
"eval_samples_per_second": 1.132,
"eval_steps_per_second": 0.08,
"eval_wer": 1.2203023758099352,
"step": 1000
},
{
"epoch": 17.991150442477878,
"grad_norm": 3.525615930557251,
"learning_rate": 6.5000000000000004e-06,
"loss": 0.004,
"step": 1025
},
{
"epoch": 18.424778761061948,
"grad_norm": 1.9330180883407593,
"learning_rate": 6.333333333333333e-06,
"loss": 0.0026,
"step": 1050
},
{
"epoch": 18.86725663716814,
"grad_norm": 0.3188052475452423,
"learning_rate": 6.166666666666667e-06,
"loss": 0.0046,
"step": 1075
},
{
"epoch": 19.300884955752213,
"grad_norm": 1.5139490365982056,
"learning_rate": 6e-06,
"loss": 0.0042,
"step": 1100
},
{
"epoch": 19.743362831858406,
"grad_norm": 5.1269354820251465,
"learning_rate": 5.833333333333334e-06,
"loss": 0.0032,
"step": 1125
},
{
"epoch": 20.17699115044248,
"grad_norm": 0.024498550221323967,
"learning_rate": 5.666666666666667e-06,
"loss": 0.0029,
"step": 1150
},
{
"epoch": 20.61946902654867,
"grad_norm": 0.017963914200663567,
"learning_rate": 5.500000000000001e-06,
"loss": 0.0011,
"step": 1175
},
{
"epoch": 21.053097345132745,
"grad_norm": 0.10410400480031967,
"learning_rate": 5.333333333333334e-06,
"loss": 0.0008,
"step": 1200
},
{
"epoch": 21.495575221238937,
"grad_norm": 0.05556880682706833,
"learning_rate": 5.1666666666666675e-06,
"loss": 0.0003,
"step": 1225
},
{
"epoch": 21.938053097345133,
"grad_norm": 0.005813132505863905,
"learning_rate": 5e-06,
"loss": 0.0004,
"step": 1250
},
{
"epoch": 22.371681415929203,
"grad_norm": 0.008638879284262657,
"learning_rate": 4.833333333333333e-06,
"loss": 0.0003,
"step": 1275
},
{
"epoch": 22.8141592920354,
"grad_norm": 0.033254146575927734,
"learning_rate": 4.666666666666667e-06,
"loss": 0.0003,
"step": 1300
},
{
"epoch": 23.24778761061947,
"grad_norm": 0.018534550443291664,
"learning_rate": 4.5e-06,
"loss": 0.0002,
"step": 1325
},
{
"epoch": 23.690265486725664,
"grad_norm": 0.00659082131460309,
"learning_rate": 4.333333333333334e-06,
"loss": 0.0003,
"step": 1350
},
{
"epoch": 24.123893805309734,
"grad_norm": 0.009523949585855007,
"learning_rate": 4.166666666666667e-06,
"loss": 0.0002,
"step": 1375
},
{
"epoch": 24.56637168141593,
"grad_norm": 0.028162825852632523,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0002,
"step": 1400
},
{
"epoch": 25.0,
"grad_norm": 0.012596463784575462,
"learning_rate": 3.833333333333334e-06,
"loss": 0.0001,
"step": 1425
},
{
"epoch": 25.442477876106196,
"grad_norm": 0.006055805366486311,
"learning_rate": 3.6666666666666666e-06,
"loss": 0.0002,
"step": 1450
},
{
"epoch": 25.884955752212388,
"grad_norm": 0.01260603778064251,
"learning_rate": 3.5e-06,
"loss": 0.0002,
"step": 1475
},
{
"epoch": 26.31858407079646,
"grad_norm": 0.02524421364068985,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.0002,
"step": 1500
},
{
"epoch": 26.31858407079646,
"eval_loss": 4.261180877685547,
"eval_runtime": 92.5387,
"eval_samples_per_second": 1.07,
"eval_steps_per_second": 0.076,
"eval_wer": 1.2203023758099352,
"step": 1500
},
{
"epoch": 26.761061946902654,
"grad_norm": 0.04416137561202049,
"learning_rate": 3.1666666666666667e-06,
"loss": 0.0002,
"step": 1525
},
{
"epoch": 27.194690265486727,
"grad_norm": 0.002192295156419277,
"learning_rate": 3e-06,
"loss": 0.0002,
"step": 1550
},
{
"epoch": 27.63716814159292,
"grad_norm": 0.005449674092233181,
"learning_rate": 2.8333333333333335e-06,
"loss": 0.0002,
"step": 1575
},
{
"epoch": 28.07079646017699,
"grad_norm": 0.02437576837837696,
"learning_rate": 2.666666666666667e-06,
"loss": 0.0002,
"step": 1600
},
{
"epoch": 28.513274336283185,
"grad_norm": 0.042830660939216614,
"learning_rate": 2.5e-06,
"loss": 0.0002,
"step": 1625
},
{
"epoch": 28.95575221238938,
"grad_norm": 0.0039032117929309607,
"learning_rate": 2.3333333333333336e-06,
"loss": 0.0001,
"step": 1650
},
{
"epoch": 29.38938053097345,
"grad_norm": 0.008059768006205559,
"learning_rate": 2.166666666666667e-06,
"loss": 0.0002,
"step": 1675
},
{
"epoch": 29.831858407079647,
"grad_norm": 0.012063885107636452,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.0002,
"step": 1700
},
{
"epoch": 30.265486725663717,
"grad_norm": 0.04550086334347725,
"learning_rate": 1.8333333333333333e-06,
"loss": 0.0001,
"step": 1725
},
{
"epoch": 30.707964601769913,
"grad_norm": 0.0046273404732346535,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.0001,
"step": 1750
},
{
"epoch": 31.141592920353983,
"grad_norm": 0.00888855941593647,
"learning_rate": 1.5e-06,
"loss": 0.0002,
"step": 1775
},
{
"epoch": 31.58407079646018,
"grad_norm": 0.011799165047705173,
"learning_rate": 1.3333333333333334e-06,
"loss": 0.0002,
"step": 1800
},
{
"epoch": 32.017699115044245,
"grad_norm": 0.033783987164497375,
"learning_rate": 1.1666666666666668e-06,
"loss": 0.0001,
"step": 1825
},
{
"epoch": 32.46017699115044,
"grad_norm": 0.002975311130285263,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.0001,
"step": 1850
},
{
"epoch": 32.902654867256636,
"grad_norm": 0.009084297344088554,
"learning_rate": 8.333333333333333e-07,
"loss": 0.0002,
"step": 1875
},
{
"epoch": 33.336283185840706,
"grad_norm": 0.011445912532508373,
"learning_rate": 6.666666666666667e-07,
"loss": 0.0002,
"step": 1900
},
{
"epoch": 33.7787610619469,
"grad_norm": 0.025164591148495674,
"learning_rate": 5.000000000000001e-07,
"loss": 0.0001,
"step": 1925
},
{
"epoch": 34.21238938053097,
"grad_norm": 0.00487129669636488,
"learning_rate": 3.3333333333333335e-07,
"loss": 0.0001,
"step": 1950
},
{
"epoch": 34.65486725663717,
"grad_norm": 0.014068867079913616,
"learning_rate": 1.6666666666666668e-07,
"loss": 0.0002,
"step": 1975
},
{
"epoch": 35.08849557522124,
"grad_norm": 0.009848229587078094,
"learning_rate": 0.0,
"loss": 0.0001,
"step": 2000
},
{
"epoch": 35.08849557522124,
"eval_loss": 4.253291606903076,
"eval_runtime": 97.0898,
"eval_samples_per_second": 1.02,
"eval_steps_per_second": 0.072,
"eval_wer": 1.2203023758099352,
"step": 2000
},
{
"epoch": 35.08849557522124,
"step": 2000,
"total_flos": 1.83078577963008e+19,
"train_loss": 0.0418594888363732,
"train_runtime": 32879.5132,
"train_samples_per_second": 1.947,
"train_steps_per_second": 0.061
}
],
"logging_steps": 25,
"max_steps": 2000,
"num_input_tokens_seen": 0,
"num_train_epochs": 36,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.83078577963008e+19,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}