legis-tucano-2b-valid / trainer_state.json
felipeoes's picture
Model save
79949ca verified
{
"best_metric": 0.7003,
"best_model_checkpoint": "runs/legis-tucano-2b-valid/checkpoint-710",
"epoch": 0.9994065281899109,
"eval_steps": 10,
"global_step": 842,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.011869436201780416,
"grad_norm": 1.3277966052420052,
"learning_rate": 2.3529411764705884e-05,
"loss": 1.3563,
"step": 10
},
{
"epoch": 0.011869436201780416,
"eval_loss": 1.2333489656448364,
"eval_runtime": 7.1731,
"eval_samples_per_second": 7.249,
"eval_steps_per_second": 0.976,
"step": 10
},
{
"epoch": 0.02373887240356083,
"grad_norm": 0.2988625315142482,
"learning_rate": 4.705882352941177e-05,
"loss": 1.2538,
"step": 20
},
{
"epoch": 0.02373887240356083,
"eval_loss": 1.117323398590088,
"eval_runtime": 7.1487,
"eval_samples_per_second": 7.274,
"eval_steps_per_second": 0.979,
"step": 20
},
{
"epoch": 0.03560830860534125,
"grad_norm": 0.15105150205016427,
"learning_rate": 7.058823529411765e-05,
"loss": 1.1462,
"step": 30
},
{
"epoch": 0.03560830860534125,
"eval_loss": 1.0231214761734009,
"eval_runtime": 7.1358,
"eval_samples_per_second": 7.287,
"eval_steps_per_second": 0.981,
"step": 30
},
{
"epoch": 0.04747774480712166,
"grad_norm": 0.07095151302909826,
"learning_rate": 9.411764705882353e-05,
"loss": 1.0635,
"step": 40
},
{
"epoch": 0.04747774480712166,
"eval_loss": 0.9480358958244324,
"eval_runtime": 7.1596,
"eval_samples_per_second": 7.263,
"eval_steps_per_second": 0.978,
"step": 40
},
{
"epoch": 0.05934718100890208,
"grad_norm": 0.0510771961177552,
"learning_rate": 0.00011764705882352942,
"loss": 0.9848,
"step": 50
},
{
"epoch": 0.05934718100890208,
"eval_loss": 0.8925089240074158,
"eval_runtime": 7.1518,
"eval_samples_per_second": 7.271,
"eval_steps_per_second": 0.979,
"step": 50
},
{
"epoch": 0.0712166172106825,
"grad_norm": 0.040539214461615646,
"learning_rate": 0.0001411764705882353,
"loss": 0.9513,
"step": 60
},
{
"epoch": 0.0712166172106825,
"eval_loss": 0.8546838164329529,
"eval_runtime": 7.1461,
"eval_samples_per_second": 7.277,
"eval_steps_per_second": 0.98,
"step": 60
},
{
"epoch": 0.0830860534124629,
"grad_norm": 0.038791558909017074,
"learning_rate": 0.0001647058823529412,
"loss": 0.9142,
"step": 70
},
{
"epoch": 0.0830860534124629,
"eval_loss": 0.8263636231422424,
"eval_runtime": 7.3085,
"eval_samples_per_second": 7.115,
"eval_steps_per_second": 0.958,
"step": 70
},
{
"epoch": 0.09495548961424333,
"grad_norm": 0.04169274461251293,
"learning_rate": 0.00018823529411764707,
"loss": 0.8854,
"step": 80
},
{
"epoch": 0.09495548961424333,
"eval_loss": 0.8020428419113159,
"eval_runtime": 7.247,
"eval_samples_per_second": 7.175,
"eval_steps_per_second": 0.966,
"step": 80
},
{
"epoch": 0.10682492581602374,
"grad_norm": 0.043015312738591396,
"learning_rate": 0.00019997847206287532,
"loss": 0.8632,
"step": 90
},
{
"epoch": 0.10682492581602374,
"eval_loss": 0.7831454873085022,
"eval_runtime": 7.2244,
"eval_samples_per_second": 7.198,
"eval_steps_per_second": 0.969,
"step": 90
},
{
"epoch": 0.11869436201780416,
"grad_norm": 0.05736189956437236,
"learning_rate": 0.00019980630417613612,
"loss": 0.8669,
"step": 100
},
{
"epoch": 0.11869436201780416,
"eval_loss": 0.7694521546363831,
"eval_runtime": 7.2277,
"eval_samples_per_second": 7.194,
"eval_steps_per_second": 0.968,
"step": 100
},
{
"epoch": 0.13056379821958458,
"grad_norm": 0.04913014792038086,
"learning_rate": 0.0001994622648842964,
"loss": 0.8314,
"step": 110
},
{
"epoch": 0.13056379821958458,
"eval_loss": 0.7584220170974731,
"eval_runtime": 7.2488,
"eval_samples_per_second": 7.174,
"eval_steps_per_second": 0.966,
"step": 110
},
{
"epoch": 0.142433234421365,
"grad_norm": 0.04446480334718679,
"learning_rate": 0.00019894694664007728,
"loss": 0.8223,
"step": 120
},
{
"epoch": 0.142433234421365,
"eval_loss": 0.749190092086792,
"eval_runtime": 7.2447,
"eval_samples_per_second": 7.178,
"eval_steps_per_second": 0.966,
"step": 120
},
{
"epoch": 0.1543026706231454,
"grad_norm": 0.05216966997945006,
"learning_rate": 0.00019826123684704952,
"loss": 0.8141,
"step": 130
},
{
"epoch": 0.1543026706231454,
"eval_loss": 0.7411435842514038,
"eval_runtime": 7.2499,
"eval_samples_per_second": 7.173,
"eval_steps_per_second": 0.966,
"step": 130
},
{
"epoch": 0.1661721068249258,
"grad_norm": 0.04233605582577254,
"learning_rate": 0.00019740631633148045,
"loss": 0.8023,
"step": 140
},
{
"epoch": 0.1661721068249258,
"eval_loss": 0.7342764139175415,
"eval_runtime": 7.2264,
"eval_samples_per_second": 7.196,
"eval_steps_per_second": 0.969,
"step": 140
},
{
"epoch": 0.17804154302670624,
"grad_norm": 0.04704056707854844,
"learning_rate": 0.00019638365730889265,
"loss": 0.7895,
"step": 150
},
{
"epoch": 0.17804154302670624,
"eval_loss": 0.7279341220855713,
"eval_runtime": 7.2397,
"eval_samples_per_second": 7.183,
"eval_steps_per_second": 0.967,
"step": 150
},
{
"epoch": 0.18991097922848665,
"grad_norm": 0.044026531833322384,
"learning_rate": 0.00019519502084883582,
"loss": 0.7916,
"step": 160
},
{
"epoch": 0.18991097922848665,
"eval_loss": 0.7229446768760681,
"eval_runtime": 7.2583,
"eval_samples_per_second": 7.164,
"eval_steps_per_second": 0.964,
"step": 160
},
{
"epoch": 0.20178041543026706,
"grad_norm": 0.05191891103370369,
"learning_rate": 0.00019384245384223767,
"loss": 0.791,
"step": 170
},
{
"epoch": 0.20178041543026706,
"eval_loss": 0.7180152535438538,
"eval_runtime": 7.2578,
"eval_samples_per_second": 7.165,
"eval_steps_per_second": 0.964,
"step": 170
},
{
"epoch": 0.21364985163204747,
"grad_norm": 0.05572444654938424,
"learning_rate": 0.00019232828547655615,
"loss": 0.7955,
"step": 180
},
{
"epoch": 0.21364985163204747,
"eval_loss": 0.7139907479286194,
"eval_runtime": 7.2429,
"eval_samples_per_second": 7.179,
"eval_steps_per_second": 0.966,
"step": 180
},
{
"epoch": 0.22551928783382788,
"grad_norm": 0.0516837327891167,
"learning_rate": 0.00019065512322480332,
"loss": 0.7776,
"step": 190
},
{
"epoch": 0.22551928783382788,
"eval_loss": 0.7107158899307251,
"eval_runtime": 7.2537,
"eval_samples_per_second": 7.169,
"eval_steps_per_second": 0.965,
"step": 190
},
{
"epoch": 0.23738872403560832,
"grad_norm": 0.050846230869752186,
"learning_rate": 0.00018882584835534737,
"loss": 0.785,
"step": 200
},
{
"epoch": 0.23738872403560832,
"eval_loss": 0.7068641185760498,
"eval_runtime": 7.2578,
"eval_samples_per_second": 7.165,
"eval_steps_per_second": 0.964,
"step": 200
},
{
"epoch": 0.24925816023738873,
"grad_norm": 0.05343887989356088,
"learning_rate": 0.00018684361097022568,
"loss": 0.7825,
"step": 210
},
{
"epoch": 0.24925816023738873,
"eval_loss": 0.703826904296875,
"eval_runtime": 7.2465,
"eval_samples_per_second": 7.176,
"eval_steps_per_second": 0.966,
"step": 210
},
{
"epoch": 0.26112759643916916,
"grad_norm": 0.05242956764694665,
"learning_rate": 0.00018471182458051287,
"loss": 0.7539,
"step": 220
},
{
"epoch": 0.26112759643916916,
"eval_loss": 0.7010646462440491,
"eval_runtime": 7.2455,
"eval_samples_per_second": 7.177,
"eval_steps_per_second": 0.966,
"step": 220
},
{
"epoch": 0.27299703264094954,
"grad_norm": 0.051064873049660696,
"learning_rate": 0.00018243416022808547,
"loss": 0.7785,
"step": 230
},
{
"epoch": 0.27299703264094954,
"eval_loss": 0.6979869604110718,
"eval_runtime": 7.2598,
"eval_samples_per_second": 7.163,
"eval_steps_per_second": 0.964,
"step": 230
},
{
"epoch": 0.28486646884273,
"grad_norm": 0.046329966712940596,
"learning_rate": 0.00018001454016390586,
"loss": 0.7792,
"step": 240
},
{
"epoch": 0.28486646884273,
"eval_loss": 0.6954675912857056,
"eval_runtime": 7.2628,
"eval_samples_per_second": 7.16,
"eval_steps_per_second": 0.964,
"step": 240
},
{
"epoch": 0.29673590504451036,
"grad_norm": 0.05390500282652213,
"learning_rate": 0.00017745713109371139,
"loss": 0.753,
"step": 250
},
{
"epoch": 0.29673590504451036,
"eval_loss": 0.6930329203605652,
"eval_runtime": 7.2472,
"eval_samples_per_second": 7.175,
"eval_steps_per_second": 0.966,
"step": 250
},
{
"epoch": 0.3086053412462908,
"grad_norm": 0.043546536548247354,
"learning_rate": 0.00017476633700274068,
"loss": 0.765,
"step": 260
},
{
"epoch": 0.3086053412462908,
"eval_loss": 0.6907065510749817,
"eval_runtime": 7.2525,
"eval_samples_per_second": 7.17,
"eval_steps_per_second": 0.965,
"step": 260
},
{
"epoch": 0.32047477744807124,
"grad_norm": 0.06771031218563213,
"learning_rate": 0.00017194679157185255,
"loss": 0.7464,
"step": 270
},
{
"epoch": 0.32047477744807124,
"eval_loss": 0.6883173584938049,
"eval_runtime": 7.1658,
"eval_samples_per_second": 7.257,
"eval_steps_per_second": 0.977,
"step": 270
},
{
"epoch": 0.3323442136498516,
"grad_norm": 0.04731592266489432,
"learning_rate": 0.00016900335019809782,
"loss": 0.7649,
"step": 280
},
{
"epoch": 0.3323442136498516,
"eval_loss": 0.6865535378456116,
"eval_runtime": 7.1591,
"eval_samples_per_second": 7.264,
"eval_steps_per_second": 0.978,
"step": 280
},
{
"epoch": 0.34421364985163205,
"grad_norm": 0.05422567227576833,
"learning_rate": 0.00016594108163348493,
"loss": 0.764,
"step": 290
},
{
"epoch": 0.34421364985163205,
"eval_loss": 0.6839883327484131,
"eval_runtime": 7.1803,
"eval_samples_per_second": 7.242,
"eval_steps_per_second": 0.975,
"step": 290
},
{
"epoch": 0.3560830860534125,
"grad_norm": 0.04716111383242718,
"learning_rate": 0.0001627652592563373,
"loss": 0.7457,
"step": 300
},
{
"epoch": 0.3560830860534125,
"eval_loss": 0.6814995408058167,
"eval_runtime": 7.1545,
"eval_samples_per_second": 7.268,
"eval_steps_per_second": 0.978,
"step": 300
},
{
"epoch": 0.36795252225519287,
"grad_norm": 0.04833237070306901,
"learning_rate": 0.00015948135199027474,
"loss": 0.7427,
"step": 310
},
{
"epoch": 0.36795252225519287,
"eval_loss": 0.6795143485069275,
"eval_runtime": 7.164,
"eval_samples_per_second": 7.258,
"eval_steps_per_second": 0.977,
"step": 310
},
{
"epoch": 0.3798219584569733,
"grad_norm": 0.04790062682337991,
"learning_rate": 0.00015609501488645554,
"loss": 0.7564,
"step": 320
},
{
"epoch": 0.3798219584569733,
"eval_loss": 0.6774563193321228,
"eval_runtime": 7.1529,
"eval_samples_per_second": 7.27,
"eval_steps_per_second": 0.979,
"step": 320
},
{
"epoch": 0.3916913946587537,
"grad_norm": 0.04849257718202548,
"learning_rate": 0.00015261207938529808,
"loss": 0.7352,
"step": 330
},
{
"epoch": 0.3916913946587537,
"eval_loss": 0.676394522190094,
"eval_runtime": 7.1522,
"eval_samples_per_second": 7.27,
"eval_steps_per_second": 0.979,
"step": 330
},
{
"epoch": 0.4035608308605341,
"grad_norm": 0.0449523115474861,
"learning_rate": 0.00014903854327445116,
"loss": 0.7471,
"step": 340
},
{
"epoch": 0.4035608308605341,
"eval_loss": 0.6733577251434326,
"eval_runtime": 7.1516,
"eval_samples_per_second": 7.271,
"eval_steps_per_second": 0.979,
"step": 340
},
{
"epoch": 0.41543026706231456,
"grad_norm": 0.044399706693004984,
"learning_rate": 0.00014538056036030622,
"loss": 0.7408,
"step": 350
},
{
"epoch": 0.41543026706231456,
"eval_loss": 0.6722955107688904,
"eval_runtime": 7.1523,
"eval_samples_per_second": 7.27,
"eval_steps_per_second": 0.979,
"step": 350
},
{
"epoch": 0.42729970326409494,
"grad_norm": 0.04749195230072245,
"learning_rate": 0.00014164442987083762,
"loss": 0.7493,
"step": 360
},
{
"epoch": 0.42729970326409494,
"eval_loss": 0.6708287000656128,
"eval_runtime": 7.158,
"eval_samples_per_second": 7.265,
"eval_steps_per_second": 0.978,
"step": 360
},
{
"epoch": 0.4391691394658754,
"grad_norm": 0.04622607097065686,
"learning_rate": 0.0001378365856080198,
"loss": 0.7375,
"step": 370
},
{
"epoch": 0.4391691394658754,
"eval_loss": 0.6693353056907654,
"eval_runtime": 7.1529,
"eval_samples_per_second": 7.27,
"eval_steps_per_second": 0.979,
"step": 370
},
{
"epoch": 0.45103857566765576,
"grad_norm": 0.04779103392401475,
"learning_rate": 0.00013396358486850103,
"loss": 0.7469,
"step": 380
},
{
"epoch": 0.45103857566765576,
"eval_loss": 0.6675475835800171,
"eval_runtime": 7.1609,
"eval_samples_per_second": 7.262,
"eval_steps_per_second": 0.978,
"step": 380
},
{
"epoch": 0.4629080118694362,
"grad_norm": 0.04367335948535579,
"learning_rate": 0.0001300320971516136,
"loss": 0.7388,
"step": 390
},
{
"epoch": 0.4629080118694362,
"eval_loss": 0.6652542948722839,
"eval_runtime": 7.1535,
"eval_samples_per_second": 7.269,
"eval_steps_per_second": 0.979,
"step": 390
},
{
"epoch": 0.47477744807121663,
"grad_norm": 0.04906513653224005,
"learning_rate": 0.0001260488926741651,
"loss": 0.7434,
"step": 400
},
{
"epoch": 0.47477744807121663,
"eval_loss": 0.6640903949737549,
"eval_runtime": 7.1588,
"eval_samples_per_second": 7.264,
"eval_steps_per_second": 0.978,
"step": 400
},
{
"epoch": 0.486646884272997,
"grad_norm": 0.04507001139303972,
"learning_rate": 0.00012202083071178938,
"loss": 0.7271,
"step": 410
},
{
"epoch": 0.486646884272997,
"eval_loss": 0.6632689237594604,
"eval_runtime": 7.1544,
"eval_samples_per_second": 7.268,
"eval_steps_per_second": 0.978,
"step": 410
},
{
"epoch": 0.49851632047477745,
"grad_norm": 0.054746961897722525,
"learning_rate": 0.00011795484778693382,
"loss": 0.731,
"step": 420
},
{
"epoch": 0.49851632047477745,
"eval_loss": 0.6615473031997681,
"eval_runtime": 7.1789,
"eval_samples_per_second": 7.243,
"eval_steps_per_second": 0.975,
"step": 420
},
{
"epoch": 0.5103857566765578,
"grad_norm": 0.04685925852904185,
"learning_rate": 0.00011385794572382357,
"loss": 0.7571,
"step": 430
},
{
"epoch": 0.5103857566765578,
"eval_loss": 0.660754919052124,
"eval_runtime": 7.1679,
"eval_samples_per_second": 7.255,
"eval_steps_per_second": 0.977,
"step": 430
},
{
"epoch": 0.5222551928783383,
"grad_norm": 0.05188129116000765,
"learning_rate": 0.00010973717959097327,
"loss": 0.7355,
"step": 440
},
{
"epoch": 0.5222551928783383,
"eval_loss": 0.6598291993141174,
"eval_runtime": 7.1682,
"eval_samples_per_second": 7.254,
"eval_steps_per_second": 0.977,
"step": 440
},
{
"epoch": 0.5341246290801187,
"grad_norm": 0.04549793036088958,
"learning_rate": 0.00010559964555200889,
"loss": 0.7379,
"step": 450
},
{
"epoch": 0.5341246290801187,
"eval_loss": 0.658793568611145,
"eval_runtime": 7.1553,
"eval_samples_per_second": 7.267,
"eval_steps_per_second": 0.978,
"step": 450
},
{
"epoch": 0.5459940652818991,
"grad_norm": 0.05162604134236413,
"learning_rate": 0.0001014524686457223,
"loss": 0.7363,
"step": 460
},
{
"epoch": 0.5459940652818991,
"eval_loss": 0.6571336984634399,
"eval_runtime": 7.156,
"eval_samples_per_second": 7.267,
"eval_steps_per_second": 0.978,
"step": 460
},
{
"epoch": 0.5578635014836796,
"grad_norm": 0.04783457254038487,
"learning_rate": 9.730279051640112e-05,
"loss": 0.7177,
"step": 470
},
{
"epoch": 0.5578635014836796,
"eval_loss": 0.655822217464447,
"eval_runtime": 7.1579,
"eval_samples_per_second": 7.265,
"eval_steps_per_second": 0.978,
"step": 470
},
{
"epoch": 0.56973293768546,
"grad_norm": 0.04436373760940458,
"learning_rate": 9.315775711556298e-05,
"loss": 0.7334,
"step": 480
},
{
"epoch": 0.56973293768546,
"eval_loss": 0.6540331244468689,
"eval_runtime": 7.1522,
"eval_samples_per_second": 7.271,
"eval_steps_per_second": 0.979,
"step": 480
},
{
"epoch": 0.5816023738872403,
"grad_norm": 0.04519231140145815,
"learning_rate": 8.902450639627288e-05,
"loss": 0.737,
"step": 490
},
{
"epoch": 0.5816023738872403,
"eval_loss": 0.6534115672111511,
"eval_runtime": 7.156,
"eval_samples_per_second": 7.267,
"eval_steps_per_second": 0.978,
"step": 490
},
{
"epoch": 0.5934718100890207,
"grad_norm": 0.04247565082967707,
"learning_rate": 8.491015602123368e-05,
"loss": 0.7413,
"step": 500
},
{
"epoch": 0.5934718100890207,
"eval_loss": 0.6529149413108826,
"eval_runtime": 7.1578,
"eval_samples_per_second": 7.265,
"eval_steps_per_second": 0.978,
"step": 500
},
{
"epoch": 0.6053412462908012,
"grad_norm": 0.048070529053236284,
"learning_rate": 8.082179110581838e-05,
"loss": 0.7141,
"step": 510
},
{
"epoch": 0.6053412462908012,
"eval_loss": 0.651964545249939,
"eval_runtime": 7.1573,
"eval_samples_per_second": 7.265,
"eval_steps_per_second": 0.978,
"step": 510
},
{
"epoch": 0.6172106824925816,
"grad_norm": 0.04071752578010737,
"learning_rate": 7.676645201714983e-05,
"loss": 0.7248,
"step": 520
},
{
"epoch": 0.6172106824925816,
"eval_loss": 0.6511014103889465,
"eval_runtime": 7.1566,
"eval_samples_per_second": 7.266,
"eval_steps_per_second": 0.978,
"step": 520
},
{
"epoch": 0.629080118694362,
"grad_norm": 0.04168337358828772,
"learning_rate": 7.27511222502395e-05,
"loss": 0.725,
"step": 530
},
{
"epoch": 0.629080118694362,
"eval_loss": 0.6503429412841797,
"eval_runtime": 7.1538,
"eval_samples_per_second": 7.269,
"eval_steps_per_second": 0.979,
"step": 530
},
{
"epoch": 0.6409495548961425,
"grad_norm": 0.05260277652628936,
"learning_rate": 6.878271640206281e-05,
"loss": 0.7299,
"step": 540
},
{
"epoch": 0.6409495548961425,
"eval_loss": 0.6499541401863098,
"eval_runtime": 7.1571,
"eval_samples_per_second": 7.265,
"eval_steps_per_second": 0.978,
"step": 540
},
{
"epoch": 0.6528189910979229,
"grad_norm": 0.04558705692362712,
"learning_rate": 6.486806826428016e-05,
"loss": 0.7381,
"step": 550
},
{
"epoch": 0.6528189910979229,
"eval_loss": 0.6490299701690674,
"eval_runtime": 7.1586,
"eval_samples_per_second": 7.264,
"eval_steps_per_second": 0.978,
"step": 550
},
{
"epoch": 0.6646884272997032,
"grad_norm": 0.046779386203552986,
"learning_rate": 6.101391905510889e-05,
"loss": 0.7052,
"step": 560
},
{
"epoch": 0.6646884272997032,
"eval_loss": 0.6482182741165161,
"eval_runtime": 7.1587,
"eval_samples_per_second": 7.264,
"eval_steps_per_second": 0.978,
"step": 560
},
{
"epoch": 0.6765578635014837,
"grad_norm": 0.04433666961158006,
"learning_rate": 5.72269058106111e-05,
"loss": 0.7267,
"step": 570
},
{
"epoch": 0.6765578635014837,
"eval_loss": 0.6474419832229614,
"eval_runtime": 7.1585,
"eval_samples_per_second": 7.264,
"eval_steps_per_second": 0.978,
"step": 570
},
{
"epoch": 0.6884272997032641,
"grad_norm": 0.040969171268037584,
"learning_rate": 5.351354995538859e-05,
"loss": 0.7217,
"step": 580
},
{
"epoch": 0.6884272997032641,
"eval_loss": 0.6467857956886292,
"eval_runtime": 7.1605,
"eval_samples_per_second": 7.262,
"eval_steps_per_second": 0.978,
"step": 580
},
{
"epoch": 0.7002967359050445,
"grad_norm": 0.04326822029385217,
"learning_rate": 4.988024607236619e-05,
"loss": 0.7225,
"step": 590
},
{
"epoch": 0.7002967359050445,
"eval_loss": 0.6460662484169006,
"eval_runtime": 7.1561,
"eval_samples_per_second": 7.267,
"eval_steps_per_second": 0.978,
"step": 590
},
{
"epoch": 0.712166172106825,
"grad_norm": 0.04151648905301352,
"learning_rate": 4.633325089100289e-05,
"loss": 0.7305,
"step": 600
},
{
"epoch": 0.712166172106825,
"eval_loss": 0.6456657648086548,
"eval_runtime": 7.1693,
"eval_samples_per_second": 7.253,
"eval_steps_per_second": 0.976,
"step": 600
},
{
"epoch": 0.7240356083086054,
"grad_norm": 0.043211271741518106,
"learning_rate": 4.287867251289348e-05,
"loss": 0.7202,
"step": 610
},
{
"epoch": 0.7240356083086054,
"eval_loss": 0.6450461745262146,
"eval_runtime": 7.1607,
"eval_samples_per_second": 7.262,
"eval_steps_per_second": 0.978,
"step": 610
},
{
"epoch": 0.7359050445103857,
"grad_norm": 0.043394130216544674,
"learning_rate": 3.952245989331466e-05,
"loss": 0.7182,
"step": 620
},
{
"epoch": 0.7359050445103857,
"eval_loss": 0.6443552374839783,
"eval_runtime": 7.1601,
"eval_samples_per_second": 7.262,
"eval_steps_per_second": 0.978,
"step": 620
},
{
"epoch": 0.7477744807121661,
"grad_norm": 0.0440813623198057,
"learning_rate": 3.627039259682899e-05,
"loss": 0.7269,
"step": 630
},
{
"epoch": 0.7477744807121661,
"eval_loss": 0.6440042853355408,
"eval_runtime": 7.1528,
"eval_samples_per_second": 7.27,
"eval_steps_per_second": 0.979,
"step": 630
},
{
"epoch": 0.7596439169139466,
"grad_norm": 0.04196493736245454,
"learning_rate": 3.312807084458831e-05,
"loss": 0.7321,
"step": 640
},
{
"epoch": 0.7596439169139466,
"eval_loss": 0.6436744332313538,
"eval_runtime": 7.157,
"eval_samples_per_second": 7.266,
"eval_steps_per_second": 0.978,
"step": 640
},
{
"epoch": 0.771513353115727,
"grad_norm": 0.04409502511550107,
"learning_rate": 3.0100905870475006e-05,
"loss": 0.7127,
"step": 650
},
{
"epoch": 0.771513353115727,
"eval_loss": 0.6433111429214478,
"eval_runtime": 7.1488,
"eval_samples_per_second": 7.274,
"eval_steps_per_second": 0.979,
"step": 650
},
{
"epoch": 0.7833827893175074,
"grad_norm": 0.04613176858045456,
"learning_rate": 2.7194110602689026e-05,
"loss": 0.7084,
"step": 660
},
{
"epoch": 0.7833827893175074,
"eval_loss": 0.6427409648895264,
"eval_runtime": 7.1651,
"eval_samples_per_second": 7.257,
"eval_steps_per_second": 0.977,
"step": 660
},
{
"epoch": 0.7952522255192879,
"grad_norm": 0.04107118214802092,
"learning_rate": 2.4412690686827e-05,
"loss": 0.7318,
"step": 670
},
{
"epoch": 0.7952522255192879,
"eval_loss": 0.6424255967140198,
"eval_runtime": 7.151,
"eval_samples_per_second": 7.272,
"eval_steps_per_second": 0.979,
"step": 670
},
{
"epoch": 0.8071216617210683,
"grad_norm": 0.04076823498258544,
"learning_rate": 2.1761435865912296e-05,
"loss": 0.7111,
"step": 680
},
{
"epoch": 0.8071216617210683,
"eval_loss": 0.6420266032218933,
"eval_runtime": 7.1562,
"eval_samples_per_second": 7.266,
"eval_steps_per_second": 0.978,
"step": 680
},
{
"epoch": 0.8189910979228486,
"grad_norm": 0.0402931102905212,
"learning_rate": 1.9244911732219918e-05,
"loss": 0.7173,
"step": 690
},
{
"epoch": 0.8189910979228486,
"eval_loss": 0.6417113542556763,
"eval_runtime": 7.1559,
"eval_samples_per_second": 7.267,
"eval_steps_per_second": 0.978,
"step": 690
},
{
"epoch": 0.8308605341246291,
"grad_norm": 0.040268557762468446,
"learning_rate": 1.6867451865100414e-05,
"loss": 0.7187,
"step": 700
},
{
"epoch": 0.8308605341246291,
"eval_loss": 0.6414252519607544,
"eval_runtime": 7.1614,
"eval_samples_per_second": 7.261,
"eval_steps_per_second": 0.977,
"step": 700
},
{
"epoch": 0.8427299703264095,
"grad_norm": 0.03910974935669944,
"learning_rate": 1.4633150368341153e-05,
"loss": 0.7003,
"step": 710
},
{
"epoch": 0.8427299703264095,
"eval_loss": 0.6411222815513611,
"eval_runtime": 7.158,
"eval_samples_per_second": 7.265,
"eval_steps_per_second": 0.978,
"step": 710
},
{
"epoch": 0.8545994065281899,
"grad_norm": 0.041327420976484716,
"learning_rate": 1.2545854819916646e-05,
"loss": 0.7146,
"step": 720
},
{
"epoch": 0.8545994065281899,
"eval_loss": 0.6409789323806763,
"eval_runtime": 7.1632,
"eval_samples_per_second": 7.259,
"eval_steps_per_second": 0.977,
"step": 720
},
{
"epoch": 0.8664688427299704,
"grad_norm": 0.0415664844879509,
"learning_rate": 1.0609159646268506e-05,
"loss": 0.7326,
"step": 730
},
{
"epoch": 0.8664688427299704,
"eval_loss": 0.640900194644928,
"eval_runtime": 7.1625,
"eval_samples_per_second": 7.26,
"eval_steps_per_second": 0.977,
"step": 730
},
{
"epoch": 0.8783382789317508,
"grad_norm": 0.040948908512356046,
"learning_rate": 8.82639993252482e-06,
"loss": 0.735,
"step": 740
},
{
"epoch": 0.8783382789317508,
"eval_loss": 0.6406245827674866,
"eval_runtime": 7.1641,
"eval_samples_per_second": 7.258,
"eval_steps_per_second": 0.977,
"step": 740
},
{
"epoch": 0.8902077151335311,
"grad_norm": 0.039353242405877606,
"learning_rate": 7.20064567931813e-06,
"loss": 0.7072,
"step": 750
},
{
"epoch": 0.8902077151335311,
"eval_loss": 0.6405530571937561,
"eval_runtime": 7.1654,
"eval_samples_per_second": 7.257,
"eval_steps_per_second": 0.977,
"step": 750
},
{
"epoch": 0.9020771513353115,
"grad_norm": 0.040045103519338765,
"learning_rate": 5.734696516092253e-06,
"loss": 0.7242,
"step": 760
},
{
"epoch": 0.9020771513353115,
"eval_loss": 0.6403719186782837,
"eval_runtime": 7.1641,
"eval_samples_per_second": 7.258,
"eval_steps_per_second": 0.977,
"step": 760
},
{
"epoch": 0.913946587537092,
"grad_norm": 0.03936062921235335,
"learning_rate": 4.431076880001439e-06,
"loss": 0.725,
"step": 770
},
{
"epoch": 0.913946587537092,
"eval_loss": 0.6402689218521118,
"eval_runtime": 7.1586,
"eval_samples_per_second": 7.264,
"eval_steps_per_second": 0.978,
"step": 770
},
{
"epoch": 0.9258160237388724,
"grad_norm": 0.038795606374443216,
"learning_rate": 3.292031668704398e-06,
"loss": 0.7051,
"step": 780
},
{
"epoch": 0.9258160237388724,
"eval_loss": 0.6402180790901184,
"eval_runtime": 7.1753,
"eval_samples_per_second": 7.247,
"eval_steps_per_second": 0.976,
"step": 780
},
{
"epoch": 0.9376854599406528,
"grad_norm": 0.04044906534894689,
"learning_rate": 2.3195223745392737e-06,
"loss": 0.7187,
"step": 790
},
{
"epoch": 0.9376854599406528,
"eval_loss": 0.6401559114456177,
"eval_runtime": 7.178,
"eval_samples_per_second": 7.244,
"eval_steps_per_second": 0.975,
"step": 790
},
{
"epoch": 0.9495548961424333,
"grad_norm": 0.03933803718026143,
"learning_rate": 1.5152237067365239e-06,
"loss": 0.7185,
"step": 800
},
{
"epoch": 0.9495548961424333,
"eval_loss": 0.6400689482688904,
"eval_runtime": 7.1614,
"eval_samples_per_second": 7.261,
"eval_steps_per_second": 0.977,
"step": 800
},
{
"epoch": 0.9614243323442137,
"grad_norm": 0.03894023495076019,
"learning_rate": 8.805207074865873e-07,
"loss": 0.7213,
"step": 810
},
{
"epoch": 0.9614243323442137,
"eval_loss": 0.6400617957115173,
"eval_runtime": 7.1732,
"eval_samples_per_second": 7.249,
"eval_steps_per_second": 0.976,
"step": 810
},
{
"epoch": 0.973293768545994,
"grad_norm": 0.03930788498385229,
"learning_rate": 4.165063668285396e-07,
"loss": 0.7192,
"step": 820
},
{
"epoch": 0.973293768545994,
"eval_loss": 0.6400607228279114,
"eval_runtime": 7.1626,
"eval_samples_per_second": 7.26,
"eval_steps_per_second": 0.977,
"step": 820
},
{
"epoch": 0.9851632047477745,
"grad_norm": 0.03939532170183159,
"learning_rate": 1.2397974046707283e-07,
"loss": 0.7214,
"step": 830
},
{
"epoch": 0.9851632047477745,
"eval_loss": 0.6400620341300964,
"eval_runtime": 7.1644,
"eval_samples_per_second": 7.258,
"eval_steps_per_second": 0.977,
"step": 830
},
{
"epoch": 0.9970326409495549,
"grad_norm": 0.04057041649987638,
"learning_rate": 3.444573758937253e-09,
"loss": 0.7317,
"step": 840
},
{
"epoch": 0.9970326409495549,
"eval_loss": 0.640075147151947,
"eval_runtime": 7.1623,
"eval_samples_per_second": 7.26,
"eval_steps_per_second": 0.977,
"step": 840
},
{
"epoch": 0.9994065281899109,
"step": 842,
"total_flos": 6.615025481416704e+16,
"train_loss": 0.6960575034103031,
"train_runtime": 22091.905,
"train_samples_per_second": 2.44,
"train_steps_per_second": 0.038
}
],
"logging_steps": 10,
"max_steps": 842,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 10,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.615025481416704e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}