Ehsanl's picture
Checkpoint 1019
2f9ecc8 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1019,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.009813542688910697,
"grad_norm": 511850.5,
"learning_rate": 3.921568627450981e-07,
"loss": 3.358,
"step": 10
},
{
"epoch": 0.019627085377821395,
"grad_norm": 852402.5625,
"learning_rate": 7.843137254901962e-07,
"loss": 4.0004,
"step": 20
},
{
"epoch": 0.029440628066732092,
"grad_norm": 41.74958801269531,
"learning_rate": 1.1764705882352942e-06,
"loss": 3.1047,
"step": 30
},
{
"epoch": 0.03925417075564279,
"grad_norm": 29.402185440063477,
"learning_rate": 1.5686274509803923e-06,
"loss": 2.4535,
"step": 40
},
{
"epoch": 0.04906771344455348,
"grad_norm": 42.18159866333008,
"learning_rate": 1.96078431372549e-06,
"loss": 3.8615,
"step": 50
},
{
"epoch": 0.058881256133464184,
"grad_norm": 15.556477546691895,
"learning_rate": 2.3529411764705885e-06,
"loss": 2.6701,
"step": 60
},
{
"epoch": 0.06869479882237488,
"grad_norm": 1009053.9375,
"learning_rate": 2.7450980392156867e-06,
"loss": 2.5866,
"step": 70
},
{
"epoch": 0.07850834151128558,
"grad_norm": 3.3355813026428223,
"learning_rate": 3.1372549019607846e-06,
"loss": 2.1546,
"step": 80
},
{
"epoch": 0.08832188420019627,
"grad_norm": 6364043.5,
"learning_rate": 3.529411764705883e-06,
"loss": 1.8961,
"step": 90
},
{
"epoch": 0.09813542688910697,
"grad_norm": 10.409449577331543,
"learning_rate": 3.92156862745098e-06,
"loss": 3.8516,
"step": 100
},
{
"epoch": 0.10794896957801767,
"grad_norm": 3035116.25,
"learning_rate": 4.313725490196079e-06,
"loss": 2.656,
"step": 110
},
{
"epoch": 0.11776251226692837,
"grad_norm": 1568043.375,
"learning_rate": 4.705882352941177e-06,
"loss": 2.5056,
"step": 120
},
{
"epoch": 0.12757605495583907,
"grad_norm": 2.928816795349121,
"learning_rate": 5.098039215686274e-06,
"loss": 1.4711,
"step": 130
},
{
"epoch": 0.13738959764474976,
"grad_norm": 2664501.25,
"learning_rate": 5.4901960784313735e-06,
"loss": 2.4145,
"step": 140
},
{
"epoch": 0.14720314033366044,
"grad_norm": 5433292.0,
"learning_rate": 5.882352941176471e-06,
"loss": 2.5401,
"step": 150
},
{
"epoch": 0.15701668302257116,
"grad_norm": 2.9648523330688477,
"learning_rate": 6.274509803921569e-06,
"loss": 1.1487,
"step": 160
},
{
"epoch": 0.16683022571148184,
"grad_norm": 3384641.25,
"learning_rate": 6.666666666666667e-06,
"loss": 1.5521,
"step": 170
},
{
"epoch": 0.17664376840039253,
"grad_norm": 2175554.5,
"learning_rate": 7.058823529411766e-06,
"loss": 2.2009,
"step": 180
},
{
"epoch": 0.18645731108930325,
"grad_norm": 4.447290897369385,
"learning_rate": 7.450980392156863e-06,
"loss": 1.6716,
"step": 190
},
{
"epoch": 0.19627085377821393,
"grad_norm": 2.5263493061065674,
"learning_rate": 7.84313725490196e-06,
"loss": 3.2454,
"step": 200
},
{
"epoch": 0.20608439646712462,
"grad_norm": 772904.75,
"learning_rate": 8.23529411764706e-06,
"loss": 1.622,
"step": 210
},
{
"epoch": 0.21589793915603533,
"grad_norm": 3.8818535804748535,
"learning_rate": 8.627450980392157e-06,
"loss": 2.1061,
"step": 220
},
{
"epoch": 0.22571148184494602,
"grad_norm": 4.158895015716553,
"learning_rate": 9.019607843137256e-06,
"loss": 1.7532,
"step": 230
},
{
"epoch": 0.23552502453385674,
"grad_norm": 4.066924571990967,
"learning_rate": 9.411764705882354e-06,
"loss": 1.6545,
"step": 240
},
{
"epoch": 0.24533856722276742,
"grad_norm": 1732917.625,
"learning_rate": 9.803921568627451e-06,
"loss": 2.3048,
"step": 250
},
{
"epoch": 0.25515210991167814,
"grad_norm": 221879.953125,
"learning_rate": 1e-05,
"loss": 1.3068,
"step": 260
},
{
"epoch": 0.2649656526005888,
"grad_norm": 859696.125,
"learning_rate": 1e-05,
"loss": 1.7565,
"step": 270
},
{
"epoch": 0.2747791952894995,
"grad_norm": 3.6436195373535156,
"learning_rate": 1e-05,
"loss": 2.1635,
"step": 280
},
{
"epoch": 0.2845927379784102,
"grad_norm": 5.748630046844482,
"learning_rate": 1e-05,
"loss": 1.678,
"step": 290
},
{
"epoch": 0.2944062806673209,
"grad_norm": 493996.46875,
"learning_rate": 1e-05,
"loss": 2.1511,
"step": 300
},
{
"epoch": 0.3042198233562316,
"grad_norm": 9.792704582214355,
"learning_rate": 1e-05,
"loss": 2.6151,
"step": 310
},
{
"epoch": 0.3140333660451423,
"grad_norm": 1645629.125,
"learning_rate": 1e-05,
"loss": 1.4831,
"step": 320
},
{
"epoch": 0.323846908734053,
"grad_norm": 4.4405131340026855,
"learning_rate": 1e-05,
"loss": 2.7356,
"step": 330
},
{
"epoch": 0.3336604514229637,
"grad_norm": 8.055213928222656,
"learning_rate": 1e-05,
"loss": 1.9915,
"step": 340
},
{
"epoch": 0.3434739941118744,
"grad_norm": 604711.25,
"learning_rate": 1e-05,
"loss": 1.7425,
"step": 350
},
{
"epoch": 0.35328753680078506,
"grad_norm": 8.440975189208984,
"learning_rate": 1e-05,
"loss": 1.5921,
"step": 360
},
{
"epoch": 0.3631010794896958,
"grad_norm": 1266034.375,
"learning_rate": 1e-05,
"loss": 1.0832,
"step": 370
},
{
"epoch": 0.3729146221786065,
"grad_norm": 3.2711005210876465,
"learning_rate": 1e-05,
"loss": 1.0002,
"step": 380
},
{
"epoch": 0.38272816486751715,
"grad_norm": 1738446.375,
"learning_rate": 1e-05,
"loss": 1.7899,
"step": 390
},
{
"epoch": 0.39254170755642787,
"grad_norm": 10.065378189086914,
"learning_rate": 1e-05,
"loss": 1.942,
"step": 400
},
{
"epoch": 0.4023552502453386,
"grad_norm": 839358.625,
"learning_rate": 1e-05,
"loss": 2.2337,
"step": 410
},
{
"epoch": 0.41216879293424924,
"grad_norm": 820795.5,
"learning_rate": 1e-05,
"loss": 2.4434,
"step": 420
},
{
"epoch": 0.42198233562315995,
"grad_norm": 760894.875,
"learning_rate": 1e-05,
"loss": 1.332,
"step": 430
},
{
"epoch": 0.43179587831207067,
"grad_norm": 465132.90625,
"learning_rate": 1e-05,
"loss": 2.1238,
"step": 440
},
{
"epoch": 0.44160942100098133,
"grad_norm": 151798.609375,
"learning_rate": 1e-05,
"loss": 2.0838,
"step": 450
},
{
"epoch": 0.45142296368989204,
"grad_norm": 3.6023194789886475,
"learning_rate": 1e-05,
"loss": 1.5318,
"step": 460
},
{
"epoch": 0.46123650637880276,
"grad_norm": 3.711779832839966,
"learning_rate": 1e-05,
"loss": 1.8415,
"step": 470
},
{
"epoch": 0.47105004906771347,
"grad_norm": 3.6837337017059326,
"learning_rate": 1e-05,
"loss": 2.2333,
"step": 480
},
{
"epoch": 0.48086359175662413,
"grad_norm": 2.7638938426971436,
"learning_rate": 1e-05,
"loss": 1.612,
"step": 490
},
{
"epoch": 0.49067713444553485,
"grad_norm": 2.2806527614593506,
"learning_rate": 1e-05,
"loss": 2.4336,
"step": 500
},
{
"epoch": 0.5004906771344455,
"grad_norm": 2.6325523853302,
"learning_rate": 1e-05,
"loss": 2.3051,
"step": 510
},
{
"epoch": 0.5103042198233563,
"grad_norm": 4.162623882293701,
"learning_rate": 1e-05,
"loss": 2.5193,
"step": 520
},
{
"epoch": 0.5201177625122669,
"grad_norm": 3.865851879119873,
"learning_rate": 1e-05,
"loss": 2.1113,
"step": 530
},
{
"epoch": 0.5299313052011776,
"grad_norm": 3.6652672290802,
"learning_rate": 1e-05,
"loss": 2.605,
"step": 540
},
{
"epoch": 0.5397448478900884,
"grad_norm": 1123418.0,
"learning_rate": 1e-05,
"loss": 2.367,
"step": 550
},
{
"epoch": 0.549558390578999,
"grad_norm": 3.206057071685791,
"learning_rate": 1e-05,
"loss": 0.9706,
"step": 560
},
{
"epoch": 0.5593719332679097,
"grad_norm": 3.8300833702087402,
"learning_rate": 1e-05,
"loss": 1.6688,
"step": 570
},
{
"epoch": 0.5691854759568205,
"grad_norm": 3.4160726070404053,
"learning_rate": 1e-05,
"loss": 1.8959,
"step": 580
},
{
"epoch": 0.5789990186457311,
"grad_norm": 6.991641044616699,
"learning_rate": 1e-05,
"loss": 2.8449,
"step": 590
},
{
"epoch": 0.5888125613346418,
"grad_norm": 3.89111065864563,
"learning_rate": 1e-05,
"loss": 2.8364,
"step": 600
},
{
"epoch": 0.5986261040235525,
"grad_norm": 12.52274227142334,
"learning_rate": 1e-05,
"loss": 2.3841,
"step": 610
},
{
"epoch": 0.6084396467124632,
"grad_norm": 1124655.25,
"learning_rate": 1e-05,
"loss": 2.8931,
"step": 620
},
{
"epoch": 0.6182531894013739,
"grad_norm": 2132181.75,
"learning_rate": 1e-05,
"loss": 1.8265,
"step": 630
},
{
"epoch": 0.6280667320902846,
"grad_norm": 3.21681547164917,
"learning_rate": 1e-05,
"loss": 0.8137,
"step": 640
},
{
"epoch": 0.6378802747791953,
"grad_norm": 1385230.375,
"learning_rate": 1e-05,
"loss": 1.2742,
"step": 650
},
{
"epoch": 0.647693817468106,
"grad_norm": 10.80539321899414,
"learning_rate": 1e-05,
"loss": 3.0502,
"step": 660
},
{
"epoch": 0.6575073601570167,
"grad_norm": 1592570.0,
"learning_rate": 1e-05,
"loss": 1.9121,
"step": 670
},
{
"epoch": 0.6673209028459274,
"grad_norm": 985591.5625,
"learning_rate": 1e-05,
"loss": 1.8159,
"step": 680
},
{
"epoch": 0.677134445534838,
"grad_norm": 1119573.375,
"learning_rate": 1e-05,
"loss": 1.9695,
"step": 690
},
{
"epoch": 0.6869479882237488,
"grad_norm": 3.928929090499878,
"learning_rate": 1e-05,
"loss": 2.1545,
"step": 700
},
{
"epoch": 0.6967615309126595,
"grad_norm": 998297.4375,
"learning_rate": 1e-05,
"loss": 1.2963,
"step": 710
},
{
"epoch": 0.7065750736015701,
"grad_norm": 3.8201591968536377,
"learning_rate": 1e-05,
"loss": 0.9735,
"step": 720
},
{
"epoch": 0.7163886162904809,
"grad_norm": 3.7799386978149414,
"learning_rate": 1e-05,
"loss": 1.5274,
"step": 730
},
{
"epoch": 0.7262021589793916,
"grad_norm": 3.718870162963867,
"learning_rate": 1e-05,
"loss": 2.9676,
"step": 740
},
{
"epoch": 0.7360157016683022,
"grad_norm": 4.023947715759277,
"learning_rate": 1e-05,
"loss": 1.3345,
"step": 750
},
{
"epoch": 0.745829244357213,
"grad_norm": 14.283628463745117,
"learning_rate": 1e-05,
"loss": 2.7141,
"step": 760
},
{
"epoch": 0.7556427870461236,
"grad_norm": 3178157.25,
"learning_rate": 1e-05,
"loss": 0.9265,
"step": 770
},
{
"epoch": 0.7654563297350343,
"grad_norm": 3.6791253089904785,
"learning_rate": 1e-05,
"loss": 1.8104,
"step": 780
},
{
"epoch": 0.7752698724239451,
"grad_norm": 4302724.5,
"learning_rate": 1e-05,
"loss": 1.4787,
"step": 790
},
{
"epoch": 0.7850834151128557,
"grad_norm": 1720963.75,
"learning_rate": 1e-05,
"loss": 2.1176,
"step": 800
},
{
"epoch": 0.7948969578017664,
"grad_norm": 1612358.875,
"learning_rate": 1e-05,
"loss": 1.2736,
"step": 810
},
{
"epoch": 0.8047105004906772,
"grad_norm": 1152146.25,
"learning_rate": 1e-05,
"loss": 1.5657,
"step": 820
},
{
"epoch": 0.8145240431795878,
"grad_norm": 3.5905027389526367,
"learning_rate": 1e-05,
"loss": 2.6198,
"step": 830
},
{
"epoch": 0.8243375858684985,
"grad_norm": 736680.8125,
"learning_rate": 1e-05,
"loss": 0.9112,
"step": 840
},
{
"epoch": 0.8341511285574092,
"grad_norm": 2.9653732776641846,
"learning_rate": 1e-05,
"loss": 2.3842,
"step": 850
},
{
"epoch": 0.8439646712463199,
"grad_norm": 12.001425743103027,
"learning_rate": 1e-05,
"loss": 2.3966,
"step": 860
},
{
"epoch": 0.8537782139352306,
"grad_norm": 2124122.25,
"learning_rate": 1e-05,
"loss": 1.3734,
"step": 870
},
{
"epoch": 0.8635917566241413,
"grad_norm": 6534144.0,
"learning_rate": 1e-05,
"loss": 1.3486,
"step": 880
},
{
"epoch": 0.873405299313052,
"grad_norm": 3.6779091358184814,
"learning_rate": 1e-05,
"loss": 0.949,
"step": 890
},
{
"epoch": 0.8832188420019627,
"grad_norm": 1221940.0,
"learning_rate": 1e-05,
"loss": 2.6138,
"step": 900
},
{
"epoch": 0.8930323846908734,
"grad_norm": 1095478.5,
"learning_rate": 1e-05,
"loss": 1.4675,
"step": 910
},
{
"epoch": 0.9028459273797841,
"grad_norm": 548933.875,
"learning_rate": 1e-05,
"loss": 2.8343,
"step": 920
},
{
"epoch": 0.9126594700686947,
"grad_norm": 13.783559799194336,
"learning_rate": 1e-05,
"loss": 2.1122,
"step": 930
},
{
"epoch": 0.9224730127576055,
"grad_norm": 13.174997329711914,
"learning_rate": 1e-05,
"loss": 2.4962,
"step": 940
},
{
"epoch": 0.9322865554465162,
"grad_norm": 10.191123962402344,
"learning_rate": 1e-05,
"loss": 2.2086,
"step": 950
},
{
"epoch": 0.9421000981354269,
"grad_norm": 3.606752872467041,
"learning_rate": 1e-05,
"loss": 1.323,
"step": 960
},
{
"epoch": 0.9519136408243376,
"grad_norm": 2473294.0,
"learning_rate": 1e-05,
"loss": 1.0528,
"step": 970
},
{
"epoch": 0.9617271835132483,
"grad_norm": 2.848081588745117,
"learning_rate": 1e-05,
"loss": 1.5576,
"step": 980
},
{
"epoch": 0.971540726202159,
"grad_norm": 3.5542256832122803,
"learning_rate": 1e-05,
"loss": 1.8997,
"step": 990
},
{
"epoch": 0.9813542688910697,
"grad_norm": 1991637.375,
"learning_rate": 1e-05,
"loss": 2.5923,
"step": 1000
},
{
"epoch": 0.9911678115799804,
"grad_norm": 21.8354434967041,
"learning_rate": 1e-05,
"loss": 2.0656,
"step": 1010
}
],
"logging_steps": 10,
"max_steps": 1019,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 255,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}