Qwen2.5-1.5B-Open-R1-Distill-lora / trainer_state.json
T1anyu's picture
Model save
aa4cde3 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 50.0,
"eval_steps": 500,
"global_step": 1700,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.14705882352941177,
"grad_norm": 0.14238034188747406,
"learning_rate": 2.9411764705882355e-06,
"loss": 0.8446,
"step": 5
},
{
"epoch": 0.29411764705882354,
"grad_norm": 0.14489340782165527,
"learning_rate": 5.882352941176471e-06,
"loss": 0.8425,
"step": 10
},
{
"epoch": 0.4411764705882353,
"grad_norm": 0.1551039218902588,
"learning_rate": 8.823529411764707e-06,
"loss": 0.8918,
"step": 15
},
{
"epoch": 0.5882352941176471,
"grad_norm": 0.15667259693145752,
"learning_rate": 1.1764705882352942e-05,
"loss": 0.8473,
"step": 20
},
{
"epoch": 0.7352941176470589,
"grad_norm": 0.14174659550189972,
"learning_rate": 1.4705882352941177e-05,
"loss": 0.8519,
"step": 25
},
{
"epoch": 0.8823529411764706,
"grad_norm": 0.15220147371292114,
"learning_rate": 1.7647058823529414e-05,
"loss": 0.8201,
"step": 30
},
{
"epoch": 1.0294117647058822,
"grad_norm": 0.14088748395442963,
"learning_rate": 2.058823529411765e-05,
"loss": 0.8345,
"step": 35
},
{
"epoch": 1.1764705882352942,
"grad_norm": 0.1412580907344818,
"learning_rate": 2.3529411764705884e-05,
"loss": 0.8549,
"step": 40
},
{
"epoch": 1.3235294117647058,
"grad_norm": 0.12632407248020172,
"learning_rate": 2.647058823529412e-05,
"loss": 0.8431,
"step": 45
},
{
"epoch": 1.4705882352941178,
"grad_norm": 0.11696045845746994,
"learning_rate": 2.9411764705882354e-05,
"loss": 0.8157,
"step": 50
},
{
"epoch": 1.6176470588235294,
"grad_norm": 0.1272636502981186,
"learning_rate": 3.235294117647059e-05,
"loss": 0.8236,
"step": 55
},
{
"epoch": 1.7647058823529411,
"grad_norm": 0.10813312977552414,
"learning_rate": 3.529411764705883e-05,
"loss": 0.8019,
"step": 60
},
{
"epoch": 1.9117647058823528,
"grad_norm": 0.11896699666976929,
"learning_rate": 3.8235294117647055e-05,
"loss": 0.8185,
"step": 65
},
{
"epoch": 2.0588235294117645,
"grad_norm": 0.10265181213617325,
"learning_rate": 4.11764705882353e-05,
"loss": 0.8044,
"step": 70
},
{
"epoch": 2.2058823529411766,
"grad_norm": 0.10106126219034195,
"learning_rate": 4.411764705882353e-05,
"loss": 0.7858,
"step": 75
},
{
"epoch": 2.3529411764705883,
"grad_norm": 0.10131911188364029,
"learning_rate": 4.705882352941177e-05,
"loss": 0.8084,
"step": 80
},
{
"epoch": 2.5,
"grad_norm": 0.09996718913316727,
"learning_rate": 5e-05,
"loss": 0.814,
"step": 85
},
{
"epoch": 2.6470588235294117,
"grad_norm": 0.0959576964378357,
"learning_rate": 4.999893574965545e-05,
"loss": 0.8105,
"step": 90
},
{
"epoch": 2.7941176470588234,
"grad_norm": 0.11985825002193451,
"learning_rate": 4.9995743099299886e-05,
"loss": 0.7813,
"step": 95
},
{
"epoch": 2.9411764705882355,
"grad_norm": 0.09690634161233902,
"learning_rate": 4.9990422350958156e-05,
"loss": 0.7917,
"step": 100
},
{
"epoch": 3.088235294117647,
"grad_norm": 0.11959923803806305,
"learning_rate": 4.99829740079732e-05,
"loss": 0.7531,
"step": 105
},
{
"epoch": 3.235294117647059,
"grad_norm": 0.09686623513698578,
"learning_rate": 4.99733987749585e-05,
"loss": 0.783,
"step": 110
},
{
"epoch": 3.3823529411764706,
"grad_norm": 0.0896935760974884,
"learning_rate": 4.996169755773138e-05,
"loss": 0.7506,
"step": 115
},
{
"epoch": 3.5294117647058822,
"grad_norm": 0.09192899614572525,
"learning_rate": 4.9947871463227374e-05,
"loss": 0.7597,
"step": 120
},
{
"epoch": 3.6764705882352944,
"grad_norm": 0.09252317994832993,
"learning_rate": 4.993192179939542e-05,
"loss": 0.767,
"step": 125
},
{
"epoch": 3.8235294117647056,
"grad_norm": 0.09957632422447205,
"learning_rate": 4.991385007507422e-05,
"loss": 0.8006,
"step": 130
},
{
"epoch": 3.9705882352941178,
"grad_norm": 0.12217137217521667,
"learning_rate": 4.989365799984943e-05,
"loss": 0.7513,
"step": 135
},
{
"epoch": 4.117647058823529,
"grad_norm": 0.09010059386491776,
"learning_rate": 4.9871347483892006e-05,
"loss": 0.7531,
"step": 140
},
{
"epoch": 4.264705882352941,
"grad_norm": 0.09149183332920074,
"learning_rate": 4.984692063777743e-05,
"loss": 0.7518,
"step": 145
},
{
"epoch": 4.411764705882353,
"grad_norm": 0.09295113384723663,
"learning_rate": 4.9820379772286095e-05,
"loss": 0.7665,
"step": 150
},
{
"epoch": 4.5588235294117645,
"grad_norm": 0.09854214638471603,
"learning_rate": 4.979172739818469e-05,
"loss": 0.7739,
"step": 155
},
{
"epoch": 4.705882352941177,
"grad_norm": 0.09157629311084747,
"learning_rate": 4.9760966225988675e-05,
"loss": 0.7522,
"step": 160
},
{
"epoch": 4.852941176470588,
"grad_norm": 0.10833761096000671,
"learning_rate": 4.9728099165705895e-05,
"loss": 0.7605,
"step": 165
},
{
"epoch": 5.0,
"grad_norm": 0.09924355149269104,
"learning_rate": 4.9693129326561254e-05,
"loss": 0.7153,
"step": 170
},
{
"epoch": 5.147058823529412,
"grad_norm": 0.09313185513019562,
"learning_rate": 4.9656060016702606e-05,
"loss": 0.7494,
"step": 175
},
{
"epoch": 5.294117647058823,
"grad_norm": 0.11171400547027588,
"learning_rate": 4.961689474288779e-05,
"loss": 0.733,
"step": 180
},
{
"epoch": 5.4411764705882355,
"grad_norm": 0.09828388690948486,
"learning_rate": 4.957563721015293e-05,
"loss": 0.7663,
"step": 185
},
{
"epoch": 5.588235294117647,
"grad_norm": 0.09972433745861053,
"learning_rate": 4.953229132146186e-05,
"loss": 0.7576,
"step": 190
},
{
"epoch": 5.735294117647059,
"grad_norm": 0.11432339251041412,
"learning_rate": 4.948686117733699e-05,
"loss": 0.7379,
"step": 195
},
{
"epoch": 5.882352941176471,
"grad_norm": 0.10343588888645172,
"learning_rate": 4.9439351075471346e-05,
"loss": 0.7066,
"step": 200
},
{
"epoch": 6.029411764705882,
"grad_norm": 0.0964265912771225,
"learning_rate": 4.9389765510322026e-05,
"loss": 0.7322,
"step": 205
},
{
"epoch": 6.176470588235294,
"grad_norm": 0.11457476019859314,
"learning_rate": 4.9338109172685006e-05,
"loss": 0.742,
"step": 210
},
{
"epoch": 6.323529411764706,
"grad_norm": 0.10812544822692871,
"learning_rate": 4.92843869492514e-05,
"loss": 0.7572,
"step": 215
},
{
"epoch": 6.470588235294118,
"grad_norm": 0.1057206243276596,
"learning_rate": 4.9228603922145206e-05,
"loss": 0.7342,
"step": 220
},
{
"epoch": 6.617647058823529,
"grad_norm": 0.11412467062473297,
"learning_rate": 4.917076536844248e-05,
"loss": 0.7331,
"step": 225
},
{
"epoch": 6.764705882352941,
"grad_norm": 0.11059483885765076,
"learning_rate": 4.9110876759672184e-05,
"loss": 0.718,
"step": 230
},
{
"epoch": 6.911764705882353,
"grad_norm": 0.10819140076637268,
"learning_rate": 4.9048943761298544e-05,
"loss": 0.7153,
"step": 235
},
{
"epoch": 7.0588235294117645,
"grad_norm": 0.11002287268638611,
"learning_rate": 4.89849722321851e-05,
"loss": 0.7201,
"step": 240
},
{
"epoch": 7.205882352941177,
"grad_norm": 0.11289830505847931,
"learning_rate": 4.891896822404046e-05,
"loss": 0.7261,
"step": 245
},
{
"epoch": 7.352941176470588,
"grad_norm": 0.12590822577476501,
"learning_rate": 4.885093798084583e-05,
"loss": 0.7329,
"step": 250
},
{
"epoch": 7.5,
"grad_norm": 0.10964758694171906,
"learning_rate": 4.878088793826428e-05,
"loss": 0.7413,
"step": 255
},
{
"epoch": 7.647058823529412,
"grad_norm": 0.10680090636014938,
"learning_rate": 4.8708824723031995e-05,
"loss": 0.7174,
"step": 260
},
{
"epoch": 7.794117647058823,
"grad_norm": 0.10832036286592484,
"learning_rate": 4.8634755152331355e-05,
"loss": 0.7345,
"step": 265
},
{
"epoch": 7.9411764705882355,
"grad_norm": 0.10789214819669724,
"learning_rate": 4.8558686233145996e-05,
"loss": 0.7213,
"step": 270
},
{
"epoch": 8.088235294117647,
"grad_norm": 0.11145245283842087,
"learning_rate": 4.8480625161598e-05,
"loss": 0.7184,
"step": 275
},
{
"epoch": 8.235294117647058,
"grad_norm": 0.12131233513355255,
"learning_rate": 4.840057932226715e-05,
"loss": 0.737,
"step": 280
},
{
"epoch": 8.382352941176471,
"grad_norm": 0.11897268146276474,
"learning_rate": 4.831855628749228e-05,
"loss": 0.7254,
"step": 285
},
{
"epoch": 8.529411764705882,
"grad_norm": 0.1130763441324234,
"learning_rate": 4.823456381665501e-05,
"loss": 0.7213,
"step": 290
},
{
"epoch": 8.676470588235293,
"grad_norm": 0.11835259944200516,
"learning_rate": 4.8148609855445624e-05,
"loss": 0.7102,
"step": 295
},
{
"epoch": 8.823529411764707,
"grad_norm": 0.12309901416301727,
"learning_rate": 4.806070253511151e-05,
"loss": 0.7227,
"step": 300
},
{
"epoch": 8.970588235294118,
"grad_norm": 0.11361519992351532,
"learning_rate": 4.797085017168787e-05,
"loss": 0.7125,
"step": 305
},
{
"epoch": 9.117647058823529,
"grad_norm": 0.13154913485050201,
"learning_rate": 4.7879061265211e-05,
"loss": 0.7293,
"step": 310
},
{
"epoch": 9.264705882352942,
"grad_norm": 0.12245271354913712,
"learning_rate": 4.778534449891428e-05,
"loss": 0.7216,
"step": 315
},
{
"epoch": 9.411764705882353,
"grad_norm": 0.10899204015731812,
"learning_rate": 4.768970873840669e-05,
"loss": 0.706,
"step": 320
},
{
"epoch": 9.558823529411764,
"grad_norm": 0.1145118996500969,
"learning_rate": 4.75921630308341e-05,
"loss": 0.7039,
"step": 325
},
{
"epoch": 9.705882352941176,
"grad_norm": 0.12111522257328033,
"learning_rate": 4.749271660402341e-05,
"loss": 0.7359,
"step": 330
},
{
"epoch": 9.852941176470589,
"grad_norm": 0.11270228773355484,
"learning_rate": 4.739137886560966e-05,
"loss": 0.7006,
"step": 335
},
{
"epoch": 10.0,
"grad_norm": 0.10989291220903397,
"learning_rate": 4.7288159402146e-05,
"loss": 0.7123,
"step": 340
},
{
"epoch": 10.147058823529411,
"grad_norm": 0.11979430168867111,
"learning_rate": 4.7183067978196855e-05,
"loss": 0.7213,
"step": 345
},
{
"epoch": 10.294117647058824,
"grad_norm": 0.11735141277313232,
"learning_rate": 4.707611453541412e-05,
"loss": 0.7061,
"step": 350
},
{
"epoch": 10.441176470588236,
"grad_norm": 0.12181384861469269,
"learning_rate": 4.696730919159677e-05,
"loss": 0.6962,
"step": 355
},
{
"epoch": 10.588235294117647,
"grad_norm": 0.11275137960910797,
"learning_rate": 4.6856662239733666e-05,
"loss": 0.7467,
"step": 360
},
{
"epoch": 10.735294117647058,
"grad_norm": 0.13028523325920105,
"learning_rate": 4.674418414702985e-05,
"loss": 0.7047,
"step": 365
},
{
"epoch": 10.882352941176471,
"grad_norm": 0.12034178525209427,
"learning_rate": 4.662988555391632e-05,
"loss": 0.7061,
"step": 370
},
{
"epoch": 11.029411764705882,
"grad_norm": 0.11595606803894043,
"learning_rate": 4.6513777273043495e-05,
"loss": 0.7023,
"step": 375
},
{
"epoch": 11.176470588235293,
"grad_norm": 0.11920719593763351,
"learning_rate": 4.63958702882583e-05,
"loss": 0.6886,
"step": 380
},
{
"epoch": 11.323529411764707,
"grad_norm": 0.12535597383975983,
"learning_rate": 4.6276175753565105e-05,
"loss": 0.7209,
"step": 385
},
{
"epoch": 11.470588235294118,
"grad_norm": 0.12857039272785187,
"learning_rate": 4.615470499207056e-05,
"loss": 0.7018,
"step": 390
},
{
"epoch": 11.617647058823529,
"grad_norm": 0.13531994819641113,
"learning_rate": 4.6031469494912416e-05,
"loss": 0.7145,
"step": 395
},
{
"epoch": 11.764705882352942,
"grad_norm": 0.10658453404903412,
"learning_rate": 4.59064809201725e-05,
"loss": 0.723,
"step": 400
},
{
"epoch": 11.911764705882353,
"grad_norm": 0.12011521309614182,
"learning_rate": 4.5779751091773774e-05,
"loss": 0.7011,
"step": 405
},
{
"epoch": 12.058823529411764,
"grad_norm": 0.11478458344936371,
"learning_rate": 4.5651291998361926e-05,
"loss": 0.7117,
"step": 410
},
{
"epoch": 12.205882352941176,
"grad_norm": 0.1333089917898178,
"learning_rate": 4.55211157921711e-05,
"loss": 0.7148,
"step": 415
},
{
"epoch": 12.352941176470589,
"grad_norm": 0.11957768350839615,
"learning_rate": 4.538923478787439e-05,
"loss": 0.7049,
"step": 420
},
{
"epoch": 12.5,
"grad_norm": 0.11590797454118729,
"learning_rate": 4.5255661461418854e-05,
"loss": 0.6797,
"step": 425
},
{
"epoch": 12.647058823529411,
"grad_norm": 0.12927260994911194,
"learning_rate": 4.5120408448845264e-05,
"loss": 0.7126,
"step": 430
},
{
"epoch": 12.794117647058824,
"grad_norm": 0.13119827210903168,
"learning_rate": 4.4983488545092753e-05,
"loss": 0.7082,
"step": 435
},
{
"epoch": 12.941176470588236,
"grad_norm": 0.1294083297252655,
"learning_rate": 4.4844914702788386e-05,
"loss": 0.699,
"step": 440
},
{
"epoch": 13.088235294117647,
"grad_norm": 0.11839265376329422,
"learning_rate": 4.470470003102192e-05,
"loss": 0.71,
"step": 445
},
{
"epoch": 13.235294117647058,
"grad_norm": 0.12290767580270767,
"learning_rate": 4.456285779410558e-05,
"loss": 0.7058,
"step": 450
},
{
"epoch": 13.382352941176471,
"grad_norm": 0.12060663849115372,
"learning_rate": 4.4419401410319334e-05,
"loss": 0.6744,
"step": 455
},
{
"epoch": 13.529411764705882,
"grad_norm": 0.12197393923997879,
"learning_rate": 4.427434445064148e-05,
"loss": 0.6919,
"step": 460
},
{
"epoch": 13.676470588235293,
"grad_norm": 0.12659871578216553,
"learning_rate": 4.4127700637464834e-05,
"loss": 0.7102,
"step": 465
},
{
"epoch": 13.823529411764707,
"grad_norm": 0.12616273760795593,
"learning_rate": 4.3979483843298624e-05,
"loss": 0.6924,
"step": 470
},
{
"epoch": 13.970588235294118,
"grad_norm": 0.1336318999528885,
"learning_rate": 4.382970808945612e-05,
"loss": 0.7248,
"step": 475
},
{
"epoch": 14.117647058823529,
"grad_norm": 0.12975798547267914,
"learning_rate": 4.367838754472821e-05,
"loss": 0.7266,
"step": 480
},
{
"epoch": 14.264705882352942,
"grad_norm": 0.1267329454421997,
"learning_rate": 4.3525536524043076e-05,
"loss": 0.7028,
"step": 485
},
{
"epoch": 14.411764705882353,
"grad_norm": 0.12307338416576385,
"learning_rate": 4.337116948711195e-05,
"loss": 0.7052,
"step": 490
},
{
"epoch": 14.558823529411764,
"grad_norm": 0.14381247758865356,
"learning_rate": 4.3215301037061244e-05,
"loss": 0.6947,
"step": 495
},
{
"epoch": 14.705882352941176,
"grad_norm": 0.11929916590452194,
"learning_rate": 4.305794591905113e-05,
"loss": 0.691,
"step": 500
},
{
"epoch": 14.852941176470589,
"grad_norm": 0.12451142817735672,
"learning_rate": 4.289911901888056e-05,
"loss": 0.6859,
"step": 505
},
{
"epoch": 15.0,
"grad_norm": 0.12542015314102173,
"learning_rate": 4.2738835361579175e-05,
"loss": 0.7139,
"step": 510
},
{
"epoch": 15.147058823529411,
"grad_norm": 0.11630310118198395,
"learning_rate": 4.257711010998586e-05,
"loss": 0.705,
"step": 515
},
{
"epoch": 15.294117647058824,
"grad_norm": 0.13719907402992249,
"learning_rate": 4.241395856331437e-05,
"loss": 0.7001,
"step": 520
},
{
"epoch": 15.441176470588236,
"grad_norm": 0.13168473541736603,
"learning_rate": 4.224939615570602e-05,
"loss": 0.7047,
"step": 525
},
{
"epoch": 15.588235294117647,
"grad_norm": 0.11908990889787674,
"learning_rate": 4.2083438454769606e-05,
"loss": 0.7086,
"step": 530
},
{
"epoch": 15.735294117647058,
"grad_norm": 0.12613283097743988,
"learning_rate": 4.1916101160108715e-05,
"loss": 0.6911,
"step": 535
},
{
"epoch": 15.882352941176471,
"grad_norm": 0.12481500208377838,
"learning_rate": 4.174740010183656e-05,
"loss": 0.6845,
"step": 540
},
{
"epoch": 16.029411764705884,
"grad_norm": 0.1311793029308319,
"learning_rate": 4.15773512390784e-05,
"loss": 0.6976,
"step": 545
},
{
"epoch": 16.176470588235293,
"grad_norm": 0.11820737272500992,
"learning_rate": 4.140597065846188e-05,
"loss": 0.7101,
"step": 550
},
{
"epoch": 16.323529411764707,
"grad_norm": 0.12259554117918015,
"learning_rate": 4.123327457259517e-05,
"loss": 0.6973,
"step": 555
},
{
"epoch": 16.470588235294116,
"grad_norm": 0.12584823369979858,
"learning_rate": 4.105927931853327e-05,
"loss": 0.6903,
"step": 560
},
{
"epoch": 16.61764705882353,
"grad_norm": 0.13863661885261536,
"learning_rate": 4.088400135623256e-05,
"loss": 0.6726,
"step": 565
},
{
"epoch": 16.764705882352942,
"grad_norm": 0.13399486243724823,
"learning_rate": 4.070745726699363e-05,
"loss": 0.6977,
"step": 570
},
{
"epoch": 16.91176470588235,
"grad_norm": 0.1234726756811142,
"learning_rate": 4.0529663751892734e-05,
"loss": 0.6907,
"step": 575
},
{
"epoch": 17.058823529411764,
"grad_norm": 0.11935710906982422,
"learning_rate": 4.035063763020185e-05,
"loss": 0.7128,
"step": 580
},
{
"epoch": 17.205882352941178,
"grad_norm": 0.12929606437683105,
"learning_rate": 4.017039583779756e-05,
"loss": 0.7106,
"step": 585
},
{
"epoch": 17.352941176470587,
"grad_norm": 0.1248982772231102,
"learning_rate": 3.9988955425558965e-05,
"loss": 0.6897,
"step": 590
},
{
"epoch": 17.5,
"grad_norm": 0.1281740367412567,
"learning_rate": 3.980633355775461e-05,
"loss": 0.6871,
"step": 595
},
{
"epoch": 17.647058823529413,
"grad_norm": 0.12564094364643097,
"learning_rate": 3.962254751041877e-05,
"loss": 0.7008,
"step": 600
},
{
"epoch": 17.794117647058822,
"grad_norm": 0.1336313784122467,
"learning_rate": 3.943761466971717e-05,
"loss": 0.6851,
"step": 605
},
{
"epoch": 17.941176470588236,
"grad_norm": 0.1354963481426239,
"learning_rate": 3.9251552530302206e-05,
"loss": 0.6951,
"step": 610
},
{
"epoch": 18.08823529411765,
"grad_norm": 0.15230301022529602,
"learning_rate": 3.906437869365795e-05,
"loss": 0.693,
"step": 615
},
{
"epoch": 18.235294117647058,
"grad_norm": 0.1296202540397644,
"learning_rate": 3.887611086643508e-05,
"loss": 0.6874,
"step": 620
},
{
"epoch": 18.38235294117647,
"grad_norm": 0.13370412588119507,
"learning_rate": 3.8686766858775843e-05,
"loss": 0.7085,
"step": 625
},
{
"epoch": 18.529411764705884,
"grad_norm": 0.13132105767726898,
"learning_rate": 3.849636458262913e-05,
"loss": 0.7037,
"step": 630
},
{
"epoch": 18.676470588235293,
"grad_norm": 0.13370752334594727,
"learning_rate": 3.830492205005612e-05,
"loss": 0.679,
"step": 635
},
{
"epoch": 18.823529411764707,
"grad_norm": 0.12619805335998535,
"learning_rate": 3.811245737152624e-05,
"loss": 0.6846,
"step": 640
},
{
"epoch": 18.970588235294116,
"grad_norm": 0.13043031096458435,
"learning_rate": 3.7918988754203985e-05,
"loss": 0.6729,
"step": 645
},
{
"epoch": 19.11764705882353,
"grad_norm": 0.132464200258255,
"learning_rate": 3.772453450022649e-05,
"loss": 0.7112,
"step": 650
},
{
"epoch": 19.264705882352942,
"grad_norm": 0.1268807351589203,
"learning_rate": 3.752911300497212e-05,
"loss": 0.6804,
"step": 655
},
{
"epoch": 19.41176470588235,
"grad_norm": 0.14288154244422913,
"learning_rate": 3.73327427553203e-05,
"loss": 0.6867,
"step": 660
},
{
"epoch": 19.558823529411764,
"grad_norm": 0.14849698543548584,
"learning_rate": 3.7135442327902695e-05,
"loss": 0.694,
"step": 665
},
{
"epoch": 19.705882352941178,
"grad_norm": 0.12607896327972412,
"learning_rate": 3.6937230387345746e-05,
"loss": 0.6873,
"step": 670
},
{
"epoch": 19.852941176470587,
"grad_norm": 0.12860074639320374,
"learning_rate": 3.673812568450513e-05,
"loss": 0.6942,
"step": 675
},
{
"epoch": 20.0,
"grad_norm": 0.12468240410089493,
"learning_rate": 3.6538147054691817e-05,
"loss": 0.6844,
"step": 680
},
{
"epoch": 20.147058823529413,
"grad_norm": 0.1367003470659256,
"learning_rate": 3.6337313415890315e-05,
"loss": 0.7005,
"step": 685
},
{
"epoch": 20.294117647058822,
"grad_norm": 0.13072577118873596,
"learning_rate": 3.6135643766969e-05,
"loss": 0.671,
"step": 690
},
{
"epoch": 20.441176470588236,
"grad_norm": 0.1326008439064026,
"learning_rate": 3.593315718588286e-05,
"loss": 0.6727,
"step": 695
},
{
"epoch": 20.58823529411765,
"grad_norm": 0.1257023960351944,
"learning_rate": 3.572987282786864e-05,
"loss": 0.7073,
"step": 700
},
{
"epoch": 20.735294117647058,
"grad_norm": 0.14335250854492188,
"learning_rate": 3.552580992363285e-05,
"loss": 0.6821,
"step": 705
},
{
"epoch": 20.88235294117647,
"grad_norm": 0.13271793723106384,
"learning_rate": 3.5320987777532465e-05,
"loss": 0.6959,
"step": 710
},
{
"epoch": 21.029411764705884,
"grad_norm": 0.12265238165855408,
"learning_rate": 3.5115425765748793e-05,
"loss": 0.6767,
"step": 715
},
{
"epoch": 21.176470588235293,
"grad_norm": 0.13558083772659302,
"learning_rate": 3.4909143334454454e-05,
"loss": 0.6859,
"step": 720
},
{
"epoch": 21.323529411764707,
"grad_norm": 0.1432723104953766,
"learning_rate": 3.4702159997973747e-05,
"loss": 0.6921,
"step": 725
},
{
"epoch": 21.470588235294116,
"grad_norm": 0.13662855327129364,
"learning_rate": 3.449449533693664e-05,
"loss": 0.7063,
"step": 730
},
{
"epoch": 21.61764705882353,
"grad_norm": 0.1422967165708542,
"learning_rate": 3.428616899642645e-05,
"loss": 0.6987,
"step": 735
},
{
"epoch": 21.764705882352942,
"grad_norm": 0.1233050599694252,
"learning_rate": 3.4077200684121345e-05,
"loss": 0.6831,
"step": 740
},
{
"epoch": 21.91176470588235,
"grad_norm": 0.13728494942188263,
"learning_rate": 3.3867610168430084e-05,
"loss": 0.6873,
"step": 745
},
{
"epoch": 22.058823529411764,
"grad_norm": 0.1322290301322937,
"learning_rate": 3.365741727662187e-05,
"loss": 0.651,
"step": 750
},
{
"epoch": 22.205882352941178,
"grad_norm": 0.13796144723892212,
"learning_rate": 3.3446641892950696e-05,
"loss": 0.671,
"step": 755
},
{
"epoch": 22.352941176470587,
"grad_norm": 0.13293515145778656,
"learning_rate": 3.3235303956774324e-05,
"loss": 0.7056,
"step": 760
},
{
"epoch": 22.5,
"grad_norm": 0.13630028069019318,
"learning_rate": 3.3023423460667985e-05,
"loss": 0.6866,
"step": 765
},
{
"epoch": 22.647058823529413,
"grad_norm": 0.1360238939523697,
"learning_rate": 3.281102044853309e-05,
"loss": 0.6991,
"step": 770
},
{
"epoch": 22.794117647058822,
"grad_norm": 0.13794392347335815,
"learning_rate": 3.2598115013701114e-05,
"loss": 0.6959,
"step": 775
},
{
"epoch": 22.941176470588236,
"grad_norm": 0.1369139850139618,
"learning_rate": 3.2384727297032705e-05,
"loss": 0.6657,
"step": 780
},
{
"epoch": 23.08823529411765,
"grad_norm": 0.12718403339385986,
"learning_rate": 3.217087748501237e-05,
"loss": 0.6733,
"step": 785
},
{
"epoch": 23.235294117647058,
"grad_norm": 0.13353672623634338,
"learning_rate": 3.1956585807838914e-05,
"loss": 0.6774,
"step": 790
},
{
"epoch": 23.38235294117647,
"grad_norm": 0.13364404439926147,
"learning_rate": 3.1741872537511535e-05,
"loss": 0.6752,
"step": 795
},
{
"epoch": 23.529411764705884,
"grad_norm": 0.14464542269706726,
"learning_rate": 3.152675798591219e-05,
"loss": 0.6667,
"step": 800
},
{
"epoch": 23.676470588235293,
"grad_norm": 0.13043712079524994,
"learning_rate": 3.131126250288405e-05,
"loss": 0.6924,
"step": 805
},
{
"epoch": 23.823529411764707,
"grad_norm": 0.12341820448637009,
"learning_rate": 3.109540647430641e-05,
"loss": 0.6969,
"step": 810
},
{
"epoch": 23.970588235294116,
"grad_norm": 0.14009004831314087,
"learning_rate": 3.087921032016619e-05,
"loss": 0.6947,
"step": 815
},
{
"epoch": 24.11764705882353,
"grad_norm": 0.13528534770011902,
"learning_rate": 3.066269449262618e-05,
"loss": 0.6833,
"step": 820
},
{
"epoch": 24.264705882352942,
"grad_norm": 0.1405653953552246,
"learning_rate": 3.04458794740903e-05,
"loss": 0.6919,
"step": 825
},
{
"epoch": 24.41176470588235,
"grad_norm": 0.14450417459011078,
"learning_rate": 3.0228785775265943e-05,
"loss": 0.7085,
"step": 830
},
{
"epoch": 24.558823529411764,
"grad_norm": 0.1257210224866867,
"learning_rate": 3.001143393322368e-05,
"loss": 0.7022,
"step": 835
},
{
"epoch": 24.705882352941178,
"grad_norm": 0.14650067687034607,
"learning_rate": 2.9793844509454417e-05,
"loss": 0.6559,
"step": 840
},
{
"epoch": 24.852941176470587,
"grad_norm": 0.1673348844051361,
"learning_rate": 2.9576038087924297e-05,
"loss": 0.6628,
"step": 845
},
{
"epoch": 25.0,
"grad_norm": 0.13868844509124756,
"learning_rate": 2.9358035273127483e-05,
"loss": 0.6761,
"step": 850
},
{
"epoch": 25.147058823529413,
"grad_norm": 0.12067105621099472,
"learning_rate": 2.9139856688136917e-05,
"loss": 0.6735,
"step": 855
},
{
"epoch": 25.294117647058822,
"grad_norm": 0.1306021362543106,
"learning_rate": 2.8921522972653437e-05,
"loss": 0.6711,
"step": 860
},
{
"epoch": 25.441176470588236,
"grad_norm": 0.13525615632534027,
"learning_rate": 2.8703054781053194e-05,
"loss": 0.6723,
"step": 865
},
{
"epoch": 25.58823529411765,
"grad_norm": 0.1306258738040924,
"learning_rate": 2.8484472780433828e-05,
"loss": 0.6922,
"step": 870
},
{
"epoch": 25.735294117647058,
"grad_norm": 0.14182746410369873,
"learning_rate": 2.8265797648659283e-05,
"loss": 0.6911,
"step": 875
},
{
"epoch": 25.88235294117647,
"grad_norm": 0.13599254190921783,
"learning_rate": 2.8047050072403713e-05,
"loss": 0.6891,
"step": 880
},
{
"epoch": 26.029411764705884,
"grad_norm": 0.1291087120771408,
"learning_rate": 2.7828250745194544e-05,
"loss": 0.6971,
"step": 885
},
{
"epoch": 26.176470588235293,
"grad_norm": 0.11979696899652481,
"learning_rate": 2.7609420365454823e-05,
"loss": 0.6921,
"step": 890
},
{
"epoch": 26.323529411764707,
"grad_norm": 0.1369645744562149,
"learning_rate": 2.7390579634545182e-05,
"loss": 0.667,
"step": 895
},
{
"epoch": 26.470588235294116,
"grad_norm": 0.1354684978723526,
"learning_rate": 2.7171749254805458e-05,
"loss": 0.6918,
"step": 900
},
{
"epoch": 26.61764705882353,
"grad_norm": 0.1434841752052307,
"learning_rate": 2.6952949927596295e-05,
"loss": 0.6961,
"step": 905
},
{
"epoch": 26.764705882352942,
"grad_norm": 0.13030685484409332,
"learning_rate": 2.6734202351340726e-05,
"loss": 0.6742,
"step": 910
},
{
"epoch": 26.91176470588235,
"grad_norm": 0.1375734657049179,
"learning_rate": 2.651552721956617e-05,
"loss": 0.66,
"step": 915
},
{
"epoch": 27.058823529411764,
"grad_norm": 0.1508912891149521,
"learning_rate": 2.6296945218946804e-05,
"loss": 0.6928,
"step": 920
},
{
"epoch": 27.205882352941178,
"grad_norm": 0.13976359367370605,
"learning_rate": 2.6078477027346572e-05,
"loss": 0.6916,
"step": 925
},
{
"epoch": 27.352941176470587,
"grad_norm": 0.13399522006511688,
"learning_rate": 2.586014331186309e-05,
"loss": 0.6617,
"step": 930
},
{
"epoch": 27.5,
"grad_norm": 0.1378486156463623,
"learning_rate": 2.5641964726872526e-05,
"loss": 0.6779,
"step": 935
},
{
"epoch": 27.647058823529413,
"grad_norm": 0.1410367488861084,
"learning_rate": 2.5423961912075712e-05,
"loss": 0.6951,
"step": 940
},
{
"epoch": 27.794117647058822,
"grad_norm": 0.1448415368795395,
"learning_rate": 2.5206155490545585e-05,
"loss": 0.6958,
"step": 945
},
{
"epoch": 27.941176470588236,
"grad_norm": 0.1381085067987442,
"learning_rate": 2.4988566066776327e-05,
"loss": 0.6629,
"step": 950
},
{
"epoch": 28.08823529411765,
"grad_norm": 0.14611601829528809,
"learning_rate": 2.4771214224734056e-05,
"loss": 0.6642,
"step": 955
},
{
"epoch": 28.235294117647058,
"grad_norm": 0.13046316802501678,
"learning_rate": 2.4554120525909703e-05,
"loss": 0.6554,
"step": 960
},
{
"epoch": 28.38235294117647,
"grad_norm": 0.1373993307352066,
"learning_rate": 2.4337305507373832e-05,
"loss": 0.6791,
"step": 965
},
{
"epoch": 28.529411764705884,
"grad_norm": 0.140591099858284,
"learning_rate": 2.4120789679833815e-05,
"loss": 0.6729,
"step": 970
},
{
"epoch": 28.676470588235293,
"grad_norm": 0.1307932734489441,
"learning_rate": 2.3904593525693593e-05,
"loss": 0.6887,
"step": 975
},
{
"epoch": 28.823529411764707,
"grad_norm": 0.13051795959472656,
"learning_rate": 2.3688737497115953e-05,
"loss": 0.6823,
"step": 980
},
{
"epoch": 28.970588235294116,
"grad_norm": 0.12720821797847748,
"learning_rate": 2.3473242014087814e-05,
"loss": 0.7063,
"step": 985
},
{
"epoch": 29.11764705882353,
"grad_norm": 0.137127086520195,
"learning_rate": 2.3258127462488467e-05,
"loss": 0.6744,
"step": 990
},
{
"epoch": 29.264705882352942,
"grad_norm": 0.13432453572750092,
"learning_rate": 2.30434141921611e-05,
"loss": 0.68,
"step": 995
},
{
"epoch": 29.41176470588235,
"grad_norm": 0.14380089938640594,
"learning_rate": 2.2829122514987634e-05,
"loss": 0.6808,
"step": 1000
},
{
"epoch": 29.558823529411764,
"grad_norm": 0.12999729812145233,
"learning_rate": 2.2615272702967304e-05,
"loss": 0.6963,
"step": 1005
},
{
"epoch": 29.705882352941178,
"grad_norm": 0.13407659530639648,
"learning_rate": 2.2401884986298892e-05,
"loss": 0.6729,
"step": 1010
},
{
"epoch": 29.852941176470587,
"grad_norm": 0.13908743858337402,
"learning_rate": 2.2188979551466916e-05,
"loss": 0.6766,
"step": 1015
},
{
"epoch": 30.0,
"grad_norm": 0.13133063912391663,
"learning_rate": 2.1976576539332024e-05,
"loss": 0.664,
"step": 1020
},
{
"epoch": 30.147058823529413,
"grad_norm": 0.15663307905197144,
"learning_rate": 2.1764696043225685e-05,
"loss": 0.7082,
"step": 1025
},
{
"epoch": 30.294117647058822,
"grad_norm": 0.13505025207996368,
"learning_rate": 2.155335810704931e-05,
"loss": 0.6463,
"step": 1030
},
{
"epoch": 30.441176470588236,
"grad_norm": 0.1344403475522995,
"learning_rate": 2.134258272337814e-05,
"loss": 0.6753,
"step": 1035
},
{
"epoch": 30.58823529411765,
"grad_norm": 0.14067409932613373,
"learning_rate": 2.1132389831569915e-05,
"loss": 0.6715,
"step": 1040
},
{
"epoch": 30.735294117647058,
"grad_norm": 0.13444367051124573,
"learning_rate": 2.092279931587866e-05,
"loss": 0.6838,
"step": 1045
},
{
"epoch": 30.88235294117647,
"grad_norm": 0.13275469839572906,
"learning_rate": 2.0713831003573564e-05,
"loss": 0.6842,
"step": 1050
},
{
"epoch": 31.029411764705884,
"grad_norm": 0.12724100053310394,
"learning_rate": 2.0505504663063364e-05,
"loss": 0.6745,
"step": 1055
},
{
"epoch": 31.176470588235293,
"grad_norm": 0.12783651053905487,
"learning_rate": 2.029784000202627e-05,
"loss": 0.6839,
"step": 1060
},
{
"epoch": 31.323529411764707,
"grad_norm": 0.13505741953849792,
"learning_rate": 2.0090856665545554e-05,
"loss": 0.6577,
"step": 1065
},
{
"epoch": 31.470588235294116,
"grad_norm": 0.14324721693992615,
"learning_rate": 1.98845742342512e-05,
"loss": 0.6786,
"step": 1070
},
{
"epoch": 31.61764705882353,
"grad_norm": 0.14350536465644836,
"learning_rate": 1.967901222246754e-05,
"loss": 0.6715,
"step": 1075
},
{
"epoch": 31.764705882352942,
"grad_norm": 0.12864898145198822,
"learning_rate": 1.947419007636716e-05,
"loss": 0.6901,
"step": 1080
},
{
"epoch": 31.91176470588235,
"grad_norm": 0.13163405656814575,
"learning_rate": 1.9270127172131363e-05,
"loss": 0.6767,
"step": 1085
},
{
"epoch": 32.05882352941177,
"grad_norm": 0.13823044300079346,
"learning_rate": 1.906684281411715e-05,
"loss": 0.6888,
"step": 1090
},
{
"epoch": 32.205882352941174,
"grad_norm": 0.13260214030742645,
"learning_rate": 1.8864356233031e-05,
"loss": 0.6899,
"step": 1095
},
{
"epoch": 32.35294117647059,
"grad_norm": 0.13542212545871735,
"learning_rate": 1.866268658410969e-05,
"loss": 0.6604,
"step": 1100
},
{
"epoch": 32.5,
"grad_norm": 0.14194779098033905,
"learning_rate": 1.8461852945308196e-05,
"loss": 0.6538,
"step": 1105
},
{
"epoch": 32.64705882352941,
"grad_norm": 0.13551092147827148,
"learning_rate": 1.8261874315494874e-05,
"loss": 0.6851,
"step": 1110
},
{
"epoch": 32.794117647058826,
"grad_norm": 0.13539521396160126,
"learning_rate": 1.806276961265425e-05,
"loss": 0.6731,
"step": 1115
},
{
"epoch": 32.94117647058823,
"grad_norm": 0.14235951006412506,
"learning_rate": 1.786455767209732e-05,
"loss": 0.6798,
"step": 1120
},
{
"epoch": 33.088235294117645,
"grad_norm": 0.12894190847873688,
"learning_rate": 1.7667257244679702e-05,
"loss": 0.6815,
"step": 1125
},
{
"epoch": 33.23529411764706,
"grad_norm": 0.13332705199718475,
"learning_rate": 1.747088699502789e-05,
"loss": 0.6709,
"step": 1130
},
{
"epoch": 33.38235294117647,
"grad_norm": 0.13527055084705353,
"learning_rate": 1.727546549977352e-05,
"loss": 0.689,
"step": 1135
},
{
"epoch": 33.529411764705884,
"grad_norm": 0.13612490892410278,
"learning_rate": 1.7081011245796013e-05,
"loss": 0.6744,
"step": 1140
},
{
"epoch": 33.6764705882353,
"grad_norm": 0.13099683821201324,
"learning_rate": 1.6887542628473763e-05,
"loss": 0.6871,
"step": 1145
},
{
"epoch": 33.8235294117647,
"grad_norm": 0.13698424398899078,
"learning_rate": 1.6695077949943892e-05,
"loss": 0.6852,
"step": 1150
},
{
"epoch": 33.970588235294116,
"grad_norm": 0.13121846318244934,
"learning_rate": 1.6503635417370882e-05,
"loss": 0.6529,
"step": 1155
},
{
"epoch": 34.11764705882353,
"grad_norm": 0.1369757056236267,
"learning_rate": 1.6313233141224165e-05,
"loss": 0.6855,
"step": 1160
},
{
"epoch": 34.26470588235294,
"grad_norm": 0.13654442131519318,
"learning_rate": 1.612388913356493e-05,
"loss": 0.6596,
"step": 1165
},
{
"epoch": 34.411764705882355,
"grad_norm": 0.136439248919487,
"learning_rate": 1.5935621306342057e-05,
"loss": 0.6843,
"step": 1170
},
{
"epoch": 34.55882352941177,
"grad_norm": 0.1410278081893921,
"learning_rate": 1.5748447469697803e-05,
"loss": 0.6786,
"step": 1175
},
{
"epoch": 34.705882352941174,
"grad_norm": 0.16095899045467377,
"learning_rate": 1.556238533028283e-05,
"loss": 0.6563,
"step": 1180
},
{
"epoch": 34.85294117647059,
"grad_norm": 0.13262508809566498,
"learning_rate": 1.5377452489581234e-05,
"loss": 0.6888,
"step": 1185
},
{
"epoch": 35.0,
"grad_norm": 0.13472063839435577,
"learning_rate": 1.5193666442245402e-05,
"loss": 0.681,
"step": 1190
},
{
"epoch": 35.14705882352941,
"grad_norm": 0.13549183309078217,
"learning_rate": 1.5011044574441036e-05,
"loss": 0.6755,
"step": 1195
},
{
"epoch": 35.294117647058826,
"grad_norm": 0.1411600261926651,
"learning_rate": 1.4829604162202442e-05,
"loss": 0.7007,
"step": 1200
},
{
"epoch": 35.44117647058823,
"grad_norm": 0.14127956330776215,
"learning_rate": 1.4649362369798152e-05,
"loss": 0.6551,
"step": 1205
},
{
"epoch": 35.588235294117645,
"grad_norm": 0.13209925591945648,
"learning_rate": 1.4470336248107266e-05,
"loss": 0.6762,
"step": 1210
},
{
"epoch": 35.73529411764706,
"grad_norm": 0.12888824939727783,
"learning_rate": 1.4292542733006372e-05,
"loss": 0.6775,
"step": 1215
},
{
"epoch": 35.88235294117647,
"grad_norm": 0.14431186020374298,
"learning_rate": 1.4115998643767447e-05,
"loss": 0.6654,
"step": 1220
},
{
"epoch": 36.029411764705884,
"grad_norm": 0.12955108284950256,
"learning_rate": 1.3940720681466734e-05,
"loss": 0.6807,
"step": 1225
},
{
"epoch": 36.1764705882353,
"grad_norm": 0.13727155327796936,
"learning_rate": 1.3766725427404843e-05,
"loss": 0.6925,
"step": 1230
},
{
"epoch": 36.3235294117647,
"grad_norm": 0.13375459611415863,
"learning_rate": 1.3594029341538128e-05,
"loss": 0.6884,
"step": 1235
},
{
"epoch": 36.470588235294116,
"grad_norm": 0.13129761815071106,
"learning_rate": 1.34226487609216e-05,
"loss": 0.6868,
"step": 1240
},
{
"epoch": 36.61764705882353,
"grad_norm": 0.1358431726694107,
"learning_rate": 1.3252599898163454e-05,
"loss": 0.6538,
"step": 1245
},
{
"epoch": 36.76470588235294,
"grad_norm": 0.14378570020198822,
"learning_rate": 1.3083898839891284e-05,
"loss": 0.6457,
"step": 1250
},
{
"epoch": 36.911764705882355,
"grad_norm": 0.14497900009155273,
"learning_rate": 1.29165615452304e-05,
"loss": 0.6746,
"step": 1255
},
{
"epoch": 37.05882352941177,
"grad_norm": 0.13402092456817627,
"learning_rate": 1.275060384429398e-05,
"loss": 0.6721,
"step": 1260
},
{
"epoch": 37.205882352941174,
"grad_norm": 0.13633766770362854,
"learning_rate": 1.258604143668563e-05,
"loss": 0.6724,
"step": 1265
},
{
"epoch": 37.35294117647059,
"grad_norm": 0.13490943610668182,
"learning_rate": 1.2422889890014143e-05,
"loss": 0.6578,
"step": 1270
},
{
"epoch": 37.5,
"grad_norm": 0.1326485574245453,
"learning_rate": 1.2261164638420832e-05,
"loss": 0.6664,
"step": 1275
},
{
"epoch": 37.64705882352941,
"grad_norm": 0.14504876732826233,
"learning_rate": 1.2100880981119447e-05,
"loss": 0.6856,
"step": 1280
},
{
"epoch": 37.794117647058826,
"grad_norm": 0.13638907670974731,
"learning_rate": 1.1942054080948878e-05,
"loss": 0.6842,
"step": 1285
},
{
"epoch": 37.94117647058823,
"grad_norm": 0.16528142988681793,
"learning_rate": 1.1784698962938763e-05,
"loss": 0.6759,
"step": 1290
},
{
"epoch": 38.088235294117645,
"grad_norm": 0.15061551332473755,
"learning_rate": 1.1628830512888057e-05,
"loss": 0.6899,
"step": 1295
},
{
"epoch": 38.23529411764706,
"grad_norm": 0.13696105778217316,
"learning_rate": 1.1474463475956926e-05,
"loss": 0.6624,
"step": 1300
},
{
"epoch": 38.38235294117647,
"grad_norm": 0.12491544336080551,
"learning_rate": 1.1321612455271793e-05,
"loss": 0.6725,
"step": 1305
},
{
"epoch": 38.529411764705884,
"grad_norm": 0.13985736668109894,
"learning_rate": 1.117029191054389e-05,
"loss": 0.6942,
"step": 1310
},
{
"epoch": 38.6764705882353,
"grad_norm": 0.14015409350395203,
"learning_rate": 1.1020516156701383e-05,
"loss": 0.6759,
"step": 1315
},
{
"epoch": 38.8235294117647,
"grad_norm": 0.14540641009807587,
"learning_rate": 1.0872299362535173e-05,
"loss": 0.6645,
"step": 1320
},
{
"epoch": 38.970588235294116,
"grad_norm": 0.1425599455833435,
"learning_rate": 1.0725655549358532e-05,
"loss": 0.6711,
"step": 1325
},
{
"epoch": 39.11764705882353,
"grad_norm": 0.13927870988845825,
"learning_rate": 1.0580598589680664e-05,
"loss": 0.6956,
"step": 1330
},
{
"epoch": 39.26470588235294,
"grad_norm": 0.13224616646766663,
"learning_rate": 1.0437142205894418e-05,
"loss": 0.6868,
"step": 1335
},
{
"epoch": 39.411764705882355,
"grad_norm": 0.13682135939598083,
"learning_rate": 1.029529996897808e-05,
"loss": 0.6735,
"step": 1340
},
{
"epoch": 39.55882352941177,
"grad_norm": 0.1319390833377838,
"learning_rate": 1.0155085297211618e-05,
"loss": 0.6513,
"step": 1345
},
{
"epoch": 39.705882352941174,
"grad_norm": 0.1452108919620514,
"learning_rate": 1.001651145490726e-05,
"loss": 0.6772,
"step": 1350
},
{
"epoch": 39.85294117647059,
"grad_norm": 0.14989398419857025,
"learning_rate": 9.87959155115474e-06,
"loss": 0.6633,
"step": 1355
},
{
"epoch": 40.0,
"grad_norm": 0.14220058917999268,
"learning_rate": 9.744338538581147e-06,
"loss": 0.6778,
"step": 1360
},
{
"epoch": 40.14705882352941,
"grad_norm": 0.15334346890449524,
"learning_rate": 9.610765212125607e-06,
"loss": 0.6775,
"step": 1365
},
{
"epoch": 40.294117647058826,
"grad_norm": 0.1356540322303772,
"learning_rate": 9.478884207828912e-06,
"loss": 0.6513,
"step": 1370
},
{
"epoch": 40.44117647058823,
"grad_norm": 0.14519663155078888,
"learning_rate": 9.34870800163808e-06,
"loss": 0.6847,
"step": 1375
},
{
"epoch": 40.588235294117645,
"grad_norm": 0.13579830527305603,
"learning_rate": 9.220248908226224e-06,
"loss": 0.6661,
"step": 1380
},
{
"epoch": 40.73529411764706,
"grad_norm": 0.13304731249809265,
"learning_rate": 9.09351907982751e-06,
"loss": 0.6569,
"step": 1385
},
{
"epoch": 40.88235294117647,
"grad_norm": 0.14108242094516754,
"learning_rate": 8.968530505087582e-06,
"loss": 0.6894,
"step": 1390
},
{
"epoch": 41.029411764705884,
"grad_norm": 0.13457804918289185,
"learning_rate": 8.845295007929446e-06,
"loss": 0.6814,
"step": 1395
},
{
"epoch": 41.1764705882353,
"grad_norm": 0.13951298594474792,
"learning_rate": 8.7238242464349e-06,
"loss": 0.6721,
"step": 1400
},
{
"epoch": 41.3235294117647,
"grad_norm": 0.14416338503360748,
"learning_rate": 8.604129711741706e-06,
"loss": 0.6881,
"step": 1405
},
{
"epoch": 41.470588235294116,
"grad_norm": 0.13295041024684906,
"learning_rate": 8.486222726956508e-06,
"loss": 0.6624,
"step": 1410
},
{
"epoch": 41.61764705882353,
"grad_norm": 0.1342659443616867,
"learning_rate": 8.370114446083686e-06,
"loss": 0.6956,
"step": 1415
},
{
"epoch": 41.76470588235294,
"grad_norm": 0.13162069022655487,
"learning_rate": 8.255815852970153e-06,
"loss": 0.6646,
"step": 1420
},
{
"epoch": 41.911764705882355,
"grad_norm": 0.12931868433952332,
"learning_rate": 8.143337760266331e-06,
"loss": 0.6618,
"step": 1425
},
{
"epoch": 42.05882352941177,
"grad_norm": 0.13857227563858032,
"learning_rate": 8.032690808403232e-06,
"loss": 0.6672,
"step": 1430
},
{
"epoch": 42.205882352941174,
"grad_norm": 0.13812746107578278,
"learning_rate": 7.923885464585884e-06,
"loss": 0.6866,
"step": 1435
},
{
"epoch": 42.35294117647059,
"grad_norm": 0.1503993570804596,
"learning_rate": 7.816932021803154e-06,
"loss": 0.6885,
"step": 1440
},
{
"epoch": 42.5,
"grad_norm": 0.13175919651985168,
"learning_rate": 7.711840597853998e-06,
"loss": 0.6686,
"step": 1445
},
{
"epoch": 42.64705882352941,
"grad_norm": 0.13319700956344604,
"learning_rate": 7.608621134390344e-06,
"loss": 0.6561,
"step": 1450
},
{
"epoch": 42.794117647058826,
"grad_norm": 0.1399184763431549,
"learning_rate": 7.507283395976592e-06,
"loss": 0.6537,
"step": 1455
},
{
"epoch": 42.94117647058823,
"grad_norm": 0.13498006761074066,
"learning_rate": 7.407836969165911e-06,
"loss": 0.6886,
"step": 1460
},
{
"epoch": 43.088235294117645,
"grad_norm": 0.1388946920633316,
"learning_rate": 7.310291261593308e-06,
"loss": 0.6797,
"step": 1465
},
{
"epoch": 43.23529411764706,
"grad_norm": 0.13527587056159973,
"learning_rate": 7.2146555010857155e-06,
"loss": 0.6813,
"step": 1470
},
{
"epoch": 43.38235294117647,
"grad_norm": 0.13819634914398193,
"learning_rate": 7.120938734789012e-06,
"loss": 0.6752,
"step": 1475
},
{
"epoch": 43.529411764705884,
"grad_norm": 0.12921284139156342,
"learning_rate": 7.029149828312145e-06,
"loss": 0.6761,
"step": 1480
},
{
"epoch": 43.6764705882353,
"grad_norm": 0.13316689431667328,
"learning_rate": 6.93929746488849e-06,
"loss": 0.6632,
"step": 1485
},
{
"epoch": 43.8235294117647,
"grad_norm": 0.13092860579490662,
"learning_rate": 6.851390144554372e-06,
"loss": 0.6705,
"step": 1490
},
{
"epoch": 43.970588235294116,
"grad_norm": 0.13236363232135773,
"learning_rate": 6.765436183344996e-06,
"loss": 0.6602,
"step": 1495
},
{
"epoch": 44.11764705882353,
"grad_norm": 0.14111852645874023,
"learning_rate": 6.6814437125077135e-06,
"loss": 0.6554,
"step": 1500
},
{
"epoch": 44.26470588235294,
"grad_norm": 0.13777071237564087,
"learning_rate": 6.599420677732848e-06,
"loss": 0.6783,
"step": 1505
},
{
"epoch": 44.411764705882355,
"grad_norm": 0.15148292481899261,
"learning_rate": 6.519374838401997e-06,
"loss": 0.6818,
"step": 1510
},
{
"epoch": 44.55882352941177,
"grad_norm": 0.14915376901626587,
"learning_rate": 6.44131376685401e-06,
"loss": 0.6758,
"step": 1515
},
{
"epoch": 44.705882352941174,
"grad_norm": 0.13784313201904297,
"learning_rate": 6.36524484766865e-06,
"loss": 0.6652,
"step": 1520
},
{
"epoch": 44.85294117647059,
"grad_norm": 0.13148203492164612,
"learning_rate": 6.291175276968002e-06,
"loss": 0.6758,
"step": 1525
},
{
"epoch": 45.0,
"grad_norm": 0.14285942912101746,
"learning_rate": 6.219112061735721e-06,
"loss": 0.6716,
"step": 1530
},
{
"epoch": 45.14705882352941,
"grad_norm": 0.14051543176174164,
"learning_rate": 6.149062019154174e-06,
"loss": 0.6833,
"step": 1535
},
{
"epoch": 45.294117647058826,
"grad_norm": 0.12927637994289398,
"learning_rate": 6.081031775959542e-06,
"loss": 0.648,
"step": 1540
},
{
"epoch": 45.44117647058823,
"grad_norm": 0.1433332860469818,
"learning_rate": 6.0150277678149055e-06,
"loss": 0.6377,
"step": 1545
},
{
"epoch": 45.588235294117645,
"grad_norm": 0.1298450231552124,
"learning_rate": 5.951056238701456e-06,
"loss": 0.6866,
"step": 1550
},
{
"epoch": 45.73529411764706,
"grad_norm": 0.13296933472156525,
"learning_rate": 5.889123240327819e-06,
"loss": 0.6747,
"step": 1555
},
{
"epoch": 45.88235294117647,
"grad_norm": 0.1386398822069168,
"learning_rate": 5.829234631557524e-06,
"loss": 0.6827,
"step": 1560
},
{
"epoch": 46.029411764705884,
"grad_norm": 0.12945467233657837,
"learning_rate": 5.771396077854802e-06,
"loss": 0.6823,
"step": 1565
},
{
"epoch": 46.1764705882353,
"grad_norm": 0.1447058618068695,
"learning_rate": 5.715613050748604e-06,
"loss": 0.6542,
"step": 1570
},
{
"epoch": 46.3235294117647,
"grad_norm": 0.13116587698459625,
"learning_rate": 5.661890827315004e-06,
"loss": 0.664,
"step": 1575
},
{
"epoch": 46.470588235294116,
"grad_norm": 0.13510237634181976,
"learning_rate": 5.61023448967798e-06,
"loss": 0.6698,
"step": 1580
},
{
"epoch": 46.61764705882353,
"grad_norm": 0.13816939294338226,
"learning_rate": 5.560648924528657e-06,
"loss": 0.7097,
"step": 1585
},
{
"epoch": 46.76470588235294,
"grad_norm": 0.1419830620288849,
"learning_rate": 5.513138822663016e-06,
"loss": 0.6905,
"step": 1590
},
{
"epoch": 46.911764705882355,
"grad_norm": 0.13188646733760834,
"learning_rate": 5.467708678538148e-06,
"loss": 0.6457,
"step": 1595
},
{
"epoch": 47.05882352941177,
"grad_norm": 0.1413869559764862,
"learning_rate": 5.424362789847082e-06,
"loss": 0.6766,
"step": 1600
},
{
"epoch": 47.205882352941174,
"grad_norm": 0.13414201140403748,
"learning_rate": 5.38310525711221e-06,
"loss": 0.6795,
"step": 1605
},
{
"epoch": 47.35294117647059,
"grad_norm": 0.1363234668970108,
"learning_rate": 5.343939983297398e-06,
"loss": 0.6713,
"step": 1610
},
{
"epoch": 47.5,
"grad_norm": 0.1344790905714035,
"learning_rate": 5.3068706734387484e-06,
"loss": 0.6584,
"step": 1615
},
{
"epoch": 47.64705882352941,
"grad_norm": 0.1452033668756485,
"learning_rate": 5.271900834294105e-06,
"loss": 0.667,
"step": 1620
},
{
"epoch": 47.794117647058826,
"grad_norm": 0.13405530154705048,
"learning_rate": 5.239033774011322e-06,
"loss": 0.669,
"step": 1625
},
{
"epoch": 47.94117647058823,
"grad_norm": 0.13740584254264832,
"learning_rate": 5.208272601815313e-06,
"loss": 0.6836,
"step": 1630
},
{
"epoch": 48.088235294117645,
"grad_norm": 0.12757079303264618,
"learning_rate": 5.1796202277139075e-06,
"loss": 0.6909,
"step": 1635
},
{
"epoch": 48.23529411764706,
"grad_norm": 0.12381689995527267,
"learning_rate": 5.1530793622225725e-06,
"loss": 0.6605,
"step": 1640
},
{
"epoch": 48.38235294117647,
"grad_norm": 0.13950037956237793,
"learning_rate": 5.128652516107996e-06,
"loss": 0.6814,
"step": 1645
},
{
"epoch": 48.529411764705884,
"grad_norm": 0.13073165714740753,
"learning_rate": 5.10634200015057e-06,
"loss": 0.6866,
"step": 1650
},
{
"epoch": 48.6764705882353,
"grad_norm": 0.1424126774072647,
"learning_rate": 5.086149924925788e-06,
"loss": 0.6697,
"step": 1655
},
{
"epoch": 48.8235294117647,
"grad_norm": 0.15078318119049072,
"learning_rate": 5.068078200604584e-06,
"loss": 0.6615,
"step": 1660
},
{
"epoch": 48.970588235294116,
"grad_norm": 0.1373935043811798,
"learning_rate": 5.052128536772629e-06,
"loss": 0.6665,
"step": 1665
},
{
"epoch": 49.11764705882353,
"grad_norm": 0.1401982456445694,
"learning_rate": 5.038302442268617e-06,
"loss": 0.6597,
"step": 1670
},
{
"epoch": 49.26470588235294,
"grad_norm": 0.1401190608739853,
"learning_rate": 5.026601225041503e-06,
"loss": 0.6929,
"step": 1675
},
{
"epoch": 49.411764705882355,
"grad_norm": 0.1368054449558258,
"learning_rate": 5.0170259920268025e-06,
"loss": 0.6923,
"step": 1680
},
{
"epoch": 49.55882352941177,
"grad_norm": 0.13621118664741516,
"learning_rate": 5.009577649041847e-06,
"loss": 0.6574,
"step": 1685
},
{
"epoch": 49.705882352941174,
"grad_norm": 0.13294735550880432,
"learning_rate": 5.004256900700115e-06,
"loss": 0.6646,
"step": 1690
},
{
"epoch": 49.85294117647059,
"grad_norm": 0.14144855737686157,
"learning_rate": 5.001064250344557e-06,
"loss": 0.666,
"step": 1695
},
{
"epoch": 50.0,
"grad_norm": 0.13993091881275177,
"learning_rate": 5e-06,
"loss": 0.6593,
"step": 1700
},
{
"epoch": 50.0,
"step": 1700,
"total_flos": 2.629944131882844e+18,
"train_loss": 0.6986723100437837,
"train_runtime": 41334.8641,
"train_samples_per_second": 0.481,
"train_steps_per_second": 0.041
}
],
"logging_steps": 5,
"max_steps": 1700,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.629944131882844e+18,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}