ideaassistant_qwen2_7b / trainer_state.json
zhangchen1991's picture
Upload folder using huggingface_hub
86d3038 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 200,
"global_step": 2094,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004775549188156638,
"grad_norm": 1.5947464157895972,
"learning_rate": 4.7619047619047623e-07,
"loss": 1.1507,
"step": 10
},
{
"epoch": 0.009551098376313277,
"grad_norm": 1.0799275908070096,
"learning_rate": 9.523809523809525e-07,
"loss": 1.1084,
"step": 20
},
{
"epoch": 0.014326647564469915,
"grad_norm": 0.8982107389870104,
"learning_rate": 1.4285714285714286e-06,
"loss": 1.0874,
"step": 30
},
{
"epoch": 0.019102196752626553,
"grad_norm": 0.7755204372056955,
"learning_rate": 1.904761904761905e-06,
"loss": 1.1068,
"step": 40
},
{
"epoch": 0.02387774594078319,
"grad_norm": 0.7764375006527714,
"learning_rate": 2.380952380952381e-06,
"loss": 1.1152,
"step": 50
},
{
"epoch": 0.02865329512893983,
"grad_norm": 0.8635581450994269,
"learning_rate": 2.8571428571428573e-06,
"loss": 1.0964,
"step": 60
},
{
"epoch": 0.033428844317096466,
"grad_norm": 0.7474505965610809,
"learning_rate": 3.3333333333333333e-06,
"loss": 1.1005,
"step": 70
},
{
"epoch": 0.038204393505253106,
"grad_norm": 0.7625379001449059,
"learning_rate": 3.80952380952381e-06,
"loss": 1.0933,
"step": 80
},
{
"epoch": 0.04297994269340974,
"grad_norm": 0.7457628875786683,
"learning_rate": 4.2857142857142855e-06,
"loss": 1.0397,
"step": 90
},
{
"epoch": 0.04775549188156638,
"grad_norm": 0.7668973425259701,
"learning_rate": 4.761904761904762e-06,
"loss": 1.0723,
"step": 100
},
{
"epoch": 0.05253104106972302,
"grad_norm": 0.7418999154350163,
"learning_rate": 5.2380952380952384e-06,
"loss": 1.0753,
"step": 110
},
{
"epoch": 0.05730659025787966,
"grad_norm": 0.7536046873082832,
"learning_rate": 5.7142857142857145e-06,
"loss": 1.0604,
"step": 120
},
{
"epoch": 0.06208213944603629,
"grad_norm": 0.7126742899879043,
"learning_rate": 6.1904761904761914e-06,
"loss": 1.0844,
"step": 130
},
{
"epoch": 0.06685768863419293,
"grad_norm": 0.7388477971520834,
"learning_rate": 6.666666666666667e-06,
"loss": 1.0738,
"step": 140
},
{
"epoch": 0.07163323782234957,
"grad_norm": 0.7063263241327802,
"learning_rate": 7.1428571428571436e-06,
"loss": 1.0638,
"step": 150
},
{
"epoch": 0.07640878701050621,
"grad_norm": 0.7435364320190926,
"learning_rate": 7.61904761904762e-06,
"loss": 1.089,
"step": 160
},
{
"epoch": 0.08118433619866285,
"grad_norm": 0.7196795649504337,
"learning_rate": 8.095238095238097e-06,
"loss": 1.0215,
"step": 170
},
{
"epoch": 0.08595988538681948,
"grad_norm": 0.792285178872838,
"learning_rate": 8.571428571428571e-06,
"loss": 1.0489,
"step": 180
},
{
"epoch": 0.09073543457497613,
"grad_norm": 0.8765129377706619,
"learning_rate": 9.047619047619049e-06,
"loss": 1.047,
"step": 190
},
{
"epoch": 0.09551098376313276,
"grad_norm": 0.6926043469847205,
"learning_rate": 9.523809523809525e-06,
"loss": 1.0484,
"step": 200
},
{
"epoch": 0.09551098376313276,
"eval_loss": 1.0037423372268677,
"eval_runtime": 534.236,
"eval_samples_per_second": 27.872,
"eval_steps_per_second": 3.485,
"step": 200
},
{
"epoch": 0.10028653295128939,
"grad_norm": 0.7189100769671237,
"learning_rate": 1e-05,
"loss": 1.0786,
"step": 210
},
{
"epoch": 0.10506208213944604,
"grad_norm": 0.6530269733207105,
"learning_rate": 9.99930486701988e-06,
"loss": 1.0568,
"step": 220
},
{
"epoch": 0.10983763132760267,
"grad_norm": 0.7951921984643434,
"learning_rate": 9.99721966136347e-06,
"loss": 1.0264,
"step": 230
},
{
"epoch": 0.11461318051575932,
"grad_norm": 0.7425556549789157,
"learning_rate": 9.99374496282885e-06,
"loss": 1.0093,
"step": 240
},
{
"epoch": 0.11938872970391595,
"grad_norm": 0.7965678052101759,
"learning_rate": 9.988881737567046e-06,
"loss": 1.0172,
"step": 250
},
{
"epoch": 0.12416427889207259,
"grad_norm": 0.6836841577113884,
"learning_rate": 9.982631337813363e-06,
"loss": 1.0369,
"step": 260
},
{
"epoch": 0.12893982808022922,
"grad_norm": 0.7403849280275662,
"learning_rate": 9.974995501511404e-06,
"loss": 0.998,
"step": 270
},
{
"epoch": 0.13371537726838587,
"grad_norm": 0.8090595134087124,
"learning_rate": 9.965976351829827e-06,
"loss": 1.0232,
"step": 280
},
{
"epoch": 0.1384909264565425,
"grad_norm": 0.7595821597209347,
"learning_rate": 9.95557639657199e-06,
"loss": 1.0093,
"step": 290
},
{
"epoch": 0.14326647564469913,
"grad_norm": 0.6967946094456987,
"learning_rate": 9.943798527478652e-06,
"loss": 1.0048,
"step": 300
},
{
"epoch": 0.14804202483285578,
"grad_norm": 0.8063573400514253,
"learning_rate": 9.930646019423909e-06,
"loss": 0.9838,
"step": 310
},
{
"epoch": 0.15281757402101243,
"grad_norm": 0.8442835891928957,
"learning_rate": 9.916122529504605e-06,
"loss": 0.9935,
"step": 320
},
{
"epoch": 0.15759312320916904,
"grad_norm": 0.781371439165008,
"learning_rate": 9.900232096023478e-06,
"loss": 0.9834,
"step": 330
},
{
"epoch": 0.1623686723973257,
"grad_norm": 0.8183448490469395,
"learning_rate": 9.882979137366275e-06,
"loss": 0.9638,
"step": 340
},
{
"epoch": 0.16714422158548234,
"grad_norm": 0.7604900553397272,
"learning_rate": 9.864368450773227e-06,
"loss": 0.983,
"step": 350
},
{
"epoch": 0.17191977077363896,
"grad_norm": 0.8396013068852565,
"learning_rate": 9.844405211005145e-06,
"loss": 0.9759,
"step": 360
},
{
"epoch": 0.1766953199617956,
"grad_norm": 0.8365439774930438,
"learning_rate": 9.823094968904572e-06,
"loss": 0.9927,
"step": 370
},
{
"epoch": 0.18147086914995225,
"grad_norm": 0.7930078729758488,
"learning_rate": 9.800443649852347e-06,
"loss": 0.985,
"step": 380
},
{
"epoch": 0.18624641833810887,
"grad_norm": 0.6951067649253269,
"learning_rate": 9.776457552120034e-06,
"loss": 0.9578,
"step": 390
},
{
"epoch": 0.19102196752626552,
"grad_norm": 0.7758966278638689,
"learning_rate": 9.751143345118675e-06,
"loss": 0.9828,
"step": 400
},
{
"epoch": 0.19102196752626552,
"eval_loss": 0.911033034324646,
"eval_runtime": 534.1402,
"eval_samples_per_second": 27.877,
"eval_steps_per_second": 3.486,
"step": 400
},
{
"epoch": 0.19579751671442217,
"grad_norm": 0.8178388178466306,
"learning_rate": 9.724508067544328e-06,
"loss": 0.9593,
"step": 410
},
{
"epoch": 0.20057306590257878,
"grad_norm": 0.7948626800066163,
"learning_rate": 9.696559125420949e-06,
"loss": 0.9342,
"step": 420
},
{
"epoch": 0.20534861509073543,
"grad_norm": 0.7690821022431013,
"learning_rate": 9.667304290041102e-06,
"loss": 0.9182,
"step": 430
},
{
"epoch": 0.21012416427889208,
"grad_norm": 0.8011672592566321,
"learning_rate": 9.636751695805154e-06,
"loss": 0.9399,
"step": 440
},
{
"epoch": 0.2148997134670487,
"grad_norm": 0.7896064611214817,
"learning_rate": 9.604909837959456e-06,
"loss": 0.9546,
"step": 450
},
{
"epoch": 0.21967526265520534,
"grad_norm": 0.8683971543565645,
"learning_rate": 9.57178757023422e-06,
"loss": 0.9493,
"step": 460
},
{
"epoch": 0.224450811843362,
"grad_norm": 0.7840423345877752,
"learning_rate": 9.537394102381719e-06,
"loss": 0.951,
"step": 470
},
{
"epoch": 0.22922636103151864,
"grad_norm": 0.8816713401350009,
"learning_rate": 9.501738997615471e-06,
"loss": 0.902,
"step": 480
},
{
"epoch": 0.23400191021967526,
"grad_norm": 0.7272663417426294,
"learning_rate": 9.464832169951171e-06,
"loss": 0.9121,
"step": 490
},
{
"epoch": 0.2387774594078319,
"grad_norm": 0.8166914666826738,
"learning_rate": 9.426683881450058e-06,
"loss": 0.9149,
"step": 500
},
{
"epoch": 0.24355300859598855,
"grad_norm": 0.806333386634663,
"learning_rate": 9.387304739365524e-06,
"loss": 0.9141,
"step": 510
},
{
"epoch": 0.24832855778414517,
"grad_norm": 0.8500670140915615,
"learning_rate": 9.346705693193722e-06,
"loss": 0.9046,
"step": 520
},
{
"epoch": 0.2531041069723018,
"grad_norm": 0.8536991575306397,
"learning_rate": 9.304898031629038e-06,
"loss": 0.907,
"step": 530
},
{
"epoch": 0.25787965616045844,
"grad_norm": 0.7805889795178423,
"learning_rate": 9.261893379425218e-06,
"loss": 0.9095,
"step": 540
},
{
"epoch": 0.2626552053486151,
"grad_norm": 0.983470886698574,
"learning_rate": 9.217703694163083e-06,
"loss": 0.8811,
"step": 550
},
{
"epoch": 0.26743075453677173,
"grad_norm": 0.8247344852185301,
"learning_rate": 9.172341262925675e-06,
"loss": 0.8743,
"step": 560
},
{
"epoch": 0.2722063037249284,
"grad_norm": 0.7947362543336306,
"learning_rate": 9.125818698881798e-06,
"loss": 0.8659,
"step": 570
},
{
"epoch": 0.276981852913085,
"grad_norm": 0.7251555329092241,
"learning_rate": 9.078148937778889e-06,
"loss": 0.906,
"step": 580
},
{
"epoch": 0.2817574021012416,
"grad_norm": 0.789421032185164,
"learning_rate": 9.029345234346183e-06,
"loss": 0.8859,
"step": 590
},
{
"epoch": 0.28653295128939826,
"grad_norm": 0.8701223156155673,
"learning_rate": 8.979421158609206e-06,
"loss": 0.8785,
"step": 600
},
{
"epoch": 0.28653295128939826,
"eval_loss": 0.8167237639427185,
"eval_runtime": 534.2082,
"eval_samples_per_second": 27.873,
"eval_steps_per_second": 3.486,
"step": 600
},
{
"epoch": 0.2913085004775549,
"grad_norm": 0.8629003851548017,
"learning_rate": 8.928390592116576e-06,
"loss": 0.8539,
"step": 610
},
{
"epoch": 0.29608404966571156,
"grad_norm": 0.8486389448069024,
"learning_rate": 8.876267724080197e-06,
"loss": 0.8527,
"step": 620
},
{
"epoch": 0.3008595988538682,
"grad_norm": 0.762838712746454,
"learning_rate": 8.823067047429908e-06,
"loss": 0.8683,
"step": 630
},
{
"epoch": 0.30563514804202485,
"grad_norm": 0.8625392396534273,
"learning_rate": 8.768803354783668e-06,
"loss": 0.8649,
"step": 640
},
{
"epoch": 0.3104106972301815,
"grad_norm": 0.8333540473972623,
"learning_rate": 8.71349173433443e-06,
"loss": 0.8622,
"step": 650
},
{
"epoch": 0.3151862464183381,
"grad_norm": 0.7664644528325345,
"learning_rate": 8.65714756565482e-06,
"loss": 0.8359,
"step": 660
},
{
"epoch": 0.31996179560649474,
"grad_norm": 0.8563484088758611,
"learning_rate": 8.599786515420789e-06,
"loss": 0.8569,
"step": 670
},
{
"epoch": 0.3247373447946514,
"grad_norm": 0.8053932610453219,
"learning_rate": 8.541424533055455e-06,
"loss": 0.8458,
"step": 680
},
{
"epoch": 0.32951289398280803,
"grad_norm": 0.8666024535787029,
"learning_rate": 8.48207784629431e-06,
"loss": 0.8447,
"step": 690
},
{
"epoch": 0.3342884431709647,
"grad_norm": 0.8868169973573014,
"learning_rate": 8.421762956673043e-06,
"loss": 0.8365,
"step": 700
},
{
"epoch": 0.3390639923591213,
"grad_norm": 0.8954608340327667,
"learning_rate": 8.360496634939243e-06,
"loss": 0.8335,
"step": 710
},
{
"epoch": 0.3438395415472779,
"grad_norm": 0.7784372366116787,
"learning_rate": 8.298295916389234e-06,
"loss": 0.8458,
"step": 720
},
{
"epoch": 0.34861509073543456,
"grad_norm": 0.8749905429657314,
"learning_rate": 8.235178096131355e-06,
"loss": 0.8185,
"step": 730
},
{
"epoch": 0.3533906399235912,
"grad_norm": 0.8021513738690056,
"learning_rate": 8.171160724277005e-06,
"loss": 0.8009,
"step": 740
},
{
"epoch": 0.35816618911174786,
"grad_norm": 0.8463663258199511,
"learning_rate": 8.106261601060773e-06,
"loss": 0.8277,
"step": 750
},
{
"epoch": 0.3629417382999045,
"grad_norm": 0.8443067155200391,
"learning_rate": 8.040498771891031e-06,
"loss": 0.8432,
"step": 760
},
{
"epoch": 0.36771728748806115,
"grad_norm": 0.8612360765810276,
"learning_rate": 7.973890522332348e-06,
"loss": 0.7933,
"step": 770
},
{
"epoch": 0.37249283667621774,
"grad_norm": 0.7566639910663036,
"learning_rate": 7.90645537302113e-06,
"loss": 0.8122,
"step": 780
},
{
"epoch": 0.3772683858643744,
"grad_norm": 0.8598522471784897,
"learning_rate": 7.838212074515899e-06,
"loss": 0.7713,
"step": 790
},
{
"epoch": 0.38204393505253104,
"grad_norm": 0.8264097384771215,
"learning_rate": 7.769179602083642e-06,
"loss": 0.7863,
"step": 800
},
{
"epoch": 0.38204393505253104,
"eval_loss": 0.7238086462020874,
"eval_runtime": 534.3413,
"eval_samples_per_second": 27.866,
"eval_steps_per_second": 3.485,
"step": 800
},
{
"epoch": 0.3868194842406877,
"grad_norm": 0.8302298544450823,
"learning_rate": 7.699377150423673e-06,
"loss": 0.7703,
"step": 810
},
{
"epoch": 0.39159503342884433,
"grad_norm": 0.9074490770748711,
"learning_rate": 7.628824128330485e-06,
"loss": 0.7651,
"step": 820
},
{
"epoch": 0.396370582617001,
"grad_norm": 0.9084786994105255,
"learning_rate": 7.557540153297086e-06,
"loss": 0.777,
"step": 830
},
{
"epoch": 0.40114613180515757,
"grad_norm": 0.7941270341446054,
"learning_rate": 7.485545046060272e-06,
"loss": 0.7659,
"step": 840
},
{
"epoch": 0.4059216809933142,
"grad_norm": 0.8418275807952966,
"learning_rate": 7.412858825089423e-06,
"loss": 0.7422,
"step": 850
},
{
"epoch": 0.41069723018147086,
"grad_norm": 0.8272295917894095,
"learning_rate": 7.3395017010202965e-06,
"loss": 0.7812,
"step": 860
},
{
"epoch": 0.4154727793696275,
"grad_norm": 0.8505418063106487,
"learning_rate": 7.265494071035401e-06,
"loss": 0.7461,
"step": 870
},
{
"epoch": 0.42024832855778416,
"grad_norm": 0.8254281894571944,
"learning_rate": 7.19085651319249e-06,
"loss": 0.7475,
"step": 880
},
{
"epoch": 0.4250238777459408,
"grad_norm": 0.7405023384966558,
"learning_rate": 7.115609780702767e-06,
"loss": 0.7485,
"step": 890
},
{
"epoch": 0.4297994269340974,
"grad_norm": 0.8069701422330396,
"learning_rate": 7.039774796160391e-06,
"loss": 0.7502,
"step": 900
},
{
"epoch": 0.43457497612225404,
"grad_norm": 0.7893053819873398,
"learning_rate": 6.9633726457248864e-06,
"loss": 0.7307,
"step": 910
},
{
"epoch": 0.4393505253104107,
"grad_norm": 0.8684426658728022,
"learning_rate": 6.886424573258057e-06,
"loss": 0.7407,
"step": 920
},
{
"epoch": 0.44412607449856734,
"grad_norm": 0.8806215484013901,
"learning_rate": 6.808951974417077e-06,
"loss": 0.7232,
"step": 930
},
{
"epoch": 0.448901623686724,
"grad_norm": 1.0078825768218607,
"learning_rate": 6.73097639070535e-06,
"loss": 0.7217,
"step": 940
},
{
"epoch": 0.45367717287488063,
"grad_norm": 0.820169166397048,
"learning_rate": 6.652519503482829e-06,
"loss": 0.7275,
"step": 950
},
{
"epoch": 0.4584527220630373,
"grad_norm": 0.9615211451349941,
"learning_rate": 6.573603127937443e-06,
"loss": 0.7244,
"step": 960
},
{
"epoch": 0.46322827125119387,
"grad_norm": 0.9105145919711279,
"learning_rate": 6.494249207019317e-06,
"loss": 0.7184,
"step": 970
},
{
"epoch": 0.4680038204393505,
"grad_norm": 0.8720579325571185,
"learning_rate": 6.414479805339465e-06,
"loss": 0.6887,
"step": 980
},
{
"epoch": 0.47277936962750716,
"grad_norm": 0.8869722170599964,
"learning_rate": 6.3343171030346525e-06,
"loss": 0.6858,
"step": 990
},
{
"epoch": 0.4775549188156638,
"grad_norm": 0.8820354385866871,
"learning_rate": 6.253783389600136e-06,
"loss": 0.7073,
"step": 1000
},
{
"epoch": 0.4775549188156638,
"eval_loss": 0.6407110095024109,
"eval_runtime": 534.1565,
"eval_samples_per_second": 27.876,
"eval_steps_per_second": 3.486,
"step": 1000
},
{
"epoch": 0.48233046800382046,
"grad_norm": 0.9876137273298787,
"learning_rate": 6.172901057692007e-06,
"loss": 0.7207,
"step": 1010
},
{
"epoch": 0.4871060171919771,
"grad_norm": 0.8646444637817178,
"learning_rate": 6.0916925969008275e-06,
"loss": 0.7363,
"step": 1020
},
{
"epoch": 0.4918815663801337,
"grad_norm": 0.8847812706441835,
"learning_rate": 6.010180587498347e-06,
"loss": 0.6729,
"step": 1030
},
{
"epoch": 0.49665711556829034,
"grad_norm": 0.7915082131571504,
"learning_rate": 5.928387694158968e-06,
"loss": 0.6956,
"step": 1040
},
{
"epoch": 0.501432664756447,
"grad_norm": 1.046517690383018,
"learning_rate": 5.8463366596577706e-06,
"loss": 0.6896,
"step": 1050
},
{
"epoch": 0.5062082139446036,
"grad_norm": 0.8847177405317377,
"learning_rate": 5.764050298546808e-06,
"loss": 0.6861,
"step": 1060
},
{
"epoch": 0.5109837631327603,
"grad_norm": 0.8672793826369924,
"learning_rate": 5.68155149081145e-06,
"loss": 0.6762,
"step": 1070
},
{
"epoch": 0.5157593123209169,
"grad_norm": 0.9340453072759527,
"learning_rate": 5.598863175508526e-06,
"loss": 0.6717,
"step": 1080
},
{
"epoch": 0.5205348615090736,
"grad_norm": 0.8259949274241196,
"learning_rate": 5.516008344388053e-06,
"loss": 0.6825,
"step": 1090
},
{
"epoch": 0.5253104106972302,
"grad_norm": 0.9029867272763249,
"learning_rate": 5.433010035500299e-06,
"loss": 0.6771,
"step": 1100
},
{
"epoch": 0.5300859598853869,
"grad_norm": 0.800953127657272,
"learning_rate": 5.3498913267899864e-06,
"loss": 0.674,
"step": 1110
},
{
"epoch": 0.5348615090735435,
"grad_norm": 0.9245770325043161,
"learning_rate": 5.2666753296793895e-06,
"loss": 0.6662,
"step": 1120
},
{
"epoch": 0.5396370582617,
"grad_norm": 0.8502150053204679,
"learning_rate": 5.183385182642136e-06,
"loss": 0.6765,
"step": 1130
},
{
"epoch": 0.5444126074498568,
"grad_norm": 1.5468980444160696,
"learning_rate": 5.100044044769472e-06,
"loss": 0.6682,
"step": 1140
},
{
"epoch": 0.5491881566380133,
"grad_norm": 1.0568604331225828,
"learning_rate": 5.016675089330817e-06,
"loss": 0.6583,
"step": 1150
},
{
"epoch": 0.55396370582617,
"grad_norm": 0.9277335589404493,
"learning_rate": 4.933301497330344e-06,
"loss": 0.6456,
"step": 1160
},
{
"epoch": 0.5587392550143266,
"grad_norm": 0.9015066761493195,
"learning_rate": 4.849946451061444e-06,
"loss": 0.673,
"step": 1170
},
{
"epoch": 0.5635148042024832,
"grad_norm": 0.916248819386327,
"learning_rate": 4.766633127660805e-06,
"loss": 0.6372,
"step": 1180
},
{
"epoch": 0.5682903533906399,
"grad_norm": 0.9944122764504796,
"learning_rate": 4.683384692663937e-06,
"loss": 0.6352,
"step": 1190
},
{
"epoch": 0.5730659025787965,
"grad_norm": 0.8995871939871956,
"learning_rate": 4.600224293563926e-06,
"loss": 0.6143,
"step": 1200
},
{
"epoch": 0.5730659025787965,
"eval_loss": 0.5672308206558228,
"eval_runtime": 534.5729,
"eval_samples_per_second": 27.854,
"eval_steps_per_second": 3.483,
"step": 1200
},
{
"epoch": 0.5778414517669532,
"grad_norm": 0.9029828803944917,
"learning_rate": 4.517175053375191e-06,
"loss": 0.6482,
"step": 1210
},
{
"epoch": 0.5826170009551098,
"grad_norm": 0.9997552015425194,
"learning_rate": 4.434260064204067e-06,
"loss": 0.6244,
"step": 1220
},
{
"epoch": 0.5873925501432665,
"grad_norm": 0.8505066146958208,
"learning_rate": 4.351502380827959e-06,
"loss": 0.6231,
"step": 1230
},
{
"epoch": 0.5921680993314231,
"grad_norm": 0.9601951382006098,
"learning_rate": 4.268925014284898e-06,
"loss": 0.6515,
"step": 1240
},
{
"epoch": 0.5969436485195797,
"grad_norm": 0.858109652878112,
"learning_rate": 4.18655092547524e-06,
"loss": 0.6027,
"step": 1250
},
{
"epoch": 0.6017191977077364,
"grad_norm": 1.0279381548988882,
"learning_rate": 4.104403018777323e-06,
"loss": 0.636,
"step": 1260
},
{
"epoch": 0.606494746895893,
"grad_norm": 0.8684044204496176,
"learning_rate": 4.022504135678822e-06,
"loss": 0.6356,
"step": 1270
},
{
"epoch": 0.6112702960840497,
"grad_norm": 1.2002839065266542,
"learning_rate": 3.94087704842561e-06,
"loss": 0.6303,
"step": 1280
},
{
"epoch": 0.6160458452722063,
"grad_norm": 1.0212819754078601,
"learning_rate": 3.859544453689853e-06,
"loss": 0.6181,
"step": 1290
},
{
"epoch": 0.620821394460363,
"grad_norm": 1.1643909557826269,
"learning_rate": 3.778528966259137e-06,
"loss": 0.6075,
"step": 1300
},
{
"epoch": 0.6255969436485196,
"grad_norm": 0.8318901215082086,
"learning_rate": 3.697853112748345e-06,
"loss": 0.6106,
"step": 1310
},
{
"epoch": 0.6303724928366762,
"grad_norm": 0.9063102495466279,
"learning_rate": 3.6175393253360704e-06,
"loss": 0.599,
"step": 1320
},
{
"epoch": 0.6351480420248329,
"grad_norm": 0.9567097209608001,
"learning_rate": 3.537609935527264e-06,
"loss": 0.5996,
"step": 1330
},
{
"epoch": 0.6399235912129895,
"grad_norm": 0.939453389364599,
"learning_rate": 3.458087167943905e-06,
"loss": 0.5867,
"step": 1340
},
{
"epoch": 0.6446991404011462,
"grad_norm": 0.9944415765925527,
"learning_rate": 3.3789931341453564e-06,
"loss": 0.614,
"step": 1350
},
{
"epoch": 0.6494746895893028,
"grad_norm": 0.8911567397377756,
"learning_rate": 3.3003498264801915e-06,
"loss": 0.5858,
"step": 1360
},
{
"epoch": 0.6542502387774594,
"grad_norm": 0.9190740572643366,
"learning_rate": 3.2221791119711372e-06,
"loss": 0.6073,
"step": 1370
},
{
"epoch": 0.6590257879656161,
"grad_norm": 0.8722067899669511,
"learning_rate": 3.144502726234889e-06,
"loss": 0.598,
"step": 1380
},
{
"epoch": 0.6638013371537727,
"grad_norm": 0.8704883954915125,
"learning_rate": 3.067342267438446e-06,
"loss": 0.5864,
"step": 1390
},
{
"epoch": 0.6685768863419294,
"grad_norm": 0.9586746506286237,
"learning_rate": 2.9907191902936773e-06,
"loss": 0.5726,
"step": 1400
},
{
"epoch": 0.6685768863419294,
"eval_loss": 0.5096372961997986,
"eval_runtime": 534.0593,
"eval_samples_per_second": 27.881,
"eval_steps_per_second": 3.487,
"step": 1400
},
{
"epoch": 0.673352435530086,
"grad_norm": 0.9771151805675299,
"learning_rate": 2.914654800091768e-06,
"loss": 0.5678,
"step": 1410
},
{
"epoch": 0.6781279847182426,
"grad_norm": 0.9844163415808749,
"learning_rate": 2.8391702467792137e-06,
"loss": 0.5875,
"step": 1420
},
{
"epoch": 0.6829035339063992,
"grad_norm": 0.936929121667794,
"learning_rate": 2.764286519077014e-06,
"loss": 0.5745,
"step": 1430
},
{
"epoch": 0.6876790830945558,
"grad_norm": 0.9581940513551886,
"learning_rate": 2.6900244386446903e-06,
"loss": 0.5748,
"step": 1440
},
{
"epoch": 0.6924546322827125,
"grad_norm": 0.9382097505155865,
"learning_rate": 2.616404654290752e-06,
"loss": 0.582,
"step": 1450
},
{
"epoch": 0.6972301814708691,
"grad_norm": 0.9458807920061071,
"learning_rate": 2.5434476362312375e-06,
"loss": 0.5859,
"step": 1460
},
{
"epoch": 0.7020057306590258,
"grad_norm": 0.8536247554325601,
"learning_rate": 2.4711736703979015e-06,
"loss": 0.5778,
"step": 1470
},
{
"epoch": 0.7067812798471824,
"grad_norm": 0.8896142850001317,
"learning_rate": 2.399602852797647e-06,
"loss": 0.5833,
"step": 1480
},
{
"epoch": 0.711556829035339,
"grad_norm": 0.9369088545555486,
"learning_rate": 2.3287550839247625e-06,
"loss": 0.5677,
"step": 1490
},
{
"epoch": 0.7163323782234957,
"grad_norm": 0.9352682466004876,
"learning_rate": 2.2586500632275333e-06,
"loss": 0.5501,
"step": 1500
},
{
"epoch": 0.7211079274116523,
"grad_norm": 0.9291292330708577,
"learning_rate": 2.1893072836307433e-06,
"loss": 0.5432,
"step": 1510
},
{
"epoch": 0.725883476599809,
"grad_norm": 1.1278542414631672,
"learning_rate": 2.1207460261156066e-06,
"loss": 0.6017,
"step": 1520
},
{
"epoch": 0.7306590257879656,
"grad_norm": 0.8496342199267922,
"learning_rate": 2.052985354358622e-06,
"loss": 0.5361,
"step": 1530
},
{
"epoch": 0.7354345749761223,
"grad_norm": 0.8448590719351696,
"learning_rate": 1.986044109430869e-06,
"loss": 0.544,
"step": 1540
},
{
"epoch": 0.7402101241642789,
"grad_norm": 1.0014560087074114,
"learning_rate": 1.91994090455918e-06,
"loss": 0.5544,
"step": 1550
},
{
"epoch": 0.7449856733524355,
"grad_norm": 0.9943362148840331,
"learning_rate": 1.8546941199506752e-06,
"loss": 0.5743,
"step": 1560
},
{
"epoch": 0.7497612225405922,
"grad_norm": 0.9488632116893986,
"learning_rate": 1.790321897682083e-06,
"loss": 0.5516,
"step": 1570
},
{
"epoch": 0.7545367717287488,
"grad_norm": 0.9282545781122443,
"learning_rate": 1.7268421366552851e-06,
"loss": 0.5598,
"step": 1580
},
{
"epoch": 0.7593123209169055,
"grad_norm": 0.893009147729329,
"learning_rate": 1.6642724876204658e-06,
"loss": 0.5457,
"step": 1590
},
{
"epoch": 0.7640878701050621,
"grad_norm": 0.8952401614954113,
"learning_rate": 1.602630348268267e-06,
"loss": 0.5623,
"step": 1600
},
{
"epoch": 0.7640878701050621,
"eval_loss": 0.46827441453933716,
"eval_runtime": 534.0896,
"eval_samples_per_second": 27.879,
"eval_steps_per_second": 3.486,
"step": 1600
},
{
"epoch": 0.7688634192932188,
"grad_norm": 0.9378147502660392,
"learning_rate": 1.541932858392296e-06,
"loss": 0.5522,
"step": 1610
},
{
"epoch": 0.7736389684813754,
"grad_norm": 0.8793552837932004,
"learning_rate": 1.482196895123364e-06,
"loss": 0.5321,
"step": 1620
},
{
"epoch": 0.778414517669532,
"grad_norm": 0.8650313644836122,
"learning_rate": 1.423439068236736e-06,
"loss": 0.5789,
"step": 1630
},
{
"epoch": 0.7831900668576887,
"grad_norm": 0.9837452596609937,
"learning_rate": 1.3656757155337413e-06,
"loss": 0.5628,
"step": 1640
},
{
"epoch": 0.7879656160458453,
"grad_norm": 1.0526446379669654,
"learning_rate": 1.3089228982989771e-06,
"loss": 0.5139,
"step": 1650
},
{
"epoch": 0.792741165234002,
"grad_norm": 0.9071660119485095,
"learning_rate": 1.2531963968344346e-06,
"loss": 0.5229,
"step": 1660
},
{
"epoch": 0.7975167144221585,
"grad_norm": 0.8509970739173898,
"learning_rate": 1.1985117060717278e-06,
"loss": 0.5184,
"step": 1670
},
{
"epoch": 0.8022922636103151,
"grad_norm": 0.8689645530199235,
"learning_rate": 1.1448840312636812e-06,
"loss": 0.5248,
"step": 1680
},
{
"epoch": 0.8070678127984718,
"grad_norm": 0.8888334350846111,
"learning_rate": 1.0923282837564537e-06,
"loss": 0.5451,
"step": 1690
},
{
"epoch": 0.8118433619866284,
"grad_norm": 1.0944895277253541,
"learning_rate": 1.0408590768434018e-06,
"loss": 0.522,
"step": 1700
},
{
"epoch": 0.8166189111747851,
"grad_norm": 1.0504131852344616,
"learning_rate": 9.904907217018e-07,
"loss": 0.5143,
"step": 1710
},
{
"epoch": 0.8213944603629417,
"grad_norm": 0.9813714332571194,
"learning_rate": 9.412372234135753e-07,
"loss": 0.5339,
"step": 1720
},
{
"epoch": 0.8261700095510984,
"grad_norm": 0.8392110216415885,
"learning_rate": 8.931122770711425e-07,
"loss": 0.5326,
"step": 1730
},
{
"epoch": 0.830945558739255,
"grad_norm": 0.883891691537776,
"learning_rate": 8.461292639694519e-07,
"loss": 0.5308,
"step": 1740
},
{
"epoch": 0.8357211079274116,
"grad_norm": 0.9330631339104432,
"learning_rate": 8.003012478852679e-07,
"loss": 0.4943,
"step": 1750
},
{
"epoch": 0.8404966571155683,
"grad_norm": 0.9077272187489582,
"learning_rate": 7.556409714447488e-07,
"loss": 0.5474,
"step": 1760
},
{
"epoch": 0.8452722063037249,
"grad_norm": 0.8412019707689536,
"learning_rate": 7.121608525803142e-07,
"loss": 0.5301,
"step": 1770
},
{
"epoch": 0.8500477554918816,
"grad_norm": 1.0343479774517594,
"learning_rate": 6.698729810778065e-07,
"loss": 0.5302,
"step": 1780
},
{
"epoch": 0.8548233046800382,
"grad_norm": 0.9838478459223126,
"learning_rate": 6.287891152148823e-07,
"loss": 0.5075,
"step": 1790
},
{
"epoch": 0.8595988538681948,
"grad_norm": 1.009194175526722,
"learning_rate": 5.889206784915863e-07,
"loss": 0.5206,
"step": 1800
},
{
"epoch": 0.8595988538681948,
"eval_loss": 0.44646286964416504,
"eval_runtime": 534.2652,
"eval_samples_per_second": 27.87,
"eval_steps_per_second": 3.485,
"step": 1800
},
{
"epoch": 0.8643744030563515,
"grad_norm": 0.8387985123574973,
"learning_rate": 5.502787564540102e-07,
"loss": 0.5305,
"step": 1810
},
{
"epoch": 0.8691499522445081,
"grad_norm": 0.9238833740564283,
"learning_rate": 5.128740936119242e-07,
"loss": 0.5115,
"step": 1820
},
{
"epoch": 0.8739255014326648,
"grad_norm": 0.8643626924619122,
"learning_rate": 4.7671709045122914e-07,
"loss": 0.501,
"step": 1830
},
{
"epoch": 0.8787010506208214,
"grad_norm": 0.8843512593283425,
"learning_rate": 4.4181780054206925e-07,
"loss": 0.5316,
"step": 1840
},
{
"epoch": 0.8834765998089781,
"grad_norm": 1.0280116743208123,
"learning_rate": 4.081859277434025e-07,
"loss": 0.5084,
"step": 1850
},
{
"epoch": 0.8882521489971347,
"grad_norm": 0.9217334180362886,
"learning_rate": 3.758308235048158e-07,
"loss": 0.4988,
"step": 1860
},
{
"epoch": 0.8930276981852913,
"grad_norm": 0.9278902432782374,
"learning_rate": 3.4476148426632215e-07,
"loss": 0.5248,
"step": 1870
},
{
"epoch": 0.897803247373448,
"grad_norm": 0.8498974666627348,
"learning_rate": 3.1498654895687095e-07,
"loss": 0.5263,
"step": 1880
},
{
"epoch": 0.9025787965616046,
"grad_norm": 0.914856710615246,
"learning_rate": 2.8651429659226906e-07,
"loss": 0.5129,
"step": 1890
},
{
"epoch": 0.9073543457497613,
"grad_norm": 0.9485264410476115,
"learning_rate": 2.593526439731697e-07,
"loss": 0.5033,
"step": 1900
},
{
"epoch": 0.9121298949379179,
"grad_norm": 0.922810539225268,
"learning_rate": 2.3350914348378606e-07,
"loss": 0.5157,
"step": 1910
},
{
"epoch": 0.9169054441260746,
"grad_norm": 0.956199326320254,
"learning_rate": 2.0899098099192273e-07,
"loss": 0.5158,
"step": 1920
},
{
"epoch": 0.9216809933142311,
"grad_norm": 0.8741702574957524,
"learning_rate": 1.8580497385092376e-07,
"loss": 0.5145,
"step": 1930
},
{
"epoch": 0.9264565425023877,
"grad_norm": 0.9941012086649309,
"learning_rate": 1.6395756900408454e-07,
"loss": 0.5321,
"step": 1940
},
{
"epoch": 0.9312320916905444,
"grad_norm": 0.8582163500365767,
"learning_rate": 1.4345484119206222e-07,
"loss": 0.5065,
"step": 1950
},
{
"epoch": 0.936007640878701,
"grad_norm": 0.9879634600102223,
"learning_rate": 1.2430249126376913e-07,
"loss": 0.54,
"step": 1960
},
{
"epoch": 0.9407831900668577,
"grad_norm": 0.9616760638465843,
"learning_rate": 1.065058445912398e-07,
"loss": 0.5084,
"step": 1970
},
{
"epoch": 0.9455587392550143,
"grad_norm": 1.0543325130204897,
"learning_rate": 9.006984958888742e-08,
"loss": 0.527,
"step": 1980
},
{
"epoch": 0.9503342884431709,
"grad_norm": 0.8916780720938148,
"learning_rate": 7.499907633758797e-08,
"loss": 0.4929,
"step": 1990
},
{
"epoch": 0.9551098376313276,
"grad_norm": 0.8743144782771384,
"learning_rate": 6.129771531395045e-08,
"loss": 0.5054,
"step": 2000
},
{
"epoch": 0.9551098376313276,
"eval_loss": 0.43961772322654724,
"eval_runtime": 534.3343,
"eval_samples_per_second": 27.866,
"eval_steps_per_second": 3.485,
"step": 2000
},
{
"epoch": 0.9598853868194842,
"grad_norm": 0.9030091469256598,
"learning_rate": 4.896957622514298e-08,
"loss": 0.4983,
"step": 2010
},
{
"epoch": 0.9646609360076409,
"grad_norm": 0.9300966338414236,
"learning_rate": 3.801808694959053e-08,
"loss": 0.5219,
"step": 2020
},
{
"epoch": 0.9694364851957975,
"grad_norm": 0.9956560344691349,
"learning_rate": 2.8446292583844126e-08,
"loss": 0.5397,
"step": 2030
},
{
"epoch": 0.9742120343839542,
"grad_norm": 0.944281751372382,
"learning_rate": 2.025685459588145e-08,
"loss": 0.525,
"step": 2040
},
{
"epoch": 0.9789875835721108,
"grad_norm": 0.8939170547532904,
"learning_rate": 1.3452050085075441e-08,
"loss": 0.5086,
"step": 2050
},
{
"epoch": 0.9837631327602674,
"grad_norm": 0.8895914526936296,
"learning_rate": 8.033771149041913e-09,
"loss": 0.5122,
"step": 2060
},
{
"epoch": 0.9885386819484241,
"grad_norm": 0.8484510879585117,
"learning_rate": 4.003524357534261e-09,
"loss": 0.5168,
"step": 2070
},
{
"epoch": 0.9933142311365807,
"grad_norm": 0.9676882963530836,
"learning_rate": 1.3624303335380006e-09,
"loss": 0.5155,
"step": 2080
},
{
"epoch": 0.9980897803247374,
"grad_norm": 0.8272201128188289,
"learning_rate": 1.1122344167613374e-10,
"loss": 0.5051,
"step": 2090
},
{
"epoch": 1.0,
"step": 2094,
"total_flos": 985274787299328.0,
"train_loss": 0.7358792713970487,
"train_runtime": 28849.9475,
"train_samples_per_second": 4.645,
"train_steps_per_second": 0.073
}
],
"logging_steps": 10,
"max_steps": 2094,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 985274787299328.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}