LegrandFrederic's picture
Upload trainer_state.json with huggingface_hub
cd8d3d3 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 500,
"global_step": 2220,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.04504504504504504,
"grad_norm": 3.175370931625366,
"learning_rate": 8.108108108108109e-06,
"loss": 0.6725,
"step": 10
},
{
"epoch": 0.09009009009009009,
"grad_norm": 1.692520260810852,
"learning_rate": 1.7117117117117117e-05,
"loss": 0.314,
"step": 20
},
{
"epoch": 0.13513513513513514,
"grad_norm": 1.95720374584198,
"learning_rate": 2.6126126126126128e-05,
"loss": 0.2577,
"step": 30
},
{
"epoch": 0.18018018018018017,
"grad_norm": 1.2667555809020996,
"learning_rate": 3.513513513513514e-05,
"loss": 0.2377,
"step": 40
},
{
"epoch": 0.22522522522522523,
"grad_norm": 1.4453164339065552,
"learning_rate": 4.414414414414415e-05,
"loss": 0.2102,
"step": 50
},
{
"epoch": 0.2702702702702703,
"grad_norm": 1.5506036281585693,
"learning_rate": 5.3153153153153155e-05,
"loss": 0.1921,
"step": 60
},
{
"epoch": 0.3153153153153153,
"grad_norm": 0.9104294776916504,
"learning_rate": 6.216216216216216e-05,
"loss": 0.1875,
"step": 70
},
{
"epoch": 0.36036036036036034,
"grad_norm": 1.6911464929580688,
"learning_rate": 7.117117117117116e-05,
"loss": 0.1705,
"step": 80
},
{
"epoch": 0.40540540540540543,
"grad_norm": 1.2363160848617554,
"learning_rate": 8.018018018018019e-05,
"loss": 0.188,
"step": 90
},
{
"epoch": 0.45045045045045046,
"grad_norm": 1.005192518234253,
"learning_rate": 8.918918918918919e-05,
"loss": 0.1554,
"step": 100
},
{
"epoch": 0.4954954954954955,
"grad_norm": 1.2431941032409668,
"learning_rate": 9.81981981981982e-05,
"loss": 0.1369,
"step": 110
},
{
"epoch": 0.5405405405405406,
"grad_norm": 1.1538969278335571,
"learning_rate": 9.999644972962145e-05,
"loss": 0.1376,
"step": 120
},
{
"epoch": 0.5855855855855856,
"grad_norm": 1.21573805809021,
"learning_rate": 9.998202762029625e-05,
"loss": 0.135,
"step": 130
},
{
"epoch": 0.6306306306306306,
"grad_norm": 1.4540833234786987,
"learning_rate": 9.995651497779182e-05,
"loss": 0.1342,
"step": 140
},
{
"epoch": 0.6756756756756757,
"grad_norm": 1.2237752676010132,
"learning_rate": 9.991991746311917e-05,
"loss": 0.1136,
"step": 150
},
{
"epoch": 0.7207207207207207,
"grad_norm": 1.0290318727493286,
"learning_rate": 9.987224319691624e-05,
"loss": 0.1114,
"step": 160
},
{
"epoch": 0.7657657657657657,
"grad_norm": 0.8585171103477478,
"learning_rate": 9.981350275764608e-05,
"loss": 0.1005,
"step": 170
},
{
"epoch": 0.8108108108108109,
"grad_norm": 0.7964202761650085,
"learning_rate": 9.97437091792495e-05,
"loss": 0.1074,
"step": 180
},
{
"epoch": 0.8558558558558559,
"grad_norm": 0.6369871497154236,
"learning_rate": 9.966287794825305e-05,
"loss": 0.103,
"step": 190
},
{
"epoch": 0.9009009009009009,
"grad_norm": 0.5182180404663086,
"learning_rate": 9.957102700033265e-05,
"loss": 0.095,
"step": 200
},
{
"epoch": 0.9459459459459459,
"grad_norm": 0.5850012898445129,
"learning_rate": 9.946817671633384e-05,
"loss": 0.0854,
"step": 210
},
{
"epoch": 0.990990990990991,
"grad_norm": 0.9804884791374207,
"learning_rate": 9.935434991774952e-05,
"loss": 0.0898,
"step": 220
},
{
"epoch": 1.0360360360360361,
"grad_norm": 0.7499725222587585,
"learning_rate": 9.922957186165598e-05,
"loss": 0.0882,
"step": 230
},
{
"epoch": 1.0810810810810811,
"grad_norm": 0.9342101216316223,
"learning_rate": 9.909387023510871e-05,
"loss": 0.0856,
"step": 240
},
{
"epoch": 1.1261261261261262,
"grad_norm": 0.752739667892456,
"learning_rate": 9.894727514899881e-05,
"loss": 0.0863,
"step": 250
},
{
"epoch": 1.1711711711711712,
"grad_norm": 0.5502750873565674,
"learning_rate": 9.878981913137179e-05,
"loss": 0.0805,
"step": 260
},
{
"epoch": 1.2162162162162162,
"grad_norm": 1.2474815845489502,
"learning_rate": 9.862153712020972e-05,
"loss": 0.0874,
"step": 270
},
{
"epoch": 1.2612612612612613,
"grad_norm": 0.8233453631401062,
"learning_rate": 9.844246645567902e-05,
"loss": 0.077,
"step": 280
},
{
"epoch": 1.3063063063063063,
"grad_norm": 0.5110873579978943,
"learning_rate": 9.825264687184493e-05,
"loss": 0.0765,
"step": 290
},
{
"epoch": 1.3513513513513513,
"grad_norm": 0.5558863878250122,
"learning_rate": 9.805212048785493e-05,
"loss": 0.08,
"step": 300
},
{
"epoch": 1.3963963963963963,
"grad_norm": 0.8767929673194885,
"learning_rate": 9.78409317985929e-05,
"loss": 0.0781,
"step": 310
},
{
"epoch": 1.4414414414414414,
"grad_norm": 0.6763063073158264,
"learning_rate": 9.761912766480614e-05,
"loss": 0.0749,
"step": 320
},
{
"epoch": 1.4864864864864864,
"grad_norm": 0.3673441410064697,
"learning_rate": 9.738675730270737e-05,
"loss": 0.0772,
"step": 330
},
{
"epoch": 1.5315315315315314,
"grad_norm": 0.3171934187412262,
"learning_rate": 9.714387227305422e-05,
"loss": 0.0725,
"step": 340
},
{
"epoch": 1.5765765765765765,
"grad_norm": 0.6207244992256165,
"learning_rate": 9.689052646970829e-05,
"loss": 0.0723,
"step": 350
},
{
"epoch": 1.6216216216216215,
"grad_norm": 0.7387151122093201,
"learning_rate": 9.662677610767672e-05,
"loss": 0.0705,
"step": 360
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.5089544057846069,
"learning_rate": 9.635267971063848e-05,
"loss": 0.0667,
"step": 370
},
{
"epoch": 1.7117117117117115,
"grad_norm": 0.8173167109489441,
"learning_rate": 9.606829809795871e-05,
"loss": 0.0672,
"step": 380
},
{
"epoch": 1.7567567567567568,
"grad_norm": 0.5291303992271423,
"learning_rate": 9.57736943711933e-05,
"loss": 0.0731,
"step": 390
},
{
"epoch": 1.8018018018018018,
"grad_norm": 0.3580414950847626,
"learning_rate": 9.546893390008738e-05,
"loss": 0.0709,
"step": 400
},
{
"epoch": 1.8468468468468469,
"grad_norm": 0.643068253993988,
"learning_rate": 9.515408430807036e-05,
"loss": 0.0641,
"step": 410
},
{
"epoch": 1.8918918918918919,
"grad_norm": 0.6227248311042786,
"learning_rate": 9.482921545725097e-05,
"loss": 0.065,
"step": 420
},
{
"epoch": 1.936936936936937,
"grad_norm": 0.7317764163017273,
"learning_rate": 9.449439943291541e-05,
"loss": 0.0694,
"step": 430
},
{
"epoch": 1.981981981981982,
"grad_norm": 0.6944959163665771,
"learning_rate": 9.414971052753252e-05,
"loss": 0.0739,
"step": 440
},
{
"epoch": 2.027027027027027,
"grad_norm": 0.5378988981246948,
"learning_rate": 9.379522522426879e-05,
"loss": 0.066,
"step": 450
},
{
"epoch": 2.0720720720720722,
"grad_norm": 0.6093091368675232,
"learning_rate": 9.343102218001762e-05,
"loss": 0.0725,
"step": 460
},
{
"epoch": 2.1171171171171173,
"grad_norm": 1.0975353717803955,
"learning_rate": 9.305718220794604e-05,
"loss": 0.0748,
"step": 470
},
{
"epoch": 2.1621621621621623,
"grad_norm": 0.7575361132621765,
"learning_rate": 9.267378825956301e-05,
"loss": 0.0634,
"step": 480
},
{
"epoch": 2.2072072072072073,
"grad_norm": 0.5477670431137085,
"learning_rate": 9.228092540631342e-05,
"loss": 0.0661,
"step": 490
},
{
"epoch": 2.2522522522522523,
"grad_norm": 0.5405918955802917,
"learning_rate": 9.187868082070132e-05,
"loss": 0.0607,
"step": 500
},
{
"epoch": 2.2972972972972974,
"grad_norm": 0.7018790245056152,
"learning_rate": 9.146714375694745e-05,
"loss": 0.0567,
"step": 510
},
{
"epoch": 2.3423423423423424,
"grad_norm": 0.630402684211731,
"learning_rate": 9.104640553118435e-05,
"loss": 0.0618,
"step": 520
},
{
"epoch": 2.3873873873873874,
"grad_norm": 0.4060376286506653,
"learning_rate": 9.06165595011943e-05,
"loss": 0.0651,
"step": 530
},
{
"epoch": 2.4324324324324325,
"grad_norm": 0.5323811173439026,
"learning_rate": 9.01777010456941e-05,
"loss": 0.0569,
"step": 540
},
{
"epoch": 2.4774774774774775,
"grad_norm": 0.4360101521015167,
"learning_rate": 8.972992754317144e-05,
"loss": 0.0542,
"step": 550
},
{
"epoch": 2.5225225225225225,
"grad_norm": 0.6220657825469971,
"learning_rate": 8.927333835027759e-05,
"loss": 0.0615,
"step": 560
},
{
"epoch": 2.5675675675675675,
"grad_norm": 0.5657696723937988,
"learning_rate": 8.880803477978102e-05,
"loss": 0.058,
"step": 570
},
{
"epoch": 2.6126126126126126,
"grad_norm": 0.5093798637390137,
"learning_rate": 8.833412007808713e-05,
"loss": 0.0594,
"step": 580
},
{
"epoch": 2.6576576576576576,
"grad_norm": 0.452443927526474,
"learning_rate": 8.78516994023289e-05,
"loss": 0.0564,
"step": 590
},
{
"epoch": 2.7027027027027026,
"grad_norm": 0.4802651107311249,
"learning_rate": 8.73608797970334e-05,
"loss": 0.0496,
"step": 600
},
{
"epoch": 2.7477477477477477,
"grad_norm": 0.33183491230010986,
"learning_rate": 8.686177017036979e-05,
"loss": 0.053,
"step": 610
},
{
"epoch": 2.7927927927927927,
"grad_norm": 0.40296322107315063,
"learning_rate": 8.635448126998352e-05,
"loss": 0.0568,
"step": 620
},
{
"epoch": 2.8378378378378377,
"grad_norm": 0.30987316370010376,
"learning_rate": 8.583912565842257e-05,
"loss": 0.0515,
"step": 630
},
{
"epoch": 2.8828828828828827,
"grad_norm": 0.6701423525810242,
"learning_rate": 8.531581768816084e-05,
"loss": 0.0526,
"step": 640
},
{
"epoch": 2.9279279279279278,
"grad_norm": 0.439345121383667,
"learning_rate": 8.478467347622443e-05,
"loss": 0.0577,
"step": 650
},
{
"epoch": 2.972972972972973,
"grad_norm": 0.42080074548721313,
"learning_rate": 8.424581087842647e-05,
"loss": 0.048,
"step": 660
},
{
"epoch": 3.018018018018018,
"grad_norm": 0.43773335218429565,
"learning_rate": 8.369934946321595e-05,
"loss": 0.0523,
"step": 670
},
{
"epoch": 3.063063063063063,
"grad_norm": 0.45040780305862427,
"learning_rate": 8.314541048514664e-05,
"loss": 0.0509,
"step": 680
},
{
"epoch": 3.108108108108108,
"grad_norm": 0.36781787872314453,
"learning_rate": 8.25841168579719e-05,
"loss": 0.0495,
"step": 690
},
{
"epoch": 3.153153153153153,
"grad_norm": 0.477033406496048,
"learning_rate": 8.201559312737132e-05,
"loss": 0.0514,
"step": 700
},
{
"epoch": 3.1981981981981984,
"grad_norm": 0.4004456698894501,
"learning_rate": 8.143996544331511e-05,
"loss": 0.0479,
"step": 710
},
{
"epoch": 3.2432432432432434,
"grad_norm": 0.5489519834518433,
"learning_rate": 8.085736153207277e-05,
"loss": 0.0475,
"step": 720
},
{
"epoch": 3.2882882882882885,
"grad_norm": 0.5412061810493469,
"learning_rate": 8.026791066787176e-05,
"loss": 0.0548,
"step": 730
},
{
"epoch": 3.3333333333333335,
"grad_norm": 0.23101916909217834,
"learning_rate": 7.967174364421284e-05,
"loss": 0.0503,
"step": 740
},
{
"epoch": 3.3783783783783785,
"grad_norm": 0.27605608105659485,
"learning_rate": 7.90689927448482e-05,
"loss": 0.0514,
"step": 750
},
{
"epoch": 3.4234234234234235,
"grad_norm": 0.29271090030670166,
"learning_rate": 7.8459791714429e-05,
"loss": 0.0498,
"step": 760
},
{
"epoch": 3.4684684684684686,
"grad_norm": 0.4988279938697815,
"learning_rate": 7.784427572882871e-05,
"loss": 0.0488,
"step": 770
},
{
"epoch": 3.5135135135135136,
"grad_norm": 0.4085995554924011,
"learning_rate": 7.722258136514884e-05,
"loss": 0.0484,
"step": 780
},
{
"epoch": 3.5585585585585586,
"grad_norm": 0.5015743374824524,
"learning_rate": 7.659484657141382e-05,
"loss": 0.0547,
"step": 790
},
{
"epoch": 3.6036036036036037,
"grad_norm": 0.41595619916915894,
"learning_rate": 7.596121063596168e-05,
"loss": 0.0493,
"step": 800
},
{
"epoch": 3.6486486486486487,
"grad_norm": 0.5095012187957764,
"learning_rate": 7.532181415653725e-05,
"loss": 0.0503,
"step": 810
},
{
"epoch": 3.6936936936936937,
"grad_norm": 0.4954228103160858,
"learning_rate": 7.467679900909489e-05,
"loss": 0.0528,
"step": 820
},
{
"epoch": 3.7387387387387387,
"grad_norm": 0.5485273003578186,
"learning_rate": 7.40263083163176e-05,
"loss": 0.0516,
"step": 830
},
{
"epoch": 3.7837837837837838,
"grad_norm": 0.35983964800834656,
"learning_rate": 7.337048641585937e-05,
"loss": 0.05,
"step": 840
},
{
"epoch": 3.828828828828829,
"grad_norm": 0.40537890791893005,
"learning_rate": 7.270947882831822e-05,
"loss": 0.044,
"step": 850
},
{
"epoch": 3.873873873873874,
"grad_norm": 0.3156987428665161,
"learning_rate": 7.20434322249464e-05,
"loss": 0.045,
"step": 860
},
{
"epoch": 3.918918918918919,
"grad_norm": 0.3144851326942444,
"learning_rate": 7.137249439510548e-05,
"loss": 0.0458,
"step": 870
},
{
"epoch": 3.963963963963964,
"grad_norm": 0.2909262478351593,
"learning_rate": 7.069681421347339e-05,
"loss": 0.0428,
"step": 880
},
{
"epoch": 4.009009009009009,
"grad_norm": 0.34615159034729004,
"learning_rate": 7.001654160701046e-05,
"loss": 0.0468,
"step": 890
},
{
"epoch": 4.054054054054054,
"grad_norm": 0.4577626883983612,
"learning_rate": 6.933182752169213e-05,
"loss": 0.0399,
"step": 900
},
{
"epoch": 4.099099099099099,
"grad_norm": 0.3926784098148346,
"learning_rate": 6.864282388901544e-05,
"loss": 0.0394,
"step": 910
},
{
"epoch": 4.1441441441441444,
"grad_norm": 0.3566577732563019,
"learning_rate": 6.794968359228688e-05,
"loss": 0.0452,
"step": 920
},
{
"epoch": 4.1891891891891895,
"grad_norm": 0.3146449029445648,
"learning_rate": 6.725256043269912e-05,
"loss": 0.0474,
"step": 930
},
{
"epoch": 4.2342342342342345,
"grad_norm": 0.2554456293582916,
"learning_rate": 6.65516090952039e-05,
"loss": 0.0458,
"step": 940
},
{
"epoch": 4.2792792792792795,
"grad_norm": 0.3618452250957489,
"learning_rate": 6.584698511418901e-05,
"loss": 0.0466,
"step": 950
},
{
"epoch": 4.324324324324325,
"grad_norm": 0.38914114236831665,
"learning_rate": 6.513884483896666e-05,
"loss": 0.0424,
"step": 960
},
{
"epoch": 4.36936936936937,
"grad_norm": 0.6733397841453552,
"learning_rate": 6.4427345399081e-05,
"loss": 0.0488,
"step": 970
},
{
"epoch": 4.414414414414415,
"grad_norm": 0.37581348419189453,
"learning_rate": 6.37126446694427e-05,
"loss": 0.0481,
"step": 980
},
{
"epoch": 4.45945945945946,
"grad_norm": 0.3915393352508545,
"learning_rate": 6.299490123529797e-05,
"loss": 0.0413,
"step": 990
},
{
"epoch": 4.504504504504505,
"grad_norm": 0.4132647216320038,
"learning_rate": 6.227427435703997e-05,
"loss": 0.0475,
"step": 1000
},
{
"epoch": 4.54954954954955,
"grad_norm": 0.4121103584766388,
"learning_rate": 6.155092393487051e-05,
"loss": 0.0493,
"step": 1010
},
{
"epoch": 4.594594594594595,
"grad_norm": 0.33546769618988037,
"learning_rate": 6.08250104733197e-05,
"loss": 0.041,
"step": 1020
},
{
"epoch": 4.63963963963964,
"grad_norm": 0.34651216864585876,
"learning_rate": 6.009669504563153e-05,
"loss": 0.043,
"step": 1030
},
{
"epoch": 4.684684684684685,
"grad_norm": 0.37287256121635437,
"learning_rate": 5.9366139258023326e-05,
"loss": 0.0413,
"step": 1040
},
{
"epoch": 4.72972972972973,
"grad_norm": 0.3134576082229614,
"learning_rate": 5.863350521382671e-05,
"loss": 0.0405,
"step": 1050
},
{
"epoch": 4.774774774774775,
"grad_norm": 0.33046990633010864,
"learning_rate": 5.7898955477518666e-05,
"loss": 0.0379,
"step": 1060
},
{
"epoch": 4.81981981981982,
"grad_norm": 0.5138404369354248,
"learning_rate": 5.716265303864978e-05,
"loss": 0.0353,
"step": 1070
},
{
"epoch": 4.864864864864865,
"grad_norm": 0.31907182931900024,
"learning_rate": 5.642476127567866e-05,
"loss": 0.0421,
"step": 1080
},
{
"epoch": 4.90990990990991,
"grad_norm": 0.2643144428730011,
"learning_rate": 5.5685443919719634e-05,
"loss": 0.0431,
"step": 1090
},
{
"epoch": 4.954954954954955,
"grad_norm": 0.3213581442832947,
"learning_rate": 5.4944865018212497e-05,
"loss": 0.0436,
"step": 1100
},
{
"epoch": 5.0,
"grad_norm": 0.2239149659872055,
"learning_rate": 5.4203188898521895e-05,
"loss": 0.0406,
"step": 1110
},
{
"epoch": 5.045045045045045,
"grad_norm": 0.3077867031097412,
"learning_rate": 5.346058013147469e-05,
"loss": 0.0417,
"step": 1120
},
{
"epoch": 5.09009009009009,
"grad_norm": 0.4085425138473511,
"learning_rate": 5.271720349484326e-05,
"loss": 0.0397,
"step": 1130
},
{
"epoch": 5.135135135135135,
"grad_norm": 0.4555751383304596,
"learning_rate": 5.1973223936782887e-05,
"loss": 0.0382,
"step": 1140
},
{
"epoch": 5.18018018018018,
"grad_norm": 0.4915667474269867,
"learning_rate": 5.122880653923134e-05,
"loss": 0.0422,
"step": 1150
},
{
"epoch": 5.225225225225225,
"grad_norm": 0.3977510333061218,
"learning_rate": 5.04841164812788e-05,
"loss": 0.0377,
"step": 1160
},
{
"epoch": 5.27027027027027,
"grad_norm": 0.42886292934417725,
"learning_rate": 4.973931900251611e-05,
"loss": 0.0411,
"step": 1170
},
{
"epoch": 5.315315315315315,
"grad_norm": 0.4493580460548401,
"learning_rate": 4.899457936636988e-05,
"loss": 0.0392,
"step": 1180
},
{
"epoch": 5.36036036036036,
"grad_norm": 0.2820551097393036,
"learning_rate": 4.82500628234319e-05,
"loss": 0.0356,
"step": 1190
},
{
"epoch": 5.405405405405405,
"grad_norm": 0.2753814160823822,
"learning_rate": 4.750593457479171e-05,
"loss": 0.0365,
"step": 1200
},
{
"epoch": 5.45045045045045,
"grad_norm": 0.3056580424308777,
"learning_rate": 4.676235973538013e-05,
"loss": 0.0385,
"step": 1210
},
{
"epoch": 5.495495495495495,
"grad_norm": 0.23721066117286682,
"learning_rate": 4.6019503297331736e-05,
"loss": 0.0401,
"step": 1220
},
{
"epoch": 5.54054054054054,
"grad_norm": 0.2635497748851776,
"learning_rate": 4.5277530093374734e-05,
"loss": 0.0358,
"step": 1230
},
{
"epoch": 5.585585585585585,
"grad_norm": 0.26532334089279175,
"learning_rate": 4.4536604760256123e-05,
"loss": 0.0406,
"step": 1240
},
{
"epoch": 5.63063063063063,
"grad_norm": 0.31581610441207886,
"learning_rate": 4.379689170221043e-05,
"loss": 0.0391,
"step": 1250
},
{
"epoch": 5.675675675675675,
"grad_norm": 0.31361261010169983,
"learning_rate": 4.3058555054479924e-05,
"loss": 0.0424,
"step": 1260
},
{
"epoch": 5.7207207207207205,
"grad_norm": 0.25601786375045776,
"learning_rate": 4.232175864689464e-05,
"loss": 0.0392,
"step": 1270
},
{
"epoch": 5.7657657657657655,
"grad_norm": 0.2870907485485077,
"learning_rate": 4.158666596752004e-05,
"loss": 0.0347,
"step": 1280
},
{
"epoch": 5.8108108108108105,
"grad_norm": 0.27915236353874207,
"learning_rate": 4.085344012638067e-05,
"loss": 0.0351,
"step": 1290
},
{
"epoch": 5.8558558558558556,
"grad_norm": 0.29854437708854675,
"learning_rate": 4.01222438192675e-05,
"loss": 0.0434,
"step": 1300
},
{
"epoch": 5.900900900900901,
"grad_norm": 0.34385260939598083,
"learning_rate": 3.939323929163738e-05,
"loss": 0.0352,
"step": 1310
},
{
"epoch": 5.945945945945946,
"grad_norm": 0.3390043377876282,
"learning_rate": 3.866658830261224e-05,
"loss": 0.0343,
"step": 1320
},
{
"epoch": 5.990990990990991,
"grad_norm": 0.3802526593208313,
"learning_rate": 3.794245208908639e-05,
"loss": 0.0403,
"step": 1330
},
{
"epoch": 6.036036036036036,
"grad_norm": 0.26724833250045776,
"learning_rate": 3.722099132994949e-05,
"loss": 0.037,
"step": 1340
},
{
"epoch": 6.081081081081081,
"grad_norm": 0.2882586717605591,
"learning_rate": 3.650236611043355e-05,
"loss": 0.0307,
"step": 1350
},
{
"epoch": 6.126126126126126,
"grad_norm": 0.24341371655464172,
"learning_rate": 3.578673588659145e-05,
"loss": 0.0318,
"step": 1360
},
{
"epoch": 6.171171171171171,
"grad_norm": 0.28189948201179504,
"learning_rate": 3.5074259449915284e-05,
"loss": 0.0306,
"step": 1370
},
{
"epoch": 6.216216216216216,
"grad_norm": 0.2897877097129822,
"learning_rate": 3.436509489210189e-05,
"loss": 0.0311,
"step": 1380
},
{
"epoch": 6.261261261261261,
"grad_norm": 0.18506336212158203,
"learning_rate": 3.365939956997399e-05,
"loss": 0.0305,
"step": 1390
},
{
"epoch": 6.306306306306306,
"grad_norm": 0.21617922186851501,
"learning_rate": 3.2957330070564085e-05,
"loss": 0.0332,
"step": 1400
},
{
"epoch": 6.351351351351352,
"grad_norm": 0.26593586802482605,
"learning_rate": 3.225904217636939e-05,
"loss": 0.0329,
"step": 1410
},
{
"epoch": 6.396396396396397,
"grad_norm": 0.24474456906318665,
"learning_rate": 3.1564690830785106e-05,
"loss": 0.0293,
"step": 1420
},
{
"epoch": 6.441441441441442,
"grad_norm": 0.21675890684127808,
"learning_rate": 3.0874430103724015e-05,
"loss": 0.033,
"step": 1430
},
{
"epoch": 6.486486486486487,
"grad_norm": 0.31768059730529785,
"learning_rate": 3.0188413157429828e-05,
"loss": 0.0303,
"step": 1440
},
{
"epoch": 6.531531531531532,
"grad_norm": 0.2665558159351349,
"learning_rate": 2.9506792212491986e-05,
"loss": 0.0326,
"step": 1450
},
{
"epoch": 6.576576576576577,
"grad_norm": 0.3153150975704193,
"learning_rate": 2.8829718514069265e-05,
"loss": 0.0322,
"step": 1460
},
{
"epoch": 6.621621621621622,
"grad_norm": 0.32425829768180847,
"learning_rate": 2.815734229833007e-05,
"loss": 0.032,
"step": 1470
},
{
"epoch": 6.666666666666667,
"grad_norm": 0.2635970413684845,
"learning_rate": 2.748981275911633e-05,
"loss": 0.0307,
"step": 1480
},
{
"epoch": 6.711711711711712,
"grad_norm": 0.41540664434432983,
"learning_rate": 2.6827278014838953e-05,
"loss": 0.0336,
"step": 1490
},
{
"epoch": 6.756756756756757,
"grad_norm": 0.2897012233734131,
"learning_rate": 2.616988507561161e-05,
"loss": 0.0301,
"step": 1500
},
{
"epoch": 6.801801801801802,
"grad_norm": 0.2893607020378113,
"learning_rate": 2.5517779810630728e-05,
"loss": 0.0331,
"step": 1510
},
{
"epoch": 6.846846846846847,
"grad_norm": 0.5410453081130981,
"learning_rate": 2.4871106915808434e-05,
"loss": 0.0329,
"step": 1520
},
{
"epoch": 6.891891891891892,
"grad_norm": 0.28260162472724915,
"learning_rate": 2.4230009881666022e-05,
"loss": 0.0353,
"step": 1530
},
{
"epoch": 6.936936936936937,
"grad_norm": 0.178074911236763,
"learning_rate": 2.359463096149461e-05,
"loss": 0.0308,
"step": 1540
},
{
"epoch": 6.981981981981982,
"grad_norm": 0.19077138602733612,
"learning_rate": 2.2965111139790697e-05,
"loss": 0.0296,
"step": 1550
},
{
"epoch": 7.027027027027027,
"grad_norm": 0.29912513494491577,
"learning_rate": 2.234159010097287e-05,
"loss": 0.0318,
"step": 1560
},
{
"epoch": 7.072072072072072,
"grad_norm": 0.3348890542984009,
"learning_rate": 2.1724206198387294e-05,
"loss": 0.0306,
"step": 1570
},
{
"epoch": 7.117117117117117,
"grad_norm": 0.1999453604221344,
"learning_rate": 2.1113096423608358e-05,
"loss": 0.0311,
"step": 1580
},
{
"epoch": 7.162162162162162,
"grad_norm": 0.27420762181282043,
"learning_rate": 2.050839637604165e-05,
"loss": 0.029,
"step": 1590
},
{
"epoch": 7.207207207207207,
"grad_norm": 0.18965670466423035,
"learning_rate": 1.991024023283562e-05,
"loss": 0.0309,
"step": 1600
},
{
"epoch": 7.252252252252252,
"grad_norm": 0.23550300300121307,
"learning_rate": 1.9318760719109054e-05,
"loss": 0.031,
"step": 1610
},
{
"epoch": 7.297297297297297,
"grad_norm": 0.2695910334587097,
"learning_rate": 1.8734089078500565e-05,
"loss": 0.026,
"step": 1620
},
{
"epoch": 7.342342342342342,
"grad_norm": 0.2515803277492523,
"learning_rate": 1.815635504404701e-05,
"loss": 0.0276,
"step": 1630
},
{
"epoch": 7.387387387387387,
"grad_norm": 0.19094723463058472,
"learning_rate": 1.7585686809396822e-05,
"loss": 0.0294,
"step": 1640
},
{
"epoch": 7.4324324324324325,
"grad_norm": 0.26856082677841187,
"learning_rate": 1.702221100036515e-05,
"loss": 0.0268,
"step": 1650
},
{
"epoch": 7.4774774774774775,
"grad_norm": 0.2571709454059601,
"learning_rate": 1.6466052646836832e-05,
"loss": 0.0289,
"step": 1660
},
{
"epoch": 7.5225225225225225,
"grad_norm": 0.2961537837982178,
"learning_rate": 1.5917335155023367e-05,
"loss": 0.0301,
"step": 1670
},
{
"epoch": 7.5675675675675675,
"grad_norm": 0.18612734973430634,
"learning_rate": 1.5376180280080333e-05,
"loss": 0.027,
"step": 1680
},
{
"epoch": 7.612612612612613,
"grad_norm": 0.2395772635936737,
"learning_rate": 1.4842708099091047e-05,
"loss": 0.0269,
"step": 1690
},
{
"epoch": 7.657657657657658,
"grad_norm": 0.2048547863960266,
"learning_rate": 1.4317036984422671e-05,
"loss": 0.0257,
"step": 1700
},
{
"epoch": 7.702702702702703,
"grad_norm": 0.1941969245672226,
"learning_rate": 1.3799283577460431e-05,
"loss": 0.0254,
"step": 1710
},
{
"epoch": 7.747747747747748,
"grad_norm": 0.3675873279571533,
"learning_rate": 1.328956276272606e-05,
"loss": 0.026,
"step": 1720
},
{
"epoch": 7.792792792792793,
"grad_norm": 0.3239746391773224,
"learning_rate": 1.2787987642386007e-05,
"loss": 0.0245,
"step": 1730
},
{
"epoch": 7.837837837837838,
"grad_norm": 0.2818147838115692,
"learning_rate": 1.2294669511155193e-05,
"loss": 0.0348,
"step": 1740
},
{
"epoch": 7.882882882882883,
"grad_norm": 0.23256780207157135,
"learning_rate": 1.1809717831601697e-05,
"loss": 0.0299,
"step": 1750
},
{
"epoch": 7.927927927927928,
"grad_norm": 0.46159571409225464,
"learning_rate": 1.1333240209858159e-05,
"loss": 0.0299,
"step": 1760
},
{
"epoch": 7.972972972972973,
"grad_norm": 0.19868823885917664,
"learning_rate": 1.0865342371744924e-05,
"loss": 0.0269,
"step": 1770
},
{
"epoch": 8.018018018018019,
"grad_norm": 0.29607120156288147,
"learning_rate": 1.0406128139310533e-05,
"loss": 0.0266,
"step": 1780
},
{
"epoch": 8.063063063063064,
"grad_norm": 0.1842232197523117,
"learning_rate": 9.955699407794594e-06,
"loss": 0.0252,
"step": 1790
},
{
"epoch": 8.108108108108109,
"grad_norm": 0.1922302395105362,
"learning_rate": 9.514156123018258e-06,
"loss": 0.0243,
"step": 1800
},
{
"epoch": 8.153153153153154,
"grad_norm": 0.21968339383602142,
"learning_rate": 9.081596259207109e-06,
"loss": 0.0266,
"step": 1810
},
{
"epoch": 8.198198198198199,
"grad_norm": 0.24987168610095978,
"learning_rate": 8.658115797251676e-06,
"loss": 0.0255,
"step": 1820
},
{
"epoch": 8.243243243243244,
"grad_norm": 0.30064335465431213,
"learning_rate": 8.243808703410177e-06,
"loss": 0.0245,
"step": 1830
},
{
"epoch": 8.288288288288289,
"grad_norm": 0.26207128167152405,
"learning_rate": 7.838766908458339e-06,
"loss": 0.0238,
"step": 1840
},
{
"epoch": 8.333333333333334,
"grad_norm": 0.23758377134799957,
"learning_rate": 7.443080287290782e-06,
"loss": 0.0265,
"step": 1850
},
{
"epoch": 8.378378378378379,
"grad_norm": 0.30203232169151306,
"learning_rate": 7.0568366389786975e-06,
"loss": 0.0283,
"step": 1860
},
{
"epoch": 8.423423423423424,
"grad_norm": 0.23233330249786377,
"learning_rate": 6.680121667288025e-06,
"loss": 0.0249,
"step": 1870
},
{
"epoch": 8.468468468468469,
"grad_norm": 0.3445972204208374,
"learning_rate": 6.3130189616626474e-06,
"loss": 0.0236,
"step": 1880
},
{
"epoch": 8.513513513513514,
"grad_norm": 0.31303051114082336,
"learning_rate": 5.955609978676652e-06,
"loss": 0.0243,
"step": 1890
},
{
"epoch": 8.558558558558559,
"grad_norm": 0.2929190397262573,
"learning_rate": 5.607974023959978e-06,
"loss": 0.0314,
"step": 1900
},
{
"epoch": 8.603603603603604,
"grad_norm": 0.4855507016181946,
"learning_rate": 5.270188234601142e-06,
"loss": 0.0267,
"step": 1910
},
{
"epoch": 8.64864864864865,
"grad_norm": 0.3386547863483429,
"learning_rate": 4.942327562031357e-06,
"loss": 0.0247,
"step": 1920
},
{
"epoch": 8.693693693693694,
"grad_norm": 0.18032731115818024,
"learning_rate": 4.624464755393459e-06,
"loss": 0.0241,
"step": 1930
},
{
"epoch": 8.73873873873874,
"grad_norm": 0.2196054905653,
"learning_rate": 4.316670345399626e-06,
"loss": 0.029,
"step": 1940
},
{
"epoch": 8.783783783783784,
"grad_norm": 0.1823672503232956,
"learning_rate": 4.019012628681234e-06,
"loss": 0.0228,
"step": 1950
},
{
"epoch": 8.82882882882883,
"grad_norm": 0.25816360116004944,
"learning_rate": 3.731557652634543e-06,
"loss": 0.0239,
"step": 1960
},
{
"epoch": 8.873873873873874,
"grad_norm": 0.21811510622501373,
"learning_rate": 3.454369200765356e-06,
"loss": 0.0244,
"step": 1970
},
{
"epoch": 8.91891891891892,
"grad_norm": 0.19826571643352509,
"learning_rate": 3.1875087785361137e-06,
"loss": 0.0236,
"step": 1980
},
{
"epoch": 8.963963963963964,
"grad_norm": 0.1923084408044815,
"learning_rate": 2.931035599718396e-06,
"loss": 0.0235,
"step": 1990
},
{
"epoch": 9.00900900900901,
"grad_norm": 0.16076907515525818,
"learning_rate": 2.6850065732539842e-06,
"loss": 0.0262,
"step": 2000
},
{
"epoch": 9.054054054054054,
"grad_norm": 0.2307383120059967,
"learning_rate": 2.449476290627273e-06,
"loss": 0.0245,
"step": 2010
},
{
"epoch": 9.0990990990991,
"grad_norm": 0.21985507011413574,
"learning_rate": 2.2244970137519583e-06,
"loss": 0.0266,
"step": 2020
},
{
"epoch": 9.144144144144144,
"grad_norm": 0.26605159044265747,
"learning_rate": 2.010118663374627e-06,
"loss": 0.0225,
"step": 2030
},
{
"epoch": 9.18918918918919,
"grad_norm": 0.3474940359592438,
"learning_rate": 1.8063888079978331e-06,
"loss": 0.0255,
"step": 2040
},
{
"epoch": 9.234234234234235,
"grad_norm": 0.1781226396560669,
"learning_rate": 1.6133526533250565e-06,
"loss": 0.0274,
"step": 2050
},
{
"epoch": 9.27927927927928,
"grad_norm": 0.2931009829044342,
"learning_rate": 1.4310530322300453e-06,
"loss": 0.0253,
"step": 2060
},
{
"epoch": 9.324324324324325,
"grad_norm": 0.21503065526485443,
"learning_rate": 1.2595303952525672e-06,
"loss": 0.0223,
"step": 2070
},
{
"epoch": 9.36936936936937,
"grad_norm": 0.23624898493289948,
"learning_rate": 1.0988228016228508e-06,
"loss": 0.0252,
"step": 2080
},
{
"epoch": 9.414414414414415,
"grad_norm": 0.2636685073375702,
"learning_rate": 9.48965910816596e-07,
"loss": 0.0302,
"step": 2090
},
{
"epoch": 9.45945945945946,
"grad_norm": 0.23757557570934296,
"learning_rate": 8.099929746424706e-07,
"loss": 0.0248,
"step": 2100
},
{
"epoch": 9.504504504504505,
"grad_norm": 0.3245598077774048,
"learning_rate": 6.819348298638839e-07,
"loss": 0.0219,
"step": 2110
},
{
"epoch": 9.54954954954955,
"grad_norm": 0.260775089263916,
"learning_rate": 5.648198913565494e-07,
"loss": 0.0198,
"step": 2120
},
{
"epoch": 9.594594594594595,
"grad_norm": 0.18040095269680023,
"learning_rate": 4.5867414580355593e-07,
"loss": 0.0263,
"step": 2130
},
{
"epoch": 9.63963963963964,
"grad_norm": 0.17358094453811646,
"learning_rate": 3.635211459291188e-07,
"loss": 0.0236,
"step": 2140
},
{
"epoch": 9.684684684684685,
"grad_norm": 0.1593407541513443,
"learning_rate": 2.793820052725049e-07,
"loss": 0.0233,
"step": 2150
},
{
"epoch": 9.72972972972973,
"grad_norm": 0.24278421700000763,
"learning_rate": 2.06275393503097e-07,
"loss": 0.0201,
"step": 2160
},
{
"epoch": 9.774774774774775,
"grad_norm": 0.1488327831029892,
"learning_rate": 1.4421753227780722e-07,
"loss": 0.0234,
"step": 2170
},
{
"epoch": 9.81981981981982,
"grad_norm": 0.2808060050010681,
"learning_rate": 9.32221916416176e-08,
"loss": 0.0248,
"step": 2180
},
{
"epoch": 9.864864864864865,
"grad_norm": 0.2229873239994049,
"learning_rate": 5.330068697215751e-08,
"loss": 0.0259,
"step": 2190
},
{
"epoch": 9.90990990990991,
"grad_norm": 0.2598866820335388,
"learning_rate": 2.4461876468934163e-08,
"loss": 0.0273,
"step": 2200
},
{
"epoch": 9.954954954954955,
"grad_norm": 0.29245129227638245,
"learning_rate": 6.712159187766131e-09,
"loss": 0.0245,
"step": 2210
},
{
"epoch": 10.0,
"grad_norm": 0.2520624101161957,
"learning_rate": 5.547362090241315e-11,
"loss": 0.0271,
"step": 2220
},
{
"epoch": 10.0,
"step": 2220,
"total_flos": 0.0,
"train_loss": 0.055370314842140354,
"train_runtime": 2269.5641,
"train_samples_per_second": 47.895,
"train_steps_per_second": 0.978
}
],
"logging_steps": 10,
"max_steps": 2220,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 20000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 49,
"trial_name": null,
"trial_params": null
}