trip_qa_gem13b / trainer_state.json
ohdyo's picture
Upload folder using huggingface_hub
9fc7870 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 747,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004016064257028112,
"grad_norm": 6.783391952514648,
"learning_rate": 0.0002,
"loss": 17.0626,
"step": 1
},
{
"epoch": 0.008032128514056224,
"grad_norm": 6.832010269165039,
"learning_rate": 0.0001997322623828648,
"loss": 16.3736,
"step": 2
},
{
"epoch": 0.012048192771084338,
"grad_norm": 4.412657260894775,
"learning_rate": 0.0001994645247657296,
"loss": 13.7202,
"step": 3
},
{
"epoch": 0.01606425702811245,
"grad_norm": 4.6994500160217285,
"learning_rate": 0.0001991967871485944,
"loss": 12.1103,
"step": 4
},
{
"epoch": 0.020080321285140562,
"grad_norm": 5.078355312347412,
"learning_rate": 0.00019892904953145918,
"loss": 11.9491,
"step": 5
},
{
"epoch": 0.024096385542168676,
"grad_norm": 5.82587194442749,
"learning_rate": 0.00019866131191432397,
"loss": 10.24,
"step": 6
},
{
"epoch": 0.028112449799196786,
"grad_norm": 5.521396160125732,
"learning_rate": 0.00019839357429718877,
"loss": 9.7617,
"step": 7
},
{
"epoch": 0.0321285140562249,
"grad_norm": 5.55628776550293,
"learning_rate": 0.00019812583668005356,
"loss": 8.9588,
"step": 8
},
{
"epoch": 0.03614457831325301,
"grad_norm": 4.77673864364624,
"learning_rate": 0.00019785809906291835,
"loss": 7.413,
"step": 9
},
{
"epoch": 0.040160642570281124,
"grad_norm": 3.045475482940674,
"learning_rate": 0.00019759036144578314,
"loss": 8.4555,
"step": 10
},
{
"epoch": 0.04417670682730924,
"grad_norm": 2.4188013076782227,
"learning_rate": 0.0001973226238286479,
"loss": 6.3816,
"step": 11
},
{
"epoch": 0.04819277108433735,
"grad_norm": 2.483142852783203,
"learning_rate": 0.00019705488621151273,
"loss": 6.0486,
"step": 12
},
{
"epoch": 0.05220883534136546,
"grad_norm": 2.7488200664520264,
"learning_rate": 0.00019678714859437752,
"loss": 6.0559,
"step": 13
},
{
"epoch": 0.05622489959839357,
"grad_norm": 3.509127140045166,
"learning_rate": 0.00019651941097724232,
"loss": 6.5013,
"step": 14
},
{
"epoch": 0.060240963855421686,
"grad_norm": 4.097210884094238,
"learning_rate": 0.0001962516733601071,
"loss": 6.6959,
"step": 15
},
{
"epoch": 0.0642570281124498,
"grad_norm": 5.211580753326416,
"learning_rate": 0.0001959839357429719,
"loss": 7.4451,
"step": 16
},
{
"epoch": 0.06827309236947791,
"grad_norm": 4.360202312469482,
"learning_rate": 0.00019571619812583667,
"loss": 7.5475,
"step": 17
},
{
"epoch": 0.07228915662650602,
"grad_norm": 4.646812915802002,
"learning_rate": 0.0001954484605087015,
"loss": 5.9117,
"step": 18
},
{
"epoch": 0.07630522088353414,
"grad_norm": 4.076641082763672,
"learning_rate": 0.00019518072289156628,
"loss": 6.5152,
"step": 19
},
{
"epoch": 0.08032128514056225,
"grad_norm": 4.571013450622559,
"learning_rate": 0.00019491298527443107,
"loss": 7.7192,
"step": 20
},
{
"epoch": 0.08433734939759036,
"grad_norm": 3.786604881286621,
"learning_rate": 0.00019464524765729587,
"loss": 6.0262,
"step": 21
},
{
"epoch": 0.08835341365461848,
"grad_norm": 3.7632923126220703,
"learning_rate": 0.00019437751004016066,
"loss": 5.515,
"step": 22
},
{
"epoch": 0.09236947791164658,
"grad_norm": 3.142625093460083,
"learning_rate": 0.00019410977242302542,
"loss": 5.5428,
"step": 23
},
{
"epoch": 0.0963855421686747,
"grad_norm": 4.195131778717041,
"learning_rate": 0.00019384203480589022,
"loss": 5.0073,
"step": 24
},
{
"epoch": 0.10040160642570281,
"grad_norm": 7.452038764953613,
"learning_rate": 0.00019357429718875504,
"loss": 5.6765,
"step": 25
},
{
"epoch": 0.10441767068273092,
"grad_norm": 9.708063125610352,
"learning_rate": 0.00019330655957161983,
"loss": 5.6149,
"step": 26
},
{
"epoch": 0.10843373493975904,
"grad_norm": 19.072011947631836,
"learning_rate": 0.00019303882195448462,
"loss": 5.4365,
"step": 27
},
{
"epoch": 0.11244979919678715,
"grad_norm": 6.726373195648193,
"learning_rate": 0.00019277108433734942,
"loss": 4.858,
"step": 28
},
{
"epoch": 0.11646586345381527,
"grad_norm": 3.187056064605713,
"learning_rate": 0.0001925033467202142,
"loss": 5.3406,
"step": 29
},
{
"epoch": 0.12048192771084337,
"grad_norm": 3.364069700241089,
"learning_rate": 0.00019223560910307897,
"loss": 5.5143,
"step": 30
},
{
"epoch": 0.12449799196787148,
"grad_norm": 2.4620518684387207,
"learning_rate": 0.00019196787148594377,
"loss": 4.638,
"step": 31
},
{
"epoch": 0.1285140562248996,
"grad_norm": 3.9363696575164795,
"learning_rate": 0.0001917001338688086,
"loss": 4.6009,
"step": 32
},
{
"epoch": 0.13253012048192772,
"grad_norm": 3.230189561843872,
"learning_rate": 0.00019143239625167338,
"loss": 4.7928,
"step": 33
},
{
"epoch": 0.13654618473895583,
"grad_norm": 2.873898983001709,
"learning_rate": 0.00019116465863453817,
"loss": 3.7444,
"step": 34
},
{
"epoch": 0.14056224899598393,
"grad_norm": 3.2136387825012207,
"learning_rate": 0.00019089692101740297,
"loss": 4.452,
"step": 35
},
{
"epoch": 0.14457831325301204,
"grad_norm": 2.8411664962768555,
"learning_rate": 0.00019062918340026773,
"loss": 4.483,
"step": 36
},
{
"epoch": 0.14859437751004015,
"grad_norm": 2.68854022026062,
"learning_rate": 0.00019036144578313252,
"loss": 3.92,
"step": 37
},
{
"epoch": 0.15261044176706828,
"grad_norm": 3.324504852294922,
"learning_rate": 0.00019009370816599734,
"loss": 4.4238,
"step": 38
},
{
"epoch": 0.1566265060240964,
"grad_norm": 3.0757510662078857,
"learning_rate": 0.00018982597054886214,
"loss": 4.0354,
"step": 39
},
{
"epoch": 0.1606425702811245,
"grad_norm": 3.1478559970855713,
"learning_rate": 0.00018955823293172693,
"loss": 4.7587,
"step": 40
},
{
"epoch": 0.1646586345381526,
"grad_norm": 2.923387050628662,
"learning_rate": 0.00018929049531459172,
"loss": 4.1713,
"step": 41
},
{
"epoch": 0.1686746987951807,
"grad_norm": 3.3262710571289062,
"learning_rate": 0.0001890227576974565,
"loss": 5.7246,
"step": 42
},
{
"epoch": 0.17269076305220885,
"grad_norm": 2.9940414428710938,
"learning_rate": 0.00018875502008032128,
"loss": 3.9502,
"step": 43
},
{
"epoch": 0.17670682730923695,
"grad_norm": 2.4215221405029297,
"learning_rate": 0.00018848728246318607,
"loss": 3.3469,
"step": 44
},
{
"epoch": 0.18072289156626506,
"grad_norm": 4.08881139755249,
"learning_rate": 0.0001882195448460509,
"loss": 3.6203,
"step": 45
},
{
"epoch": 0.18473895582329317,
"grad_norm": 2.550448417663574,
"learning_rate": 0.00018795180722891569,
"loss": 3.9986,
"step": 46
},
{
"epoch": 0.18875502008032127,
"grad_norm": 2.3286774158477783,
"learning_rate": 0.00018768406961178048,
"loss": 3.3749,
"step": 47
},
{
"epoch": 0.1927710843373494,
"grad_norm": 2.724431276321411,
"learning_rate": 0.00018741633199464524,
"loss": 3.4734,
"step": 48
},
{
"epoch": 0.19678714859437751,
"grad_norm": 2.961087226867676,
"learning_rate": 0.00018714859437751004,
"loss": 4.242,
"step": 49
},
{
"epoch": 0.20080321285140562,
"grad_norm": 2.4245645999908447,
"learning_rate": 0.00018688085676037483,
"loss": 3.7956,
"step": 50
},
{
"epoch": 0.20481927710843373,
"grad_norm": 2.141226053237915,
"learning_rate": 0.00018661311914323962,
"loss": 3.0041,
"step": 51
},
{
"epoch": 0.20883534136546184,
"grad_norm": 2.7774155139923096,
"learning_rate": 0.00018634538152610444,
"loss": 3.5062,
"step": 52
},
{
"epoch": 0.21285140562248997,
"grad_norm": 2.6332597732543945,
"learning_rate": 0.00018607764390896924,
"loss": 3.9305,
"step": 53
},
{
"epoch": 0.21686746987951808,
"grad_norm": 3.4417197704315186,
"learning_rate": 0.000185809906291834,
"loss": 5.1481,
"step": 54
},
{
"epoch": 0.22088353413654618,
"grad_norm": 2.576704978942871,
"learning_rate": 0.0001855421686746988,
"loss": 3.6137,
"step": 55
},
{
"epoch": 0.2248995983935743,
"grad_norm": 2.816452980041504,
"learning_rate": 0.0001852744310575636,
"loss": 3.5015,
"step": 56
},
{
"epoch": 0.2289156626506024,
"grad_norm": 3.5300023555755615,
"learning_rate": 0.00018500669344042838,
"loss": 4.7758,
"step": 57
},
{
"epoch": 0.23293172690763053,
"grad_norm": 2.594787120819092,
"learning_rate": 0.0001847389558232932,
"loss": 4.0104,
"step": 58
},
{
"epoch": 0.23694779116465864,
"grad_norm": 3.472842216491699,
"learning_rate": 0.000184471218206158,
"loss": 4.2051,
"step": 59
},
{
"epoch": 0.24096385542168675,
"grad_norm": 2.195838212966919,
"learning_rate": 0.00018420348058902276,
"loss": 3.4561,
"step": 60
},
{
"epoch": 0.24497991967871485,
"grad_norm": 2.6737020015716553,
"learning_rate": 0.00018393574297188755,
"loss": 5.4281,
"step": 61
},
{
"epoch": 0.24899598393574296,
"grad_norm": 3.128307342529297,
"learning_rate": 0.00018366800535475234,
"loss": 4.835,
"step": 62
},
{
"epoch": 0.25301204819277107,
"grad_norm": 2.8915627002716064,
"learning_rate": 0.00018340026773761714,
"loss": 5.6513,
"step": 63
},
{
"epoch": 0.2570281124497992,
"grad_norm": 2.4325616359710693,
"learning_rate": 0.00018313253012048193,
"loss": 3.8769,
"step": 64
},
{
"epoch": 0.26104417670682734,
"grad_norm": 2.717306613922119,
"learning_rate": 0.00018286479250334675,
"loss": 4.7258,
"step": 65
},
{
"epoch": 0.26506024096385544,
"grad_norm": 2.6178746223449707,
"learning_rate": 0.00018259705488621152,
"loss": 4.0424,
"step": 66
},
{
"epoch": 0.26907630522088355,
"grad_norm": 2.382551431655884,
"learning_rate": 0.0001823293172690763,
"loss": 3.547,
"step": 67
},
{
"epoch": 0.27309236947791166,
"grad_norm": 2.546783685684204,
"learning_rate": 0.0001820615796519411,
"loss": 4.2495,
"step": 68
},
{
"epoch": 0.27710843373493976,
"grad_norm": 2.4738221168518066,
"learning_rate": 0.0001817938420348059,
"loss": 3.69,
"step": 69
},
{
"epoch": 0.28112449799196787,
"grad_norm": 2.2191786766052246,
"learning_rate": 0.0001815261044176707,
"loss": 3.1576,
"step": 70
},
{
"epoch": 0.285140562248996,
"grad_norm": 2.4891932010650635,
"learning_rate": 0.00018125836680053548,
"loss": 3.7767,
"step": 71
},
{
"epoch": 0.2891566265060241,
"grad_norm": 2.0602684020996094,
"learning_rate": 0.00018099062918340027,
"loss": 3.1497,
"step": 72
},
{
"epoch": 0.2931726907630522,
"grad_norm": 2.435455560684204,
"learning_rate": 0.00018072289156626507,
"loss": 4.3061,
"step": 73
},
{
"epoch": 0.2971887550200803,
"grad_norm": 2.7304036617279053,
"learning_rate": 0.00018045515394912986,
"loss": 3.6995,
"step": 74
},
{
"epoch": 0.30120481927710846,
"grad_norm": 2.6375226974487305,
"learning_rate": 0.00018018741633199465,
"loss": 3.3922,
"step": 75
},
{
"epoch": 0.30522088353413657,
"grad_norm": 2.097759246826172,
"learning_rate": 0.00017991967871485944,
"loss": 3.1887,
"step": 76
},
{
"epoch": 0.3092369477911647,
"grad_norm": 2.600724458694458,
"learning_rate": 0.00017965194109772424,
"loss": 3.8532,
"step": 77
},
{
"epoch": 0.3132530120481928,
"grad_norm": 3.0356369018554688,
"learning_rate": 0.00017938420348058903,
"loss": 4.6221,
"step": 78
},
{
"epoch": 0.3172690763052209,
"grad_norm": 2.1509416103363037,
"learning_rate": 0.00017911646586345382,
"loss": 3.5473,
"step": 79
},
{
"epoch": 0.321285140562249,
"grad_norm": 2.7542128562927246,
"learning_rate": 0.00017884872824631862,
"loss": 4.3206,
"step": 80
},
{
"epoch": 0.3253012048192771,
"grad_norm": 2.7480881214141846,
"learning_rate": 0.0001785809906291834,
"loss": 3.4596,
"step": 81
},
{
"epoch": 0.3293172690763052,
"grad_norm": 2.8787624835968018,
"learning_rate": 0.0001783132530120482,
"loss": 4.0409,
"step": 82
},
{
"epoch": 0.3333333333333333,
"grad_norm": 2.234320878982544,
"learning_rate": 0.000178045515394913,
"loss": 3.6684,
"step": 83
},
{
"epoch": 0.3373493975903614,
"grad_norm": 2.174452781677246,
"learning_rate": 0.00017777777777777779,
"loss": 3.8964,
"step": 84
},
{
"epoch": 0.3413654618473896,
"grad_norm": 2.25730299949646,
"learning_rate": 0.00017751004016064258,
"loss": 3.3793,
"step": 85
},
{
"epoch": 0.3453815261044177,
"grad_norm": 2.3120176792144775,
"learning_rate": 0.00017724230254350737,
"loss": 3.9183,
"step": 86
},
{
"epoch": 0.3493975903614458,
"grad_norm": 2.696288824081421,
"learning_rate": 0.00017697456492637216,
"loss": 4.1063,
"step": 87
},
{
"epoch": 0.3534136546184739,
"grad_norm": 3.9386634826660156,
"learning_rate": 0.00017670682730923696,
"loss": 4.599,
"step": 88
},
{
"epoch": 0.357429718875502,
"grad_norm": 2.7136473655700684,
"learning_rate": 0.00017643908969210175,
"loss": 4.1535,
"step": 89
},
{
"epoch": 0.3614457831325301,
"grad_norm": 2.4276645183563232,
"learning_rate": 0.00017617135207496654,
"loss": 4.4834,
"step": 90
},
{
"epoch": 0.3654618473895582,
"grad_norm": 2.6002511978149414,
"learning_rate": 0.00017590361445783134,
"loss": 4.0748,
"step": 91
},
{
"epoch": 0.36947791164658633,
"grad_norm": 2.682366132736206,
"learning_rate": 0.00017563587684069613,
"loss": 4.4142,
"step": 92
},
{
"epoch": 0.37349397590361444,
"grad_norm": 2.108722686767578,
"learning_rate": 0.00017536813922356092,
"loss": 4.4304,
"step": 93
},
{
"epoch": 0.37751004016064255,
"grad_norm": 2.0732803344726562,
"learning_rate": 0.00017510040160642571,
"loss": 3.2521,
"step": 94
},
{
"epoch": 0.3815261044176707,
"grad_norm": 2.3038790225982666,
"learning_rate": 0.0001748326639892905,
"loss": 4.3167,
"step": 95
},
{
"epoch": 0.3855421686746988,
"grad_norm": 2.623572587966919,
"learning_rate": 0.0001745649263721553,
"loss": 5.3465,
"step": 96
},
{
"epoch": 0.3895582329317269,
"grad_norm": 2.4543046951293945,
"learning_rate": 0.0001742971887550201,
"loss": 3.4479,
"step": 97
},
{
"epoch": 0.39357429718875503,
"grad_norm": 2.291369915008545,
"learning_rate": 0.00017402945113788489,
"loss": 4.0893,
"step": 98
},
{
"epoch": 0.39759036144578314,
"grad_norm": 2.4371914863586426,
"learning_rate": 0.00017376171352074968,
"loss": 3.7132,
"step": 99
},
{
"epoch": 0.40160642570281124,
"grad_norm": 2.1401989459991455,
"learning_rate": 0.00017349397590361447,
"loss": 2.9892,
"step": 100
},
{
"epoch": 0.40562248995983935,
"grad_norm": 2.1574857234954834,
"learning_rate": 0.00017322623828647926,
"loss": 3.3145,
"step": 101
},
{
"epoch": 0.40963855421686746,
"grad_norm": 2.7298076152801514,
"learning_rate": 0.00017295850066934406,
"loss": 4.2365,
"step": 102
},
{
"epoch": 0.41365461847389556,
"grad_norm": 2.5634846687316895,
"learning_rate": 0.00017269076305220885,
"loss": 3.4466,
"step": 103
},
{
"epoch": 0.41767068273092367,
"grad_norm": 2.573195695877075,
"learning_rate": 0.00017242302543507362,
"loss": 3.3283,
"step": 104
},
{
"epoch": 0.42168674698795183,
"grad_norm": 2.205293655395508,
"learning_rate": 0.00017215528781793844,
"loss": 3.7288,
"step": 105
},
{
"epoch": 0.42570281124497994,
"grad_norm": 3.3177073001861572,
"learning_rate": 0.00017188755020080323,
"loss": 3.9341,
"step": 106
},
{
"epoch": 0.42971887550200805,
"grad_norm": 2.601710557937622,
"learning_rate": 0.00017161981258366802,
"loss": 4.3724,
"step": 107
},
{
"epoch": 0.43373493975903615,
"grad_norm": 2.490556478500366,
"learning_rate": 0.00017135207496653281,
"loss": 3.0784,
"step": 108
},
{
"epoch": 0.43775100401606426,
"grad_norm": 2.7771122455596924,
"learning_rate": 0.0001710843373493976,
"loss": 3.7125,
"step": 109
},
{
"epoch": 0.44176706827309237,
"grad_norm": 2.9865031242370605,
"learning_rate": 0.00017081659973226237,
"loss": 4.9747,
"step": 110
},
{
"epoch": 0.4457831325301205,
"grad_norm": 3.2922353744506836,
"learning_rate": 0.00017054886211512717,
"loss": 4.229,
"step": 111
},
{
"epoch": 0.4497991967871486,
"grad_norm": 2.2360899448394775,
"learning_rate": 0.00017028112449799199,
"loss": 3.1859,
"step": 112
},
{
"epoch": 0.4538152610441767,
"grad_norm": 2.4282941818237305,
"learning_rate": 0.00017001338688085678,
"loss": 4.4577,
"step": 113
},
{
"epoch": 0.4578313253012048,
"grad_norm": 2.2384181022644043,
"learning_rate": 0.00016974564926372157,
"loss": 3.435,
"step": 114
},
{
"epoch": 0.46184738955823296,
"grad_norm": 2.586678981781006,
"learning_rate": 0.00016947791164658636,
"loss": 3.7974,
"step": 115
},
{
"epoch": 0.46586345381526106,
"grad_norm": 2.2473366260528564,
"learning_rate": 0.00016921017402945113,
"loss": 3.2193,
"step": 116
},
{
"epoch": 0.46987951807228917,
"grad_norm": 2.2137515544891357,
"learning_rate": 0.00016894243641231592,
"loss": 3.2774,
"step": 117
},
{
"epoch": 0.4738955823293173,
"grad_norm": 2.6827173233032227,
"learning_rate": 0.00016867469879518074,
"loss": 3.843,
"step": 118
},
{
"epoch": 0.4779116465863454,
"grad_norm": 2.499166250228882,
"learning_rate": 0.00016840696117804553,
"loss": 3.1818,
"step": 119
},
{
"epoch": 0.4819277108433735,
"grad_norm": 2.609964609146118,
"learning_rate": 0.00016813922356091033,
"loss": 3.6292,
"step": 120
},
{
"epoch": 0.4859437751004016,
"grad_norm": 2.697786808013916,
"learning_rate": 0.00016787148594377512,
"loss": 3.7501,
"step": 121
},
{
"epoch": 0.4899598393574297,
"grad_norm": 2.834494113922119,
"learning_rate": 0.00016760374832663989,
"loss": 3.9265,
"step": 122
},
{
"epoch": 0.4939759036144578,
"grad_norm": 2.3431777954101562,
"learning_rate": 0.00016733601070950468,
"loss": 3.7916,
"step": 123
},
{
"epoch": 0.4979919678714859,
"grad_norm": 2.434953212738037,
"learning_rate": 0.00016706827309236947,
"loss": 3.4279,
"step": 124
},
{
"epoch": 0.5020080321285141,
"grad_norm": 2.3629250526428223,
"learning_rate": 0.0001668005354752343,
"loss": 3.4382,
"step": 125
},
{
"epoch": 0.5060240963855421,
"grad_norm": 2.7543423175811768,
"learning_rate": 0.00016653279785809908,
"loss": 4.8146,
"step": 126
},
{
"epoch": 0.5100401606425703,
"grad_norm": 3.149775981903076,
"learning_rate": 0.00016626506024096388,
"loss": 5.365,
"step": 127
},
{
"epoch": 0.5140562248995983,
"grad_norm": 2.640326499938965,
"learning_rate": 0.00016599732262382864,
"loss": 4.2036,
"step": 128
},
{
"epoch": 0.5180722891566265,
"grad_norm": 2.6297357082366943,
"learning_rate": 0.00016572958500669344,
"loss": 3.7331,
"step": 129
},
{
"epoch": 0.5220883534136547,
"grad_norm": 2.9165263175964355,
"learning_rate": 0.00016546184738955823,
"loss": 4.2224,
"step": 130
},
{
"epoch": 0.5261044176706827,
"grad_norm": 2.003908634185791,
"learning_rate": 0.00016519410977242302,
"loss": 3.5818,
"step": 131
},
{
"epoch": 0.5301204819277109,
"grad_norm": 2.3137078285217285,
"learning_rate": 0.00016492637215528784,
"loss": 3.4726,
"step": 132
},
{
"epoch": 0.5341365461847389,
"grad_norm": 2.69950795173645,
"learning_rate": 0.00016465863453815263,
"loss": 4.0059,
"step": 133
},
{
"epoch": 0.5381526104417671,
"grad_norm": 2.1858394145965576,
"learning_rate": 0.0001643908969210174,
"loss": 3.6957,
"step": 134
},
{
"epoch": 0.5421686746987951,
"grad_norm": 2.423802137374878,
"learning_rate": 0.0001641231593038822,
"loss": 4.1535,
"step": 135
},
{
"epoch": 0.5461847389558233,
"grad_norm": 2.244253158569336,
"learning_rate": 0.00016385542168674699,
"loss": 3.3276,
"step": 136
},
{
"epoch": 0.5502008032128514,
"grad_norm": 2.2932465076446533,
"learning_rate": 0.00016358768406961178,
"loss": 3.6498,
"step": 137
},
{
"epoch": 0.5542168674698795,
"grad_norm": 2.0782933235168457,
"learning_rate": 0.0001633199464524766,
"loss": 4.007,
"step": 138
},
{
"epoch": 0.5582329317269076,
"grad_norm": 2.778797149658203,
"learning_rate": 0.0001630522088353414,
"loss": 3.8436,
"step": 139
},
{
"epoch": 0.5622489959839357,
"grad_norm": 2.7823002338409424,
"learning_rate": 0.00016278447121820616,
"loss": 5.5985,
"step": 140
},
{
"epoch": 0.5662650602409639,
"grad_norm": 3.124753475189209,
"learning_rate": 0.00016251673360107095,
"loss": 3.8402,
"step": 141
},
{
"epoch": 0.570281124497992,
"grad_norm": 2.999889612197876,
"learning_rate": 0.00016224899598393574,
"loss": 4.8463,
"step": 142
},
{
"epoch": 0.5742971887550201,
"grad_norm": 2.2176406383514404,
"learning_rate": 0.00016198125836680054,
"loss": 3.6488,
"step": 143
},
{
"epoch": 0.5783132530120482,
"grad_norm": 2.334336757659912,
"learning_rate": 0.00016171352074966533,
"loss": 3.4351,
"step": 144
},
{
"epoch": 0.5823293172690763,
"grad_norm": 2.1625120639801025,
"learning_rate": 0.00016144578313253015,
"loss": 3.4423,
"step": 145
},
{
"epoch": 0.5863453815261044,
"grad_norm": 2.3950042724609375,
"learning_rate": 0.00016117804551539491,
"loss": 3.4302,
"step": 146
},
{
"epoch": 0.5903614457831325,
"grad_norm": 1.968996524810791,
"learning_rate": 0.0001609103078982597,
"loss": 3.3924,
"step": 147
},
{
"epoch": 0.5943775100401606,
"grad_norm": 2.259298801422119,
"learning_rate": 0.0001606425702811245,
"loss": 3.4544,
"step": 148
},
{
"epoch": 0.5983935742971888,
"grad_norm": 2.5227410793304443,
"learning_rate": 0.0001603748326639893,
"loss": 3.6276,
"step": 149
},
{
"epoch": 0.6024096385542169,
"grad_norm": 2.4112424850463867,
"learning_rate": 0.00016010709504685409,
"loss": 3.8806,
"step": 150
},
{
"epoch": 0.606425702811245,
"grad_norm": 2.5478017330169678,
"learning_rate": 0.00015983935742971888,
"loss": 4.1461,
"step": 151
},
{
"epoch": 0.6104417670682731,
"grad_norm": 2.832744836807251,
"learning_rate": 0.00015957161981258367,
"loss": 5.0162,
"step": 152
},
{
"epoch": 0.6144578313253012,
"grad_norm": 2.7249608039855957,
"learning_rate": 0.00015930388219544846,
"loss": 3.2521,
"step": 153
},
{
"epoch": 0.6184738955823293,
"grad_norm": 2.579235315322876,
"learning_rate": 0.00015903614457831326,
"loss": 4.0444,
"step": 154
},
{
"epoch": 0.6224899598393574,
"grad_norm": 2.719031572341919,
"learning_rate": 0.00015876840696117805,
"loss": 3.8091,
"step": 155
},
{
"epoch": 0.6265060240963856,
"grad_norm": 2.9060187339782715,
"learning_rate": 0.00015850066934404284,
"loss": 3.574,
"step": 156
},
{
"epoch": 0.6305220883534136,
"grad_norm": 2.3890836238861084,
"learning_rate": 0.00015823293172690763,
"loss": 3.0126,
"step": 157
},
{
"epoch": 0.6345381526104418,
"grad_norm": 2.4875965118408203,
"learning_rate": 0.00015796519410977243,
"loss": 3.8722,
"step": 158
},
{
"epoch": 0.6385542168674698,
"grad_norm": 2.452133893966675,
"learning_rate": 0.00015769745649263722,
"loss": 3.1996,
"step": 159
},
{
"epoch": 0.642570281124498,
"grad_norm": 2.644927740097046,
"learning_rate": 0.000157429718875502,
"loss": 4.5955,
"step": 160
},
{
"epoch": 0.6465863453815262,
"grad_norm": 2.4523508548736572,
"learning_rate": 0.0001571619812583668,
"loss": 3.3654,
"step": 161
},
{
"epoch": 0.6506024096385542,
"grad_norm": 2.5598349571228027,
"learning_rate": 0.0001568942436412316,
"loss": 3.0078,
"step": 162
},
{
"epoch": 0.6546184738955824,
"grad_norm": 3.0518641471862793,
"learning_rate": 0.0001566265060240964,
"loss": 4.5464,
"step": 163
},
{
"epoch": 0.6586345381526104,
"grad_norm": 2.8101203441619873,
"learning_rate": 0.00015635876840696118,
"loss": 3.4404,
"step": 164
},
{
"epoch": 0.6626506024096386,
"grad_norm": 2.7174525260925293,
"learning_rate": 0.00015609103078982598,
"loss": 3.6615,
"step": 165
},
{
"epoch": 0.6666666666666666,
"grad_norm": 2.620638608932495,
"learning_rate": 0.00015582329317269077,
"loss": 3.448,
"step": 166
},
{
"epoch": 0.6706827309236948,
"grad_norm": 2.9395246505737305,
"learning_rate": 0.00015555555555555556,
"loss": 3.6454,
"step": 167
},
{
"epoch": 0.6746987951807228,
"grad_norm": 3.050710916519165,
"learning_rate": 0.00015528781793842036,
"loss": 4.0765,
"step": 168
},
{
"epoch": 0.678714859437751,
"grad_norm": 2.2552433013916016,
"learning_rate": 0.00015502008032128515,
"loss": 3.1558,
"step": 169
},
{
"epoch": 0.6827309236947792,
"grad_norm": 2.1489574909210205,
"learning_rate": 0.00015475234270414994,
"loss": 4.2047,
"step": 170
},
{
"epoch": 0.6867469879518072,
"grad_norm": 2.172776937484741,
"learning_rate": 0.00015448460508701473,
"loss": 3.4285,
"step": 171
},
{
"epoch": 0.6907630522088354,
"grad_norm": 2.1401731967926025,
"learning_rate": 0.00015421686746987953,
"loss": 3.2497,
"step": 172
},
{
"epoch": 0.6947791164658634,
"grad_norm": 2.7701947689056396,
"learning_rate": 0.00015394912985274432,
"loss": 3.9331,
"step": 173
},
{
"epoch": 0.6987951807228916,
"grad_norm": 2.319415330886841,
"learning_rate": 0.0001536813922356091,
"loss": 3.176,
"step": 174
},
{
"epoch": 0.7028112449799196,
"grad_norm": 2.428131341934204,
"learning_rate": 0.0001534136546184739,
"loss": 3.1192,
"step": 175
},
{
"epoch": 0.7068273092369478,
"grad_norm": 2.135892868041992,
"learning_rate": 0.0001531459170013387,
"loss": 3.0222,
"step": 176
},
{
"epoch": 0.7108433734939759,
"grad_norm": 2.7550647258758545,
"learning_rate": 0.0001528781793842035,
"loss": 4.6775,
"step": 177
},
{
"epoch": 0.714859437751004,
"grad_norm": 2.2021191120147705,
"learning_rate": 0.00015261044176706828,
"loss": 2.7476,
"step": 178
},
{
"epoch": 0.7188755020080321,
"grad_norm": 2.686431407928467,
"learning_rate": 0.00015234270414993308,
"loss": 4.1621,
"step": 179
},
{
"epoch": 0.7228915662650602,
"grad_norm": 2.827143669128418,
"learning_rate": 0.00015207496653279787,
"loss": 4.4613,
"step": 180
},
{
"epoch": 0.7269076305220884,
"grad_norm": 3.090308904647827,
"learning_rate": 0.00015180722891566266,
"loss": 4.6863,
"step": 181
},
{
"epoch": 0.7309236947791165,
"grad_norm": 2.492013454437256,
"learning_rate": 0.00015153949129852746,
"loss": 3.2319,
"step": 182
},
{
"epoch": 0.7349397590361446,
"grad_norm": 2.6304264068603516,
"learning_rate": 0.00015127175368139225,
"loss": 3.3099,
"step": 183
},
{
"epoch": 0.7389558232931727,
"grad_norm": 2.270024299621582,
"learning_rate": 0.00015100401606425701,
"loss": 3.8332,
"step": 184
},
{
"epoch": 0.7429718875502008,
"grad_norm": 2.2107675075531006,
"learning_rate": 0.00015073627844712183,
"loss": 3.4966,
"step": 185
},
{
"epoch": 0.7469879518072289,
"grad_norm": 1.804654598236084,
"learning_rate": 0.00015046854082998663,
"loss": 2.7441,
"step": 186
},
{
"epoch": 0.751004016064257,
"grad_norm": 2.8919899463653564,
"learning_rate": 0.00015020080321285142,
"loss": 3.7274,
"step": 187
},
{
"epoch": 0.7550200803212851,
"grad_norm": 2.4757237434387207,
"learning_rate": 0.0001499330655957162,
"loss": 3.6959,
"step": 188
},
{
"epoch": 0.7590361445783133,
"grad_norm": 2.037745952606201,
"learning_rate": 0.000149665327978581,
"loss": 3.0673,
"step": 189
},
{
"epoch": 0.7630522088353414,
"grad_norm": 2.479806423187256,
"learning_rate": 0.00014939759036144577,
"loss": 3.5497,
"step": 190
},
{
"epoch": 0.7670682730923695,
"grad_norm": 2.532616138458252,
"learning_rate": 0.00014912985274431056,
"loss": 4.4538,
"step": 191
},
{
"epoch": 0.7710843373493976,
"grad_norm": 2.2965128421783447,
"learning_rate": 0.00014886211512717538,
"loss": 3.8924,
"step": 192
},
{
"epoch": 0.7751004016064257,
"grad_norm": 2.569096088409424,
"learning_rate": 0.00014859437751004018,
"loss": 4.3112,
"step": 193
},
{
"epoch": 0.7791164658634538,
"grad_norm": 2.3299782276153564,
"learning_rate": 0.00014832663989290497,
"loss": 3.4171,
"step": 194
},
{
"epoch": 0.7831325301204819,
"grad_norm": 2.4750306606292725,
"learning_rate": 0.00014805890227576976,
"loss": 4.2418,
"step": 195
},
{
"epoch": 0.7871485943775101,
"grad_norm": 2.34830904006958,
"learning_rate": 0.00014779116465863453,
"loss": 4.7654,
"step": 196
},
{
"epoch": 0.7911646586345381,
"grad_norm": 2.3084421157836914,
"learning_rate": 0.00014752342704149932,
"loss": 3.5955,
"step": 197
},
{
"epoch": 0.7951807228915663,
"grad_norm": 2.088836431503296,
"learning_rate": 0.00014725568942436414,
"loss": 3.4426,
"step": 198
},
{
"epoch": 0.7991967871485943,
"grad_norm": 2.387511968612671,
"learning_rate": 0.00014698795180722893,
"loss": 3.4799,
"step": 199
},
{
"epoch": 0.8032128514056225,
"grad_norm": 2.173638343811035,
"learning_rate": 0.00014672021419009373,
"loss": 3.1073,
"step": 200
},
{
"epoch": 0.8072289156626506,
"grad_norm": 2.4268410205841064,
"learning_rate": 0.00014645247657295852,
"loss": 3.895,
"step": 201
},
{
"epoch": 0.8112449799196787,
"grad_norm": 2.298238515853882,
"learning_rate": 0.00014618473895582328,
"loss": 3.1374,
"step": 202
},
{
"epoch": 0.8152610441767069,
"grad_norm": 2.5447280406951904,
"learning_rate": 0.00014591700133868808,
"loss": 4.201,
"step": 203
},
{
"epoch": 0.8192771084337349,
"grad_norm": 2.2700531482696533,
"learning_rate": 0.00014564926372155287,
"loss": 3.3756,
"step": 204
},
{
"epoch": 0.8232931726907631,
"grad_norm": 2.2147793769836426,
"learning_rate": 0.0001453815261044177,
"loss": 2.8677,
"step": 205
},
{
"epoch": 0.8273092369477911,
"grad_norm": 2.820615768432617,
"learning_rate": 0.00014511378848728248,
"loss": 3.8278,
"step": 206
},
{
"epoch": 0.8313253012048193,
"grad_norm": 2.214066743850708,
"learning_rate": 0.00014484605087014728,
"loss": 2.8015,
"step": 207
},
{
"epoch": 0.8353413654618473,
"grad_norm": 2.7223362922668457,
"learning_rate": 0.00014457831325301204,
"loss": 4.5482,
"step": 208
},
{
"epoch": 0.8393574297188755,
"grad_norm": 2.6131458282470703,
"learning_rate": 0.00014431057563587683,
"loss": 3.258,
"step": 209
},
{
"epoch": 0.8433734939759037,
"grad_norm": 2.378821611404419,
"learning_rate": 0.00014404283801874163,
"loss": 3.4395,
"step": 210
},
{
"epoch": 0.8473895582329317,
"grad_norm": 2.5394039154052734,
"learning_rate": 0.00014377510040160642,
"loss": 3.5583,
"step": 211
},
{
"epoch": 0.8514056224899599,
"grad_norm": 2.8768603801727295,
"learning_rate": 0.00014350736278447124,
"loss": 4.1826,
"step": 212
},
{
"epoch": 0.8554216867469879,
"grad_norm": 2.325242757797241,
"learning_rate": 0.00014323962516733603,
"loss": 3.2996,
"step": 213
},
{
"epoch": 0.8594377510040161,
"grad_norm": 2.847722053527832,
"learning_rate": 0.0001429718875502008,
"loss": 3.7535,
"step": 214
},
{
"epoch": 0.8634538152610441,
"grad_norm": 2.3787224292755127,
"learning_rate": 0.0001427041499330656,
"loss": 2.989,
"step": 215
},
{
"epoch": 0.8674698795180723,
"grad_norm": 2.3759453296661377,
"learning_rate": 0.00014243641231593038,
"loss": 3.2181,
"step": 216
},
{
"epoch": 0.8714859437751004,
"grad_norm": 2.48319411277771,
"learning_rate": 0.00014216867469879518,
"loss": 4.0624,
"step": 217
},
{
"epoch": 0.8755020080321285,
"grad_norm": 2.75231671333313,
"learning_rate": 0.00014190093708166,
"loss": 4.2616,
"step": 218
},
{
"epoch": 0.8795180722891566,
"grad_norm": 2.165195941925049,
"learning_rate": 0.0001416331994645248,
"loss": 2.773,
"step": 219
},
{
"epoch": 0.8835341365461847,
"grad_norm": 2.9390523433685303,
"learning_rate": 0.00014136546184738956,
"loss": 5.3133,
"step": 220
},
{
"epoch": 0.8875502008032129,
"grad_norm": 2.4109458923339844,
"learning_rate": 0.00014109772423025435,
"loss": 3.8292,
"step": 221
},
{
"epoch": 0.891566265060241,
"grad_norm": 2.5037901401519775,
"learning_rate": 0.00014082998661311914,
"loss": 4.0122,
"step": 222
},
{
"epoch": 0.8955823293172691,
"grad_norm": 2.985944986343384,
"learning_rate": 0.00014056224899598393,
"loss": 3.7539,
"step": 223
},
{
"epoch": 0.8995983935742972,
"grad_norm": 2.2456915378570557,
"learning_rate": 0.00014029451137884873,
"loss": 3.4707,
"step": 224
},
{
"epoch": 0.9036144578313253,
"grad_norm": 2.0935449600219727,
"learning_rate": 0.00014002677376171355,
"loss": 2.7515,
"step": 225
},
{
"epoch": 0.9076305220883534,
"grad_norm": 2.4609766006469727,
"learning_rate": 0.00013975903614457834,
"loss": 3.8227,
"step": 226
},
{
"epoch": 0.9116465863453815,
"grad_norm": 2.2097980976104736,
"learning_rate": 0.0001394912985274431,
"loss": 3.2733,
"step": 227
},
{
"epoch": 0.9156626506024096,
"grad_norm": 2.0642688274383545,
"learning_rate": 0.0001392235609103079,
"loss": 3.0938,
"step": 228
},
{
"epoch": 0.9196787148594378,
"grad_norm": 2.3710100650787354,
"learning_rate": 0.0001389558232931727,
"loss": 4.2002,
"step": 229
},
{
"epoch": 0.9236947791164659,
"grad_norm": 2.6360647678375244,
"learning_rate": 0.00013868808567603748,
"loss": 3.8326,
"step": 230
},
{
"epoch": 0.927710843373494,
"grad_norm": 2.2522687911987305,
"learning_rate": 0.00013842034805890228,
"loss": 4.0576,
"step": 231
},
{
"epoch": 0.9317269076305221,
"grad_norm": 2.3965373039245605,
"learning_rate": 0.0001381526104417671,
"loss": 2.551,
"step": 232
},
{
"epoch": 0.9357429718875502,
"grad_norm": 2.160850763320923,
"learning_rate": 0.00013788487282463186,
"loss": 3.0346,
"step": 233
},
{
"epoch": 0.9397590361445783,
"grad_norm": 2.7340362071990967,
"learning_rate": 0.00013761713520749665,
"loss": 3.8792,
"step": 234
},
{
"epoch": 0.9437751004016064,
"grad_norm": 2.373431921005249,
"learning_rate": 0.00013734939759036145,
"loss": 3.4563,
"step": 235
},
{
"epoch": 0.9477911646586346,
"grad_norm": 2.887669801712036,
"learning_rate": 0.00013708165997322624,
"loss": 3.4205,
"step": 236
},
{
"epoch": 0.9518072289156626,
"grad_norm": 2.47088360786438,
"learning_rate": 0.00013681392235609103,
"loss": 3.7738,
"step": 237
},
{
"epoch": 0.9558232931726908,
"grad_norm": 2.7040438652038574,
"learning_rate": 0.00013654618473895585,
"loss": 3.5389,
"step": 238
},
{
"epoch": 0.9598393574297188,
"grad_norm": 2.2656071186065674,
"learning_rate": 0.00013627844712182062,
"loss": 2.5192,
"step": 239
},
{
"epoch": 0.963855421686747,
"grad_norm": 2.0689640045166016,
"learning_rate": 0.0001360107095046854,
"loss": 3.2038,
"step": 240
},
{
"epoch": 0.9678714859437751,
"grad_norm": 2.456049680709839,
"learning_rate": 0.0001357429718875502,
"loss": 3.3779,
"step": 241
},
{
"epoch": 0.9718875502008032,
"grad_norm": 3.6520512104034424,
"learning_rate": 0.000135475234270415,
"loss": 6.3828,
"step": 242
},
{
"epoch": 0.9759036144578314,
"grad_norm": 2.9019930362701416,
"learning_rate": 0.0001352074966532798,
"loss": 4.4033,
"step": 243
},
{
"epoch": 0.9799196787148594,
"grad_norm": 2.688805103302002,
"learning_rate": 0.00013493975903614458,
"loss": 3.7718,
"step": 244
},
{
"epoch": 0.9839357429718876,
"grad_norm": 2.3583173751831055,
"learning_rate": 0.00013467202141900938,
"loss": 2.8558,
"step": 245
},
{
"epoch": 0.9879518072289156,
"grad_norm": 2.2991857528686523,
"learning_rate": 0.00013440428380187417,
"loss": 3.3544,
"step": 246
},
{
"epoch": 0.9919678714859438,
"grad_norm": 2.3462352752685547,
"learning_rate": 0.00013413654618473896,
"loss": 3.4804,
"step": 247
},
{
"epoch": 0.9959839357429718,
"grad_norm": 2.375304698944092,
"learning_rate": 0.00013386880856760375,
"loss": 3.9284,
"step": 248
},
{
"epoch": 1.0,
"grad_norm": 2.3574721813201904,
"learning_rate": 0.00013360107095046855,
"loss": 3.5948,
"step": 249
},
{
"epoch": 1.0,
"eval_loss": 0.906198263168335,
"eval_runtime": 202.0311,
"eval_samples_per_second": 2.47,
"eval_steps_per_second": 1.237,
"step": 249
},
{
"epoch": 1.0040160642570282,
"grad_norm": 2.329230546951294,
"learning_rate": 0.00013333333333333334,
"loss": 3.8794,
"step": 250
},
{
"epoch": 1.0080321285140563,
"grad_norm": 2.304131507873535,
"learning_rate": 0.00013306559571619813,
"loss": 2.618,
"step": 251
},
{
"epoch": 1.0120481927710843,
"grad_norm": 2.258854389190674,
"learning_rate": 0.00013279785809906293,
"loss": 4.5112,
"step": 252
},
{
"epoch": 1.0160642570281124,
"grad_norm": 1.9307198524475098,
"learning_rate": 0.00013253012048192772,
"loss": 2.8023,
"step": 253
},
{
"epoch": 1.0200803212851406,
"grad_norm": 2.070939540863037,
"learning_rate": 0.0001322623828647925,
"loss": 2.9067,
"step": 254
},
{
"epoch": 1.0240963855421688,
"grad_norm": 2.1403632164001465,
"learning_rate": 0.0001319946452476573,
"loss": 3.0498,
"step": 255
},
{
"epoch": 1.0281124497991967,
"grad_norm": 1.9982527494430542,
"learning_rate": 0.0001317269076305221,
"loss": 2.7652,
"step": 256
},
{
"epoch": 1.0321285140562249,
"grad_norm": 2.3440232276916504,
"learning_rate": 0.0001314591700133869,
"loss": 3.8854,
"step": 257
},
{
"epoch": 1.036144578313253,
"grad_norm": 2.3406286239624023,
"learning_rate": 0.00013119143239625168,
"loss": 2.9114,
"step": 258
},
{
"epoch": 1.0401606425702812,
"grad_norm": 2.673793077468872,
"learning_rate": 0.00013092369477911648,
"loss": 3.0531,
"step": 259
},
{
"epoch": 1.0441767068273093,
"grad_norm": 2.2808480262756348,
"learning_rate": 0.00013065595716198127,
"loss": 2.9484,
"step": 260
},
{
"epoch": 1.0481927710843373,
"grad_norm": 2.513705253601074,
"learning_rate": 0.00013038821954484606,
"loss": 2.6625,
"step": 261
},
{
"epoch": 1.0522088353413654,
"grad_norm": 2.7780377864837646,
"learning_rate": 0.00013012048192771085,
"loss": 3.1793,
"step": 262
},
{
"epoch": 1.0562248995983936,
"grad_norm": 2.522724151611328,
"learning_rate": 0.00012985274431057565,
"loss": 3.1926,
"step": 263
},
{
"epoch": 1.0602409638554218,
"grad_norm": 3.2487499713897705,
"learning_rate": 0.0001295850066934404,
"loss": 3.9779,
"step": 264
},
{
"epoch": 1.0642570281124497,
"grad_norm": 2.4341378211975098,
"learning_rate": 0.00012931726907630523,
"loss": 2.9064,
"step": 265
},
{
"epoch": 1.0682730923694779,
"grad_norm": 2.5539276599884033,
"learning_rate": 0.00012904953145917002,
"loss": 3.4219,
"step": 266
},
{
"epoch": 1.072289156626506,
"grad_norm": 2.0425596237182617,
"learning_rate": 0.00012878179384203482,
"loss": 2.5395,
"step": 267
},
{
"epoch": 1.0763052208835342,
"grad_norm": 2.3625378608703613,
"learning_rate": 0.0001285140562248996,
"loss": 2.757,
"step": 268
},
{
"epoch": 1.0803212851405624,
"grad_norm": 2.0414483547210693,
"learning_rate": 0.0001282463186077644,
"loss": 2.7764,
"step": 269
},
{
"epoch": 1.0843373493975903,
"grad_norm": 3.544743061065674,
"learning_rate": 0.00012797858099062917,
"loss": 3.6176,
"step": 270
},
{
"epoch": 1.0883534136546185,
"grad_norm": 2.4814655780792236,
"learning_rate": 0.00012771084337349396,
"loss": 3.2284,
"step": 271
},
{
"epoch": 1.0923694779116466,
"grad_norm": 2.364025592803955,
"learning_rate": 0.00012744310575635878,
"loss": 3.6178,
"step": 272
},
{
"epoch": 1.0963855421686748,
"grad_norm": 1.989912748336792,
"learning_rate": 0.00012717536813922357,
"loss": 2.5839,
"step": 273
},
{
"epoch": 1.1004016064257027,
"grad_norm": 2.413421154022217,
"learning_rate": 0.00012690763052208837,
"loss": 3.5416,
"step": 274
},
{
"epoch": 1.104417670682731,
"grad_norm": 2.679314613342285,
"learning_rate": 0.00012663989290495316,
"loss": 3.0015,
"step": 275
},
{
"epoch": 1.108433734939759,
"grad_norm": 2.2354209423065186,
"learning_rate": 0.00012637215528781793,
"loss": 3.3867,
"step": 276
},
{
"epoch": 1.1124497991967872,
"grad_norm": 2.4003982543945312,
"learning_rate": 0.00012610441767068272,
"loss": 3.0927,
"step": 277
},
{
"epoch": 1.1164658634538154,
"grad_norm": 2.2922661304473877,
"learning_rate": 0.00012583668005354754,
"loss": 2.835,
"step": 278
},
{
"epoch": 1.1204819277108433,
"grad_norm": 2.1880528926849365,
"learning_rate": 0.00012556894243641233,
"loss": 2.9581,
"step": 279
},
{
"epoch": 1.1244979919678715,
"grad_norm": 2.5255534648895264,
"learning_rate": 0.00012530120481927712,
"loss": 2.7931,
"step": 280
},
{
"epoch": 1.1285140562248996,
"grad_norm": 2.2529118061065674,
"learning_rate": 0.00012503346720214192,
"loss": 2.6831,
"step": 281
},
{
"epoch": 1.1325301204819278,
"grad_norm": 2.2123444080352783,
"learning_rate": 0.0001247657295850067,
"loss": 2.8091,
"step": 282
},
{
"epoch": 1.1365461847389557,
"grad_norm": 2.538160800933838,
"learning_rate": 0.00012449799196787148,
"loss": 3.0089,
"step": 283
},
{
"epoch": 1.140562248995984,
"grad_norm": 3.0052592754364014,
"learning_rate": 0.00012423025435073627,
"loss": 3.9042,
"step": 284
},
{
"epoch": 1.144578313253012,
"grad_norm": 2.691096067428589,
"learning_rate": 0.0001239625167336011,
"loss": 3.9491,
"step": 285
},
{
"epoch": 1.1485943775100402,
"grad_norm": 2.6101088523864746,
"learning_rate": 0.00012369477911646588,
"loss": 2.9432,
"step": 286
},
{
"epoch": 1.1526104417670684,
"grad_norm": 2.368319511413574,
"learning_rate": 0.00012342704149933067,
"loss": 2.966,
"step": 287
},
{
"epoch": 1.1566265060240963,
"grad_norm": 2.4615232944488525,
"learning_rate": 0.00012315930388219547,
"loss": 3.4359,
"step": 288
},
{
"epoch": 1.1606425702811245,
"grad_norm": 2.3296902179718018,
"learning_rate": 0.00012289156626506023,
"loss": 3.0168,
"step": 289
},
{
"epoch": 1.1646586345381527,
"grad_norm": 2.7844183444976807,
"learning_rate": 0.00012262382864792503,
"loss": 3.1574,
"step": 290
},
{
"epoch": 1.1686746987951806,
"grad_norm": 2.486553430557251,
"learning_rate": 0.00012235609103078982,
"loss": 3.1044,
"step": 291
},
{
"epoch": 1.1726907630522088,
"grad_norm": 2.4482836723327637,
"learning_rate": 0.00012208835341365464,
"loss": 3.2606,
"step": 292
},
{
"epoch": 1.176706827309237,
"grad_norm": 2.393049955368042,
"learning_rate": 0.00012182061579651942,
"loss": 2.9026,
"step": 293
},
{
"epoch": 1.180722891566265,
"grad_norm": 2.8396050930023193,
"learning_rate": 0.00012155287817938421,
"loss": 2.9787,
"step": 294
},
{
"epoch": 1.1847389558232932,
"grad_norm": 2.447458028793335,
"learning_rate": 0.000121285140562249,
"loss": 2.6885,
"step": 295
},
{
"epoch": 1.1887550200803212,
"grad_norm": 2.3094258308410645,
"learning_rate": 0.0001210174029451138,
"loss": 2.9401,
"step": 296
},
{
"epoch": 1.1927710843373494,
"grad_norm": 2.5315654277801514,
"learning_rate": 0.00012074966532797858,
"loss": 3.2829,
"step": 297
},
{
"epoch": 1.1967871485943775,
"grad_norm": 2.4781811237335205,
"learning_rate": 0.0001204819277108434,
"loss": 2.9542,
"step": 298
},
{
"epoch": 1.2008032128514057,
"grad_norm": 2.759524345397949,
"learning_rate": 0.00012021419009370817,
"loss": 3.5029,
"step": 299
},
{
"epoch": 1.2048192771084336,
"grad_norm": 2.388485908508301,
"learning_rate": 0.00011994645247657297,
"loss": 2.6706,
"step": 300
},
{
"epoch": 1.2088353413654618,
"grad_norm": 2.5414671897888184,
"learning_rate": 0.00011967871485943776,
"loss": 2.7898,
"step": 301
},
{
"epoch": 1.21285140562249,
"grad_norm": 3.36741042137146,
"learning_rate": 0.00011941097724230255,
"loss": 2.7475,
"step": 302
},
{
"epoch": 1.216867469879518,
"grad_norm": 2.7749950885772705,
"learning_rate": 0.00011914323962516733,
"loss": 2.9617,
"step": 303
},
{
"epoch": 1.2208835341365463,
"grad_norm": 2.685976505279541,
"learning_rate": 0.00011887550200803212,
"loss": 3.2493,
"step": 304
},
{
"epoch": 1.2248995983935742,
"grad_norm": 2.7357215881347656,
"learning_rate": 0.00011860776439089693,
"loss": 2.7249,
"step": 305
},
{
"epoch": 1.2289156626506024,
"grad_norm": 2.962019443511963,
"learning_rate": 0.00011834002677376172,
"loss": 3.4647,
"step": 306
},
{
"epoch": 1.2329317269076305,
"grad_norm": 2.891343832015991,
"learning_rate": 0.00011807228915662652,
"loss": 3.5527,
"step": 307
},
{
"epoch": 1.2369477911646587,
"grad_norm": 2.7382125854492188,
"learning_rate": 0.00011780455153949131,
"loss": 3.1955,
"step": 308
},
{
"epoch": 1.2409638554216866,
"grad_norm": 2.385486602783203,
"learning_rate": 0.00011753681392235609,
"loss": 3.022,
"step": 309
},
{
"epoch": 1.2449799196787148,
"grad_norm": 2.553295612335205,
"learning_rate": 0.00011726907630522088,
"loss": 2.801,
"step": 310
},
{
"epoch": 1.248995983935743,
"grad_norm": 2.9965014457702637,
"learning_rate": 0.00011700133868808567,
"loss": 2.4453,
"step": 311
},
{
"epoch": 1.2530120481927711,
"grad_norm": 2.327629566192627,
"learning_rate": 0.00011673360107095048,
"loss": 2.2897,
"step": 312
},
{
"epoch": 1.2570281124497993,
"grad_norm": 2.7544825077056885,
"learning_rate": 0.00011646586345381527,
"loss": 3.2796,
"step": 313
},
{
"epoch": 1.2610441767068274,
"grad_norm": 2.590733051300049,
"learning_rate": 0.00011619812583668007,
"loss": 2.9126,
"step": 314
},
{
"epoch": 1.2650602409638554,
"grad_norm": 3.3064663410186768,
"learning_rate": 0.00011593038821954485,
"loss": 3.6784,
"step": 315
},
{
"epoch": 1.2690763052208835,
"grad_norm": 3.3928616046905518,
"learning_rate": 0.00011566265060240964,
"loss": 3.3292,
"step": 316
},
{
"epoch": 1.2730923694779117,
"grad_norm": 2.6576473712921143,
"learning_rate": 0.00011539491298527443,
"loss": 3.0617,
"step": 317
},
{
"epoch": 1.2771084337349397,
"grad_norm": 2.5956337451934814,
"learning_rate": 0.00011512717536813924,
"loss": 2.9754,
"step": 318
},
{
"epoch": 1.2811244979919678,
"grad_norm": 2.8080995082855225,
"learning_rate": 0.00011485943775100403,
"loss": 3.1712,
"step": 319
},
{
"epoch": 1.285140562248996,
"grad_norm": 2.4304864406585693,
"learning_rate": 0.00011459170013386882,
"loss": 3.0387,
"step": 320
},
{
"epoch": 1.2891566265060241,
"grad_norm": 2.2777411937713623,
"learning_rate": 0.0001143239625167336,
"loss": 2.8357,
"step": 321
},
{
"epoch": 1.2931726907630523,
"grad_norm": 2.370192289352417,
"learning_rate": 0.0001140562248995984,
"loss": 2.5937,
"step": 322
},
{
"epoch": 1.2971887550200802,
"grad_norm": 3.0521585941314697,
"learning_rate": 0.00011378848728246319,
"loss": 4.4271,
"step": 323
},
{
"epoch": 1.3012048192771084,
"grad_norm": 2.4153242111206055,
"learning_rate": 0.00011352074966532798,
"loss": 2.7952,
"step": 324
},
{
"epoch": 1.3052208835341366,
"grad_norm": 2.629312038421631,
"learning_rate": 0.00011325301204819279,
"loss": 3.6324,
"step": 325
},
{
"epoch": 1.3092369477911647,
"grad_norm": 2.0146517753601074,
"learning_rate": 0.00011298527443105758,
"loss": 2.3154,
"step": 326
},
{
"epoch": 1.3132530120481927,
"grad_norm": 2.3414394855499268,
"learning_rate": 0.00011271753681392236,
"loss": 2.809,
"step": 327
},
{
"epoch": 1.3172690763052208,
"grad_norm": 2.366577386856079,
"learning_rate": 0.00011244979919678715,
"loss": 3.7852,
"step": 328
},
{
"epoch": 1.321285140562249,
"grad_norm": 2.661543130874634,
"learning_rate": 0.00011218206157965195,
"loss": 2.818,
"step": 329
},
{
"epoch": 1.3253012048192772,
"grad_norm": 2.51835036277771,
"learning_rate": 0.00011191432396251674,
"loss": 2.8359,
"step": 330
},
{
"epoch": 1.3293172690763053,
"grad_norm": 2.473179817199707,
"learning_rate": 0.00011164658634538152,
"loss": 2.8498,
"step": 331
},
{
"epoch": 1.3333333333333333,
"grad_norm": 2.9637928009033203,
"learning_rate": 0.00011137884872824634,
"loss": 4.164,
"step": 332
},
{
"epoch": 1.3373493975903614,
"grad_norm": 2.5028486251831055,
"learning_rate": 0.00011111111111111112,
"loss": 3.6701,
"step": 333
},
{
"epoch": 1.3413654618473896,
"grad_norm": 3.149928092956543,
"learning_rate": 0.00011084337349397591,
"loss": 3.7949,
"step": 334
},
{
"epoch": 1.3453815261044177,
"grad_norm": 2.7405877113342285,
"learning_rate": 0.0001105756358768407,
"loss": 3.2064,
"step": 335
},
{
"epoch": 1.3493975903614457,
"grad_norm": 2.830744743347168,
"learning_rate": 0.0001103078982597055,
"loss": 2.8919,
"step": 336
},
{
"epoch": 1.3534136546184738,
"grad_norm": 2.9335427284240723,
"learning_rate": 0.00011004016064257027,
"loss": 3.1013,
"step": 337
},
{
"epoch": 1.357429718875502,
"grad_norm": 2.505171537399292,
"learning_rate": 0.0001097724230254351,
"loss": 3.206,
"step": 338
},
{
"epoch": 1.3614457831325302,
"grad_norm": 3.127634286880493,
"learning_rate": 0.00010950468540829987,
"loss": 3.2454,
"step": 339
},
{
"epoch": 1.3654618473895583,
"grad_norm": 2.7009451389312744,
"learning_rate": 0.00010923694779116467,
"loss": 3.0679,
"step": 340
},
{
"epoch": 1.3694779116465863,
"grad_norm": 2.3906707763671875,
"learning_rate": 0.00010896921017402946,
"loss": 3.7267,
"step": 341
},
{
"epoch": 1.3734939759036144,
"grad_norm": 2.4884233474731445,
"learning_rate": 0.00010870147255689425,
"loss": 3.2707,
"step": 342
},
{
"epoch": 1.3775100401606426,
"grad_norm": 2.514148712158203,
"learning_rate": 0.00010843373493975903,
"loss": 3.0734,
"step": 343
},
{
"epoch": 1.3815261044176708,
"grad_norm": 2.450438976287842,
"learning_rate": 0.00010816599732262382,
"loss": 2.7529,
"step": 344
},
{
"epoch": 1.3855421686746987,
"grad_norm": 2.5931103229522705,
"learning_rate": 0.00010789825970548863,
"loss": 3.8578,
"step": 345
},
{
"epoch": 1.3895582329317269,
"grad_norm": 2.386543035507202,
"learning_rate": 0.00010763052208835342,
"loss": 3.2145,
"step": 346
},
{
"epoch": 1.393574297188755,
"grad_norm": 2.643378973007202,
"learning_rate": 0.00010736278447121822,
"loss": 2.7853,
"step": 347
},
{
"epoch": 1.3975903614457832,
"grad_norm": 1.9885903596878052,
"learning_rate": 0.00010709504685408301,
"loss": 2.2022,
"step": 348
},
{
"epoch": 1.4016064257028114,
"grad_norm": 2.6465091705322266,
"learning_rate": 0.00010682730923694779,
"loss": 3.5565,
"step": 349
},
{
"epoch": 1.4056224899598393,
"grad_norm": 2.6052937507629395,
"learning_rate": 0.00010655957161981258,
"loss": 2.9741,
"step": 350
},
{
"epoch": 1.4096385542168675,
"grad_norm": 2.7112314701080322,
"learning_rate": 0.00010629183400267737,
"loss": 4.0259,
"step": 351
},
{
"epoch": 1.4136546184738956,
"grad_norm": 2.5356833934783936,
"learning_rate": 0.00010602409638554218,
"loss": 2.6879,
"step": 352
},
{
"epoch": 1.4176706827309236,
"grad_norm": 2.745176315307617,
"learning_rate": 0.00010575635876840697,
"loss": 4.0105,
"step": 353
},
{
"epoch": 1.4216867469879517,
"grad_norm": 2.5344765186309814,
"learning_rate": 0.00010548862115127177,
"loss": 2.9797,
"step": 354
},
{
"epoch": 1.4257028112449799,
"grad_norm": 2.680912733078003,
"learning_rate": 0.00010522088353413654,
"loss": 3.3971,
"step": 355
},
{
"epoch": 1.429718875502008,
"grad_norm": 3.498023271560669,
"learning_rate": 0.00010495314591700134,
"loss": 3.6706,
"step": 356
},
{
"epoch": 1.4337349397590362,
"grad_norm": 2.4419398307800293,
"learning_rate": 0.00010468540829986613,
"loss": 2.6477,
"step": 357
},
{
"epoch": 1.4377510040160644,
"grad_norm": 3.2264997959136963,
"learning_rate": 0.00010441767068273094,
"loss": 4.5181,
"step": 358
},
{
"epoch": 1.4417670682730923,
"grad_norm": 2.5578315258026123,
"learning_rate": 0.00010414993306559573,
"loss": 2.6282,
"step": 359
},
{
"epoch": 1.4457831325301205,
"grad_norm": 2.539045572280884,
"learning_rate": 0.00010388219544846052,
"loss": 2.6435,
"step": 360
},
{
"epoch": 1.4497991967871486,
"grad_norm": 2.9697344303131104,
"learning_rate": 0.0001036144578313253,
"loss": 2.6676,
"step": 361
},
{
"epoch": 1.4538152610441766,
"grad_norm": 2.606131076812744,
"learning_rate": 0.0001033467202141901,
"loss": 2.9316,
"step": 362
},
{
"epoch": 1.4578313253012047,
"grad_norm": 3.290837049484253,
"learning_rate": 0.00010307898259705489,
"loss": 3.0869,
"step": 363
},
{
"epoch": 1.461847389558233,
"grad_norm": 2.331320285797119,
"learning_rate": 0.00010281124497991968,
"loss": 2.555,
"step": 364
},
{
"epoch": 1.465863453815261,
"grad_norm": 2.8447391986846924,
"learning_rate": 0.00010254350736278449,
"loss": 2.6998,
"step": 365
},
{
"epoch": 1.4698795180722892,
"grad_norm": 2.6170618534088135,
"learning_rate": 0.00010227576974564928,
"loss": 2.7688,
"step": 366
},
{
"epoch": 1.4738955823293174,
"grad_norm": 2.933560609817505,
"learning_rate": 0.00010200803212851406,
"loss": 3.0291,
"step": 367
},
{
"epoch": 1.4779116465863453,
"grad_norm": 2.6285972595214844,
"learning_rate": 0.00010174029451137885,
"loss": 2.8629,
"step": 368
},
{
"epoch": 1.4819277108433735,
"grad_norm": 3.2716546058654785,
"learning_rate": 0.00010147255689424364,
"loss": 3.1994,
"step": 369
},
{
"epoch": 1.4859437751004017,
"grad_norm": 2.758296489715576,
"learning_rate": 0.00010120481927710844,
"loss": 2.6734,
"step": 370
},
{
"epoch": 1.4899598393574296,
"grad_norm": 2.3439807891845703,
"learning_rate": 0.00010093708165997322,
"loss": 2.8747,
"step": 371
},
{
"epoch": 1.4939759036144578,
"grad_norm": 2.4199349880218506,
"learning_rate": 0.00010066934404283804,
"loss": 2.7135,
"step": 372
},
{
"epoch": 1.497991967871486,
"grad_norm": 2.8863987922668457,
"learning_rate": 0.00010040160642570282,
"loss": 3.3239,
"step": 373
},
{
"epoch": 1.502008032128514,
"grad_norm": 2.5620765686035156,
"learning_rate": 0.00010013386880856761,
"loss": 2.5748,
"step": 374
},
{
"epoch": 1.5060240963855422,
"grad_norm": 2.5705456733703613,
"learning_rate": 9.98661311914324e-05,
"loss": 3.4645,
"step": 375
},
{
"epoch": 1.5100401606425704,
"grad_norm": 2.75276780128479,
"learning_rate": 9.95983935742972e-05,
"loss": 2.7345,
"step": 376
},
{
"epoch": 1.5140562248995983,
"grad_norm": 2.5206143856048584,
"learning_rate": 9.933065595716199e-05,
"loss": 2.8325,
"step": 377
},
{
"epoch": 1.5180722891566265,
"grad_norm": 2.3054890632629395,
"learning_rate": 9.906291834002678e-05,
"loss": 2.884,
"step": 378
},
{
"epoch": 1.5220883534136547,
"grad_norm": 2.563084125518799,
"learning_rate": 9.879518072289157e-05,
"loss": 3.0262,
"step": 379
},
{
"epoch": 1.5261044176706826,
"grad_norm": 2.575040817260742,
"learning_rate": 9.852744310575637e-05,
"loss": 3.096,
"step": 380
},
{
"epoch": 1.5301204819277108,
"grad_norm": 2.3715319633483887,
"learning_rate": 9.825970548862116e-05,
"loss": 2.7163,
"step": 381
},
{
"epoch": 1.534136546184739,
"grad_norm": 2.7323389053344727,
"learning_rate": 9.799196787148595e-05,
"loss": 2.792,
"step": 382
},
{
"epoch": 1.538152610441767,
"grad_norm": 2.523524522781372,
"learning_rate": 9.772423025435074e-05,
"loss": 3.2821,
"step": 383
},
{
"epoch": 1.5421686746987953,
"grad_norm": 2.533090114593506,
"learning_rate": 9.745649263721554e-05,
"loss": 2.7672,
"step": 384
},
{
"epoch": 1.5461847389558234,
"grad_norm": 2.644031286239624,
"learning_rate": 9.718875502008033e-05,
"loss": 3.0318,
"step": 385
},
{
"epoch": 1.5502008032128514,
"grad_norm": 3.1442739963531494,
"learning_rate": 9.692101740294511e-05,
"loss": 3.6628,
"step": 386
},
{
"epoch": 1.5542168674698795,
"grad_norm": 2.403552532196045,
"learning_rate": 9.665327978580992e-05,
"loss": 2.4332,
"step": 387
},
{
"epoch": 1.5582329317269075,
"grad_norm": 2.478534698486328,
"learning_rate": 9.638554216867471e-05,
"loss": 2.4746,
"step": 388
},
{
"epoch": 1.5622489959839356,
"grad_norm": 2.7873339653015137,
"learning_rate": 9.611780455153949e-05,
"loss": 2.8514,
"step": 389
},
{
"epoch": 1.5662650602409638,
"grad_norm": 2.751532793045044,
"learning_rate": 9.58500669344043e-05,
"loss": 2.9365,
"step": 390
},
{
"epoch": 1.570281124497992,
"grad_norm": 2.8862998485565186,
"learning_rate": 9.558232931726909e-05,
"loss": 3.2632,
"step": 391
},
{
"epoch": 1.5742971887550201,
"grad_norm": 2.5372817516326904,
"learning_rate": 9.531459170013387e-05,
"loss": 2.8649,
"step": 392
},
{
"epoch": 1.5783132530120483,
"grad_norm": 2.428025007247925,
"learning_rate": 9.504685408299867e-05,
"loss": 2.6417,
"step": 393
},
{
"epoch": 1.5823293172690764,
"grad_norm": 3.284771680831909,
"learning_rate": 9.477911646586346e-05,
"loss": 3.4804,
"step": 394
},
{
"epoch": 1.5863453815261044,
"grad_norm": 2.8651950359344482,
"learning_rate": 9.451137884872824e-05,
"loss": 3.1454,
"step": 395
},
{
"epoch": 1.5903614457831325,
"grad_norm": 3.078660011291504,
"learning_rate": 9.424364123159304e-05,
"loss": 3.5961,
"step": 396
},
{
"epoch": 1.5943775100401605,
"grad_norm": 2.2207376956939697,
"learning_rate": 9.397590361445784e-05,
"loss": 2.3121,
"step": 397
},
{
"epoch": 1.5983935742971886,
"grad_norm": 2.4094178676605225,
"learning_rate": 9.370816599732262e-05,
"loss": 2.7138,
"step": 398
},
{
"epoch": 1.6024096385542168,
"grad_norm": 2.759876251220703,
"learning_rate": 9.344042838018742e-05,
"loss": 3.5605,
"step": 399
},
{
"epoch": 1.606425702811245,
"grad_norm": 2.189237117767334,
"learning_rate": 9.317269076305222e-05,
"loss": 2.6023,
"step": 400
},
{
"epoch": 1.6104417670682731,
"grad_norm": 2.585479736328125,
"learning_rate": 9.2904953145917e-05,
"loss": 3.2234,
"step": 401
},
{
"epoch": 1.6144578313253013,
"grad_norm": 2.565342664718628,
"learning_rate": 9.26372155287818e-05,
"loss": 3.0341,
"step": 402
},
{
"epoch": 1.6184738955823295,
"grad_norm": 2.4045302867889404,
"learning_rate": 9.23694779116466e-05,
"loss": 2.7032,
"step": 403
},
{
"epoch": 1.6224899598393574,
"grad_norm": 3.0136139392852783,
"learning_rate": 9.210174029451138e-05,
"loss": 3.1651,
"step": 404
},
{
"epoch": 1.6265060240963856,
"grad_norm": 2.253669261932373,
"learning_rate": 9.183400267737617e-05,
"loss": 2.2507,
"step": 405
},
{
"epoch": 1.6305220883534135,
"grad_norm": 2.734966993331909,
"learning_rate": 9.156626506024096e-05,
"loss": 3.0798,
"step": 406
},
{
"epoch": 1.6345381526104417,
"grad_norm": 2.955502986907959,
"learning_rate": 9.129852744310576e-05,
"loss": 3.086,
"step": 407
},
{
"epoch": 1.6385542168674698,
"grad_norm": 3.2345542907714844,
"learning_rate": 9.103078982597055e-05,
"loss": 3.3553,
"step": 408
},
{
"epoch": 1.642570281124498,
"grad_norm": 2.7762720584869385,
"learning_rate": 9.076305220883534e-05,
"loss": 3.4238,
"step": 409
},
{
"epoch": 1.6465863453815262,
"grad_norm": 2.824641466140747,
"learning_rate": 9.049531459170014e-05,
"loss": 2.8925,
"step": 410
},
{
"epoch": 1.6506024096385543,
"grad_norm": 2.754810094833374,
"learning_rate": 9.022757697456493e-05,
"loss": 2.9022,
"step": 411
},
{
"epoch": 1.6546184738955825,
"grad_norm": 2.5305283069610596,
"learning_rate": 8.995983935742972e-05,
"loss": 2.927,
"step": 412
},
{
"epoch": 1.6586345381526104,
"grad_norm": 2.796165943145752,
"learning_rate": 8.969210174029451e-05,
"loss": 2.9185,
"step": 413
},
{
"epoch": 1.6626506024096386,
"grad_norm": 2.9504239559173584,
"learning_rate": 8.942436412315931e-05,
"loss": 3.3915,
"step": 414
},
{
"epoch": 1.6666666666666665,
"grad_norm": 2.8904786109924316,
"learning_rate": 8.91566265060241e-05,
"loss": 2.8841,
"step": 415
},
{
"epoch": 1.6706827309236947,
"grad_norm": 2.184354305267334,
"learning_rate": 8.888888888888889e-05,
"loss": 2.3859,
"step": 416
},
{
"epoch": 1.6746987951807228,
"grad_norm": 3.1552340984344482,
"learning_rate": 8.862115127175369e-05,
"loss": 3.22,
"step": 417
},
{
"epoch": 1.678714859437751,
"grad_norm": 3.2323250770568848,
"learning_rate": 8.835341365461848e-05,
"loss": 2.8859,
"step": 418
},
{
"epoch": 1.6827309236947792,
"grad_norm": 2.726513147354126,
"learning_rate": 8.808567603748327e-05,
"loss": 3.0969,
"step": 419
},
{
"epoch": 1.6867469879518073,
"grad_norm": 2.7404675483703613,
"learning_rate": 8.781793842034806e-05,
"loss": 2.76,
"step": 420
},
{
"epoch": 1.6907630522088355,
"grad_norm": 3.433872699737549,
"learning_rate": 8.755020080321286e-05,
"loss": 3.1852,
"step": 421
},
{
"epoch": 1.6947791164658634,
"grad_norm": 3.4727306365966797,
"learning_rate": 8.728246318607765e-05,
"loss": 3.6413,
"step": 422
},
{
"epoch": 1.6987951807228916,
"grad_norm": 2.968161106109619,
"learning_rate": 8.701472556894244e-05,
"loss": 3.15,
"step": 423
},
{
"epoch": 1.7028112449799195,
"grad_norm": 2.8164682388305664,
"learning_rate": 8.674698795180724e-05,
"loss": 3.0286,
"step": 424
},
{
"epoch": 1.7068273092369477,
"grad_norm": 2.7942745685577393,
"learning_rate": 8.647925033467203e-05,
"loss": 3.2501,
"step": 425
},
{
"epoch": 1.7108433734939759,
"grad_norm": 3.2419016361236572,
"learning_rate": 8.621151271753681e-05,
"loss": 4.3181,
"step": 426
},
{
"epoch": 1.714859437751004,
"grad_norm": 3.3823928833007812,
"learning_rate": 8.594377510040161e-05,
"loss": 3.2917,
"step": 427
},
{
"epoch": 1.7188755020080322,
"grad_norm": 2.8482446670532227,
"learning_rate": 8.567603748326641e-05,
"loss": 3.0338,
"step": 428
},
{
"epoch": 1.7228915662650603,
"grad_norm": 2.435845375061035,
"learning_rate": 8.540829986613119e-05,
"loss": 2.5519,
"step": 429
},
{
"epoch": 1.7269076305220885,
"grad_norm": 2.9163546562194824,
"learning_rate": 8.514056224899599e-05,
"loss": 3.72,
"step": 430
},
{
"epoch": 1.7309236947791165,
"grad_norm": 2.3660037517547607,
"learning_rate": 8.487282463186079e-05,
"loss": 2.3941,
"step": 431
},
{
"epoch": 1.7349397590361446,
"grad_norm": 2.527449131011963,
"learning_rate": 8.460508701472556e-05,
"loss": 2.9851,
"step": 432
},
{
"epoch": 1.7389558232931726,
"grad_norm": 2.2324576377868652,
"learning_rate": 8.433734939759037e-05,
"loss": 2.6241,
"step": 433
},
{
"epoch": 1.7429718875502007,
"grad_norm": 2.7165253162384033,
"learning_rate": 8.406961178045516e-05,
"loss": 2.7749,
"step": 434
},
{
"epoch": 1.7469879518072289,
"grad_norm": 2.7401411533355713,
"learning_rate": 8.380187416331994e-05,
"loss": 2.9022,
"step": 435
},
{
"epoch": 1.751004016064257,
"grad_norm": 2.518826961517334,
"learning_rate": 8.353413654618474e-05,
"loss": 2.7587,
"step": 436
},
{
"epoch": 1.7550200803212852,
"grad_norm": 2.493936061859131,
"learning_rate": 8.326639892904954e-05,
"loss": 3.1417,
"step": 437
},
{
"epoch": 1.7590361445783134,
"grad_norm": 2.747951030731201,
"learning_rate": 8.299866131191432e-05,
"loss": 2.6913,
"step": 438
},
{
"epoch": 1.7630522088353415,
"grad_norm": 2.8907039165496826,
"learning_rate": 8.273092369477911e-05,
"loss": 2.4416,
"step": 439
},
{
"epoch": 1.7670682730923695,
"grad_norm": 3.6564669609069824,
"learning_rate": 8.246318607764392e-05,
"loss": 3.9361,
"step": 440
},
{
"epoch": 1.7710843373493976,
"grad_norm": 2.4362285137176514,
"learning_rate": 8.21954484605087e-05,
"loss": 2.461,
"step": 441
},
{
"epoch": 1.7751004016064256,
"grad_norm": 3.2182202339172363,
"learning_rate": 8.192771084337349e-05,
"loss": 3.2511,
"step": 442
},
{
"epoch": 1.7791164658634537,
"grad_norm": 3.2106211185455322,
"learning_rate": 8.16599732262383e-05,
"loss": 4.4307,
"step": 443
},
{
"epoch": 1.783132530120482,
"grad_norm": 3.4369003772735596,
"learning_rate": 8.139223560910308e-05,
"loss": 4.08,
"step": 444
},
{
"epoch": 1.78714859437751,
"grad_norm": 2.2681970596313477,
"learning_rate": 8.112449799196787e-05,
"loss": 2.3631,
"step": 445
},
{
"epoch": 1.7911646586345382,
"grad_norm": 2.691133975982666,
"learning_rate": 8.085676037483266e-05,
"loss": 2.6157,
"step": 446
},
{
"epoch": 1.7951807228915664,
"grad_norm": 2.9200479984283447,
"learning_rate": 8.058902275769746e-05,
"loss": 2.6649,
"step": 447
},
{
"epoch": 1.7991967871485943,
"grad_norm": 2.787264108657837,
"learning_rate": 8.032128514056225e-05,
"loss": 2.763,
"step": 448
},
{
"epoch": 1.8032128514056225,
"grad_norm": 2.940075635910034,
"learning_rate": 8.005354752342704e-05,
"loss": 2.9436,
"step": 449
},
{
"epoch": 1.8072289156626506,
"grad_norm": 3.1111507415771484,
"learning_rate": 7.978580990629184e-05,
"loss": 3.1194,
"step": 450
},
{
"epoch": 1.8112449799196786,
"grad_norm": 2.695709228515625,
"learning_rate": 7.951807228915663e-05,
"loss": 2.7517,
"step": 451
},
{
"epoch": 1.8152610441767068,
"grad_norm": 2.939112663269043,
"learning_rate": 7.925033467202142e-05,
"loss": 3.7794,
"step": 452
},
{
"epoch": 1.819277108433735,
"grad_norm": 2.583163022994995,
"learning_rate": 7.898259705488621e-05,
"loss": 3.0265,
"step": 453
},
{
"epoch": 1.823293172690763,
"grad_norm": 2.496131181716919,
"learning_rate": 7.8714859437751e-05,
"loss": 2.5762,
"step": 454
},
{
"epoch": 1.8273092369477912,
"grad_norm": 2.4272570610046387,
"learning_rate": 7.84471218206158e-05,
"loss": 2.758,
"step": 455
},
{
"epoch": 1.8313253012048194,
"grad_norm": 2.4154021739959717,
"learning_rate": 7.817938420348059e-05,
"loss": 2.7325,
"step": 456
},
{
"epoch": 1.8353413654618473,
"grad_norm": 2.5219106674194336,
"learning_rate": 7.791164658634539e-05,
"loss": 2.779,
"step": 457
},
{
"epoch": 1.8393574297188755,
"grad_norm": 2.3390161991119385,
"learning_rate": 7.764390896921018e-05,
"loss": 2.2922,
"step": 458
},
{
"epoch": 1.8433734939759037,
"grad_norm": 2.7101354598999023,
"learning_rate": 7.737617135207497e-05,
"loss": 2.9825,
"step": 459
},
{
"epoch": 1.8473895582329316,
"grad_norm": 2.8510243892669678,
"learning_rate": 7.710843373493976e-05,
"loss": 2.8628,
"step": 460
},
{
"epoch": 1.8514056224899598,
"grad_norm": 2.6924989223480225,
"learning_rate": 7.684069611780456e-05,
"loss": 2.6543,
"step": 461
},
{
"epoch": 1.855421686746988,
"grad_norm": 2.6552584171295166,
"learning_rate": 7.657295850066935e-05,
"loss": 3.0625,
"step": 462
},
{
"epoch": 1.859437751004016,
"grad_norm": 3.2962827682495117,
"learning_rate": 7.630522088353414e-05,
"loss": 3.308,
"step": 463
},
{
"epoch": 1.8634538152610443,
"grad_norm": 3.0845699310302734,
"learning_rate": 7.603748326639893e-05,
"loss": 3.5178,
"step": 464
},
{
"epoch": 1.8674698795180724,
"grad_norm": 2.768254518508911,
"learning_rate": 7.576974564926373e-05,
"loss": 3.6667,
"step": 465
},
{
"epoch": 1.8714859437751004,
"grad_norm": 2.5801167488098145,
"learning_rate": 7.550200803212851e-05,
"loss": 2.7686,
"step": 466
},
{
"epoch": 1.8755020080321285,
"grad_norm": 2.2853081226348877,
"learning_rate": 7.523427041499331e-05,
"loss": 2.2115,
"step": 467
},
{
"epoch": 1.8795180722891565,
"grad_norm": 2.9309747219085693,
"learning_rate": 7.49665327978581e-05,
"loss": 2.9426,
"step": 468
},
{
"epoch": 1.8835341365461846,
"grad_norm": 3.146700143814087,
"learning_rate": 7.469879518072289e-05,
"loss": 3.3903,
"step": 469
},
{
"epoch": 1.8875502008032128,
"grad_norm": 3.3652424812316895,
"learning_rate": 7.443105756358769e-05,
"loss": 3.0085,
"step": 470
},
{
"epoch": 1.891566265060241,
"grad_norm": 2.424377918243408,
"learning_rate": 7.416331994645248e-05,
"loss": 2.5145,
"step": 471
},
{
"epoch": 1.895582329317269,
"grad_norm": 2.5642752647399902,
"learning_rate": 7.389558232931726e-05,
"loss": 3.1927,
"step": 472
},
{
"epoch": 1.8995983935742973,
"grad_norm": 2.7574706077575684,
"learning_rate": 7.362784471218207e-05,
"loss": 2.6753,
"step": 473
},
{
"epoch": 1.9036144578313254,
"grad_norm": 2.6844048500061035,
"learning_rate": 7.336010709504686e-05,
"loss": 2.7126,
"step": 474
},
{
"epoch": 1.9076305220883534,
"grad_norm": 2.3251895904541016,
"learning_rate": 7.309236947791164e-05,
"loss": 2.5947,
"step": 475
},
{
"epoch": 1.9116465863453815,
"grad_norm": 2.1562206745147705,
"learning_rate": 7.282463186077644e-05,
"loss": 2.2137,
"step": 476
},
{
"epoch": 1.9156626506024095,
"grad_norm": 2.400747776031494,
"learning_rate": 7.255689424364124e-05,
"loss": 2.8869,
"step": 477
},
{
"epoch": 1.9196787148594376,
"grad_norm": 3.1380369663238525,
"learning_rate": 7.228915662650602e-05,
"loss": 3.4202,
"step": 478
},
{
"epoch": 1.9236947791164658,
"grad_norm": 2.9858291149139404,
"learning_rate": 7.202141900937081e-05,
"loss": 3.1519,
"step": 479
},
{
"epoch": 1.927710843373494,
"grad_norm": 2.6354973316192627,
"learning_rate": 7.175368139223562e-05,
"loss": 2.8662,
"step": 480
},
{
"epoch": 1.9317269076305221,
"grad_norm": 2.7349445819854736,
"learning_rate": 7.14859437751004e-05,
"loss": 4.2679,
"step": 481
},
{
"epoch": 1.9357429718875503,
"grad_norm": 3.0139505863189697,
"learning_rate": 7.121820615796519e-05,
"loss": 2.9382,
"step": 482
},
{
"epoch": 1.9397590361445785,
"grad_norm": 3.1879093647003174,
"learning_rate": 7.095046854083e-05,
"loss": 3.168,
"step": 483
},
{
"epoch": 1.9437751004016064,
"grad_norm": 3.2778398990631104,
"learning_rate": 7.068273092369478e-05,
"loss": 3.4373,
"step": 484
},
{
"epoch": 1.9477911646586346,
"grad_norm": 3.024111747741699,
"learning_rate": 7.041499330655957e-05,
"loss": 3.7807,
"step": 485
},
{
"epoch": 1.9518072289156625,
"grad_norm": 2.750593423843384,
"learning_rate": 7.014725568942436e-05,
"loss": 3.4546,
"step": 486
},
{
"epoch": 1.9558232931726907,
"grad_norm": 2.9757187366485596,
"learning_rate": 6.987951807228917e-05,
"loss": 3.0145,
"step": 487
},
{
"epoch": 1.9598393574297188,
"grad_norm": 2.867292881011963,
"learning_rate": 6.961178045515395e-05,
"loss": 2.5524,
"step": 488
},
{
"epoch": 1.963855421686747,
"grad_norm": 2.563595771789551,
"learning_rate": 6.934404283801874e-05,
"loss": 2.7503,
"step": 489
},
{
"epoch": 1.9678714859437751,
"grad_norm": 2.52006459236145,
"learning_rate": 6.907630522088355e-05,
"loss": 3.0431,
"step": 490
},
{
"epoch": 1.9718875502008033,
"grad_norm": 3.0700199604034424,
"learning_rate": 6.880856760374833e-05,
"loss": 3.7242,
"step": 491
},
{
"epoch": 1.9759036144578315,
"grad_norm": 2.7504234313964844,
"learning_rate": 6.854082998661312e-05,
"loss": 2.6293,
"step": 492
},
{
"epoch": 1.9799196787148594,
"grad_norm": 2.919828414916992,
"learning_rate": 6.827309236947793e-05,
"loss": 2.6278,
"step": 493
},
{
"epoch": 1.9839357429718876,
"grad_norm": 2.453157663345337,
"learning_rate": 6.80053547523427e-05,
"loss": 2.2764,
"step": 494
},
{
"epoch": 1.9879518072289155,
"grad_norm": 2.635430335998535,
"learning_rate": 6.77376171352075e-05,
"loss": 2.9467,
"step": 495
},
{
"epoch": 1.9919678714859437,
"grad_norm": 2.7158102989196777,
"learning_rate": 6.746987951807229e-05,
"loss": 2.7886,
"step": 496
},
{
"epoch": 1.9959839357429718,
"grad_norm": 2.3272292613983154,
"learning_rate": 6.720214190093708e-05,
"loss": 2.6445,
"step": 497
},
{
"epoch": 2.0,
"grad_norm": 2.2954020500183105,
"learning_rate": 6.693440428380188e-05,
"loss": 2.5719,
"step": 498
},
{
"epoch": 2.0,
"eval_loss": 0.8565791249275208,
"eval_runtime": 200.8505,
"eval_samples_per_second": 2.484,
"eval_steps_per_second": 1.245,
"step": 498
},
{
"epoch": 2.004016064257028,
"grad_norm": 2.3647961616516113,
"learning_rate": 6.666666666666667e-05,
"loss": 2.5357,
"step": 499
},
{
"epoch": 2.0080321285140563,
"grad_norm": 2.052393674850464,
"learning_rate": 6.639892904953146e-05,
"loss": 2.1653,
"step": 500
},
{
"epoch": 2.0120481927710845,
"grad_norm": 2.6393344402313232,
"learning_rate": 6.613119143239626e-05,
"loss": 2.2634,
"step": 501
},
{
"epoch": 2.0160642570281126,
"grad_norm": 2.4461183547973633,
"learning_rate": 6.586345381526105e-05,
"loss": 2.7017,
"step": 502
},
{
"epoch": 2.0200803212851404,
"grad_norm": 3.1604115962982178,
"learning_rate": 6.559571619812584e-05,
"loss": 3.6735,
"step": 503
},
{
"epoch": 2.0240963855421685,
"grad_norm": 3.0627472400665283,
"learning_rate": 6.532797858099063e-05,
"loss": 2.9889,
"step": 504
},
{
"epoch": 2.0281124497991967,
"grad_norm": 2.568150520324707,
"learning_rate": 6.506024096385543e-05,
"loss": 2.492,
"step": 505
},
{
"epoch": 2.032128514056225,
"grad_norm": 2.2594618797302246,
"learning_rate": 6.47925033467202e-05,
"loss": 1.8152,
"step": 506
},
{
"epoch": 2.036144578313253,
"grad_norm": 2.544188976287842,
"learning_rate": 6.452476572958501e-05,
"loss": 3.7016,
"step": 507
},
{
"epoch": 2.040160642570281,
"grad_norm": 2.418565511703491,
"learning_rate": 6.42570281124498e-05,
"loss": 2.3062,
"step": 508
},
{
"epoch": 2.0441767068273093,
"grad_norm": 2.3617923259735107,
"learning_rate": 6.398929049531458e-05,
"loss": 2.2887,
"step": 509
},
{
"epoch": 2.0481927710843375,
"grad_norm": 2.4115524291992188,
"learning_rate": 6.372155287817939e-05,
"loss": 2.4596,
"step": 510
},
{
"epoch": 2.0522088353413657,
"grad_norm": 2.763218402862549,
"learning_rate": 6.345381526104418e-05,
"loss": 2.7423,
"step": 511
},
{
"epoch": 2.0562248995983934,
"grad_norm": 2.515378713607788,
"learning_rate": 6.318607764390896e-05,
"loss": 2.4356,
"step": 512
},
{
"epoch": 2.0602409638554215,
"grad_norm": 2.809786796569824,
"learning_rate": 6.291834002677377e-05,
"loss": 3.3361,
"step": 513
},
{
"epoch": 2.0642570281124497,
"grad_norm": 2.3717005252838135,
"learning_rate": 6.265060240963856e-05,
"loss": 3.0205,
"step": 514
},
{
"epoch": 2.068273092369478,
"grad_norm": 2.7689290046691895,
"learning_rate": 6.238286479250335e-05,
"loss": 2.9104,
"step": 515
},
{
"epoch": 2.072289156626506,
"grad_norm": 2.573058843612671,
"learning_rate": 6.211512717536813e-05,
"loss": 2.2966,
"step": 516
},
{
"epoch": 2.076305220883534,
"grad_norm": 2.5662682056427,
"learning_rate": 6.184738955823294e-05,
"loss": 2.4407,
"step": 517
},
{
"epoch": 2.0803212851405624,
"grad_norm": 2.475853681564331,
"learning_rate": 6.157965194109773e-05,
"loss": 2.2512,
"step": 518
},
{
"epoch": 2.0843373493975905,
"grad_norm": 2.426939010620117,
"learning_rate": 6.131191432396251e-05,
"loss": 2.2575,
"step": 519
},
{
"epoch": 2.0883534136546187,
"grad_norm": 2.709951877593994,
"learning_rate": 6.104417670682732e-05,
"loss": 2.2289,
"step": 520
},
{
"epoch": 2.0923694779116464,
"grad_norm": 2.620199680328369,
"learning_rate": 6.0776439089692105e-05,
"loss": 2.6856,
"step": 521
},
{
"epoch": 2.0963855421686746,
"grad_norm": 2.236469030380249,
"learning_rate": 6.05087014725569e-05,
"loss": 2.1652,
"step": 522
},
{
"epoch": 2.1004016064257027,
"grad_norm": 2.4781830310821533,
"learning_rate": 6.02409638554217e-05,
"loss": 2.0519,
"step": 523
},
{
"epoch": 2.104417670682731,
"grad_norm": 2.9179675579071045,
"learning_rate": 5.9973226238286484e-05,
"loss": 2.3534,
"step": 524
},
{
"epoch": 2.108433734939759,
"grad_norm": 2.7088980674743652,
"learning_rate": 5.9705488621151276e-05,
"loss": 2.3717,
"step": 525
},
{
"epoch": 2.112449799196787,
"grad_norm": 2.784228801727295,
"learning_rate": 5.943775100401606e-05,
"loss": 2.7936,
"step": 526
},
{
"epoch": 2.1164658634538154,
"grad_norm": 3.1045587062835693,
"learning_rate": 5.917001338688086e-05,
"loss": 2.1785,
"step": 527
},
{
"epoch": 2.1204819277108435,
"grad_norm": 2.7609670162200928,
"learning_rate": 5.8902275769745655e-05,
"loss": 2.4232,
"step": 528
},
{
"epoch": 2.1244979919678713,
"grad_norm": 2.9791460037231445,
"learning_rate": 5.863453815261044e-05,
"loss": 2.6127,
"step": 529
},
{
"epoch": 2.1285140562248994,
"grad_norm": 2.917396306991577,
"learning_rate": 5.836680053547524e-05,
"loss": 2.5008,
"step": 530
},
{
"epoch": 2.1325301204819276,
"grad_norm": 3.066033124923706,
"learning_rate": 5.809906291834003e-05,
"loss": 2.8997,
"step": 531
},
{
"epoch": 2.1365461847389557,
"grad_norm": 2.570894241333008,
"learning_rate": 5.783132530120482e-05,
"loss": 2.2987,
"step": 532
},
{
"epoch": 2.140562248995984,
"grad_norm": 2.4431967735290527,
"learning_rate": 5.756358768406962e-05,
"loss": 2.1485,
"step": 533
},
{
"epoch": 2.144578313253012,
"grad_norm": 2.789560079574585,
"learning_rate": 5.729585006693441e-05,
"loss": 2.3678,
"step": 534
},
{
"epoch": 2.1485943775100402,
"grad_norm": 2.691913366317749,
"learning_rate": 5.70281124497992e-05,
"loss": 2.3469,
"step": 535
},
{
"epoch": 2.1526104417670684,
"grad_norm": 2.472721815109253,
"learning_rate": 5.676037483266399e-05,
"loss": 2.0741,
"step": 536
},
{
"epoch": 2.1566265060240966,
"grad_norm": 2.705008029937744,
"learning_rate": 5.649263721552879e-05,
"loss": 2.3399,
"step": 537
},
{
"epoch": 2.1606425702811247,
"grad_norm": 2.8036177158355713,
"learning_rate": 5.6224899598393576e-05,
"loss": 2.4336,
"step": 538
},
{
"epoch": 2.1646586345381524,
"grad_norm": 2.8112568855285645,
"learning_rate": 5.595716198125837e-05,
"loss": 2.4039,
"step": 539
},
{
"epoch": 2.1686746987951806,
"grad_norm": 2.932802438735962,
"learning_rate": 5.568942436412317e-05,
"loss": 2.4175,
"step": 540
},
{
"epoch": 2.1726907630522088,
"grad_norm": 3.0952837467193604,
"learning_rate": 5.5421686746987955e-05,
"loss": 2.4552,
"step": 541
},
{
"epoch": 2.176706827309237,
"grad_norm": 2.6719419956207275,
"learning_rate": 5.515394912985275e-05,
"loss": 2.0765,
"step": 542
},
{
"epoch": 2.180722891566265,
"grad_norm": 3.0576534271240234,
"learning_rate": 5.488621151271755e-05,
"loss": 2.417,
"step": 543
},
{
"epoch": 2.1847389558232932,
"grad_norm": 3.0612807273864746,
"learning_rate": 5.461847389558233e-05,
"loss": 2.9868,
"step": 544
},
{
"epoch": 2.1887550200803214,
"grad_norm": 3.5036559104919434,
"learning_rate": 5.4350736278447126e-05,
"loss": 2.7975,
"step": 545
},
{
"epoch": 2.1927710843373496,
"grad_norm": 3.5645198822021484,
"learning_rate": 5.408299866131191e-05,
"loss": 2.8446,
"step": 546
},
{
"epoch": 2.1967871485943773,
"grad_norm": 2.72088360786438,
"learning_rate": 5.381526104417671e-05,
"loss": 2.3907,
"step": 547
},
{
"epoch": 2.2008032128514055,
"grad_norm": 3.901146411895752,
"learning_rate": 5.3547523427041504e-05,
"loss": 3.4091,
"step": 548
},
{
"epoch": 2.2048192771084336,
"grad_norm": 2.9762930870056152,
"learning_rate": 5.327978580990629e-05,
"loss": 2.2808,
"step": 549
},
{
"epoch": 2.208835341365462,
"grad_norm": 3.1252336502075195,
"learning_rate": 5.301204819277109e-05,
"loss": 2.3206,
"step": 550
},
{
"epoch": 2.21285140562249,
"grad_norm": 3.61395525932312,
"learning_rate": 5.274431057563588e-05,
"loss": 2.9899,
"step": 551
},
{
"epoch": 2.216867469879518,
"grad_norm": 3.035787582397461,
"learning_rate": 5.247657295850067e-05,
"loss": 2.2514,
"step": 552
},
{
"epoch": 2.2208835341365463,
"grad_norm": 3.0700008869171143,
"learning_rate": 5.220883534136547e-05,
"loss": 2.7965,
"step": 553
},
{
"epoch": 2.2248995983935744,
"grad_norm": 3.380383253097534,
"learning_rate": 5.194109772423026e-05,
"loss": 2.7258,
"step": 554
},
{
"epoch": 2.2289156626506026,
"grad_norm": 3.3445475101470947,
"learning_rate": 5.167336010709505e-05,
"loss": 3.0532,
"step": 555
},
{
"epoch": 2.2329317269076308,
"grad_norm": 3.305169105529785,
"learning_rate": 5.140562248995984e-05,
"loss": 2.7851,
"step": 556
},
{
"epoch": 2.2369477911646585,
"grad_norm": 3.3952481746673584,
"learning_rate": 5.113788487282464e-05,
"loss": 2.6845,
"step": 557
},
{
"epoch": 2.2409638554216866,
"grad_norm": 2.7673559188842773,
"learning_rate": 5.0870147255689426e-05,
"loss": 2.6067,
"step": 558
},
{
"epoch": 2.244979919678715,
"grad_norm": 3.3448803424835205,
"learning_rate": 5.060240963855422e-05,
"loss": 2.4804,
"step": 559
},
{
"epoch": 2.248995983935743,
"grad_norm": 2.797827959060669,
"learning_rate": 5.033467202141902e-05,
"loss": 2.1237,
"step": 560
},
{
"epoch": 2.253012048192771,
"grad_norm": 2.9383599758148193,
"learning_rate": 5.0066934404283804e-05,
"loss": 2.3107,
"step": 561
},
{
"epoch": 2.2570281124497993,
"grad_norm": 3.0028162002563477,
"learning_rate": 4.97991967871486e-05,
"loss": 3.2211,
"step": 562
},
{
"epoch": 2.2610441767068274,
"grad_norm": 2.928341865539551,
"learning_rate": 4.953145917001339e-05,
"loss": 2.5173,
"step": 563
},
{
"epoch": 2.2650602409638556,
"grad_norm": 2.9720232486724854,
"learning_rate": 4.926372155287818e-05,
"loss": 2.3146,
"step": 564
},
{
"epoch": 2.2690763052208833,
"grad_norm": 3.558094024658203,
"learning_rate": 4.8995983935742975e-05,
"loss": 3.1953,
"step": 565
},
{
"epoch": 2.2730923694779115,
"grad_norm": 3.0352494716644287,
"learning_rate": 4.872824631860777e-05,
"loss": 2.4965,
"step": 566
},
{
"epoch": 2.2771084337349397,
"grad_norm": 2.7428176403045654,
"learning_rate": 4.8460508701472554e-05,
"loss": 2.1514,
"step": 567
},
{
"epoch": 2.281124497991968,
"grad_norm": 2.3594534397125244,
"learning_rate": 4.8192771084337354e-05,
"loss": 1.8075,
"step": 568
},
{
"epoch": 2.285140562248996,
"grad_norm": 3.3449742794036865,
"learning_rate": 4.792503346720215e-05,
"loss": 2.5945,
"step": 569
},
{
"epoch": 2.289156626506024,
"grad_norm": 3.104633331298828,
"learning_rate": 4.765729585006693e-05,
"loss": 2.9666,
"step": 570
},
{
"epoch": 2.2931726907630523,
"grad_norm": 3.094238758087158,
"learning_rate": 4.738955823293173e-05,
"loss": 2.489,
"step": 571
},
{
"epoch": 2.2971887550200805,
"grad_norm": 3.381775379180908,
"learning_rate": 4.712182061579652e-05,
"loss": 2.9042,
"step": 572
},
{
"epoch": 2.3012048192771086,
"grad_norm": 3.2117156982421875,
"learning_rate": 4.685408299866131e-05,
"loss": 2.6925,
"step": 573
},
{
"epoch": 2.305220883534137,
"grad_norm": 2.8267903327941895,
"learning_rate": 4.658634538152611e-05,
"loss": 2.3816,
"step": 574
},
{
"epoch": 2.3092369477911645,
"grad_norm": 3.068437099456787,
"learning_rate": 4.63186077643909e-05,
"loss": 2.3124,
"step": 575
},
{
"epoch": 2.3132530120481927,
"grad_norm": 2.832303762435913,
"learning_rate": 4.605087014725569e-05,
"loss": 2.5169,
"step": 576
},
{
"epoch": 2.317269076305221,
"grad_norm": 2.8893704414367676,
"learning_rate": 4.578313253012048e-05,
"loss": 2.3119,
"step": 577
},
{
"epoch": 2.321285140562249,
"grad_norm": 2.952976703643799,
"learning_rate": 4.5515394912985275e-05,
"loss": 2.3063,
"step": 578
},
{
"epoch": 2.325301204819277,
"grad_norm": 2.7303566932678223,
"learning_rate": 4.524765729585007e-05,
"loss": 2.5834,
"step": 579
},
{
"epoch": 2.3293172690763053,
"grad_norm": 2.9680216312408447,
"learning_rate": 4.497991967871486e-05,
"loss": 2.249,
"step": 580
},
{
"epoch": 2.3333333333333335,
"grad_norm": 2.997044324874878,
"learning_rate": 4.4712182061579654e-05,
"loss": 2.5954,
"step": 581
},
{
"epoch": 2.337349397590361,
"grad_norm": 3.4494729042053223,
"learning_rate": 4.4444444444444447e-05,
"loss": 3.1359,
"step": 582
},
{
"epoch": 2.3413654618473894,
"grad_norm": 3.1353585720062256,
"learning_rate": 4.417670682730924e-05,
"loss": 2.4317,
"step": 583
},
{
"epoch": 2.3453815261044175,
"grad_norm": 2.9816396236419678,
"learning_rate": 4.390896921017403e-05,
"loss": 2.8438,
"step": 584
},
{
"epoch": 2.3493975903614457,
"grad_norm": 2.6249794960021973,
"learning_rate": 4.3641231593038825e-05,
"loss": 2.0497,
"step": 585
},
{
"epoch": 2.353413654618474,
"grad_norm": 2.8994345664978027,
"learning_rate": 4.337349397590362e-05,
"loss": 2.149,
"step": 586
},
{
"epoch": 2.357429718875502,
"grad_norm": 3.8927950859069824,
"learning_rate": 4.3105756358768404e-05,
"loss": 3.0218,
"step": 587
},
{
"epoch": 2.36144578313253,
"grad_norm": 3.120274543762207,
"learning_rate": 4.2838018741633203e-05,
"loss": 2.1973,
"step": 588
},
{
"epoch": 2.3654618473895583,
"grad_norm": 3.104851007461548,
"learning_rate": 4.2570281124497996e-05,
"loss": 2.3442,
"step": 589
},
{
"epoch": 2.3694779116465865,
"grad_norm": 2.97161602973938,
"learning_rate": 4.230254350736278e-05,
"loss": 2.5706,
"step": 590
},
{
"epoch": 2.3734939759036147,
"grad_norm": 2.6856470108032227,
"learning_rate": 4.203480589022758e-05,
"loss": 2.0781,
"step": 591
},
{
"epoch": 2.3775100401606424,
"grad_norm": 2.9654481410980225,
"learning_rate": 4.176706827309237e-05,
"loss": 2.2495,
"step": 592
},
{
"epoch": 2.3815261044176705,
"grad_norm": 2.861020088195801,
"learning_rate": 4.149933065595716e-05,
"loss": 1.9942,
"step": 593
},
{
"epoch": 2.3855421686746987,
"grad_norm": 3.413158893585205,
"learning_rate": 4.123159303882196e-05,
"loss": 2.6585,
"step": 594
},
{
"epoch": 2.389558232931727,
"grad_norm": 3.1313233375549316,
"learning_rate": 4.0963855421686746e-05,
"loss": 2.9493,
"step": 595
},
{
"epoch": 2.393574297188755,
"grad_norm": 3.325638771057129,
"learning_rate": 4.069611780455154e-05,
"loss": 2.7101,
"step": 596
},
{
"epoch": 2.397590361445783,
"grad_norm": 2.991661787033081,
"learning_rate": 4.042838018741633e-05,
"loss": 2.5683,
"step": 597
},
{
"epoch": 2.4016064257028114,
"grad_norm": 3.0619921684265137,
"learning_rate": 4.0160642570281125e-05,
"loss": 2.5722,
"step": 598
},
{
"epoch": 2.4056224899598395,
"grad_norm": 2.730375289916992,
"learning_rate": 3.989290495314592e-05,
"loss": 2.2107,
"step": 599
},
{
"epoch": 2.4096385542168672,
"grad_norm": 2.5859103202819824,
"learning_rate": 3.962516733601071e-05,
"loss": 2.0576,
"step": 600
},
{
"epoch": 2.4136546184738954,
"grad_norm": 2.8956499099731445,
"learning_rate": 3.93574297188755e-05,
"loss": 2.1889,
"step": 601
},
{
"epoch": 2.4176706827309236,
"grad_norm": 2.575547933578491,
"learning_rate": 3.9089692101740296e-05,
"loss": 1.9322,
"step": 602
},
{
"epoch": 2.4216867469879517,
"grad_norm": 3.3304378986358643,
"learning_rate": 3.882195448460509e-05,
"loss": 2.4677,
"step": 603
},
{
"epoch": 2.42570281124498,
"grad_norm": 3.5554420948028564,
"learning_rate": 3.855421686746988e-05,
"loss": 2.6703,
"step": 604
},
{
"epoch": 2.429718875502008,
"grad_norm": 3.415844440460205,
"learning_rate": 3.8286479250334675e-05,
"loss": 2.9157,
"step": 605
},
{
"epoch": 2.433734939759036,
"grad_norm": 3.127218008041382,
"learning_rate": 3.801874163319947e-05,
"loss": 2.416,
"step": 606
},
{
"epoch": 2.4377510040160644,
"grad_norm": 3.796701192855835,
"learning_rate": 3.7751004016064253e-05,
"loss": 2.3505,
"step": 607
},
{
"epoch": 2.4417670682730925,
"grad_norm": 3.6044912338256836,
"learning_rate": 3.748326639892905e-05,
"loss": 2.8561,
"step": 608
},
{
"epoch": 2.4457831325301207,
"grad_norm": 3.2551517486572266,
"learning_rate": 3.7215528781793846e-05,
"loss": 2.5376,
"step": 609
},
{
"epoch": 2.4497991967871484,
"grad_norm": 2.890302896499634,
"learning_rate": 3.694779116465863e-05,
"loss": 2.2256,
"step": 610
},
{
"epoch": 2.4538152610441766,
"grad_norm": 3.478085517883301,
"learning_rate": 3.668005354752343e-05,
"loss": 2.6602,
"step": 611
},
{
"epoch": 2.4578313253012047,
"grad_norm": 3.682518720626831,
"learning_rate": 3.641231593038822e-05,
"loss": 2.8083,
"step": 612
},
{
"epoch": 2.461847389558233,
"grad_norm": 2.841364860534668,
"learning_rate": 3.614457831325301e-05,
"loss": 2.0827,
"step": 613
},
{
"epoch": 2.465863453815261,
"grad_norm": 2.784315347671509,
"learning_rate": 3.587684069611781e-05,
"loss": 3.9997,
"step": 614
},
{
"epoch": 2.4698795180722892,
"grad_norm": 3.153395652770996,
"learning_rate": 3.5609103078982596e-05,
"loss": 2.3443,
"step": 615
},
{
"epoch": 2.4738955823293174,
"grad_norm": 3.2817304134368896,
"learning_rate": 3.534136546184739e-05,
"loss": 2.6729,
"step": 616
},
{
"epoch": 2.4779116465863456,
"grad_norm": 2.8291358947753906,
"learning_rate": 3.507362784471218e-05,
"loss": 2.1918,
"step": 617
},
{
"epoch": 2.4819277108433733,
"grad_norm": 3.548492670059204,
"learning_rate": 3.4805890227576974e-05,
"loss": 3.5277,
"step": 618
},
{
"epoch": 2.4859437751004014,
"grad_norm": 9.622389793395996,
"learning_rate": 3.4538152610441774e-05,
"loss": 3.3926,
"step": 619
},
{
"epoch": 2.4899598393574296,
"grad_norm": 3.489105224609375,
"learning_rate": 3.427041499330656e-05,
"loss": 2.5828,
"step": 620
},
{
"epoch": 2.4939759036144578,
"grad_norm": 2.7694857120513916,
"learning_rate": 3.400267737617135e-05,
"loss": 1.9917,
"step": 621
},
{
"epoch": 2.497991967871486,
"grad_norm": 3.2993392944335938,
"learning_rate": 3.3734939759036146e-05,
"loss": 2.8177,
"step": 622
},
{
"epoch": 2.502008032128514,
"grad_norm": 2.863051176071167,
"learning_rate": 3.346720214190094e-05,
"loss": 2.0999,
"step": 623
},
{
"epoch": 2.5060240963855422,
"grad_norm": 3.025731086730957,
"learning_rate": 3.319946452476573e-05,
"loss": 2.555,
"step": 624
},
{
"epoch": 2.5100401606425704,
"grad_norm": 3.236588716506958,
"learning_rate": 3.2931726907630524e-05,
"loss": 2.3746,
"step": 625
},
{
"epoch": 2.5140562248995986,
"grad_norm": 3.071715831756592,
"learning_rate": 3.266398929049532e-05,
"loss": 2.1943,
"step": 626
},
{
"epoch": 2.5180722891566267,
"grad_norm": 3.353304147720337,
"learning_rate": 3.23962516733601e-05,
"loss": 3.2267,
"step": 627
},
{
"epoch": 2.522088353413655,
"grad_norm": 2.9166722297668457,
"learning_rate": 3.21285140562249e-05,
"loss": 2.5768,
"step": 628
},
{
"epoch": 2.5261044176706826,
"grad_norm": 2.571737051010132,
"learning_rate": 3.1860776439089695e-05,
"loss": 2.4097,
"step": 629
},
{
"epoch": 2.5301204819277108,
"grad_norm": 3.2051124572753906,
"learning_rate": 3.159303882195448e-05,
"loss": 2.6875,
"step": 630
},
{
"epoch": 2.534136546184739,
"grad_norm": 3.414586067199707,
"learning_rate": 3.132530120481928e-05,
"loss": 2.467,
"step": 631
},
{
"epoch": 2.538152610441767,
"grad_norm": 3.201895236968994,
"learning_rate": 3.105756358768407e-05,
"loss": 2.6332,
"step": 632
},
{
"epoch": 2.5421686746987953,
"grad_norm": 3.2875518798828125,
"learning_rate": 3.078982597054887e-05,
"loss": 3.0367,
"step": 633
},
{
"epoch": 2.5461847389558234,
"grad_norm": 2.6989524364471436,
"learning_rate": 3.052208835341366e-05,
"loss": 2.1665,
"step": 634
},
{
"epoch": 2.550200803212851,
"grad_norm": 2.7747488021850586,
"learning_rate": 3.025435073627845e-05,
"loss": 2.1499,
"step": 635
},
{
"epoch": 2.5542168674698793,
"grad_norm": 3.4082605838775635,
"learning_rate": 2.9986613119143242e-05,
"loss": 2.6462,
"step": 636
},
{
"epoch": 2.5582329317269075,
"grad_norm": 2.713757276535034,
"learning_rate": 2.971887550200803e-05,
"loss": 2.09,
"step": 637
},
{
"epoch": 2.5622489959839356,
"grad_norm": 3.2788338661193848,
"learning_rate": 2.9451137884872827e-05,
"loss": 2.3322,
"step": 638
},
{
"epoch": 2.566265060240964,
"grad_norm": 2.6642184257507324,
"learning_rate": 2.918340026773762e-05,
"loss": 2.1751,
"step": 639
},
{
"epoch": 2.570281124497992,
"grad_norm": 3.069793224334717,
"learning_rate": 2.891566265060241e-05,
"loss": 2.2499,
"step": 640
},
{
"epoch": 2.57429718875502,
"grad_norm": 3.132709503173828,
"learning_rate": 2.8647925033467206e-05,
"loss": 2.585,
"step": 641
},
{
"epoch": 2.5783132530120483,
"grad_norm": 3.27109432220459,
"learning_rate": 2.8380187416331995e-05,
"loss": 2.4458,
"step": 642
},
{
"epoch": 2.5823293172690764,
"grad_norm": 3.5450148582458496,
"learning_rate": 2.8112449799196788e-05,
"loss": 3.8692,
"step": 643
},
{
"epoch": 2.5863453815261046,
"grad_norm": 3.2768943309783936,
"learning_rate": 2.7844712182061584e-05,
"loss": 2.4152,
"step": 644
},
{
"epoch": 2.5903614457831328,
"grad_norm": 3.1916306018829346,
"learning_rate": 2.7576974564926374e-05,
"loss": 2.5376,
"step": 645
},
{
"epoch": 2.5943775100401605,
"grad_norm": 2.7519237995147705,
"learning_rate": 2.7309236947791167e-05,
"loss": 2.1762,
"step": 646
},
{
"epoch": 2.5983935742971886,
"grad_norm": 3.649415969848633,
"learning_rate": 2.7041499330655956e-05,
"loss": 3.0767,
"step": 647
},
{
"epoch": 2.602409638554217,
"grad_norm": 3.1575088500976562,
"learning_rate": 2.6773761713520752e-05,
"loss": 2.5746,
"step": 648
},
{
"epoch": 2.606425702811245,
"grad_norm": 3.1661970615386963,
"learning_rate": 2.6506024096385545e-05,
"loss": 2.8486,
"step": 649
},
{
"epoch": 2.610441767068273,
"grad_norm": 3.374446392059326,
"learning_rate": 2.6238286479250334e-05,
"loss": 3.0536,
"step": 650
},
{
"epoch": 2.6144578313253013,
"grad_norm": 3.2961578369140625,
"learning_rate": 2.597054886211513e-05,
"loss": 2.403,
"step": 651
},
{
"epoch": 2.6184738955823295,
"grad_norm": 3.078670024871826,
"learning_rate": 2.570281124497992e-05,
"loss": 2.0923,
"step": 652
},
{
"epoch": 2.622489959839357,
"grad_norm": 3.625155448913574,
"learning_rate": 2.5435073627844713e-05,
"loss": 3.3948,
"step": 653
},
{
"epoch": 2.6265060240963853,
"grad_norm": 3.2434301376342773,
"learning_rate": 2.516733601070951e-05,
"loss": 3.0131,
"step": 654
},
{
"epoch": 2.6305220883534135,
"grad_norm": 3.321974515914917,
"learning_rate": 2.48995983935743e-05,
"loss": 2.5972,
"step": 655
},
{
"epoch": 2.6345381526104417,
"grad_norm": 2.6846182346343994,
"learning_rate": 2.463186077643909e-05,
"loss": 2.2812,
"step": 656
},
{
"epoch": 2.63855421686747,
"grad_norm": 2.814183235168457,
"learning_rate": 2.4364123159303884e-05,
"loss": 2.1195,
"step": 657
},
{
"epoch": 2.642570281124498,
"grad_norm": 2.640397310256958,
"learning_rate": 2.4096385542168677e-05,
"loss": 2.1728,
"step": 658
},
{
"epoch": 2.646586345381526,
"grad_norm": 3.7056844234466553,
"learning_rate": 2.3828647925033466e-05,
"loss": 2.8224,
"step": 659
},
{
"epoch": 2.6506024096385543,
"grad_norm": 2.740823268890381,
"learning_rate": 2.356091030789826e-05,
"loss": 2.3886,
"step": 660
},
{
"epoch": 2.6546184738955825,
"grad_norm": 2.689279079437256,
"learning_rate": 2.3293172690763055e-05,
"loss": 2.3151,
"step": 661
},
{
"epoch": 2.6586345381526106,
"grad_norm": 3.4579248428344727,
"learning_rate": 2.3025435073627845e-05,
"loss": 2.7812,
"step": 662
},
{
"epoch": 2.662650602409639,
"grad_norm": 3.293381690979004,
"learning_rate": 2.2757697456492638e-05,
"loss": 2.9381,
"step": 663
},
{
"epoch": 2.6666666666666665,
"grad_norm": 3.3860654830932617,
"learning_rate": 2.248995983935743e-05,
"loss": 2.4111,
"step": 664
},
{
"epoch": 2.6706827309236947,
"grad_norm": 3.3504996299743652,
"learning_rate": 2.2222222222222223e-05,
"loss": 2.4411,
"step": 665
},
{
"epoch": 2.674698795180723,
"grad_norm": 3.2323498725891113,
"learning_rate": 2.1954484605087016e-05,
"loss": 2.6294,
"step": 666
},
{
"epoch": 2.678714859437751,
"grad_norm": 2.935426950454712,
"learning_rate": 2.168674698795181e-05,
"loss": 2.5489,
"step": 667
},
{
"epoch": 2.682730923694779,
"grad_norm": 3.483436346054077,
"learning_rate": 2.1419009370816602e-05,
"loss": 2.7512,
"step": 668
},
{
"epoch": 2.6867469879518073,
"grad_norm": 3.4001944065093994,
"learning_rate": 2.115127175368139e-05,
"loss": 2.4015,
"step": 669
},
{
"epoch": 2.6907630522088355,
"grad_norm": 3.6413683891296387,
"learning_rate": 2.0883534136546184e-05,
"loss": 3.5122,
"step": 670
},
{
"epoch": 2.694779116465863,
"grad_norm": 2.5411088466644287,
"learning_rate": 2.061579651941098e-05,
"loss": 2.0925,
"step": 671
},
{
"epoch": 2.6987951807228914,
"grad_norm": 3.1367125511169434,
"learning_rate": 2.034805890227577e-05,
"loss": 2.5457,
"step": 672
},
{
"epoch": 2.7028112449799195,
"grad_norm": 3.300114393234253,
"learning_rate": 2.0080321285140562e-05,
"loss": 3.0402,
"step": 673
},
{
"epoch": 2.7068273092369477,
"grad_norm": 2.744513750076294,
"learning_rate": 1.9812583668005355e-05,
"loss": 2.2273,
"step": 674
},
{
"epoch": 2.710843373493976,
"grad_norm": 3.0049889087677,
"learning_rate": 1.9544846050870148e-05,
"loss": 2.4656,
"step": 675
},
{
"epoch": 2.714859437751004,
"grad_norm": 2.9064860343933105,
"learning_rate": 1.927710843373494e-05,
"loss": 2.3855,
"step": 676
},
{
"epoch": 2.718875502008032,
"grad_norm": 3.317073106765747,
"learning_rate": 1.9009370816599734e-05,
"loss": 2.7036,
"step": 677
},
{
"epoch": 2.7228915662650603,
"grad_norm": 3.580209732055664,
"learning_rate": 1.8741633199464527e-05,
"loss": 2.4416,
"step": 678
},
{
"epoch": 2.7269076305220885,
"grad_norm": 3.0195388793945312,
"learning_rate": 1.8473895582329316e-05,
"loss": 2.0284,
"step": 679
},
{
"epoch": 2.7309236947791167,
"grad_norm": 3.5155584812164307,
"learning_rate": 1.820615796519411e-05,
"loss": 3.6898,
"step": 680
},
{
"epoch": 2.734939759036145,
"grad_norm": 3.3643851280212402,
"learning_rate": 1.7938420348058905e-05,
"loss": 2.7534,
"step": 681
},
{
"epoch": 2.7389558232931726,
"grad_norm": 3.949350595474243,
"learning_rate": 1.7670682730923694e-05,
"loss": 3.6933,
"step": 682
},
{
"epoch": 2.7429718875502007,
"grad_norm": 2.7811617851257324,
"learning_rate": 1.7402945113788487e-05,
"loss": 2.0857,
"step": 683
},
{
"epoch": 2.746987951807229,
"grad_norm": 3.3071796894073486,
"learning_rate": 1.713520749665328e-05,
"loss": 2.9454,
"step": 684
},
{
"epoch": 2.751004016064257,
"grad_norm": 3.181541919708252,
"learning_rate": 1.6867469879518073e-05,
"loss": 2.4977,
"step": 685
},
{
"epoch": 2.755020080321285,
"grad_norm": 2.8570432662963867,
"learning_rate": 1.6599732262382866e-05,
"loss": 2.2448,
"step": 686
},
{
"epoch": 2.7590361445783134,
"grad_norm": 2.8519392013549805,
"learning_rate": 1.633199464524766e-05,
"loss": 2.0659,
"step": 687
},
{
"epoch": 2.7630522088353415,
"grad_norm": 3.0057828426361084,
"learning_rate": 1.606425702811245e-05,
"loss": 2.711,
"step": 688
},
{
"epoch": 2.7670682730923692,
"grad_norm": 3.7644693851470947,
"learning_rate": 1.579651941097724e-05,
"loss": 2.7368,
"step": 689
},
{
"epoch": 2.7710843373493974,
"grad_norm": 3.339076519012451,
"learning_rate": 1.5528781793842034e-05,
"loss": 2.4372,
"step": 690
},
{
"epoch": 2.7751004016064256,
"grad_norm": 3.3303468227386475,
"learning_rate": 1.526104417670683e-05,
"loss": 2.1496,
"step": 691
},
{
"epoch": 2.7791164658634537,
"grad_norm": 3.007516384124756,
"learning_rate": 1.4993306559571621e-05,
"loss": 2.0637,
"step": 692
},
{
"epoch": 2.783132530120482,
"grad_norm": 3.2054901123046875,
"learning_rate": 1.4725568942436414e-05,
"loss": 2.6325,
"step": 693
},
{
"epoch": 2.78714859437751,
"grad_norm": 3.089660882949829,
"learning_rate": 1.4457831325301205e-05,
"loss": 2.6186,
"step": 694
},
{
"epoch": 2.791164658634538,
"grad_norm": 3.6075477600097656,
"learning_rate": 1.4190093708165998e-05,
"loss": 3.04,
"step": 695
},
{
"epoch": 2.7951807228915664,
"grad_norm": 2.9559810161590576,
"learning_rate": 1.3922356091030792e-05,
"loss": 2.1752,
"step": 696
},
{
"epoch": 2.7991967871485945,
"grad_norm": 3.062072992324829,
"learning_rate": 1.3654618473895583e-05,
"loss": 2.0509,
"step": 697
},
{
"epoch": 2.8032128514056227,
"grad_norm": 4.112563610076904,
"learning_rate": 1.3386880856760376e-05,
"loss": 2.937,
"step": 698
},
{
"epoch": 2.807228915662651,
"grad_norm": 3.2194480895996094,
"learning_rate": 1.3119143239625167e-05,
"loss": 2.2974,
"step": 699
},
{
"epoch": 2.8112449799196786,
"grad_norm": 3.2111270427703857,
"learning_rate": 1.285140562248996e-05,
"loss": 2.3903,
"step": 700
},
{
"epoch": 2.8152610441767068,
"grad_norm": 3.1619982719421387,
"learning_rate": 1.2583668005354755e-05,
"loss": 2.154,
"step": 701
},
{
"epoch": 2.819277108433735,
"grad_norm": 3.0533196926116943,
"learning_rate": 1.2315930388219546e-05,
"loss": 2.8862,
"step": 702
},
{
"epoch": 2.823293172690763,
"grad_norm": 2.838397264480591,
"learning_rate": 1.2048192771084338e-05,
"loss": 2.1974,
"step": 703
},
{
"epoch": 2.8273092369477912,
"grad_norm": 2.960359573364258,
"learning_rate": 1.178045515394913e-05,
"loss": 2.2714,
"step": 704
},
{
"epoch": 2.8313253012048194,
"grad_norm": 3.3387844562530518,
"learning_rate": 1.1512717536813922e-05,
"loss": 2.5617,
"step": 705
},
{
"epoch": 2.835341365461847,
"grad_norm": 3.802029609680176,
"learning_rate": 1.1244979919678715e-05,
"loss": 2.6791,
"step": 706
},
{
"epoch": 2.8393574297188753,
"grad_norm": 3.0797119140625,
"learning_rate": 1.0977242302543508e-05,
"loss": 2.008,
"step": 707
},
{
"epoch": 2.8433734939759034,
"grad_norm": 3.6929612159729004,
"learning_rate": 1.0709504685408301e-05,
"loss": 3.0253,
"step": 708
},
{
"epoch": 2.8473895582329316,
"grad_norm": 3.409666061401367,
"learning_rate": 1.0441767068273092e-05,
"loss": 2.488,
"step": 709
},
{
"epoch": 2.8514056224899598,
"grad_norm": 3.4419896602630615,
"learning_rate": 1.0174029451137885e-05,
"loss": 2.5107,
"step": 710
},
{
"epoch": 2.855421686746988,
"grad_norm": 2.9970462322235107,
"learning_rate": 9.906291834002678e-06,
"loss": 2.4561,
"step": 711
},
{
"epoch": 2.859437751004016,
"grad_norm": 2.9567370414733887,
"learning_rate": 9.63855421686747e-06,
"loss": 2.0972,
"step": 712
},
{
"epoch": 2.8634538152610443,
"grad_norm": 3.134462356567383,
"learning_rate": 9.370816599732263e-06,
"loss": 2.4256,
"step": 713
},
{
"epoch": 2.8674698795180724,
"grad_norm": 3.376096487045288,
"learning_rate": 9.103078982597054e-06,
"loss": 2.221,
"step": 714
},
{
"epoch": 2.8714859437751006,
"grad_norm": 3.569254159927368,
"learning_rate": 8.835341365461847e-06,
"loss": 2.379,
"step": 715
},
{
"epoch": 2.8755020080321287,
"grad_norm": 3.4028611183166504,
"learning_rate": 8.56760374832664e-06,
"loss": 2.3297,
"step": 716
},
{
"epoch": 2.8795180722891565,
"grad_norm": 3.772540807723999,
"learning_rate": 8.299866131191433e-06,
"loss": 2.9839,
"step": 717
},
{
"epoch": 2.8835341365461846,
"grad_norm": 3.2679340839385986,
"learning_rate": 8.032128514056226e-06,
"loss": 2.3875,
"step": 718
},
{
"epoch": 2.887550200803213,
"grad_norm": 3.6074769496917725,
"learning_rate": 7.764390896921017e-06,
"loss": 2.9021,
"step": 719
},
{
"epoch": 2.891566265060241,
"grad_norm": 3.7479116916656494,
"learning_rate": 7.4966532797858104e-06,
"loss": 2.5803,
"step": 720
},
{
"epoch": 2.895582329317269,
"grad_norm": 3.051452875137329,
"learning_rate": 7.228915662650602e-06,
"loss": 2.9504,
"step": 721
},
{
"epoch": 2.8995983935742973,
"grad_norm": 3.341724157333374,
"learning_rate": 6.961178045515396e-06,
"loss": 2.8643,
"step": 722
},
{
"epoch": 2.9036144578313254,
"grad_norm": 2.8065922260284424,
"learning_rate": 6.693440428380188e-06,
"loss": 2.6456,
"step": 723
},
{
"epoch": 2.907630522088353,
"grad_norm": 3.295828342437744,
"learning_rate": 6.42570281124498e-06,
"loss": 3.2691,
"step": 724
},
{
"epoch": 2.9116465863453813,
"grad_norm": 3.15494966506958,
"learning_rate": 6.157965194109773e-06,
"loss": 2.3256,
"step": 725
},
{
"epoch": 2.9156626506024095,
"grad_norm": 3.146188259124756,
"learning_rate": 5.890227576974565e-06,
"loss": 2.5247,
"step": 726
},
{
"epoch": 2.9196787148594376,
"grad_norm": 3.042181968688965,
"learning_rate": 5.622489959839358e-06,
"loss": 2.3458,
"step": 727
},
{
"epoch": 2.923694779116466,
"grad_norm": 2.8072509765625,
"learning_rate": 5.3547523427041504e-06,
"loss": 2.2129,
"step": 728
},
{
"epoch": 2.927710843373494,
"grad_norm": 3.1902520656585693,
"learning_rate": 5.087014725568942e-06,
"loss": 2.1905,
"step": 729
},
{
"epoch": 2.931726907630522,
"grad_norm": 3.706218719482422,
"learning_rate": 4.819277108433735e-06,
"loss": 2.8587,
"step": 730
},
{
"epoch": 2.9357429718875503,
"grad_norm": 3.516908645629883,
"learning_rate": 4.551539491298527e-06,
"loss": 3.0003,
"step": 731
},
{
"epoch": 2.9397590361445785,
"grad_norm": 3.9051806926727295,
"learning_rate": 4.28380187416332e-06,
"loss": 2.6986,
"step": 732
},
{
"epoch": 2.9437751004016066,
"grad_norm": 2.434493064880371,
"learning_rate": 4.016064257028113e-06,
"loss": 2.0143,
"step": 733
},
{
"epoch": 2.9477911646586348,
"grad_norm": 3.514988899230957,
"learning_rate": 3.7483266398929052e-06,
"loss": 2.5539,
"step": 734
},
{
"epoch": 2.9518072289156625,
"grad_norm": 3.145475387573242,
"learning_rate": 3.480589022757698e-06,
"loss": 2.3991,
"step": 735
},
{
"epoch": 2.9558232931726907,
"grad_norm": 3.0328280925750732,
"learning_rate": 3.21285140562249e-06,
"loss": 2.4384,
"step": 736
},
{
"epoch": 2.959839357429719,
"grad_norm": 3.584406614303589,
"learning_rate": 2.9451137884872824e-06,
"loss": 2.219,
"step": 737
},
{
"epoch": 2.963855421686747,
"grad_norm": 2.8902695178985596,
"learning_rate": 2.6773761713520752e-06,
"loss": 2.0701,
"step": 738
},
{
"epoch": 2.967871485943775,
"grad_norm": 2.714848518371582,
"learning_rate": 2.4096385542168676e-06,
"loss": 2.3578,
"step": 739
},
{
"epoch": 2.9718875502008033,
"grad_norm": 3.4589223861694336,
"learning_rate": 2.14190093708166e-06,
"loss": 2.4076,
"step": 740
},
{
"epoch": 2.9759036144578315,
"grad_norm": 2.8250577449798584,
"learning_rate": 1.8741633199464526e-06,
"loss": 2.2688,
"step": 741
},
{
"epoch": 2.979919678714859,
"grad_norm": 3.090301752090454,
"learning_rate": 1.606425702811245e-06,
"loss": 2.0527,
"step": 742
},
{
"epoch": 2.9839357429718874,
"grad_norm": 3.82488751411438,
"learning_rate": 1.3386880856760376e-06,
"loss": 2.9784,
"step": 743
},
{
"epoch": 2.9879518072289155,
"grad_norm": 3.046949863433838,
"learning_rate": 1.07095046854083e-06,
"loss": 2.987,
"step": 744
},
{
"epoch": 2.9919678714859437,
"grad_norm": 3.08667254447937,
"learning_rate": 8.032128514056225e-07,
"loss": 2.3121,
"step": 745
},
{
"epoch": 2.995983935742972,
"grad_norm": 3.114004611968994,
"learning_rate": 5.35475234270415e-07,
"loss": 2.4549,
"step": 746
},
{
"epoch": 3.0,
"grad_norm": 3.1294381618499756,
"learning_rate": 2.677376171352075e-07,
"loss": 2.2527,
"step": 747
},
{
"epoch": 3.0,
"eval_loss": 0.8732815980911255,
"eval_runtime": 201.6297,
"eval_samples_per_second": 2.475,
"eval_steps_per_second": 1.24,
"step": 747
}
],
"logging_steps": 1,
"max_steps": 747,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.0605631120002253e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}