lfm2-sft-ckpt-408 / trainer_state.json
Fizzarolli's picture
Upload folder using huggingface_hub
6ff623e verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 408,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0024554941682013503,
"grad_norm": 4.53125,
"learning_rate": 1.2500000000000002e-07,
"loss": 1.3745,
"step": 1
},
{
"epoch": 0.004910988336402701,
"grad_norm": 4.4375,
"learning_rate": 2.5000000000000004e-07,
"loss": 1.5178,
"step": 2
},
{
"epoch": 0.007366482504604052,
"grad_norm": 4.46875,
"learning_rate": 3.7500000000000006e-07,
"loss": 1.5165,
"step": 3
},
{
"epoch": 0.009821976672805401,
"grad_norm": 4.90625,
"learning_rate": 5.000000000000001e-07,
"loss": 1.5066,
"step": 4
},
{
"epoch": 0.012277470841006752,
"grad_norm": 4.28125,
"learning_rate": 6.25e-07,
"loss": 1.4756,
"step": 5
},
{
"epoch": 0.014732965009208104,
"grad_norm": 6.4375,
"learning_rate": 7.500000000000001e-07,
"loss": 1.6926,
"step": 6
},
{
"epoch": 0.017188459177409455,
"grad_norm": 4.4375,
"learning_rate": 8.750000000000001e-07,
"loss": 1.4304,
"step": 7
},
{
"epoch": 0.019643953345610803,
"grad_norm": 4.40625,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.5449,
"step": 8
},
{
"epoch": 0.022099447513812154,
"grad_norm": 4.28125,
"learning_rate": 1.125e-06,
"loss": 1.4239,
"step": 9
},
{
"epoch": 0.024554941682013505,
"grad_norm": 4.59375,
"learning_rate": 1.25e-06,
"loss": 1.5942,
"step": 10
},
{
"epoch": 0.027010435850214856,
"grad_norm": 4.5,
"learning_rate": 1.375e-06,
"loss": 1.3251,
"step": 11
},
{
"epoch": 0.029465930018416207,
"grad_norm": 4.3125,
"learning_rate": 1.5000000000000002e-06,
"loss": 1.398,
"step": 12
},
{
"epoch": 0.03192142418661756,
"grad_norm": 4.4375,
"learning_rate": 1.6250000000000001e-06,
"loss": 1.4457,
"step": 13
},
{
"epoch": 0.03437691835481891,
"grad_norm": 4.8125,
"learning_rate": 1.7500000000000002e-06,
"loss": 1.7422,
"step": 14
},
{
"epoch": 0.03683241252302026,
"grad_norm": 4.5625,
"learning_rate": 1.8750000000000003e-06,
"loss": 1.403,
"step": 15
},
{
"epoch": 0.039287906691221605,
"grad_norm": 4.59375,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.5114,
"step": 16
},
{
"epoch": 0.041743400859422956,
"grad_norm": 4.3125,
"learning_rate": 2.125e-06,
"loss": 1.4917,
"step": 17
},
{
"epoch": 0.04419889502762431,
"grad_norm": 4.6875,
"learning_rate": 2.25e-06,
"loss": 1.6311,
"step": 18
},
{
"epoch": 0.04665438919582566,
"grad_norm": 4.53125,
"learning_rate": 2.375e-06,
"loss": 1.5155,
"step": 19
},
{
"epoch": 0.04910988336402701,
"grad_norm": 4.15625,
"learning_rate": 2.5e-06,
"loss": 1.3574,
"step": 20
},
{
"epoch": 0.05156537753222836,
"grad_norm": 4.03125,
"learning_rate": 2.6250000000000003e-06,
"loss": 1.356,
"step": 21
},
{
"epoch": 0.05402087170042971,
"grad_norm": 4.125,
"learning_rate": 2.75e-06,
"loss": 1.3505,
"step": 22
},
{
"epoch": 0.056476365868631064,
"grad_norm": 4.5625,
"learning_rate": 2.875e-06,
"loss": 1.4953,
"step": 23
},
{
"epoch": 0.058931860036832415,
"grad_norm": 3.78125,
"learning_rate": 3.0000000000000005e-06,
"loss": 1.3245,
"step": 24
},
{
"epoch": 0.061387354205033766,
"grad_norm": 4.625,
"learning_rate": 3.125e-06,
"loss": 1.7409,
"step": 25
},
{
"epoch": 0.06384284837323512,
"grad_norm": 4.0625,
"learning_rate": 3.2500000000000002e-06,
"loss": 1.4082,
"step": 26
},
{
"epoch": 0.06629834254143646,
"grad_norm": 3.90625,
"learning_rate": 3.375e-06,
"loss": 1.4205,
"step": 27
},
{
"epoch": 0.06875383670963782,
"grad_norm": 3.8125,
"learning_rate": 3.5000000000000004e-06,
"loss": 1.4518,
"step": 28
},
{
"epoch": 0.07120933087783916,
"grad_norm": 3.8125,
"learning_rate": 3.625e-06,
"loss": 1.5376,
"step": 29
},
{
"epoch": 0.07366482504604052,
"grad_norm": 3.828125,
"learning_rate": 3.7500000000000005e-06,
"loss": 1.3659,
"step": 30
},
{
"epoch": 0.07612031921424187,
"grad_norm": 4.71875,
"learning_rate": 3.875e-06,
"loss": 1.5217,
"step": 31
},
{
"epoch": 0.07857581338244321,
"grad_norm": 4.75,
"learning_rate": 4.000000000000001e-06,
"loss": 1.6483,
"step": 32
},
{
"epoch": 0.08103130755064457,
"grad_norm": 3.96875,
"learning_rate": 4.125e-06,
"loss": 1.4399,
"step": 33
},
{
"epoch": 0.08348680171884591,
"grad_norm": 3.5625,
"learning_rate": 4.25e-06,
"loss": 1.4659,
"step": 34
},
{
"epoch": 0.08594229588704727,
"grad_norm": 3.78125,
"learning_rate": 4.3750000000000005e-06,
"loss": 1.3418,
"step": 35
},
{
"epoch": 0.08839779005524862,
"grad_norm": 3.6875,
"learning_rate": 4.5e-06,
"loss": 1.5274,
"step": 36
},
{
"epoch": 0.09085328422344997,
"grad_norm": 3.546875,
"learning_rate": 4.625000000000001e-06,
"loss": 1.4488,
"step": 37
},
{
"epoch": 0.09330877839165132,
"grad_norm": 2.953125,
"learning_rate": 4.75e-06,
"loss": 1.3321,
"step": 38
},
{
"epoch": 0.09576427255985268,
"grad_norm": 3.265625,
"learning_rate": 4.875000000000001e-06,
"loss": 1.6221,
"step": 39
},
{
"epoch": 0.09821976672805402,
"grad_norm": 3.1875,
"learning_rate": 5e-06,
"loss": 1.5383,
"step": 40
},
{
"epoch": 0.10067526089625538,
"grad_norm": 3.171875,
"learning_rate": 4.99941792782305e-06,
"loss": 1.4365,
"step": 41
},
{
"epoch": 0.10313075506445672,
"grad_norm": 2.640625,
"learning_rate": 4.998834498834499e-06,
"loss": 1.403,
"step": 42
},
{
"epoch": 0.10558624923265807,
"grad_norm": 2.734375,
"learning_rate": 4.998249708284714e-06,
"loss": 1.3352,
"step": 43
},
{
"epoch": 0.10804174340085942,
"grad_norm": 2.46875,
"learning_rate": 4.99766355140187e-06,
"loss": 1.4499,
"step": 44
},
{
"epoch": 0.11049723756906077,
"grad_norm": 2.296875,
"learning_rate": 4.997076023391814e-06,
"loss": 1.2393,
"step": 45
},
{
"epoch": 0.11295273173726213,
"grad_norm": 2.421875,
"learning_rate": 4.99648711943794e-06,
"loss": 1.318,
"step": 46
},
{
"epoch": 0.11540822590546347,
"grad_norm": 2.640625,
"learning_rate": 4.995896834701055e-06,
"loss": 1.302,
"step": 47
},
{
"epoch": 0.11786372007366483,
"grad_norm": 2.09375,
"learning_rate": 4.995305164319249e-06,
"loss": 1.3988,
"step": 48
},
{
"epoch": 0.12031921424186617,
"grad_norm": 2.125,
"learning_rate": 4.994712103407756e-06,
"loss": 1.3306,
"step": 49
},
{
"epoch": 0.12277470841006753,
"grad_norm": 2.109375,
"learning_rate": 4.994117647058823e-06,
"loss": 1.4521,
"step": 50
},
{
"epoch": 0.1252302025782689,
"grad_norm": 2.046875,
"learning_rate": 4.993521790341579e-06,
"loss": 1.2632,
"step": 51
},
{
"epoch": 0.12768569674647023,
"grad_norm": 1.8125,
"learning_rate": 4.992924528301888e-06,
"loss": 1.3189,
"step": 52
},
{
"epoch": 0.13014119091467158,
"grad_norm": 1.9296875,
"learning_rate": 4.99232585596222e-06,
"loss": 1.2612,
"step": 53
},
{
"epoch": 0.13259668508287292,
"grad_norm": 1.921875,
"learning_rate": 4.991725768321514e-06,
"loss": 1.4976,
"step": 54
},
{
"epoch": 0.13505217925107427,
"grad_norm": 1.8359375,
"learning_rate": 4.9911242603550295e-06,
"loss": 1.3946,
"step": 55
},
{
"epoch": 0.13750767341927564,
"grad_norm": 1.9375,
"learning_rate": 4.990521327014218e-06,
"loss": 1.517,
"step": 56
},
{
"epoch": 0.13996316758747698,
"grad_norm": 1.71875,
"learning_rate": 4.989916963226572e-06,
"loss": 1.3417,
"step": 57
},
{
"epoch": 0.14241866175567833,
"grad_norm": 1.671875,
"learning_rate": 4.989311163895487e-06,
"loss": 1.5581,
"step": 58
},
{
"epoch": 0.14487415592387967,
"grad_norm": 2.25,
"learning_rate": 4.988703923900119e-06,
"loss": 1.4299,
"step": 59
},
{
"epoch": 0.14732965009208104,
"grad_norm": 1.6484375,
"learning_rate": 4.988095238095239e-06,
"loss": 1.3143,
"step": 60
},
{
"epoch": 0.1497851442602824,
"grad_norm": 1.453125,
"learning_rate": 4.987485101311085e-06,
"loss": 1.3981,
"step": 61
},
{
"epoch": 0.15224063842848373,
"grad_norm": 1.4140625,
"learning_rate": 4.986873508353222e-06,
"loss": 1.2811,
"step": 62
},
{
"epoch": 0.15469613259668508,
"grad_norm": 1.421875,
"learning_rate": 4.98626045400239e-06,
"loss": 1.3865,
"step": 63
},
{
"epoch": 0.15715162676488642,
"grad_norm": 1.4453125,
"learning_rate": 4.9856459330143545e-06,
"loss": 1.3941,
"step": 64
},
{
"epoch": 0.1596071209330878,
"grad_norm": 1.3125,
"learning_rate": 4.985029940119761e-06,
"loss": 1.4471,
"step": 65
},
{
"epoch": 0.16206261510128914,
"grad_norm": 1.3203125,
"learning_rate": 4.984412470023982e-06,
"loss": 1.1329,
"step": 66
},
{
"epoch": 0.16451810926949048,
"grad_norm": 1.375,
"learning_rate": 4.9837935174069635e-06,
"loss": 1.3922,
"step": 67
},
{
"epoch": 0.16697360343769183,
"grad_norm": 1.1875,
"learning_rate": 4.983173076923077e-06,
"loss": 1.2263,
"step": 68
},
{
"epoch": 0.1694290976058932,
"grad_norm": 1.140625,
"learning_rate": 4.982551143200963e-06,
"loss": 1.2131,
"step": 69
},
{
"epoch": 0.17188459177409454,
"grad_norm": 1.234375,
"learning_rate": 4.981927710843374e-06,
"loss": 1.368,
"step": 70
},
{
"epoch": 0.17434008594229589,
"grad_norm": 1.265625,
"learning_rate": 4.981302774427021e-06,
"loss": 1.3914,
"step": 71
},
{
"epoch": 0.17679558011049723,
"grad_norm": 1.265625,
"learning_rate": 4.980676328502416e-06,
"loss": 1.294,
"step": 72
},
{
"epoch": 0.17925107427869857,
"grad_norm": 1.1640625,
"learning_rate": 4.980048367593713e-06,
"loss": 1.2289,
"step": 73
},
{
"epoch": 0.18170656844689995,
"grad_norm": 1.0859375,
"learning_rate": 4.979418886198547e-06,
"loss": 1.26,
"step": 74
},
{
"epoch": 0.1841620626151013,
"grad_norm": 1.1171875,
"learning_rate": 4.9787878787878795e-06,
"loss": 1.4256,
"step": 75
},
{
"epoch": 0.18661755678330263,
"grad_norm": 1.140625,
"learning_rate": 4.9781553398058256e-06,
"loss": 1.3011,
"step": 76
},
{
"epoch": 0.18907305095150398,
"grad_norm": 1.140625,
"learning_rate": 4.9775212636695016e-06,
"loss": 1.3333,
"step": 77
},
{
"epoch": 0.19152854511970535,
"grad_norm": 1.1328125,
"learning_rate": 4.976885644768857e-06,
"loss": 1.3712,
"step": 78
},
{
"epoch": 0.1939840392879067,
"grad_norm": 1.0390625,
"learning_rate": 4.9762484774665045e-06,
"loss": 1.2534,
"step": 79
},
{
"epoch": 0.19643953345610804,
"grad_norm": 1.1171875,
"learning_rate": 4.975609756097562e-06,
"loss": 1.4554,
"step": 80
},
{
"epoch": 0.19889502762430938,
"grad_norm": 1.0703125,
"learning_rate": 4.974969474969476e-06,
"loss": 1.2135,
"step": 81
},
{
"epoch": 0.20135052179251076,
"grad_norm": 0.9765625,
"learning_rate": 4.974327628361858e-06,
"loss": 1.206,
"step": 82
},
{
"epoch": 0.2038060159607121,
"grad_norm": 0.96875,
"learning_rate": 4.973684210526317e-06,
"loss": 1.2847,
"step": 83
},
{
"epoch": 0.20626151012891344,
"grad_norm": 1.0234375,
"learning_rate": 4.973039215686275e-06,
"loss": 1.2353,
"step": 84
},
{
"epoch": 0.2087170042971148,
"grad_norm": 0.92578125,
"learning_rate": 4.97239263803681e-06,
"loss": 1.2502,
"step": 85
},
{
"epoch": 0.21117249846531613,
"grad_norm": 1.1171875,
"learning_rate": 4.971744471744472e-06,
"loss": 1.3201,
"step": 86
},
{
"epoch": 0.2136279926335175,
"grad_norm": 1.1328125,
"learning_rate": 4.97109471094711e-06,
"loss": 1.2823,
"step": 87
},
{
"epoch": 0.21608348680171885,
"grad_norm": 1.0234375,
"learning_rate": 4.970443349753695e-06,
"loss": 1.2085,
"step": 88
},
{
"epoch": 0.2185389809699202,
"grad_norm": 0.97265625,
"learning_rate": 4.969790382244143e-06,
"loss": 1.2188,
"step": 89
},
{
"epoch": 0.22099447513812154,
"grad_norm": 0.94921875,
"learning_rate": 4.9691358024691365e-06,
"loss": 1.1417,
"step": 90
},
{
"epoch": 0.2234499693063229,
"grad_norm": 0.921875,
"learning_rate": 4.968479604449938e-06,
"loss": 1.1576,
"step": 91
},
{
"epoch": 0.22590546347452425,
"grad_norm": 0.9375,
"learning_rate": 4.967821782178218e-06,
"loss": 1.4177,
"step": 92
},
{
"epoch": 0.2283609576427256,
"grad_norm": 1.015625,
"learning_rate": 4.967162329615861e-06,
"loss": 1.3878,
"step": 93
},
{
"epoch": 0.23081645181092694,
"grad_norm": 0.93359375,
"learning_rate": 4.966501240694789e-06,
"loss": 1.3376,
"step": 94
},
{
"epoch": 0.2332719459791283,
"grad_norm": 0.87109375,
"learning_rate": 4.965838509316771e-06,
"loss": 1.2629,
"step": 95
},
{
"epoch": 0.23572744014732966,
"grad_norm": 0.96875,
"learning_rate": 4.965174129353235e-06,
"loss": 1.5579,
"step": 96
},
{
"epoch": 0.238182934315531,
"grad_norm": 0.921875,
"learning_rate": 4.964508094645081e-06,
"loss": 1.4673,
"step": 97
},
{
"epoch": 0.24063842848373235,
"grad_norm": 0.91796875,
"learning_rate": 4.963840399002494e-06,
"loss": 1.1368,
"step": 98
},
{
"epoch": 0.2430939226519337,
"grad_norm": 0.97265625,
"learning_rate": 4.9631710362047445e-06,
"loss": 1.2686,
"step": 99
},
{
"epoch": 0.24554941682013506,
"grad_norm": 0.9453125,
"learning_rate": 4.9625e-06,
"loss": 1.3491,
"step": 100
},
{
"epoch": 0.2480049109883364,
"grad_norm": 0.921875,
"learning_rate": 4.961827284105132e-06,
"loss": 1.3463,
"step": 101
},
{
"epoch": 0.2504604051565378,
"grad_norm": 0.90234375,
"learning_rate": 4.961152882205514e-06,
"loss": 1.3989,
"step": 102
},
{
"epoch": 0.2529158993247391,
"grad_norm": 0.84375,
"learning_rate": 4.960476787954831e-06,
"loss": 1.3649,
"step": 103
},
{
"epoch": 0.25537139349294047,
"grad_norm": 0.859375,
"learning_rate": 4.959798994974875e-06,
"loss": 1.3352,
"step": 104
},
{
"epoch": 0.2578268876611418,
"grad_norm": 0.90625,
"learning_rate": 4.959119496855347e-06,
"loss": 1.2536,
"step": 105
},
{
"epoch": 0.26028238182934316,
"grad_norm": 0.984375,
"learning_rate": 4.958438287153652e-06,
"loss": 1.2375,
"step": 106
},
{
"epoch": 0.2627378759975445,
"grad_norm": 0.83984375,
"learning_rate": 4.9577553593947035e-06,
"loss": 1.3203,
"step": 107
},
{
"epoch": 0.26519337016574585,
"grad_norm": 1.078125,
"learning_rate": 4.957070707070708e-06,
"loss": 1.2825,
"step": 108
},
{
"epoch": 0.2676488643339472,
"grad_norm": 0.859375,
"learning_rate": 4.9563843236409605e-06,
"loss": 1.4075,
"step": 109
},
{
"epoch": 0.27010435850214853,
"grad_norm": 0.85546875,
"learning_rate": 4.955696202531646e-06,
"loss": 1.2861,
"step": 110
},
{
"epoch": 0.27255985267034993,
"grad_norm": 0.82421875,
"learning_rate": 4.955006337135615e-06,
"loss": 1.3195,
"step": 111
},
{
"epoch": 0.2750153468385513,
"grad_norm": 0.8828125,
"learning_rate": 4.954314720812184e-06,
"loss": 1.4529,
"step": 112
},
{
"epoch": 0.2774708410067526,
"grad_norm": 0.9453125,
"learning_rate": 4.9536213468869126e-06,
"loss": 1.448,
"step": 113
},
{
"epoch": 0.27992633517495397,
"grad_norm": 0.83203125,
"learning_rate": 4.9529262086513995e-06,
"loss": 1.281,
"step": 114
},
{
"epoch": 0.2823818293431553,
"grad_norm": 0.7734375,
"learning_rate": 4.952229299363058e-06,
"loss": 1.2564,
"step": 115
},
{
"epoch": 0.28483732351135665,
"grad_norm": 0.8828125,
"learning_rate": 4.951530612244899e-06,
"loss": 1.2746,
"step": 116
},
{
"epoch": 0.287292817679558,
"grad_norm": 0.90234375,
"learning_rate": 4.950830140485312e-06,
"loss": 1.2465,
"step": 117
},
{
"epoch": 0.28974831184775934,
"grad_norm": 1.109375,
"learning_rate": 4.950127877237852e-06,
"loss": 1.2232,
"step": 118
},
{
"epoch": 0.2922038060159607,
"grad_norm": 0.8515625,
"learning_rate": 4.949423815621e-06,
"loss": 1.2338,
"step": 119
},
{
"epoch": 0.2946593001841621,
"grad_norm": 0.890625,
"learning_rate": 4.9487179487179486e-06,
"loss": 1.3921,
"step": 120
},
{
"epoch": 0.29711479435236343,
"grad_norm": 0.80078125,
"learning_rate": 4.94801026957638e-06,
"loss": 1.2641,
"step": 121
},
{
"epoch": 0.2995702885205648,
"grad_norm": 0.79296875,
"learning_rate": 4.9473007712082265e-06,
"loss": 1.2663,
"step": 122
},
{
"epoch": 0.3020257826887661,
"grad_norm": 0.8359375,
"learning_rate": 4.946589446589447e-06,
"loss": 1.365,
"step": 123
},
{
"epoch": 0.30448127685696746,
"grad_norm": 0.86328125,
"learning_rate": 4.945876288659794e-06,
"loss": 1.2591,
"step": 124
},
{
"epoch": 0.3069367710251688,
"grad_norm": 0.8046875,
"learning_rate": 4.945161290322581e-06,
"loss": 1.2908,
"step": 125
},
{
"epoch": 0.30939226519337015,
"grad_norm": 0.8125,
"learning_rate": 4.944444444444445e-06,
"loss": 1.0606,
"step": 126
},
{
"epoch": 0.3118477593615715,
"grad_norm": 0.7890625,
"learning_rate": 4.943725743855111e-06,
"loss": 1.2429,
"step": 127
},
{
"epoch": 0.31430325352977284,
"grad_norm": 0.93359375,
"learning_rate": 4.9430051813471505e-06,
"loss": 1.1361,
"step": 128
},
{
"epoch": 0.31675874769797424,
"grad_norm": 0.8046875,
"learning_rate": 4.942282749675746e-06,
"loss": 1.3881,
"step": 129
},
{
"epoch": 0.3192142418661756,
"grad_norm": 0.77734375,
"learning_rate": 4.941558441558442e-06,
"loss": 1.4425,
"step": 130
},
{
"epoch": 0.32166973603437693,
"grad_norm": 0.84375,
"learning_rate": 4.940832249674903e-06,
"loss": 1.4025,
"step": 131
},
{
"epoch": 0.3241252302025783,
"grad_norm": 0.8359375,
"learning_rate": 4.940104166666667e-06,
"loss": 1.3005,
"step": 132
},
{
"epoch": 0.3265807243707796,
"grad_norm": 0.82421875,
"learning_rate": 4.939374185136898e-06,
"loss": 1.2197,
"step": 133
},
{
"epoch": 0.32903621853898096,
"grad_norm": 0.921875,
"learning_rate": 4.938642297650132e-06,
"loss": 1.2671,
"step": 134
},
{
"epoch": 0.3314917127071823,
"grad_norm": 0.80078125,
"learning_rate": 4.937908496732027e-06,
"loss": 1.4586,
"step": 135
},
{
"epoch": 0.33394720687538365,
"grad_norm": 0.77734375,
"learning_rate": 4.9371727748691105e-06,
"loss": 1.411,
"step": 136
},
{
"epoch": 0.336402701043585,
"grad_norm": 0.8359375,
"learning_rate": 4.936435124508519e-06,
"loss": 1.2816,
"step": 137
},
{
"epoch": 0.3388581952117864,
"grad_norm": 1.0625,
"learning_rate": 4.935695538057743e-06,
"loss": 1.387,
"step": 138
},
{
"epoch": 0.34131368937998774,
"grad_norm": 0.8125,
"learning_rate": 4.9349540078843624e-06,
"loss": 1.4591,
"step": 139
},
{
"epoch": 0.3437691835481891,
"grad_norm": 0.76953125,
"learning_rate": 4.93421052631579e-06,
"loss": 1.161,
"step": 140
},
{
"epoch": 0.3462246777163904,
"grad_norm": 0.78515625,
"learning_rate": 4.933465085639e-06,
"loss": 1.2872,
"step": 141
},
{
"epoch": 0.34868017188459177,
"grad_norm": 0.73046875,
"learning_rate": 4.932717678100264e-06,
"loss": 1.3039,
"step": 142
},
{
"epoch": 0.3511356660527931,
"grad_norm": 0.7734375,
"learning_rate": 4.931968295904888e-06,
"loss": 1.4027,
"step": 143
},
{
"epoch": 0.35359116022099446,
"grad_norm": 0.73828125,
"learning_rate": 4.931216931216932e-06,
"loss": 1.2858,
"step": 144
},
{
"epoch": 0.3560466543891958,
"grad_norm": 0.76171875,
"learning_rate": 4.93046357615894e-06,
"loss": 1.2619,
"step": 145
},
{
"epoch": 0.35850214855739715,
"grad_norm": 0.78515625,
"learning_rate": 4.929708222811671e-06,
"loss": 1.2746,
"step": 146
},
{
"epoch": 0.36095764272559855,
"grad_norm": 0.8125,
"learning_rate": 4.928950863213812e-06,
"loss": 1.3008,
"step": 147
},
{
"epoch": 0.3634131368937999,
"grad_norm": 0.74609375,
"learning_rate": 4.928191489361703e-06,
"loss": 1.3898,
"step": 148
},
{
"epoch": 0.36586863106200124,
"grad_norm": 0.734375,
"learning_rate": 4.927430093209056e-06,
"loss": 1.3964,
"step": 149
},
{
"epoch": 0.3683241252302026,
"grad_norm": 0.7734375,
"learning_rate": 4.926666666666667e-06,
"loss": 1.2544,
"step": 150
},
{
"epoch": 0.3707796193984039,
"grad_norm": 0.7734375,
"learning_rate": 4.925901201602136e-06,
"loss": 1.3839,
"step": 151
},
{
"epoch": 0.37323511356660527,
"grad_norm": 0.77734375,
"learning_rate": 4.9251336898395725e-06,
"loss": 1.3611,
"step": 152
},
{
"epoch": 0.3756906077348066,
"grad_norm": 0.765625,
"learning_rate": 4.924364123159303e-06,
"loss": 1.3025,
"step": 153
},
{
"epoch": 0.37814610190300796,
"grad_norm": 0.69140625,
"learning_rate": 4.923592493297587e-06,
"loss": 1.2579,
"step": 154
},
{
"epoch": 0.38060159607120936,
"grad_norm": 0.69921875,
"learning_rate": 4.922818791946309e-06,
"loss": 1.3892,
"step": 155
},
{
"epoch": 0.3830570902394107,
"grad_norm": 0.78515625,
"learning_rate": 4.922043010752689e-06,
"loss": 1.2966,
"step": 156
},
{
"epoch": 0.38551258440761205,
"grad_norm": 0.671875,
"learning_rate": 4.921265141318978e-06,
"loss": 1.2942,
"step": 157
},
{
"epoch": 0.3879680785758134,
"grad_norm": 0.7265625,
"learning_rate": 4.920485175202157e-06,
"loss": 1.192,
"step": 158
},
{
"epoch": 0.39042357274401474,
"grad_norm": 0.71875,
"learning_rate": 4.9197031039136305e-06,
"loss": 1.129,
"step": 159
},
{
"epoch": 0.3928790669122161,
"grad_norm": 0.75390625,
"learning_rate": 4.918918918918919e-06,
"loss": 1.2928,
"step": 160
},
{
"epoch": 0.3953345610804174,
"grad_norm": 0.70703125,
"learning_rate": 4.918132611637348e-06,
"loss": 1.239,
"step": 161
},
{
"epoch": 0.39779005524861877,
"grad_norm": 0.7265625,
"learning_rate": 4.917344173441735e-06,
"loss": 1.1922,
"step": 162
},
{
"epoch": 0.4002455494168201,
"grad_norm": 0.72265625,
"learning_rate": 4.916553595658074e-06,
"loss": 1.19,
"step": 163
},
{
"epoch": 0.4027010435850215,
"grad_norm": 0.74609375,
"learning_rate": 4.915760869565218e-06,
"loss": 1.2543,
"step": 164
},
{
"epoch": 0.40515653775322286,
"grad_norm": 0.9140625,
"learning_rate": 4.914965986394558e-06,
"loss": 1.2686,
"step": 165
},
{
"epoch": 0.4076120319214242,
"grad_norm": 0.69921875,
"learning_rate": 4.9141689373297006e-06,
"loss": 1.2605,
"step": 166
},
{
"epoch": 0.41006752608962554,
"grad_norm": 0.76171875,
"learning_rate": 4.9133697135061394e-06,
"loss": 1.1579,
"step": 167
},
{
"epoch": 0.4125230202578269,
"grad_norm": 0.7578125,
"learning_rate": 4.9125683060109295e-06,
"loss": 1.3781,
"step": 168
},
{
"epoch": 0.41497851442602823,
"grad_norm": 0.7734375,
"learning_rate": 4.911764705882354e-06,
"loss": 1.4131,
"step": 169
},
{
"epoch": 0.4174340085942296,
"grad_norm": 0.92578125,
"learning_rate": 4.910958904109589e-06,
"loss": 1.3198,
"step": 170
},
{
"epoch": 0.4198895027624309,
"grad_norm": 0.8125,
"learning_rate": 4.9101508916323735e-06,
"loss": 1.4827,
"step": 171
},
{
"epoch": 0.42234499693063227,
"grad_norm": 0.71875,
"learning_rate": 4.90934065934066e-06,
"loss": 1.2649,
"step": 172
},
{
"epoch": 0.42480049109883367,
"grad_norm": 0.78125,
"learning_rate": 4.9085281980742785e-06,
"loss": 1.2458,
"step": 173
},
{
"epoch": 0.427255985267035,
"grad_norm": 0.67578125,
"learning_rate": 4.90771349862259e-06,
"loss": 1.0635,
"step": 174
},
{
"epoch": 0.42971147943523635,
"grad_norm": 0.72265625,
"learning_rate": 4.906896551724138e-06,
"loss": 1.3535,
"step": 175
},
{
"epoch": 0.4321669736034377,
"grad_norm": 0.671875,
"learning_rate": 4.906077348066298e-06,
"loss": 1.1425,
"step": 176
},
{
"epoch": 0.43462246777163904,
"grad_norm": 0.73828125,
"learning_rate": 4.905255878284925e-06,
"loss": 1.4151,
"step": 177
},
{
"epoch": 0.4370779619398404,
"grad_norm": 0.7578125,
"learning_rate": 4.90443213296399e-06,
"loss": 1.4322,
"step": 178
},
{
"epoch": 0.43953345610804173,
"grad_norm": 0.671875,
"learning_rate": 4.903606102635229e-06,
"loss": 1.2474,
"step": 179
},
{
"epoch": 0.4419889502762431,
"grad_norm": 0.73046875,
"learning_rate": 4.902777777777778e-06,
"loss": 1.2191,
"step": 180
},
{
"epoch": 0.4444444444444444,
"grad_norm": 0.73828125,
"learning_rate": 4.901947148817803e-06,
"loss": 1.32,
"step": 181
},
{
"epoch": 0.4468999386126458,
"grad_norm": 0.73828125,
"learning_rate": 4.901114206128134e-06,
"loss": 1.4193,
"step": 182
},
{
"epoch": 0.44935543278084716,
"grad_norm": 0.734375,
"learning_rate": 4.900278940027894e-06,
"loss": 1.363,
"step": 183
},
{
"epoch": 0.4518109269490485,
"grad_norm": 0.671875,
"learning_rate": 4.899441340782124e-06,
"loss": 1.2171,
"step": 184
},
{
"epoch": 0.45426642111724985,
"grad_norm": 0.69140625,
"learning_rate": 4.8986013986013995e-06,
"loss": 1.3648,
"step": 185
},
{
"epoch": 0.4567219152854512,
"grad_norm": 0.703125,
"learning_rate": 4.8977591036414575e-06,
"loss": 1.3496,
"step": 186
},
{
"epoch": 0.45917740945365254,
"grad_norm": 0.6796875,
"learning_rate": 4.896914446002806e-06,
"loss": 1.1913,
"step": 187
},
{
"epoch": 0.4616329036218539,
"grad_norm": 0.75,
"learning_rate": 4.896067415730337e-06,
"loss": 1.443,
"step": 188
},
{
"epoch": 0.46408839779005523,
"grad_norm": 0.72265625,
"learning_rate": 4.89521800281294e-06,
"loss": 1.3634,
"step": 189
},
{
"epoch": 0.4665438919582566,
"grad_norm": 0.7578125,
"learning_rate": 4.894366197183099e-06,
"loss": 1.3341,
"step": 190
},
{
"epoch": 0.468999386126458,
"grad_norm": 0.6796875,
"learning_rate": 4.893511988716502e-06,
"loss": 1.2351,
"step": 191
},
{
"epoch": 0.4714548802946593,
"grad_norm": 0.70703125,
"learning_rate": 4.892655367231639e-06,
"loss": 1.1749,
"step": 192
},
{
"epoch": 0.47391037446286066,
"grad_norm": 0.703125,
"learning_rate": 4.891796322489392e-06,
"loss": 1.3234,
"step": 193
},
{
"epoch": 0.476365868631062,
"grad_norm": 0.66796875,
"learning_rate": 4.890934844192635e-06,
"loss": 1.2423,
"step": 194
},
{
"epoch": 0.47882136279926335,
"grad_norm": 0.66796875,
"learning_rate": 4.890070921985816e-06,
"loss": 1.2254,
"step": 195
},
{
"epoch": 0.4812768569674647,
"grad_norm": 0.71875,
"learning_rate": 4.889204545454545e-06,
"loss": 1.1784,
"step": 196
},
{
"epoch": 0.48373235113566604,
"grad_norm": 0.69140625,
"learning_rate": 4.888335704125178e-06,
"loss": 1.2662,
"step": 197
},
{
"epoch": 0.4861878453038674,
"grad_norm": 0.73046875,
"learning_rate": 4.887464387464388e-06,
"loss": 1.298,
"step": 198
},
{
"epoch": 0.4886433394720687,
"grad_norm": 0.703125,
"learning_rate": 4.8865905848787455e-06,
"loss": 1.3095,
"step": 199
},
{
"epoch": 0.4910988336402701,
"grad_norm": 0.71484375,
"learning_rate": 4.8857142857142865e-06,
"loss": 1.5246,
"step": 200
},
{
"epoch": 0.49355432780847147,
"grad_norm": 0.69140625,
"learning_rate": 4.884835479256081e-06,
"loss": 1.3319,
"step": 201
},
{
"epoch": 0.4960098219766728,
"grad_norm": 0.77734375,
"learning_rate": 4.883954154727794e-06,
"loss": 1.4236,
"step": 202
},
{
"epoch": 0.49846531614487416,
"grad_norm": 0.6875,
"learning_rate": 4.8830703012912485e-06,
"loss": 1.1756,
"step": 203
},
{
"epoch": 0.5009208103130756,
"grad_norm": 0.73828125,
"learning_rate": 4.882183908045977e-06,
"loss": 1.3086,
"step": 204
},
{
"epoch": 0.5033763044812769,
"grad_norm": 0.6875,
"learning_rate": 4.881294964028777e-06,
"loss": 1.3479,
"step": 205
},
{
"epoch": 0.5058317986494782,
"grad_norm": 0.7265625,
"learning_rate": 4.880403458213257e-06,
"loss": 1.3573,
"step": 206
},
{
"epoch": 0.5082872928176796,
"grad_norm": 0.671875,
"learning_rate": 4.87950937950938e-06,
"loss": 1.2731,
"step": 207
},
{
"epoch": 0.5107427869858809,
"grad_norm": 0.6953125,
"learning_rate": 4.878612716763007e-06,
"loss": 1.3774,
"step": 208
},
{
"epoch": 0.5131982811540823,
"grad_norm": 0.65234375,
"learning_rate": 4.877713458755427e-06,
"loss": 1.1831,
"step": 209
},
{
"epoch": 0.5156537753222836,
"grad_norm": 0.64453125,
"learning_rate": 4.876811594202899e-06,
"loss": 1.2903,
"step": 210
},
{
"epoch": 0.518109269490485,
"grad_norm": 0.703125,
"learning_rate": 4.875907111756169e-06,
"loss": 1.2286,
"step": 211
},
{
"epoch": 0.5205647636586863,
"grad_norm": 0.71484375,
"learning_rate": 4.875e-06,
"loss": 1.3615,
"step": 212
},
{
"epoch": 0.5230202578268877,
"grad_norm": 0.640625,
"learning_rate": 4.874090247452693e-06,
"loss": 1.2775,
"step": 213
},
{
"epoch": 0.525475751995089,
"grad_norm": 0.71484375,
"learning_rate": 4.873177842565599e-06,
"loss": 1.2464,
"step": 214
},
{
"epoch": 0.5279312461632903,
"grad_norm": 0.703125,
"learning_rate": 4.872262773722629e-06,
"loss": 1.4034,
"step": 215
},
{
"epoch": 0.5303867403314917,
"grad_norm": 0.7109375,
"learning_rate": 4.871345029239766e-06,
"loss": 1.295,
"step": 216
},
{
"epoch": 0.532842234499693,
"grad_norm": 0.68359375,
"learning_rate": 4.870424597364568e-06,
"loss": 1.2426,
"step": 217
},
{
"epoch": 0.5352977286678944,
"grad_norm": 0.6328125,
"learning_rate": 4.86950146627566e-06,
"loss": 1.2622,
"step": 218
},
{
"epoch": 0.5377532228360957,
"grad_norm": 0.671875,
"learning_rate": 4.868575624082232e-06,
"loss": 1.3219,
"step": 219
},
{
"epoch": 0.5402087170042971,
"grad_norm": 0.71875,
"learning_rate": 4.8676470588235295e-06,
"loss": 1.5289,
"step": 220
},
{
"epoch": 0.5426642111724984,
"grad_norm": 0.70703125,
"learning_rate": 4.8667157584683365e-06,
"loss": 1.2633,
"step": 221
},
{
"epoch": 0.5451197053406999,
"grad_norm": 0.70703125,
"learning_rate": 4.865781710914455e-06,
"loss": 1.2798,
"step": 222
},
{
"epoch": 0.5475751995089012,
"grad_norm": 0.6640625,
"learning_rate": 4.864844903988184e-06,
"loss": 1.2477,
"step": 223
},
{
"epoch": 0.5500306936771026,
"grad_norm": 0.62890625,
"learning_rate": 4.8639053254437875e-06,
"loss": 1.1598,
"step": 224
},
{
"epoch": 0.5524861878453039,
"grad_norm": 0.69921875,
"learning_rate": 4.862962962962963e-06,
"loss": 1.2752,
"step": 225
},
{
"epoch": 0.5549416820135052,
"grad_norm": 1.1640625,
"learning_rate": 4.862017804154303e-06,
"loss": 1.3412,
"step": 226
},
{
"epoch": 0.5573971761817066,
"grad_norm": 0.65625,
"learning_rate": 4.861069836552749e-06,
"loss": 1.2334,
"step": 227
},
{
"epoch": 0.5598526703499079,
"grad_norm": 0.91015625,
"learning_rate": 4.860119047619048e-06,
"loss": 1.3265,
"step": 228
},
{
"epoch": 0.5623081645181093,
"grad_norm": 0.828125,
"learning_rate": 4.859165424739195e-06,
"loss": 1.3923,
"step": 229
},
{
"epoch": 0.5647636586863106,
"grad_norm": 0.70703125,
"learning_rate": 4.858208955223881e-06,
"loss": 1.3351,
"step": 230
},
{
"epoch": 0.567219152854512,
"grad_norm": 0.62890625,
"learning_rate": 4.857249626307923e-06,
"loss": 1.187,
"step": 231
},
{
"epoch": 0.5696746470227133,
"grad_norm": 0.65625,
"learning_rate": 4.856287425149701e-06,
"loss": 1.2872,
"step": 232
},
{
"epoch": 0.5721301411909147,
"grad_norm": 0.68359375,
"learning_rate": 4.855322338830585e-06,
"loss": 1.3065,
"step": 233
},
{
"epoch": 0.574585635359116,
"grad_norm": 0.72265625,
"learning_rate": 4.854354354354354e-06,
"loss": 1.1766,
"step": 234
},
{
"epoch": 0.5770411295273173,
"grad_norm": 0.8125,
"learning_rate": 4.853383458646616e-06,
"loss": 1.2015,
"step": 235
},
{
"epoch": 0.5794966236955187,
"grad_norm": 0.69140625,
"learning_rate": 4.852409638554218e-06,
"loss": 1.3997,
"step": 236
},
{
"epoch": 0.58195211786372,
"grad_norm": 0.76953125,
"learning_rate": 4.851432880844646e-06,
"loss": 1.4776,
"step": 237
},
{
"epoch": 0.5844076120319214,
"grad_norm": 0.6953125,
"learning_rate": 4.850453172205439e-06,
"loss": 1.4555,
"step": 238
},
{
"epoch": 0.5868631062001227,
"grad_norm": 0.6328125,
"learning_rate": 4.849470499243571e-06,
"loss": 1.1934,
"step": 239
},
{
"epoch": 0.5893186003683242,
"grad_norm": 0.67578125,
"learning_rate": 4.848484848484849e-06,
"loss": 1.2544,
"step": 240
},
{
"epoch": 0.5917740945365255,
"grad_norm": 0.66015625,
"learning_rate": 4.847496206373293e-06,
"loss": 1.2578,
"step": 241
},
{
"epoch": 0.5942295887047269,
"grad_norm": 0.6796875,
"learning_rate": 4.846504559270517e-06,
"loss": 1.3459,
"step": 242
},
{
"epoch": 0.5966850828729282,
"grad_norm": 0.8046875,
"learning_rate": 4.8455098934551e-06,
"loss": 1.2658,
"step": 243
},
{
"epoch": 0.5991405770411296,
"grad_norm": 0.64453125,
"learning_rate": 4.844512195121952e-06,
"loss": 1.2501,
"step": 244
},
{
"epoch": 0.6015960712093309,
"grad_norm": 0.64453125,
"learning_rate": 4.84351145038168e-06,
"loss": 1.3957,
"step": 245
},
{
"epoch": 0.6040515653775322,
"grad_norm": 0.80078125,
"learning_rate": 4.8425076452599395e-06,
"loss": 1.4144,
"step": 246
},
{
"epoch": 0.6065070595457336,
"grad_norm": 0.671875,
"learning_rate": 4.841500765696784e-06,
"loss": 1.3007,
"step": 247
},
{
"epoch": 0.6089625537139349,
"grad_norm": 0.69921875,
"learning_rate": 4.840490797546013e-06,
"loss": 1.171,
"step": 248
},
{
"epoch": 0.6114180478821363,
"grad_norm": 0.70703125,
"learning_rate": 4.839477726574502e-06,
"loss": 1.4907,
"step": 249
},
{
"epoch": 0.6138735420503376,
"grad_norm": 0.62890625,
"learning_rate": 4.8384615384615385e-06,
"loss": 1.2608,
"step": 250
},
{
"epoch": 0.616329036218539,
"grad_norm": 0.6484375,
"learning_rate": 4.837442218798152e-06,
"loss": 1.2279,
"step": 251
},
{
"epoch": 0.6187845303867403,
"grad_norm": 0.640625,
"learning_rate": 4.83641975308642e-06,
"loss": 1.1505,
"step": 252
},
{
"epoch": 0.6212400245549416,
"grad_norm": 0.6640625,
"learning_rate": 4.835394126738795e-06,
"loss": 1.5144,
"step": 253
},
{
"epoch": 0.623695518723143,
"grad_norm": 0.67578125,
"learning_rate": 4.8343653250773995e-06,
"loss": 1.3644,
"step": 254
},
{
"epoch": 0.6261510128913443,
"grad_norm": 0.63671875,
"learning_rate": 4.833333333333333e-06,
"loss": 1.2564,
"step": 255
},
{
"epoch": 0.6286065070595457,
"grad_norm": 0.62109375,
"learning_rate": 4.832298136645963e-06,
"loss": 1.2714,
"step": 256
},
{
"epoch": 0.6310620012277471,
"grad_norm": 0.63671875,
"learning_rate": 4.831259720062209e-06,
"loss": 1.2751,
"step": 257
},
{
"epoch": 0.6335174953959485,
"grad_norm": 0.6640625,
"learning_rate": 4.830218068535826e-06,
"loss": 1.3451,
"step": 258
},
{
"epoch": 0.6359729895641498,
"grad_norm": 0.68359375,
"learning_rate": 4.8291731669266775e-06,
"loss": 1.2426,
"step": 259
},
{
"epoch": 0.6384284837323512,
"grad_norm": 0.65234375,
"learning_rate": 4.8281250000000005e-06,
"loss": 1.2962,
"step": 260
},
{
"epoch": 0.6408839779005525,
"grad_norm": 0.86328125,
"learning_rate": 4.827073552425666e-06,
"loss": 1.2006,
"step": 261
},
{
"epoch": 0.6433394720687539,
"grad_norm": 0.6640625,
"learning_rate": 4.826018808777429e-06,
"loss": 1.345,
"step": 262
},
{
"epoch": 0.6457949662369552,
"grad_norm": 0.72265625,
"learning_rate": 4.824960753532182e-06,
"loss": 1.2243,
"step": 263
},
{
"epoch": 0.6482504604051565,
"grad_norm": 0.6484375,
"learning_rate": 4.823899371069182e-06,
"loss": 1.447,
"step": 264
},
{
"epoch": 0.6507059545733579,
"grad_norm": 0.65625,
"learning_rate": 4.822834645669292e-06,
"loss": 1.2785,
"step": 265
},
{
"epoch": 0.6531614487415592,
"grad_norm": 0.625,
"learning_rate": 4.821766561514196e-06,
"loss": 1.1441,
"step": 266
},
{
"epoch": 0.6556169429097606,
"grad_norm": 0.6328125,
"learning_rate": 4.8206951026856246e-06,
"loss": 1.248,
"step": 267
},
{
"epoch": 0.6580724370779619,
"grad_norm": 0.671875,
"learning_rate": 4.819620253164557e-06,
"loss": 1.4042,
"step": 268
},
{
"epoch": 0.6605279312461633,
"grad_norm": 0.6484375,
"learning_rate": 4.818541996830428e-06,
"loss": 1.1198,
"step": 269
},
{
"epoch": 0.6629834254143646,
"grad_norm": 0.68359375,
"learning_rate": 4.8174603174603175e-06,
"loss": 1.3123,
"step": 270
},
{
"epoch": 0.665438919582566,
"grad_norm": 0.66796875,
"learning_rate": 4.81637519872814e-06,
"loss": 1.2525,
"step": 271
},
{
"epoch": 0.6678944137507673,
"grad_norm": 0.61328125,
"learning_rate": 4.815286624203822e-06,
"loss": 1.2531,
"step": 272
},
{
"epoch": 0.6703499079189686,
"grad_norm": 0.6328125,
"learning_rate": 4.814194577352473e-06,
"loss": 1.2536,
"step": 273
},
{
"epoch": 0.67280540208717,
"grad_norm": 0.6328125,
"learning_rate": 4.813099041533547e-06,
"loss": 1.3437,
"step": 274
},
{
"epoch": 0.6752608962553714,
"grad_norm": 0.765625,
"learning_rate": 4.812e-06,
"loss": 1.2383,
"step": 275
},
{
"epoch": 0.6777163904235728,
"grad_norm": 0.61328125,
"learning_rate": 4.8108974358974366e-06,
"loss": 1.1858,
"step": 276
},
{
"epoch": 0.6801718845917741,
"grad_norm": 0.6328125,
"learning_rate": 4.809791332263242e-06,
"loss": 1.3243,
"step": 277
},
{
"epoch": 0.6826273787599755,
"grad_norm": 0.671875,
"learning_rate": 4.808681672025724e-06,
"loss": 1.1714,
"step": 278
},
{
"epoch": 0.6850828729281768,
"grad_norm": 0.65234375,
"learning_rate": 4.807568438003221e-06,
"loss": 1.484,
"step": 279
},
{
"epoch": 0.6875383670963782,
"grad_norm": 0.65625,
"learning_rate": 4.806451612903227e-06,
"loss": 1.2078,
"step": 280
},
{
"epoch": 0.6899938612645795,
"grad_norm": 0.6328125,
"learning_rate": 4.805331179321487e-06,
"loss": 1.1594,
"step": 281
},
{
"epoch": 0.6924493554327809,
"grad_norm": 0.765625,
"learning_rate": 4.8042071197411e-06,
"loss": 1.3314,
"step": 282
},
{
"epoch": 0.6949048496009822,
"grad_norm": 0.65625,
"learning_rate": 4.803079416531605e-06,
"loss": 1.4602,
"step": 283
},
{
"epoch": 0.6973603437691835,
"grad_norm": 0.6484375,
"learning_rate": 4.801948051948052e-06,
"loss": 1.1564,
"step": 284
},
{
"epoch": 0.6998158379373849,
"grad_norm": 1.0234375,
"learning_rate": 4.800813008130081e-06,
"loss": 1.1491,
"step": 285
},
{
"epoch": 0.7022713321055862,
"grad_norm": 0.66796875,
"learning_rate": 4.799674267100978e-06,
"loss": 1.3547,
"step": 286
},
{
"epoch": 0.7047268262737876,
"grad_norm": 0.609375,
"learning_rate": 4.798531810766721e-06,
"loss": 1.2468,
"step": 287
},
{
"epoch": 0.7071823204419889,
"grad_norm": 0.64453125,
"learning_rate": 4.7973856209150335e-06,
"loss": 1.373,
"step": 288
},
{
"epoch": 0.7096378146101903,
"grad_norm": 0.63671875,
"learning_rate": 4.796235679214403e-06,
"loss": 1.1777,
"step": 289
},
{
"epoch": 0.7120933087783916,
"grad_norm": 0.63671875,
"learning_rate": 4.7950819672131156e-06,
"loss": 1.1952,
"step": 290
},
{
"epoch": 0.714548802946593,
"grad_norm": 0.66015625,
"learning_rate": 4.79392446633826e-06,
"loss": 1.1449,
"step": 291
},
{
"epoch": 0.7170042971147943,
"grad_norm": 0.6171875,
"learning_rate": 4.792763157894737e-06,
"loss": 1.4823,
"step": 292
},
{
"epoch": 0.7194597912829958,
"grad_norm": 0.64453125,
"learning_rate": 4.791598023064251e-06,
"loss": 1.3252,
"step": 293
},
{
"epoch": 0.7219152854511971,
"grad_norm": 0.62109375,
"learning_rate": 4.790429042904291e-06,
"loss": 1.1599,
"step": 294
},
{
"epoch": 0.7243707796193984,
"grad_norm": 0.66015625,
"learning_rate": 4.789256198347108e-06,
"loss": 1.3266,
"step": 295
},
{
"epoch": 0.7268262737875998,
"grad_norm": 0.6484375,
"learning_rate": 4.788079470198676e-06,
"loss": 1.2402,
"step": 296
},
{
"epoch": 0.7292817679558011,
"grad_norm": 0.63671875,
"learning_rate": 4.786898839137645e-06,
"loss": 1.2549,
"step": 297
},
{
"epoch": 0.7317372621240025,
"grad_norm": 0.640625,
"learning_rate": 4.785714285714287e-06,
"loss": 1.3053,
"step": 298
},
{
"epoch": 0.7341927562922038,
"grad_norm": 0.64453125,
"learning_rate": 4.784525790349418e-06,
"loss": 1.2875,
"step": 299
},
{
"epoch": 0.7366482504604052,
"grad_norm": 0.66796875,
"learning_rate": 4.783333333333334e-06,
"loss": 1.3651,
"step": 300
},
{
"epoch": 0.7391037446286065,
"grad_norm": 0.6171875,
"learning_rate": 4.782136894824708e-06,
"loss": 1.2945,
"step": 301
},
{
"epoch": 0.7415592387968079,
"grad_norm": 0.6640625,
"learning_rate": 4.780936454849499e-06,
"loss": 1.2634,
"step": 302
},
{
"epoch": 0.7440147329650092,
"grad_norm": 0.640625,
"learning_rate": 4.779731993299833e-06,
"loss": 1.2937,
"step": 303
},
{
"epoch": 0.7464702271332105,
"grad_norm": 0.625,
"learning_rate": 4.7785234899328866e-06,
"loss": 1.2601,
"step": 304
},
{
"epoch": 0.7489257213014119,
"grad_norm": 0.8359375,
"learning_rate": 4.777310924369749e-06,
"loss": 1.2282,
"step": 305
},
{
"epoch": 0.7513812154696132,
"grad_norm": 0.61328125,
"learning_rate": 4.776094276094276e-06,
"loss": 1.3313,
"step": 306
},
{
"epoch": 0.7538367096378146,
"grad_norm": 0.609375,
"learning_rate": 4.774873524451939e-06,
"loss": 1.1972,
"step": 307
},
{
"epoch": 0.7562922038060159,
"grad_norm": 0.64453125,
"learning_rate": 4.773648648648649e-06,
"loss": 1.1661,
"step": 308
},
{
"epoch": 0.7587476979742173,
"grad_norm": 0.6015625,
"learning_rate": 4.772419627749577e-06,
"loss": 1.1749,
"step": 309
},
{
"epoch": 0.7612031921424187,
"grad_norm": 0.6328125,
"learning_rate": 4.771186440677967e-06,
"loss": 1.3634,
"step": 310
},
{
"epoch": 0.7636586863106201,
"grad_norm": 0.62109375,
"learning_rate": 4.769949066213922e-06,
"loss": 1.2586,
"step": 311
},
{
"epoch": 0.7661141804788214,
"grad_norm": 0.58984375,
"learning_rate": 4.768707482993198e-06,
"loss": 1.0948,
"step": 312
},
{
"epoch": 0.7685696746470227,
"grad_norm": 0.72265625,
"learning_rate": 4.767461669505963e-06,
"loss": 1.2056,
"step": 313
},
{
"epoch": 0.7710251688152241,
"grad_norm": 0.7421875,
"learning_rate": 4.7662116040955635e-06,
"loss": 1.2017,
"step": 314
},
{
"epoch": 0.7734806629834254,
"grad_norm": 0.60546875,
"learning_rate": 4.764957264957265e-06,
"loss": 1.2027,
"step": 315
},
{
"epoch": 0.7759361571516268,
"grad_norm": 0.62109375,
"learning_rate": 4.7636986301369865e-06,
"loss": 1.3685,
"step": 316
},
{
"epoch": 0.7783916513198281,
"grad_norm": 0.66796875,
"learning_rate": 4.762435677530018e-06,
"loss": 1.2291,
"step": 317
},
{
"epoch": 0.7808471454880295,
"grad_norm": 0.77734375,
"learning_rate": 4.761168384879725e-06,
"loss": 1.335,
"step": 318
},
{
"epoch": 0.7833026396562308,
"grad_norm": 0.59375,
"learning_rate": 4.759896729776248e-06,
"loss": 1.1458,
"step": 319
},
{
"epoch": 0.7857581338244322,
"grad_norm": 0.6328125,
"learning_rate": 4.758620689655173e-06,
"loss": 1.3041,
"step": 320
},
{
"epoch": 0.7882136279926335,
"grad_norm": 0.85546875,
"learning_rate": 4.757340241796201e-06,
"loss": 1.2781,
"step": 321
},
{
"epoch": 0.7906691221608348,
"grad_norm": 0.6640625,
"learning_rate": 4.7560553633218e-06,
"loss": 1.425,
"step": 322
},
{
"epoch": 0.7931246163290362,
"grad_norm": 0.671875,
"learning_rate": 4.754766031195841e-06,
"loss": 1.2964,
"step": 323
},
{
"epoch": 0.7955801104972375,
"grad_norm": 0.61328125,
"learning_rate": 4.753472222222224e-06,
"loss": 1.2405,
"step": 324
},
{
"epoch": 0.7980356046654389,
"grad_norm": 0.609375,
"learning_rate": 4.752173913043479e-06,
"loss": 1.2106,
"step": 325
},
{
"epoch": 0.8004910988336402,
"grad_norm": 0.609375,
"learning_rate": 4.750871080139373e-06,
"loss": 1.3867,
"step": 326
},
{
"epoch": 0.8029465930018416,
"grad_norm": 0.6171875,
"learning_rate": 4.74956369982548e-06,
"loss": 1.2059,
"step": 327
},
{
"epoch": 0.805402087170043,
"grad_norm": 0.83984375,
"learning_rate": 4.748251748251749e-06,
"loss": 1.3675,
"step": 328
},
{
"epoch": 0.8078575813382444,
"grad_norm": 0.62109375,
"learning_rate": 4.746935201401051e-06,
"loss": 1.2453,
"step": 329
},
{
"epoch": 0.8103130755064457,
"grad_norm": 0.65625,
"learning_rate": 4.74561403508772e-06,
"loss": 1.2978,
"step": 330
},
{
"epoch": 0.8127685696746471,
"grad_norm": 0.6171875,
"learning_rate": 4.744288224956063e-06,
"loss": 1.1665,
"step": 331
},
{
"epoch": 0.8152240638428484,
"grad_norm": 0.6015625,
"learning_rate": 4.742957746478874e-06,
"loss": 1.3244,
"step": 332
},
{
"epoch": 0.8176795580110497,
"grad_norm": 0.6171875,
"learning_rate": 4.7416225749559084e-06,
"loss": 1.4481,
"step": 333
},
{
"epoch": 0.8201350521792511,
"grad_norm": 0.6484375,
"learning_rate": 4.740282685512368e-06,
"loss": 1.2474,
"step": 334
},
{
"epoch": 0.8225905463474524,
"grad_norm": 0.60546875,
"learning_rate": 4.738938053097346e-06,
"loss": 1.2216,
"step": 335
},
{
"epoch": 0.8250460405156538,
"grad_norm": 0.62890625,
"learning_rate": 4.73758865248227e-06,
"loss": 1.3191,
"step": 336
},
{
"epoch": 0.8275015346838551,
"grad_norm": 0.609375,
"learning_rate": 4.736234458259325e-06,
"loss": 1.1618,
"step": 337
},
{
"epoch": 0.8299570288520565,
"grad_norm": 0.6015625,
"learning_rate": 4.734875444839857e-06,
"loss": 1.2559,
"step": 338
},
{
"epoch": 0.8324125230202578,
"grad_norm": 0.6328125,
"learning_rate": 4.733511586452764e-06,
"loss": 1.3254,
"step": 339
},
{
"epoch": 0.8348680171884592,
"grad_norm": 0.57421875,
"learning_rate": 4.732142857142858e-06,
"loss": 1.2175,
"step": 340
},
{
"epoch": 0.8373235113566605,
"grad_norm": 0.6328125,
"learning_rate": 4.730769230769231e-06,
"loss": 1.2715,
"step": 341
},
{
"epoch": 0.8397790055248618,
"grad_norm": 0.61328125,
"learning_rate": 4.729390681003584e-06,
"loss": 1.2113,
"step": 342
},
{
"epoch": 0.8422344996930632,
"grad_norm": 0.609375,
"learning_rate": 4.728007181328546e-06,
"loss": 1.2132,
"step": 343
},
{
"epoch": 0.8446899938612645,
"grad_norm": 0.609375,
"learning_rate": 4.726618705035971e-06,
"loss": 1.2593,
"step": 344
},
{
"epoch": 0.8471454880294659,
"grad_norm": 0.609375,
"learning_rate": 4.725225225225225e-06,
"loss": 1.2435,
"step": 345
},
{
"epoch": 0.8496009821976673,
"grad_norm": 0.58984375,
"learning_rate": 4.723826714801444e-06,
"loss": 1.2349,
"step": 346
},
{
"epoch": 0.8520564763658687,
"grad_norm": 0.609375,
"learning_rate": 4.72242314647378e-06,
"loss": 1.2344,
"step": 347
},
{
"epoch": 0.85451197053407,
"grad_norm": 0.5703125,
"learning_rate": 4.721014492753624e-06,
"loss": 1.1971,
"step": 348
},
{
"epoch": 0.8569674647022714,
"grad_norm": 0.59375,
"learning_rate": 4.7196007259528135e-06,
"loss": 1.2314,
"step": 349
},
{
"epoch": 0.8594229588704727,
"grad_norm": 0.6015625,
"learning_rate": 4.718181818181819e-06,
"loss": 1.261,
"step": 350
},
{
"epoch": 0.861878453038674,
"grad_norm": 0.65625,
"learning_rate": 4.7167577413479055e-06,
"loss": 1.2093,
"step": 351
},
{
"epoch": 0.8643339472068754,
"grad_norm": 0.6015625,
"learning_rate": 4.7153284671532855e-06,
"loss": 1.1604,
"step": 352
},
{
"epoch": 0.8667894413750767,
"grad_norm": 0.76171875,
"learning_rate": 4.713893967093236e-06,
"loss": 1.111,
"step": 353
},
{
"epoch": 0.8692449355432781,
"grad_norm": 0.64453125,
"learning_rate": 4.712454212454213e-06,
"loss": 1.2903,
"step": 354
},
{
"epoch": 0.8717004297114794,
"grad_norm": 0.62890625,
"learning_rate": 4.711009174311927e-06,
"loss": 1.2801,
"step": 355
},
{
"epoch": 0.8741559238796808,
"grad_norm": 0.6484375,
"learning_rate": 4.709558823529412e-06,
"loss": 1.3241,
"step": 356
},
{
"epoch": 0.8766114180478821,
"grad_norm": 0.6171875,
"learning_rate": 4.708103130755065e-06,
"loss": 1.2036,
"step": 357
},
{
"epoch": 0.8790669122160835,
"grad_norm": 0.734375,
"learning_rate": 4.706642066420664e-06,
"loss": 1.2354,
"step": 358
},
{
"epoch": 0.8815224063842848,
"grad_norm": 0.6328125,
"learning_rate": 4.7051756007393715e-06,
"loss": 1.2554,
"step": 359
},
{
"epoch": 0.8839779005524862,
"grad_norm": 0.60546875,
"learning_rate": 4.703703703703704e-06,
"loss": 1.243,
"step": 360
},
{
"epoch": 0.8864333947206875,
"grad_norm": 0.75390625,
"learning_rate": 4.7022263450834885e-06,
"loss": 1.1985,
"step": 361
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.59765625,
"learning_rate": 4.700743494423793e-06,
"loss": 1.2429,
"step": 362
},
{
"epoch": 0.8913443830570903,
"grad_norm": 0.6171875,
"learning_rate": 4.699255121042831e-06,
"loss": 1.2472,
"step": 363
},
{
"epoch": 0.8937998772252916,
"grad_norm": 0.59765625,
"learning_rate": 4.697761194029851e-06,
"loss": 1.2172,
"step": 364
},
{
"epoch": 0.896255371393493,
"grad_norm": 0.609375,
"learning_rate": 4.696261682242991e-06,
"loss": 1.4051,
"step": 365
},
{
"epoch": 0.8987108655616943,
"grad_norm": 0.59765625,
"learning_rate": 4.6947565543071164e-06,
"loss": 1.2184,
"step": 366
},
{
"epoch": 0.9011663597298957,
"grad_norm": 0.609375,
"learning_rate": 4.693245778611632e-06,
"loss": 1.222,
"step": 367
},
{
"epoch": 0.903621853898097,
"grad_norm": 0.63671875,
"learning_rate": 4.691729323308271e-06,
"loss": 1.4323,
"step": 368
},
{
"epoch": 0.9060773480662984,
"grad_norm": 0.6328125,
"learning_rate": 4.690207156308852e-06,
"loss": 1.2835,
"step": 369
},
{
"epoch": 0.9085328422344997,
"grad_norm": 0.6015625,
"learning_rate": 4.68867924528302e-06,
"loss": 1.327,
"step": 370
},
{
"epoch": 0.910988336402701,
"grad_norm": 0.6015625,
"learning_rate": 4.6871455576559546e-06,
"loss": 1.1185,
"step": 371
},
{
"epoch": 0.9134438305709024,
"grad_norm": 0.59765625,
"learning_rate": 4.6856060606060614e-06,
"loss": 1.3439,
"step": 372
},
{
"epoch": 0.9158993247391037,
"grad_norm": 0.59375,
"learning_rate": 4.6840607210626185e-06,
"loss": 1.2999,
"step": 373
},
{
"epoch": 0.9183548189073051,
"grad_norm": 0.65625,
"learning_rate": 4.682509505703422e-06,
"loss": 1.5383,
"step": 374
},
{
"epoch": 0.9208103130755064,
"grad_norm": 0.66015625,
"learning_rate": 4.680952380952381e-06,
"loss": 1.4458,
"step": 375
},
{
"epoch": 0.9232658072437078,
"grad_norm": 0.57421875,
"learning_rate": 4.6793893129771e-06,
"loss": 1.3359,
"step": 376
},
{
"epoch": 0.9257213014119091,
"grad_norm": 0.625,
"learning_rate": 4.677820267686425e-06,
"loss": 1.1187,
"step": 377
},
{
"epoch": 0.9281767955801105,
"grad_norm": 0.60546875,
"learning_rate": 4.67624521072797e-06,
"loss": 1.2834,
"step": 378
},
{
"epoch": 0.9306322897483118,
"grad_norm": 0.63671875,
"learning_rate": 4.674664107485605e-06,
"loss": 1.2328,
"step": 379
},
{
"epoch": 0.9330877839165131,
"grad_norm": 0.81640625,
"learning_rate": 4.673076923076924e-06,
"loss": 1.2323,
"step": 380
},
{
"epoch": 0.9355432780847146,
"grad_norm": 0.640625,
"learning_rate": 4.671483622350675e-06,
"loss": 1.1229,
"step": 381
},
{
"epoch": 0.937998772252916,
"grad_norm": 0.64453125,
"learning_rate": 4.66988416988417e-06,
"loss": 1.366,
"step": 382
},
{
"epoch": 0.9404542664211173,
"grad_norm": 0.58984375,
"learning_rate": 4.668278529980658e-06,
"loss": 1.2033,
"step": 383
},
{
"epoch": 0.9429097605893186,
"grad_norm": 0.6015625,
"learning_rate": 4.666666666666668e-06,
"loss": 1.2062,
"step": 384
},
{
"epoch": 0.94536525475752,
"grad_norm": 0.6171875,
"learning_rate": 4.66504854368932e-06,
"loss": 1.3482,
"step": 385
},
{
"epoch": 0.9478207489257213,
"grad_norm": 0.625,
"learning_rate": 4.6634241245136196e-06,
"loss": 1.3237,
"step": 386
},
{
"epoch": 0.9502762430939227,
"grad_norm": 0.59765625,
"learning_rate": 4.661793372319688e-06,
"loss": 1.1933,
"step": 387
},
{
"epoch": 0.952731737262124,
"grad_norm": 0.671875,
"learning_rate": 4.66015625e-06,
"loss": 1.2976,
"step": 388
},
{
"epoch": 0.9551872314303254,
"grad_norm": 0.58203125,
"learning_rate": 4.658512720156556e-06,
"loss": 1.1236,
"step": 389
},
{
"epoch": 0.9576427255985267,
"grad_norm": 0.609375,
"learning_rate": 4.65686274509804e-06,
"loss": 1.3531,
"step": 390
},
{
"epoch": 0.960098219766728,
"grad_norm": 0.6171875,
"learning_rate": 4.6552062868369355e-06,
"loss": 1.3564,
"step": 391
},
{
"epoch": 0.9625537139349294,
"grad_norm": 0.640625,
"learning_rate": 4.653543307086615e-06,
"loss": 1.289,
"step": 392
},
{
"epoch": 0.9650092081031307,
"grad_norm": 0.609375,
"learning_rate": 4.651873767258383e-06,
"loss": 1.4734,
"step": 393
},
{
"epoch": 0.9674647022713321,
"grad_norm": 0.578125,
"learning_rate": 4.650197628458498e-06,
"loss": 1.2798,
"step": 394
},
{
"epoch": 0.9699201964395334,
"grad_norm": 0.640625,
"learning_rate": 4.648514851485149e-06,
"loss": 1.2872,
"step": 395
},
{
"epoch": 0.9723756906077348,
"grad_norm": 0.61328125,
"learning_rate": 4.646825396825397e-06,
"loss": 1.315,
"step": 396
},
{
"epoch": 0.9748311847759361,
"grad_norm": 0.62890625,
"learning_rate": 4.645129224652087e-06,
"loss": 1.2918,
"step": 397
},
{
"epoch": 0.9772866789441375,
"grad_norm": 0.59765625,
"learning_rate": 4.643426294820717e-06,
"loss": 1.4759,
"step": 398
},
{
"epoch": 0.9797421731123389,
"grad_norm": 0.59375,
"learning_rate": 4.6417165668662675e-06,
"loss": 1.3303,
"step": 399
},
{
"epoch": 0.9821976672805403,
"grad_norm": 0.57421875,
"learning_rate": 4.6400000000000005e-06,
"loss": 1.1404,
"step": 400
},
{
"epoch": 0.9846531614487416,
"grad_norm": 0.625,
"learning_rate": 4.638276553106213e-06,
"loss": 1.2628,
"step": 401
},
{
"epoch": 0.9871086556169429,
"grad_norm": 0.64453125,
"learning_rate": 4.636546184738957e-06,
"loss": 1.336,
"step": 402
},
{
"epoch": 0.9895641497851443,
"grad_norm": 0.64453125,
"learning_rate": 4.634808853118712e-06,
"loss": 1.31,
"step": 403
},
{
"epoch": 0.9920196439533456,
"grad_norm": 0.90625,
"learning_rate": 4.633064516129032e-06,
"loss": 1.3131,
"step": 404
},
{
"epoch": 0.994475138121547,
"grad_norm": 0.640625,
"learning_rate": 4.6313131313131315e-06,
"loss": 1.5459,
"step": 405
},
{
"epoch": 0.9969306322897483,
"grad_norm": 0.609375,
"learning_rate": 4.629554655870445e-06,
"loss": 1.4008,
"step": 406
},
{
"epoch": 0.9993861264579497,
"grad_norm": 0.59765625,
"learning_rate": 4.627789046653145e-06,
"loss": 1.2619,
"step": 407
},
{
"epoch": 1.0,
"grad_norm": 1.2265625,
"learning_rate": 4.626016260162602e-06,
"loss": 1.6531,
"step": 408
}
],
"logging_steps": 1,
"max_steps": 814,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 204,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.3184366687420416e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}