diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8001 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 5687, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008791981712678037, + "grad_norm": 10.053899765014648, + "learning_rate": 5.847953216374269e-07, + "loss": 0.1487, + "step": 5 + }, + { + "epoch": 0.0017583963425356075, + "grad_norm": 1371.18408203125, + "learning_rate": 1.1695906432748538e-06, + "loss": 0.154, + "step": 10 + }, + { + "epoch": 0.0026375945138034113, + "grad_norm": 4331.09814453125, + "learning_rate": 1.7543859649122807e-06, + "loss": 0.1485, + "step": 15 + }, + { + "epoch": 0.003516792685071215, + "grad_norm": 50.946231842041016, + "learning_rate": 2.3391812865497075e-06, + "loss": 0.082, + "step": 20 + }, + { + "epoch": 0.004395990856339019, + "grad_norm": 130.27598571777344, + "learning_rate": 2.9239766081871347e-06, + "loss": 0.1029, + "step": 25 + }, + { + "epoch": 0.005275189027606823, + "grad_norm": 312.9079895019531, + "learning_rate": 3.5087719298245615e-06, + "loss": 0.0983, + "step": 30 + }, + { + "epoch": 0.006154387198874627, + "grad_norm": 828.6588134765625, + "learning_rate": 4.093567251461989e-06, + "loss": 0.116, + "step": 35 + }, + { + "epoch": 0.00703358537014243, + "grad_norm": 95.5357666015625, + "learning_rate": 4.678362573099415e-06, + "loss": 0.1025, + "step": 40 + }, + { + "epoch": 0.007912783541410234, + "grad_norm": 46.62782669067383, + "learning_rate": 5.263157894736842e-06, + "loss": 0.0929, + "step": 45 + }, + { + "epoch": 0.008791981712678037, + "grad_norm": 24.908008575439453, + "learning_rate": 5.847953216374269e-06, + "loss": 0.0911, + "step": 50 + }, + { + "epoch": 0.009671179883945842, + "grad_norm": 42.72391891479492, + "learning_rate": 6.432748538011696e-06, + "loss": 0.1051, + "step": 55 + }, + { + "epoch": 0.010550378055213645, + "grad_norm": 1.5553064346313477, + "learning_rate": 7.017543859649123e-06, + "loss": 0.0939, + "step": 60 + }, + { + "epoch": 0.011429576226481448, + "grad_norm": 840.0651245117188, + "learning_rate": 7.60233918128655e-06, + "loss": 0.0739, + "step": 65 + }, + { + "epoch": 0.012308774397749253, + "grad_norm": 4780.78466796875, + "learning_rate": 8.187134502923977e-06, + "loss": 0.1114, + "step": 70 + }, + { + "epoch": 0.013187972569017057, + "grad_norm": 36.51594161987305, + "learning_rate": 8.771929824561405e-06, + "loss": 0.1356, + "step": 75 + }, + { + "epoch": 0.01406717074028486, + "grad_norm": 6.5139970779418945, + "learning_rate": 9.35672514619883e-06, + "loss": 0.0977, + "step": 80 + }, + { + "epoch": 0.014946368911552665, + "grad_norm": 63.903018951416016, + "learning_rate": 9.941520467836257e-06, + "loss": 0.1117, + "step": 85 + }, + { + "epoch": 0.015825567082820468, + "grad_norm": 18.261695861816406, + "learning_rate": 1.0526315789473684e-05, + "loss": 0.0912, + "step": 90 + }, + { + "epoch": 0.01670476525408827, + "grad_norm": 246.80682373046875, + "learning_rate": 1.1111111111111113e-05, + "loss": 0.1014, + "step": 95 + }, + { + "epoch": 0.017583963425356074, + "grad_norm": 432.3685607910156, + "learning_rate": 1.1695906432748539e-05, + "loss": 0.0915, + "step": 100 + }, + { + "epoch": 0.018463161596623878, + "grad_norm": 1106.1822509765625, + "learning_rate": 1.2280701754385966e-05, + "loss": 0.0993, + "step": 105 + }, + { + "epoch": 0.019342359767891684, + "grad_norm": 39.593421936035156, + "learning_rate": 1.2865497076023392e-05, + "loss": 0.157, + "step": 110 + }, + { + "epoch": 0.020221557939159487, + "grad_norm": 3.5536696910858154, + "learning_rate": 1.345029239766082e-05, + "loss": 0.1274, + "step": 115 + }, + { + "epoch": 0.02110075611042729, + "grad_norm": 1.2601370811462402, + "learning_rate": 1.4035087719298246e-05, + "loss": 0.0806, + "step": 120 + }, + { + "epoch": 0.021979954281695094, + "grad_norm": 1.1521034240722656, + "learning_rate": 1.4619883040935675e-05, + "loss": 0.0785, + "step": 125 + }, + { + "epoch": 0.022859152452962897, + "grad_norm": 0.7003596425056458, + "learning_rate": 1.52046783625731e-05, + "loss": 0.0819, + "step": 130 + }, + { + "epoch": 0.0237383506242307, + "grad_norm": 0.29950231313705444, + "learning_rate": 1.578947368421053e-05, + "loss": 0.0825, + "step": 135 + }, + { + "epoch": 0.024617548795498507, + "grad_norm": 1.1281251907348633, + "learning_rate": 1.6374269005847955e-05, + "loss": 0.105, + "step": 140 + }, + { + "epoch": 0.02549674696676631, + "grad_norm": 0.8757471442222595, + "learning_rate": 1.695906432748538e-05, + "loss": 0.1029, + "step": 145 + }, + { + "epoch": 0.026375945138034113, + "grad_norm": 1.9525190591812134, + "learning_rate": 1.754385964912281e-05, + "loss": 0.0708, + "step": 150 + }, + { + "epoch": 0.027255143309301916, + "grad_norm": 1.9676127433776855, + "learning_rate": 1.8128654970760235e-05, + "loss": 0.0863, + "step": 155 + }, + { + "epoch": 0.02813434148056972, + "grad_norm": 0.7372169494628906, + "learning_rate": 1.871345029239766e-05, + "loss": 0.0668, + "step": 160 + }, + { + "epoch": 0.029013539651837523, + "grad_norm": 0.6957089900970459, + "learning_rate": 1.929824561403509e-05, + "loss": 0.0688, + "step": 165 + }, + { + "epoch": 0.02989273782310533, + "grad_norm": 0.7962855696678162, + "learning_rate": 1.9883040935672515e-05, + "loss": 0.0619, + "step": 170 + }, + { + "epoch": 0.030771935994373133, + "grad_norm": 0.6682185530662537, + "learning_rate": 1.999997404978087e-05, + "loss": 0.0698, + "step": 175 + }, + { + "epoch": 0.031651134165640936, + "grad_norm": 0.7355603575706482, + "learning_rate": 1.999986862724647e-05, + "loss": 0.078, + "step": 180 + }, + { + "epoch": 0.03253033233690874, + "grad_norm": 0.4755054712295532, + "learning_rate": 1.9999682111362368e-05, + "loss": 0.0732, + "step": 185 + }, + { + "epoch": 0.03340953050817654, + "grad_norm": 0.4909241497516632, + "learning_rate": 1.9999414503641103e-05, + "loss": 0.0631, + "step": 190 + }, + { + "epoch": 0.034288728679444345, + "grad_norm": 1.0899882316589355, + "learning_rate": 1.9999065806252828e-05, + "loss": 0.0692, + "step": 195 + }, + { + "epoch": 0.03516792685071215, + "grad_norm": 0.263811320066452, + "learning_rate": 1.999863602202528e-05, + "loss": 0.0696, + "step": 200 + }, + { + "epoch": 0.03604712502197995, + "grad_norm": 1.9172215461730957, + "learning_rate": 1.999812515444377e-05, + "loss": 0.079, + "step": 205 + }, + { + "epoch": 0.036926323193247755, + "grad_norm": 1.79094660282135, + "learning_rate": 1.9997533207651147e-05, + "loss": 0.0627, + "step": 210 + }, + { + "epoch": 0.037805521364515565, + "grad_norm": 0.3693501651287079, + "learning_rate": 1.999686018644777e-05, + "loss": 0.0778, + "step": 215 + }, + { + "epoch": 0.03868471953578337, + "grad_norm": 0.18116237223148346, + "learning_rate": 1.999610609629147e-05, + "loss": 0.0643, + "step": 220 + }, + { + "epoch": 0.03956391770705117, + "grad_norm": 0.5909445881843567, + "learning_rate": 1.999527094329749e-05, + "loss": 0.0689, + "step": 225 + }, + { + "epoch": 0.040443115878318975, + "grad_norm": 0.4016267955303192, + "learning_rate": 1.9994354734238456e-05, + "loss": 0.0589, + "step": 230 + }, + { + "epoch": 0.04132231404958678, + "grad_norm": 0.8470014929771423, + "learning_rate": 1.9993357476544314e-05, + "loss": 0.0714, + "step": 235 + }, + { + "epoch": 0.04220151222085458, + "grad_norm": 1.2889784574508667, + "learning_rate": 1.9992279178302266e-05, + "loss": 0.0759, + "step": 240 + }, + { + "epoch": 0.043080710392122384, + "grad_norm": 1.695059061050415, + "learning_rate": 1.9991119848256708e-05, + "loss": 0.0582, + "step": 245 + }, + { + "epoch": 0.04395990856339019, + "grad_norm": 0.7226565480232239, + "learning_rate": 1.998987949580916e-05, + "loss": 0.0802, + "step": 250 + }, + { + "epoch": 0.04483910673465799, + "grad_norm": 0.513992965221405, + "learning_rate": 1.9988558131018188e-05, + "loss": 0.0747, + "step": 255 + }, + { + "epoch": 0.045718304905925794, + "grad_norm": 0.8010172247886658, + "learning_rate": 1.998715576459932e-05, + "loss": 0.0779, + "step": 260 + }, + { + "epoch": 0.0465975030771936, + "grad_norm": 0.6723889112472534, + "learning_rate": 1.9985672407924966e-05, + "loss": 0.0778, + "step": 265 + }, + { + "epoch": 0.0474767012484614, + "grad_norm": 0.5232120752334595, + "learning_rate": 1.998410807302432e-05, + "loss": 0.0606, + "step": 270 + }, + { + "epoch": 0.048355899419729204, + "grad_norm": 1.1310707330703735, + "learning_rate": 1.9982462772583267e-05, + "loss": 0.0786, + "step": 275 + }, + { + "epoch": 0.049235097590997014, + "grad_norm": 0.42932379245758057, + "learning_rate": 1.998073651994427e-05, + "loss": 0.0674, + "step": 280 + }, + { + "epoch": 0.05011429576226482, + "grad_norm": 0.30086904764175415, + "learning_rate": 1.997892932910628e-05, + "loss": 0.0662, + "step": 285 + }, + { + "epoch": 0.05099349393353262, + "grad_norm": 0.3778522312641144, + "learning_rate": 1.9977041214724594e-05, + "loss": 0.077, + "step": 290 + }, + { + "epoch": 0.05187269210480042, + "grad_norm": 1.5126148462295532, + "learning_rate": 1.997507219211078e-05, + "loss": 0.073, + "step": 295 + }, + { + "epoch": 0.052751890276068227, + "grad_norm": 0.4894915223121643, + "learning_rate": 1.99730222772325e-05, + "loss": 0.0705, + "step": 300 + }, + { + "epoch": 0.05363108844733603, + "grad_norm": 0.6623161435127258, + "learning_rate": 1.9970891486713423e-05, + "loss": 0.0583, + "step": 305 + }, + { + "epoch": 0.05451028661860383, + "grad_norm": 0.6160693764686584, + "learning_rate": 1.9968679837833075e-05, + "loss": 0.061, + "step": 310 + }, + { + "epoch": 0.055389484789871636, + "grad_norm": 1.3512332439422607, + "learning_rate": 1.9966387348526682e-05, + "loss": 0.0609, + "step": 315 + }, + { + "epoch": 0.05626868296113944, + "grad_norm": 0.7443258166313171, + "learning_rate": 1.9964014037385065e-05, + "loss": 0.0605, + "step": 320 + }, + { + "epoch": 0.05714788113240724, + "grad_norm": 0.47612714767456055, + "learning_rate": 1.996155992365444e-05, + "loss": 0.0631, + "step": 325 + }, + { + "epoch": 0.058027079303675046, + "grad_norm": 1.439274787902832, + "learning_rate": 1.9959025027236305e-05, + "loss": 0.0687, + "step": 330 + }, + { + "epoch": 0.05890627747494285, + "grad_norm": 0.7117618322372437, + "learning_rate": 1.9956409368687257e-05, + "loss": 0.0714, + "step": 335 + }, + { + "epoch": 0.05978547564621066, + "grad_norm": 0.6142310500144958, + "learning_rate": 1.995371296921882e-05, + "loss": 0.0672, + "step": 340 + }, + { + "epoch": 0.06066467381747846, + "grad_norm": 1.082131028175354, + "learning_rate": 1.9950935850697288e-05, + "loss": 0.0879, + "step": 345 + }, + { + "epoch": 0.061543871988746265, + "grad_norm": 0.35354727506637573, + "learning_rate": 1.9948078035643546e-05, + "loss": 0.0799, + "step": 350 + }, + { + "epoch": 0.06242307016001407, + "grad_norm": 0.2982726991176605, + "learning_rate": 1.9945139547232872e-05, + "loss": 0.0764, + "step": 355 + }, + { + "epoch": 0.06330226833128187, + "grad_norm": 1.1916660070419312, + "learning_rate": 1.9942120409294768e-05, + "loss": 0.0742, + "step": 360 + }, + { + "epoch": 0.06418146650254968, + "grad_norm": 1.0965343713760376, + "learning_rate": 1.9939020646312764e-05, + "loss": 0.0634, + "step": 365 + }, + { + "epoch": 0.06506066467381748, + "grad_norm": 0.46244287490844727, + "learning_rate": 1.9935840283424196e-05, + "loss": 0.0711, + "step": 370 + }, + { + "epoch": 0.06593986284508528, + "grad_norm": 0.1318541318178177, + "learning_rate": 1.993257934642004e-05, + "loss": 0.0591, + "step": 375 + }, + { + "epoch": 0.06681906101635308, + "grad_norm": 0.5300299525260925, + "learning_rate": 1.9929237861744663e-05, + "loss": 0.0712, + "step": 380 + }, + { + "epoch": 0.06769825918762089, + "grad_norm": 1.014757752418518, + "learning_rate": 1.9925815856495646e-05, + "loss": 0.0612, + "step": 385 + }, + { + "epoch": 0.06857745735888869, + "grad_norm": 0.24749091267585754, + "learning_rate": 1.992231335842354e-05, + "loss": 0.077, + "step": 390 + }, + { + "epoch": 0.0694566555301565, + "grad_norm": 0.5739014148712158, + "learning_rate": 1.9918730395931648e-05, + "loss": 0.0618, + "step": 395 + }, + { + "epoch": 0.0703358537014243, + "grad_norm": 0.23715724050998688, + "learning_rate": 1.9915066998075797e-05, + "loss": 0.0563, + "step": 400 + }, + { + "epoch": 0.0712150518726921, + "grad_norm": 0.5633426904678345, + "learning_rate": 1.9911323194564095e-05, + "loss": 0.054, + "step": 405 + }, + { + "epoch": 0.0720942500439599, + "grad_norm": 0.4382643401622772, + "learning_rate": 1.9907499015756696e-05, + "loss": 0.0561, + "step": 410 + }, + { + "epoch": 0.07297344821522771, + "grad_norm": 0.4218790829181671, + "learning_rate": 1.9903594492665557e-05, + "loss": 0.0466, + "step": 415 + }, + { + "epoch": 0.07385264638649551, + "grad_norm": 0.8700308203697205, + "learning_rate": 1.9899609656954183e-05, + "loss": 0.0652, + "step": 420 + }, + { + "epoch": 0.07473184455776331, + "grad_norm": 0.1704479455947876, + "learning_rate": 1.9895544540937358e-05, + "loss": 0.0494, + "step": 425 + }, + { + "epoch": 0.07561104272903113, + "grad_norm": 0.6057877540588379, + "learning_rate": 1.989139917758091e-05, + "loss": 0.0494, + "step": 430 + }, + { + "epoch": 0.07649024090029893, + "grad_norm": 1.0760382413864136, + "learning_rate": 1.9887173600501414e-05, + "loss": 0.0767, + "step": 435 + }, + { + "epoch": 0.07736943907156674, + "grad_norm": 0.42263808846473694, + "learning_rate": 1.988286784396594e-05, + "loss": 0.0666, + "step": 440 + }, + { + "epoch": 0.07824863724283454, + "grad_norm": 0.13608968257904053, + "learning_rate": 1.987848194289178e-05, + "loss": 0.0663, + "step": 445 + }, + { + "epoch": 0.07912783541410234, + "grad_norm": 0.20840178430080414, + "learning_rate": 1.987401593284613e-05, + "loss": 0.0814, + "step": 450 + }, + { + "epoch": 0.08000703358537015, + "grad_norm": 1.5564632415771484, + "learning_rate": 1.9869469850045845e-05, + "loss": 0.0733, + "step": 455 + }, + { + "epoch": 0.08088623175663795, + "grad_norm": 0.3628084063529968, + "learning_rate": 1.9864843731357108e-05, + "loss": 0.0668, + "step": 460 + }, + { + "epoch": 0.08176542992790575, + "grad_norm": 6.541281223297119, + "learning_rate": 1.986013761429517e-05, + "loss": 0.0575, + "step": 465 + }, + { + "epoch": 0.08264462809917356, + "grad_norm": 0.3698543310165405, + "learning_rate": 1.9855351537024004e-05, + "loss": 0.0686, + "step": 470 + }, + { + "epoch": 0.08352382627044136, + "grad_norm": 0.21145962178707123, + "learning_rate": 1.9850485538356026e-05, + "loss": 0.0693, + "step": 475 + }, + { + "epoch": 0.08440302444170916, + "grad_norm": 0.718197226524353, + "learning_rate": 1.9845539657751768e-05, + "loss": 0.0577, + "step": 480 + }, + { + "epoch": 0.08528222261297697, + "grad_norm": 1.4340827465057373, + "learning_rate": 1.9840513935319557e-05, + "loss": 0.056, + "step": 485 + }, + { + "epoch": 0.08616142078424477, + "grad_norm": 2.368858814239502, + "learning_rate": 1.98354084118152e-05, + "loss": 0.0674, + "step": 490 + }, + { + "epoch": 0.08704061895551257, + "grad_norm": 0.6914955973625183, + "learning_rate": 1.9830223128641636e-05, + "loss": 0.0646, + "step": 495 + }, + { + "epoch": 0.08791981712678038, + "grad_norm": 0.5653345584869385, + "learning_rate": 1.9824958127848618e-05, + "loss": 0.0868, + "step": 500 + }, + { + "epoch": 0.08879901529804818, + "grad_norm": 0.6143190860748291, + "learning_rate": 1.9819613452132365e-05, + "loss": 0.0524, + "step": 505 + }, + { + "epoch": 0.08967821346931598, + "grad_norm": 0.9025689363479614, + "learning_rate": 1.9814189144835205e-05, + "loss": 0.0646, + "step": 510 + }, + { + "epoch": 0.09055741164058378, + "grad_norm": 1.0996524095535278, + "learning_rate": 1.9808685249945245e-05, + "loss": 0.0686, + "step": 515 + }, + { + "epoch": 0.09143660981185159, + "grad_norm": 1.0614774227142334, + "learning_rate": 1.9803101812096e-05, + "loss": 0.0636, + "step": 520 + }, + { + "epoch": 0.09231580798311939, + "grad_norm": 0.67917799949646, + "learning_rate": 1.9797438876566027e-05, + "loss": 0.0623, + "step": 525 + }, + { + "epoch": 0.0931950061543872, + "grad_norm": 0.29005610942840576, + "learning_rate": 1.9791696489278578e-05, + "loss": 0.059, + "step": 530 + }, + { + "epoch": 0.094074204325655, + "grad_norm": 0.6829861402511597, + "learning_rate": 1.97858746968012e-05, + "loss": 0.0797, + "step": 535 + }, + { + "epoch": 0.0949534024969228, + "grad_norm": 0.22500386834144592, + "learning_rate": 1.9779973546345385e-05, + "loss": 0.0673, + "step": 540 + }, + { + "epoch": 0.0958326006681906, + "grad_norm": 1.5297006368637085, + "learning_rate": 1.9773993085766163e-05, + "loss": 0.062, + "step": 545 + }, + { + "epoch": 0.09671179883945841, + "grad_norm": 0.35818400979042053, + "learning_rate": 1.976793336356173e-05, + "loss": 0.0627, + "step": 550 + }, + { + "epoch": 0.09759099701072622, + "grad_norm": 1.0418643951416016, + "learning_rate": 1.976179442887305e-05, + "loss": 0.0756, + "step": 555 + }, + { + "epoch": 0.09847019518199403, + "grad_norm": 1.5865000486373901, + "learning_rate": 1.9755576331483453e-05, + "loss": 0.0577, + "step": 560 + }, + { + "epoch": 0.09934939335326183, + "grad_norm": 0.43606239557266235, + "learning_rate": 1.9749279121818235e-05, + "loss": 0.0642, + "step": 565 + }, + { + "epoch": 0.10022859152452963, + "grad_norm": 0.1087045669555664, + "learning_rate": 1.9742902850944257e-05, + "loss": 0.0667, + "step": 570 + }, + { + "epoch": 0.10110778969579744, + "grad_norm": 0.4932880103588104, + "learning_rate": 1.9736447570569503e-05, + "loss": 0.0643, + "step": 575 + }, + { + "epoch": 0.10198698786706524, + "grad_norm": 0.28585073351860046, + "learning_rate": 1.97299133330427e-05, + "loss": 0.0619, + "step": 580 + }, + { + "epoch": 0.10286618603833304, + "grad_norm": 0.1778407096862793, + "learning_rate": 1.9723300191352866e-05, + "loss": 0.0482, + "step": 585 + }, + { + "epoch": 0.10374538420960085, + "grad_norm": 0.35073766112327576, + "learning_rate": 1.971660819912888e-05, + "loss": 0.075, + "step": 590 + }, + { + "epoch": 0.10462458238086865, + "grad_norm": 0.19325245916843414, + "learning_rate": 1.9709837410639062e-05, + "loss": 0.0629, + "step": 595 + }, + { + "epoch": 0.10550378055213645, + "grad_norm": 0.5602083802223206, + "learning_rate": 1.9702987880790733e-05, + "loss": 0.0537, + "step": 600 + }, + { + "epoch": 0.10638297872340426, + "grad_norm": 0.6220573782920837, + "learning_rate": 1.969605966512975e-05, + "loss": 0.0735, + "step": 605 + }, + { + "epoch": 0.10726217689467206, + "grad_norm": 0.7392856478691101, + "learning_rate": 1.968905281984007e-05, + "loss": 0.0567, + "step": 610 + }, + { + "epoch": 0.10814137506593986, + "grad_norm": 0.4744727909564972, + "learning_rate": 1.9681967401743297e-05, + "loss": 0.0668, + "step": 615 + }, + { + "epoch": 0.10902057323720767, + "grad_norm": 1.1823076009750366, + "learning_rate": 1.9674803468298216e-05, + "loss": 0.0613, + "step": 620 + }, + { + "epoch": 0.10989977140847547, + "grad_norm": 0.7359253764152527, + "learning_rate": 1.9667561077600325e-05, + "loss": 0.0633, + "step": 625 + }, + { + "epoch": 0.11077896957974327, + "grad_norm": 0.49054092168807983, + "learning_rate": 1.966024028838137e-05, + "loss": 0.0571, + "step": 630 + }, + { + "epoch": 0.11165816775101108, + "grad_norm": 0.3266174793243408, + "learning_rate": 1.965284116000886e-05, + "loss": 0.0717, + "step": 635 + }, + { + "epoch": 0.11253736592227888, + "grad_norm": 0.2428748905658722, + "learning_rate": 1.9645363752485594e-05, + "loss": 0.0805, + "step": 640 + }, + { + "epoch": 0.11341656409354668, + "grad_norm": 0.27535462379455566, + "learning_rate": 1.963780812644917e-05, + "loss": 0.0632, + "step": 645 + }, + { + "epoch": 0.11429576226481448, + "grad_norm": 1.5982582569122314, + "learning_rate": 1.9630174343171498e-05, + "loss": 0.0574, + "step": 650 + }, + { + "epoch": 0.11517496043608229, + "grad_norm": 0.09966005384922028, + "learning_rate": 1.9622462464558296e-05, + "loss": 0.0505, + "step": 655 + }, + { + "epoch": 0.11605415860735009, + "grad_norm": 0.3889922797679901, + "learning_rate": 1.9614672553148592e-05, + "loss": 0.0598, + "step": 660 + }, + { + "epoch": 0.1169333567786179, + "grad_norm": 1.1012970209121704, + "learning_rate": 1.9606804672114217e-05, + "loss": 0.0618, + "step": 665 + }, + { + "epoch": 0.1178125549498857, + "grad_norm": 0.4842506945133209, + "learning_rate": 1.959885888525929e-05, + "loss": 0.065, + "step": 670 + }, + { + "epoch": 0.11869175312115351, + "grad_norm": 0.5499223470687866, + "learning_rate": 1.9590835257019715e-05, + "loss": 0.0575, + "step": 675 + }, + { + "epoch": 0.11957095129242132, + "grad_norm": 0.9636365175247192, + "learning_rate": 1.9582733852462623e-05, + "loss": 0.0565, + "step": 680 + }, + { + "epoch": 0.12045014946368912, + "grad_norm": 0.5249933004379272, + "learning_rate": 1.9574554737285885e-05, + "loss": 0.0594, + "step": 685 + }, + { + "epoch": 0.12132934763495692, + "grad_norm": 0.8125589489936829, + "learning_rate": 1.956629797781756e-05, + "loss": 0.0652, + "step": 690 + }, + { + "epoch": 0.12220854580622473, + "grad_norm": 0.3194701373577118, + "learning_rate": 1.955796364101535e-05, + "loss": 0.0634, + "step": 695 + }, + { + "epoch": 0.12308774397749253, + "grad_norm": 0.3122730851173401, + "learning_rate": 1.954955179446608e-05, + "loss": 0.0577, + "step": 700 + }, + { + "epoch": 0.12396694214876033, + "grad_norm": 0.546394407749176, + "learning_rate": 1.9541062506385116e-05, + "loss": 0.0635, + "step": 705 + }, + { + "epoch": 0.12484614032002814, + "grad_norm": 0.6376326680183411, + "learning_rate": 1.9532495845615854e-05, + "loss": 0.0702, + "step": 710 + }, + { + "epoch": 0.12572533849129594, + "grad_norm": 0.5695558786392212, + "learning_rate": 1.9523851881629124e-05, + "loss": 0.0613, + "step": 715 + }, + { + "epoch": 0.12660453666256374, + "grad_norm": 0.5965823531150818, + "learning_rate": 1.9515130684522647e-05, + "loss": 0.0652, + "step": 720 + }, + { + "epoch": 0.12748373483383155, + "grad_norm": 0.4935171604156494, + "learning_rate": 1.950633232502046e-05, + "loss": 0.063, + "step": 725 + }, + { + "epoch": 0.12836293300509935, + "grad_norm": 0.5116500854492188, + "learning_rate": 1.9497456874472346e-05, + "loss": 0.0552, + "step": 730 + }, + { + "epoch": 0.12924213117636715, + "grad_norm": 0.5821178555488586, + "learning_rate": 1.9488504404853247e-05, + "loss": 0.0591, + "step": 735 + }, + { + "epoch": 0.13012132934763496, + "grad_norm": 0.31190237402915955, + "learning_rate": 1.94794749887627e-05, + "loss": 0.0755, + "step": 740 + }, + { + "epoch": 0.13100052751890276, + "grad_norm": 0.6998320817947388, + "learning_rate": 1.947036869942422e-05, + "loss": 0.086, + "step": 745 + }, + { + "epoch": 0.13187972569017056, + "grad_norm": 0.5217974185943604, + "learning_rate": 1.9461185610684736e-05, + "loss": 0.0602, + "step": 750 + }, + { + "epoch": 0.13275892386143837, + "grad_norm": 1.050721287727356, + "learning_rate": 1.9451925797013955e-05, + "loss": 0.0698, + "step": 755 + }, + { + "epoch": 0.13363812203270617, + "grad_norm": 0.2586376965045929, + "learning_rate": 1.9442589333503806e-05, + "loss": 0.0622, + "step": 760 + }, + { + "epoch": 0.13451732020397397, + "grad_norm": 0.5251173377037048, + "learning_rate": 1.9433176295867792e-05, + "loss": 0.0567, + "step": 765 + }, + { + "epoch": 0.13539651837524178, + "grad_norm": 0.4066588878631592, + "learning_rate": 1.9423686760440386e-05, + "loss": 0.0548, + "step": 770 + }, + { + "epoch": 0.13627571654650958, + "grad_norm": 0.5858006477355957, + "learning_rate": 1.9414120804176427e-05, + "loss": 0.0607, + "step": 775 + }, + { + "epoch": 0.13715491471777738, + "grad_norm": 0.9163162112236023, + "learning_rate": 1.9404478504650473e-05, + "loss": 0.0575, + "step": 780 + }, + { + "epoch": 0.13803411288904519, + "grad_norm": 0.274795264005661, + "learning_rate": 1.939475994005619e-05, + "loss": 0.07, + "step": 785 + }, + { + "epoch": 0.138913311060313, + "grad_norm": 0.1485268473625183, + "learning_rate": 1.938496518920571e-05, + "loss": 0.067, + "step": 790 + }, + { + "epoch": 0.1397925092315808, + "grad_norm": 0.57244473695755, + "learning_rate": 1.937509433152899e-05, + "loss": 0.0742, + "step": 795 + }, + { + "epoch": 0.1406717074028486, + "grad_norm": 0.9547910690307617, + "learning_rate": 1.9365147447073172e-05, + "loss": 0.0655, + "step": 800 + }, + { + "epoch": 0.1415509055741164, + "grad_norm": 0.8948219418525696, + "learning_rate": 1.9355124616501936e-05, + "loss": 0.0714, + "step": 805 + }, + { + "epoch": 0.1424301037453842, + "grad_norm": 0.7073503136634827, + "learning_rate": 1.934502592109484e-05, + "loss": 0.0646, + "step": 810 + }, + { + "epoch": 0.143309301916652, + "grad_norm": 0.38196781277656555, + "learning_rate": 1.9334851442746665e-05, + "loss": 0.0606, + "step": 815 + }, + { + "epoch": 0.1441885000879198, + "grad_norm": 0.22767876088619232, + "learning_rate": 1.9324601263966746e-05, + "loss": 0.0586, + "step": 820 + }, + { + "epoch": 0.1450676982591876, + "grad_norm": 0.44821423292160034, + "learning_rate": 1.9314275467878304e-05, + "loss": 0.0671, + "step": 825 + }, + { + "epoch": 0.14594689643045541, + "grad_norm": 0.32358282804489136, + "learning_rate": 1.9303874138217788e-05, + "loss": 0.0535, + "step": 830 + }, + { + "epoch": 0.14682609460172322, + "grad_norm": 0.39932888746261597, + "learning_rate": 1.9293397359334167e-05, + "loss": 0.0553, + "step": 835 + }, + { + "epoch": 0.14770529277299102, + "grad_norm": 0.160264790058136, + "learning_rate": 1.9282845216188267e-05, + "loss": 0.0583, + "step": 840 + }, + { + "epoch": 0.14858449094425882, + "grad_norm": 0.5190912485122681, + "learning_rate": 1.9272217794352073e-05, + "loss": 0.0716, + "step": 845 + }, + { + "epoch": 0.14946368911552663, + "grad_norm": 0.5016174912452698, + "learning_rate": 1.9261515180008047e-05, + "loss": 0.0668, + "step": 850 + }, + { + "epoch": 0.15034288728679443, + "grad_norm": 0.12489809095859528, + "learning_rate": 1.9250737459948404e-05, + "loss": 0.0619, + "step": 855 + }, + { + "epoch": 0.15122208545806226, + "grad_norm": 1.153669834136963, + "learning_rate": 1.923988472157445e-05, + "loss": 0.0779, + "step": 860 + }, + { + "epoch": 0.15210128362933006, + "grad_norm": 0.2374788522720337, + "learning_rate": 1.9228957052895816e-05, + "loss": 0.0677, + "step": 865 + }, + { + "epoch": 0.15298048180059787, + "grad_norm": 1.060134768486023, + "learning_rate": 1.92179545425298e-05, + "loss": 0.0632, + "step": 870 + }, + { + "epoch": 0.15385967997186567, + "grad_norm": 0.3676360845565796, + "learning_rate": 1.9206877279700614e-05, + "loss": 0.0614, + "step": 875 + }, + { + "epoch": 0.15473887814313347, + "grad_norm": 0.3198089003562927, + "learning_rate": 1.9195725354238677e-05, + "loss": 0.0718, + "step": 880 + }, + { + "epoch": 0.15561807631440128, + "grad_norm": 0.2891201674938202, + "learning_rate": 1.918449885657987e-05, + "loss": 0.0586, + "step": 885 + }, + { + "epoch": 0.15649727448566908, + "grad_norm": 0.4054102897644043, + "learning_rate": 1.9173197877764824e-05, + "loss": 0.0523, + "step": 890 + }, + { + "epoch": 0.15737647265693688, + "grad_norm": 0.1266939640045166, + "learning_rate": 1.916182250943816e-05, + "loss": 0.0546, + "step": 895 + }, + { + "epoch": 0.1582556708282047, + "grad_norm": 0.7244488000869751, + "learning_rate": 1.915037284384777e-05, + "loss": 0.0634, + "step": 900 + }, + { + "epoch": 0.1591348689994725, + "grad_norm": 0.8754041790962219, + "learning_rate": 1.913884897384404e-05, + "loss": 0.0712, + "step": 905 + }, + { + "epoch": 0.1600140671707403, + "grad_norm": 0.7527337670326233, + "learning_rate": 1.9127250992879128e-05, + "loss": 0.0685, + "step": 910 + }, + { + "epoch": 0.1608932653420081, + "grad_norm": 0.8655832409858704, + "learning_rate": 1.9115578995006175e-05, + "loss": 0.0709, + "step": 915 + }, + { + "epoch": 0.1617724635132759, + "grad_norm": 0.5657609105110168, + "learning_rate": 1.9103833074878565e-05, + "loss": 0.0606, + "step": 920 + }, + { + "epoch": 0.1626516616845437, + "grad_norm": 0.5217536091804504, + "learning_rate": 1.909201332774916e-05, + "loss": 0.0577, + "step": 925 + }, + { + "epoch": 0.1635308598558115, + "grad_norm": 0.5291991233825684, + "learning_rate": 1.908011984946949e-05, + "loss": 0.0574, + "step": 930 + }, + { + "epoch": 0.1644100580270793, + "grad_norm": 0.16585160791873932, + "learning_rate": 1.9068152736489036e-05, + "loss": 0.0588, + "step": 935 + }, + { + "epoch": 0.1652892561983471, + "grad_norm": 0.5434625744819641, + "learning_rate": 1.9056112085854397e-05, + "loss": 0.0645, + "step": 940 + }, + { + "epoch": 0.16616845436961492, + "grad_norm": 0.685371458530426, + "learning_rate": 1.9043997995208525e-05, + "loss": 0.0452, + "step": 945 + }, + { + "epoch": 0.16704765254088272, + "grad_norm": 0.3393997251987457, + "learning_rate": 1.9031810562789927e-05, + "loss": 0.0569, + "step": 950 + }, + { + "epoch": 0.16792685071215052, + "grad_norm": 0.281892329454422, + "learning_rate": 1.901954988743188e-05, + "loss": 0.0686, + "step": 955 + }, + { + "epoch": 0.16880604888341832, + "grad_norm": 0.5703971982002258, + "learning_rate": 1.9007216068561605e-05, + "loss": 0.0667, + "step": 960 + }, + { + "epoch": 0.16968524705468613, + "grad_norm": 0.6741696000099182, + "learning_rate": 1.899480920619949e-05, + "loss": 0.0551, + "step": 965 + }, + { + "epoch": 0.17056444522595393, + "grad_norm": 1.3032324314117432, + "learning_rate": 1.8982329400958254e-05, + "loss": 0.066, + "step": 970 + }, + { + "epoch": 0.17144364339722173, + "grad_norm": 0.8134323954582214, + "learning_rate": 1.8969776754042157e-05, + "loss": 0.0704, + "step": 975 + }, + { + "epoch": 0.17232284156848954, + "grad_norm": 0.6495192646980286, + "learning_rate": 1.895715136724615e-05, + "loss": 0.0687, + "step": 980 + }, + { + "epoch": 0.17320203973975734, + "grad_norm": 0.49367162585258484, + "learning_rate": 1.8944453342955064e-05, + "loss": 0.0555, + "step": 985 + }, + { + "epoch": 0.17408123791102514, + "grad_norm": 0.2508549392223358, + "learning_rate": 1.8931682784142792e-05, + "loss": 0.0694, + "step": 990 + }, + { + "epoch": 0.17496043608229295, + "grad_norm": 0.3268815875053406, + "learning_rate": 1.891883979437143e-05, + "loss": 0.058, + "step": 995 + }, + { + "epoch": 0.17583963425356075, + "grad_norm": 0.6226515173912048, + "learning_rate": 1.8905924477790452e-05, + "loss": 0.0661, + "step": 1000 + }, + { + "epoch": 0.17671883242482855, + "grad_norm": 0.3348465859889984, + "learning_rate": 1.8892936939135863e-05, + "loss": 0.0651, + "step": 1005 + }, + { + "epoch": 0.17759803059609636, + "grad_norm": 0.7326810956001282, + "learning_rate": 1.887987728372935e-05, + "loss": 0.0695, + "step": 1010 + }, + { + "epoch": 0.17847722876736416, + "grad_norm": 0.6014009714126587, + "learning_rate": 1.8866745617477423e-05, + "loss": 0.063, + "step": 1015 + }, + { + "epoch": 0.17935642693863196, + "grad_norm": 1.2527378797531128, + "learning_rate": 1.8853542046870558e-05, + "loss": 0.0631, + "step": 1020 + }, + { + "epoch": 0.18023562510989977, + "grad_norm": 0.3176214396953583, + "learning_rate": 1.8840266678982343e-05, + "loss": 0.0562, + "step": 1025 + }, + { + "epoch": 0.18111482328116757, + "grad_norm": 0.26997071504592896, + "learning_rate": 1.8826919621468595e-05, + "loss": 0.0618, + "step": 1030 + }, + { + "epoch": 0.18199402145243537, + "grad_norm": 0.2553798258304596, + "learning_rate": 1.8813500982566498e-05, + "loss": 0.0622, + "step": 1035 + }, + { + "epoch": 0.18287321962370318, + "grad_norm": 0.9949320554733276, + "learning_rate": 1.8800010871093718e-05, + "loss": 0.0748, + "step": 1040 + }, + { + "epoch": 0.18375241779497098, + "grad_norm": 0.5384786128997803, + "learning_rate": 1.8786449396447528e-05, + "loss": 0.0757, + "step": 1045 + }, + { + "epoch": 0.18463161596623878, + "grad_norm": 0.14809344708919525, + "learning_rate": 1.8772816668603907e-05, + "loss": 0.0675, + "step": 1050 + }, + { + "epoch": 0.18551081413750659, + "grad_norm": 0.764203667640686, + "learning_rate": 1.8759112798116673e-05, + "loss": 0.0615, + "step": 1055 + }, + { + "epoch": 0.1863900123087744, + "grad_norm": 0.18247248232364655, + "learning_rate": 1.874533789611655e-05, + "loss": 0.061, + "step": 1060 + }, + { + "epoch": 0.1872692104800422, + "grad_norm": 0.9988198280334473, + "learning_rate": 1.873149207431031e-05, + "loss": 0.0591, + "step": 1065 + }, + { + "epoch": 0.18814840865131, + "grad_norm": 0.07756359130144119, + "learning_rate": 1.871757544497983e-05, + "loss": 0.0641, + "step": 1070 + }, + { + "epoch": 0.1890276068225778, + "grad_norm": 0.7131006121635437, + "learning_rate": 1.870358812098121e-05, + "loss": 0.0581, + "step": 1075 + }, + { + "epoch": 0.1899068049938456, + "grad_norm": 0.3485928177833557, + "learning_rate": 1.868953021574382e-05, + "loss": 0.0645, + "step": 1080 + }, + { + "epoch": 0.1907860031651134, + "grad_norm": 0.16775915026664734, + "learning_rate": 1.8675401843269438e-05, + "loss": 0.0644, + "step": 1085 + }, + { + "epoch": 0.1916652013363812, + "grad_norm": 0.3290361762046814, + "learning_rate": 1.866120311813126e-05, + "loss": 0.0619, + "step": 1090 + }, + { + "epoch": 0.192544399507649, + "grad_norm": 1.0206267833709717, + "learning_rate": 1.8646934155473025e-05, + "loss": 0.0854, + "step": 1095 + }, + { + "epoch": 0.19342359767891681, + "grad_norm": 0.6392635703086853, + "learning_rate": 1.8632595071008044e-05, + "loss": 0.0647, + "step": 1100 + }, + { + "epoch": 0.19430279585018465, + "grad_norm": 0.4575440287590027, + "learning_rate": 1.8618185981018292e-05, + "loss": 0.065, + "step": 1105 + }, + { + "epoch": 0.19518199402145245, + "grad_norm": 0.5402662754058838, + "learning_rate": 1.8603707002353436e-05, + "loss": 0.053, + "step": 1110 + }, + { + "epoch": 0.19606119219272025, + "grad_norm": 0.16452832520008087, + "learning_rate": 1.858915825242991e-05, + "loss": 0.0577, + "step": 1115 + }, + { + "epoch": 0.19694039036398805, + "grad_norm": 0.7707622647285461, + "learning_rate": 1.857453984922995e-05, + "loss": 0.0572, + "step": 1120 + }, + { + "epoch": 0.19781958853525586, + "grad_norm": 0.2900203466415405, + "learning_rate": 1.8559851911300638e-05, + "loss": 0.0534, + "step": 1125 + }, + { + "epoch": 0.19869878670652366, + "grad_norm": 0.2933928370475769, + "learning_rate": 1.854509455775295e-05, + "loss": 0.0534, + "step": 1130 + }, + { + "epoch": 0.19957798487779146, + "grad_norm": 1.2258199453353882, + "learning_rate": 1.8530267908260782e-05, + "loss": 0.0645, + "step": 1135 + }, + { + "epoch": 0.20045718304905927, + "grad_norm": 0.3073072135448456, + "learning_rate": 1.8515372083059982e-05, + "loss": 0.0672, + "step": 1140 + }, + { + "epoch": 0.20133638122032707, + "grad_norm": 0.23768655955791473, + "learning_rate": 1.850040720294737e-05, + "loss": 0.0573, + "step": 1145 + }, + { + "epoch": 0.20221557939159487, + "grad_norm": 0.6997068524360657, + "learning_rate": 1.8485373389279768e-05, + "loss": 0.0564, + "step": 1150 + }, + { + "epoch": 0.20309477756286268, + "grad_norm": 0.4729757308959961, + "learning_rate": 1.8470270763973004e-05, + "loss": 0.0588, + "step": 1155 + }, + { + "epoch": 0.20397397573413048, + "grad_norm": 0.19242537021636963, + "learning_rate": 1.845509944950094e-05, + "loss": 0.0532, + "step": 1160 + }, + { + "epoch": 0.20485317390539828, + "grad_norm": 0.1492680460214615, + "learning_rate": 1.8439859568894464e-05, + "loss": 0.0658, + "step": 1165 + }, + { + "epoch": 0.2057323720766661, + "grad_norm": 0.6383575201034546, + "learning_rate": 1.8424551245740493e-05, + "loss": 0.0563, + "step": 1170 + }, + { + "epoch": 0.2066115702479339, + "grad_norm": 0.9722626805305481, + "learning_rate": 1.8409174604180977e-05, + "loss": 0.0603, + "step": 1175 + }, + { + "epoch": 0.2074907684192017, + "grad_norm": 0.5511413812637329, + "learning_rate": 1.8393729768911894e-05, + "loss": 0.0534, + "step": 1180 + }, + { + "epoch": 0.2083699665904695, + "grad_norm": 0.3645865321159363, + "learning_rate": 1.837821686518223e-05, + "loss": 0.0601, + "step": 1185 + }, + { + "epoch": 0.2092491647617373, + "grad_norm": 0.43972018361091614, + "learning_rate": 1.8362636018792975e-05, + "loss": 0.049, + "step": 1190 + }, + { + "epoch": 0.2101283629330051, + "grad_norm": 0.34283024072647095, + "learning_rate": 1.8346987356096087e-05, + "loss": 0.0596, + "step": 1195 + }, + { + "epoch": 0.2110075611042729, + "grad_norm": 0.3272128701210022, + "learning_rate": 1.833127100399348e-05, + "loss": 0.0604, + "step": 1200 + }, + { + "epoch": 0.2118867592755407, + "grad_norm": 0.48746854066848755, + "learning_rate": 1.8315487089935995e-05, + "loss": 0.0505, + "step": 1205 + }, + { + "epoch": 0.2127659574468085, + "grad_norm": 0.21915239095687866, + "learning_rate": 1.8299635741922365e-05, + "loss": 0.0574, + "step": 1210 + }, + { + "epoch": 0.21364515561807632, + "grad_norm": 0.3507218360900879, + "learning_rate": 1.8283717088498157e-05, + "loss": 0.0651, + "step": 1215 + }, + { + "epoch": 0.21452435378934412, + "grad_norm": 0.712352454662323, + "learning_rate": 1.8267731258754765e-05, + "loss": 0.0564, + "step": 1220 + }, + { + "epoch": 0.21540355196061192, + "grad_norm": 0.3927139937877655, + "learning_rate": 1.8251678382328345e-05, + "loss": 0.0474, + "step": 1225 + }, + { + "epoch": 0.21628275013187973, + "grad_norm": 1.3225253820419312, + "learning_rate": 1.8235558589398756e-05, + "loss": 0.0826, + "step": 1230 + }, + { + "epoch": 0.21716194830314753, + "grad_norm": 1.0917742252349854, + "learning_rate": 1.8219372010688516e-05, + "loss": 0.0614, + "step": 1235 + }, + { + "epoch": 0.21804114647441533, + "grad_norm": 1.2095632553100586, + "learning_rate": 1.8203118777461735e-05, + "loss": 0.0569, + "step": 1240 + }, + { + "epoch": 0.21892034464568313, + "grad_norm": 0.1377822309732437, + "learning_rate": 1.8186799021523064e-05, + "loss": 0.063, + "step": 1245 + }, + { + "epoch": 0.21979954281695094, + "grad_norm": 1.2656950950622559, + "learning_rate": 1.81704128752166e-05, + "loss": 0.0664, + "step": 1250 + }, + { + "epoch": 0.22067874098821874, + "grad_norm": 0.21433009207248688, + "learning_rate": 1.815396047142485e-05, + "loss": 0.0636, + "step": 1255 + }, + { + "epoch": 0.22155793915948654, + "grad_norm": 0.34033942222595215, + "learning_rate": 1.8137441943567607e-05, + "loss": 0.0535, + "step": 1260 + }, + { + "epoch": 0.22243713733075435, + "grad_norm": 0.3627121150493622, + "learning_rate": 1.8120857425600914e-05, + "loss": 0.0596, + "step": 1265 + }, + { + "epoch": 0.22331633550202215, + "grad_norm": 0.6685848832130432, + "learning_rate": 1.8104207052015952e-05, + "loss": 0.0696, + "step": 1270 + }, + { + "epoch": 0.22419553367328995, + "grad_norm": 0.2353779375553131, + "learning_rate": 1.8087490957837947e-05, + "loss": 0.0536, + "step": 1275 + }, + { + "epoch": 0.22507473184455776, + "grad_norm": 0.403475821018219, + "learning_rate": 1.807070927862509e-05, + "loss": 0.0636, + "step": 1280 + }, + { + "epoch": 0.22595393001582556, + "grad_norm": 0.6222421526908875, + "learning_rate": 1.8053862150467417e-05, + "loss": 0.0558, + "step": 1285 + }, + { + "epoch": 0.22683312818709336, + "grad_norm": 0.3176823854446411, + "learning_rate": 1.803694970998574e-05, + "loss": 0.0566, + "step": 1290 + }, + { + "epoch": 0.22771232635836117, + "grad_norm": 0.7340067625045776, + "learning_rate": 1.8019972094330502e-05, + "loss": 0.0487, + "step": 1295 + }, + { + "epoch": 0.22859152452962897, + "grad_norm": 1.254428505897522, + "learning_rate": 1.8002929441180684e-05, + "loss": 0.0511, + "step": 1300 + }, + { + "epoch": 0.22947072270089677, + "grad_norm": 0.20499931275844574, + "learning_rate": 1.7985821888742687e-05, + "loss": 0.0648, + "step": 1305 + }, + { + "epoch": 0.23034992087216458, + "grad_norm": 0.4078265428543091, + "learning_rate": 1.7968649575749202e-05, + "loss": 0.047, + "step": 1310 + }, + { + "epoch": 0.23122911904343238, + "grad_norm": 0.15697845816612244, + "learning_rate": 1.79514126414581e-05, + "loss": 0.0488, + "step": 1315 + }, + { + "epoch": 0.23210831721470018, + "grad_norm": 0.6871252059936523, + "learning_rate": 1.7934111225651293e-05, + "loss": 0.0585, + "step": 1320 + }, + { + "epoch": 0.23298751538596799, + "grad_norm": 0.13073213398456573, + "learning_rate": 1.7916745468633593e-05, + "loss": 0.0576, + "step": 1325 + }, + { + "epoch": 0.2338667135572358, + "grad_norm": 0.150588259100914, + "learning_rate": 1.7899315511231598e-05, + "loss": 0.0572, + "step": 1330 + }, + { + "epoch": 0.2347459117285036, + "grad_norm": 0.40196287631988525, + "learning_rate": 1.7881821494792527e-05, + "loss": 0.0573, + "step": 1335 + }, + { + "epoch": 0.2356251098997714, + "grad_norm": 0.7040359973907471, + "learning_rate": 1.7864263561183085e-05, + "loss": 0.0653, + "step": 1340 + }, + { + "epoch": 0.2365043080710392, + "grad_norm": 1.1014829874038696, + "learning_rate": 1.78466418527883e-05, + "loss": 0.0509, + "step": 1345 + }, + { + "epoch": 0.23738350624230703, + "grad_norm": 0.2666812539100647, + "learning_rate": 1.782895651251039e-05, + "loss": 0.0585, + "step": 1350 + }, + { + "epoch": 0.23826270441357483, + "grad_norm": 0.9666887521743774, + "learning_rate": 1.781120768376759e-05, + "loss": 0.0588, + "step": 1355 + }, + { + "epoch": 0.23914190258484264, + "grad_norm": 0.7215674519538879, + "learning_rate": 1.7793395510492986e-05, + "loss": 0.0597, + "step": 1360 + }, + { + "epoch": 0.24002110075611044, + "grad_norm": 0.5669434070587158, + "learning_rate": 1.7775520137133354e-05, + "loss": 0.0516, + "step": 1365 + }, + { + "epoch": 0.24090029892737824, + "grad_norm": 0.38593825697898865, + "learning_rate": 1.775758170864799e-05, + "loss": 0.0485, + "step": 1370 + }, + { + "epoch": 0.24177949709864605, + "grad_norm": 0.5211871266365051, + "learning_rate": 1.7739580370507533e-05, + "loss": 0.0619, + "step": 1375 + }, + { + "epoch": 0.24265869526991385, + "grad_norm": 0.3265356719493866, + "learning_rate": 1.7721516268692776e-05, + "loss": 0.0593, + "step": 1380 + }, + { + "epoch": 0.24353789344118165, + "grad_norm": 0.5374659895896912, + "learning_rate": 1.77033895496935e-05, + "loss": 0.0666, + "step": 1385 + }, + { + "epoch": 0.24441709161244946, + "grad_norm": 0.26406246423721313, + "learning_rate": 1.768520036050727e-05, + "loss": 0.0493, + "step": 1390 + }, + { + "epoch": 0.24529628978371726, + "grad_norm": 0.38316601514816284, + "learning_rate": 1.7666948848638257e-05, + "loss": 0.0503, + "step": 1395 + }, + { + "epoch": 0.24617548795498506, + "grad_norm": 0.3633623719215393, + "learning_rate": 1.7648635162096022e-05, + "loss": 0.0569, + "step": 1400 + }, + { + "epoch": 0.24705468612625286, + "grad_norm": 0.941490650177002, + "learning_rate": 1.763025944939434e-05, + "loss": 0.058, + "step": 1405 + }, + { + "epoch": 0.24793388429752067, + "grad_norm": 0.21693024039268494, + "learning_rate": 1.7611821859549977e-05, + "loss": 0.0539, + "step": 1410 + }, + { + "epoch": 0.24881308246878847, + "grad_norm": 0.7575194835662842, + "learning_rate": 1.7593322542081486e-05, + "loss": 0.0705, + "step": 1415 + }, + { + "epoch": 0.24969228064005627, + "grad_norm": 0.3184313178062439, + "learning_rate": 1.7574761647008004e-05, + "loss": 0.0655, + "step": 1420 + }, + { + "epoch": 0.2505714788113241, + "grad_norm": 0.9622363448143005, + "learning_rate": 1.7556139324848024e-05, + "loss": 0.0653, + "step": 1425 + }, + { + "epoch": 0.2514506769825919, + "grad_norm": 0.3079875111579895, + "learning_rate": 1.753745572661817e-05, + "loss": 0.0497, + "step": 1430 + }, + { + "epoch": 0.2523298751538597, + "grad_norm": 0.1410188525915146, + "learning_rate": 1.7518711003832003e-05, + "loss": 0.0715, + "step": 1435 + }, + { + "epoch": 0.2532090733251275, + "grad_norm": 0.5498221516609192, + "learning_rate": 1.749990530849875e-05, + "loss": 0.0705, + "step": 1440 + }, + { + "epoch": 0.2540882714963953, + "grad_norm": 0.27683818340301514, + "learning_rate": 1.748103879312209e-05, + "loss": 0.06, + "step": 1445 + }, + { + "epoch": 0.2549674696676631, + "grad_norm": 1.3148400783538818, + "learning_rate": 1.7462111610698934e-05, + "loss": 0.0629, + "step": 1450 + }, + { + "epoch": 0.2558466678389309, + "grad_norm": 0.4923277199268341, + "learning_rate": 1.744312391471816e-05, + "loss": 0.0573, + "step": 1455 + }, + { + "epoch": 0.2567258660101987, + "grad_norm": 0.8244169354438782, + "learning_rate": 1.7424075859159376e-05, + "loss": 0.0561, + "step": 1460 + }, + { + "epoch": 0.2576050641814665, + "grad_norm": 0.2395920604467392, + "learning_rate": 1.7404967598491674e-05, + "loss": 0.0643, + "step": 1465 + }, + { + "epoch": 0.2584842623527343, + "grad_norm": 0.3752864897251129, + "learning_rate": 1.7385799287672375e-05, + "loss": 0.0634, + "step": 1470 + }, + { + "epoch": 0.2593634605240021, + "grad_norm": 1.0273178815841675, + "learning_rate": 1.736657108214578e-05, + "loss": 0.0613, + "step": 1475 + }, + { + "epoch": 0.2602426586952699, + "grad_norm": 0.9190396666526794, + "learning_rate": 1.734728313784189e-05, + "loss": 0.0623, + "step": 1480 + }, + { + "epoch": 0.2611218568665377, + "grad_norm": 0.9993478655815125, + "learning_rate": 1.732793561117517e-05, + "loss": 0.0421, + "step": 1485 + }, + { + "epoch": 0.2620010550378055, + "grad_norm": 0.4666178226470947, + "learning_rate": 1.7308528659043243e-05, + "loss": 0.0531, + "step": 1490 + }, + { + "epoch": 0.2628802532090733, + "grad_norm": 0.24554145336151123, + "learning_rate": 1.7289062438825665e-05, + "loss": 0.0514, + "step": 1495 + }, + { + "epoch": 0.2637594513803411, + "grad_norm": 0.29805853962898254, + "learning_rate": 1.7269537108382605e-05, + "loss": 0.0526, + "step": 1500 + }, + { + "epoch": 0.26463864955160893, + "grad_norm": 0.9100229144096375, + "learning_rate": 1.7249952826053582e-05, + "loss": 0.0653, + "step": 1505 + }, + { + "epoch": 0.26551784772287673, + "grad_norm": 0.2765738368034363, + "learning_rate": 1.72303097506562e-05, + "loss": 0.0694, + "step": 1510 + }, + { + "epoch": 0.26639704589414454, + "grad_norm": 0.12102984637022018, + "learning_rate": 1.721060804148482e-05, + "loss": 0.0619, + "step": 1515 + }, + { + "epoch": 0.26727624406541234, + "grad_norm": 0.2673247456550598, + "learning_rate": 1.7190847858309304e-05, + "loss": 0.0536, + "step": 1520 + }, + { + "epoch": 0.26815544223668014, + "grad_norm": 0.6815070509910583, + "learning_rate": 1.71710293613737e-05, + "loss": 0.0521, + "step": 1525 + }, + { + "epoch": 0.26903464040794794, + "grad_norm": 0.8095347881317139, + "learning_rate": 1.7151152711394954e-05, + "loss": 0.0628, + "step": 1530 + }, + { + "epoch": 0.26991383857921575, + "grad_norm": 0.7218222618103027, + "learning_rate": 1.7131218069561594e-05, + "loss": 0.0405, + "step": 1535 + }, + { + "epoch": 0.27079303675048355, + "grad_norm": 0.6927086710929871, + "learning_rate": 1.7111225597532428e-05, + "loss": 0.0647, + "step": 1540 + }, + { + "epoch": 0.27167223492175135, + "grad_norm": 0.8299700617790222, + "learning_rate": 1.7091175457435242e-05, + "loss": 0.0648, + "step": 1545 + }, + { + "epoch": 0.27255143309301916, + "grad_norm": 0.16689668595790863, + "learning_rate": 1.7071067811865477e-05, + "loss": 0.0481, + "step": 1550 + }, + { + "epoch": 0.27343063126428696, + "grad_norm": 0.9474055767059326, + "learning_rate": 1.7050902823884904e-05, + "loss": 0.056, + "step": 1555 + }, + { + "epoch": 0.27430982943555476, + "grad_norm": 0.2540503740310669, + "learning_rate": 1.7030680657020314e-05, + "loss": 0.0642, + "step": 1560 + }, + { + "epoch": 0.27518902760682257, + "grad_norm": 0.24232099950313568, + "learning_rate": 1.701040147526219e-05, + "loss": 0.0531, + "step": 1565 + }, + { + "epoch": 0.27606822577809037, + "grad_norm": 1.1714109182357788, + "learning_rate": 1.6990065443063364e-05, + "loss": 0.0471, + "step": 1570 + }, + { + "epoch": 0.2769474239493582, + "grad_norm": 0.2802835702896118, + "learning_rate": 1.6969672725337706e-05, + "loss": 0.0678, + "step": 1575 + }, + { + "epoch": 0.277826622120626, + "grad_norm": 0.6939117312431335, + "learning_rate": 1.6949223487458764e-05, + "loss": 0.0576, + "step": 1580 + }, + { + "epoch": 0.2787058202918938, + "grad_norm": 0.558993935585022, + "learning_rate": 1.692871789525844e-05, + "loss": 0.0567, + "step": 1585 + }, + { + "epoch": 0.2795850184631616, + "grad_norm": 0.6370697617530823, + "learning_rate": 1.6908156115025626e-05, + "loss": 0.0578, + "step": 1590 + }, + { + "epoch": 0.2804642166344294, + "grad_norm": 0.22744275629520416, + "learning_rate": 1.6887538313504883e-05, + "loss": 0.0594, + "step": 1595 + }, + { + "epoch": 0.2813434148056972, + "grad_norm": 0.9439639449119568, + "learning_rate": 1.686686465789507e-05, + "loss": 0.0473, + "step": 1600 + }, + { + "epoch": 0.282222612976965, + "grad_norm": 0.5774978995323181, + "learning_rate": 1.6846135315847978e-05, + "loss": 0.0457, + "step": 1605 + }, + { + "epoch": 0.2831018111482328, + "grad_norm": 1.2410329580307007, + "learning_rate": 1.6825350455467e-05, + "loss": 0.0602, + "step": 1610 + }, + { + "epoch": 0.2839810093195006, + "grad_norm": 0.5602573752403259, + "learning_rate": 1.6804510245305745e-05, + "loss": 0.0533, + "step": 1615 + }, + { + "epoch": 0.2848602074907684, + "grad_norm": 0.6151893734931946, + "learning_rate": 1.678361485436668e-05, + "loss": 0.0549, + "step": 1620 + }, + { + "epoch": 0.2857394056620362, + "grad_norm": 0.17069809138774872, + "learning_rate": 1.676266445209975e-05, + "loss": 0.0548, + "step": 1625 + }, + { + "epoch": 0.286618603833304, + "grad_norm": 0.2259790599346161, + "learning_rate": 1.674165920840102e-05, + "loss": 0.056, + "step": 1630 + }, + { + "epoch": 0.2874978020045718, + "grad_norm": 0.17054542899131775, + "learning_rate": 1.6720599293611287e-05, + "loss": 0.0637, + "step": 1635 + }, + { + "epoch": 0.2883770001758396, + "grad_norm": 0.38651248812675476, + "learning_rate": 1.6699484878514693e-05, + "loss": 0.0517, + "step": 1640 + }, + { + "epoch": 0.2892561983471074, + "grad_norm": 0.3397147059440613, + "learning_rate": 1.6678316134337362e-05, + "loss": 0.0545, + "step": 1645 + }, + { + "epoch": 0.2901353965183752, + "grad_norm": 0.28409913182258606, + "learning_rate": 1.6657093232745973e-05, + "loss": 0.0562, + "step": 1650 + }, + { + "epoch": 0.291014594689643, + "grad_norm": 0.8544853925704956, + "learning_rate": 1.6635816345846413e-05, + "loss": 0.0641, + "step": 1655 + }, + { + "epoch": 0.29189379286091083, + "grad_norm": 0.35938528180122375, + "learning_rate": 1.661448564618235e-05, + "loss": 0.0589, + "step": 1660 + }, + { + "epoch": 0.29277299103217863, + "grad_norm": 0.3898780345916748, + "learning_rate": 1.6593101306733847e-05, + "loss": 0.059, + "step": 1665 + }, + { + "epoch": 0.29365218920344643, + "grad_norm": 0.34503045678138733, + "learning_rate": 1.6571663500915957e-05, + "loss": 0.0702, + "step": 1670 + }, + { + "epoch": 0.29453138737471424, + "grad_norm": 0.06647461652755737, + "learning_rate": 1.6550172402577304e-05, + "loss": 0.0618, + "step": 1675 + }, + { + "epoch": 0.29541058554598204, + "grad_norm": 0.3430112600326538, + "learning_rate": 1.6528628185998697e-05, + "loss": 0.0587, + "step": 1680 + }, + { + "epoch": 0.29628978371724984, + "grad_norm": 0.773381769657135, + "learning_rate": 1.65070310258917e-05, + "loss": 0.0608, + "step": 1685 + }, + { + "epoch": 0.29716898188851765, + "grad_norm": 0.6421689391136169, + "learning_rate": 1.6485381097397223e-05, + "loss": 0.0558, + "step": 1690 + }, + { + "epoch": 0.29804818005978545, + "grad_norm": 0.6023097634315491, + "learning_rate": 1.646367857608409e-05, + "loss": 0.0597, + "step": 1695 + }, + { + "epoch": 0.29892737823105325, + "grad_norm": 0.47872084379196167, + "learning_rate": 1.6441923637947627e-05, + "loss": 0.0647, + "step": 1700 + }, + { + "epoch": 0.29980657640232106, + "grad_norm": 1.3520686626434326, + "learning_rate": 1.6420116459408237e-05, + "loss": 0.0621, + "step": 1705 + }, + { + "epoch": 0.30068577457358886, + "grad_norm": 0.354427307844162, + "learning_rate": 1.6398257217309956e-05, + "loss": 0.0591, + "step": 1710 + }, + { + "epoch": 0.30156497274485666, + "grad_norm": 0.4515109658241272, + "learning_rate": 1.6376346088919032e-05, + "loss": 0.0444, + "step": 1715 + }, + { + "epoch": 0.3024441709161245, + "grad_norm": 0.8840310573577881, + "learning_rate": 1.6354383251922473e-05, + "loss": 0.069, + "step": 1720 + }, + { + "epoch": 0.3033233690873923, + "grad_norm": 0.13943177461624146, + "learning_rate": 1.633236888442663e-05, + "loss": 0.0621, + "step": 1725 + }, + { + "epoch": 0.30420256725866013, + "grad_norm": 0.2890174984931946, + "learning_rate": 1.631030316495572e-05, + "loss": 0.0676, + "step": 1730 + }, + { + "epoch": 0.30508176542992793, + "grad_norm": 0.18880678713321686, + "learning_rate": 1.6288186272450407e-05, + "loss": 0.0636, + "step": 1735 + }, + { + "epoch": 0.30596096360119573, + "grad_norm": 0.5174923539161682, + "learning_rate": 1.626601838626634e-05, + "loss": 0.0638, + "step": 1740 + }, + { + "epoch": 0.30684016177246354, + "grad_norm": 0.47316744923591614, + "learning_rate": 1.624379968617269e-05, + "loss": 0.0571, + "step": 1745 + }, + { + "epoch": 0.30771935994373134, + "grad_norm": 0.4440658986568451, + "learning_rate": 1.6221530352350713e-05, + "loss": 0.0551, + "step": 1750 + }, + { + "epoch": 0.30859855811499914, + "grad_norm": 0.5249147415161133, + "learning_rate": 1.619921056539226e-05, + "loss": 0.0559, + "step": 1755 + }, + { + "epoch": 0.30947775628626695, + "grad_norm": 0.6782397627830505, + "learning_rate": 1.6176840506298345e-05, + "loss": 0.0695, + "step": 1760 + }, + { + "epoch": 0.31035695445753475, + "grad_norm": 0.6498162150382996, + "learning_rate": 1.615442035647765e-05, + "loss": 0.0547, + "step": 1765 + }, + { + "epoch": 0.31123615262880255, + "grad_norm": 0.458238422870636, + "learning_rate": 1.6131950297745075e-05, + "loss": 0.0608, + "step": 1770 + }, + { + "epoch": 0.31211535080007036, + "grad_norm": 0.15339304506778717, + "learning_rate": 1.6109430512320235e-05, + "loss": 0.0583, + "step": 1775 + }, + { + "epoch": 0.31299454897133816, + "grad_norm": 0.3111644387245178, + "learning_rate": 1.6086861182826024e-05, + "loss": 0.051, + "step": 1780 + }, + { + "epoch": 0.31387374714260596, + "grad_norm": 1.0354893207550049, + "learning_rate": 1.6064242492287095e-05, + "loss": 0.065, + "step": 1785 + }, + { + "epoch": 0.31475294531387377, + "grad_norm": 0.2242657095193863, + "learning_rate": 1.6041574624128392e-05, + "loss": 0.0473, + "step": 1790 + }, + { + "epoch": 0.31563214348514157, + "grad_norm": 0.2808036208152771, + "learning_rate": 1.6018857762173672e-05, + "loss": 0.0537, + "step": 1795 + }, + { + "epoch": 0.3165113416564094, + "grad_norm": 0.3030805289745331, + "learning_rate": 1.5996092090643993e-05, + "loss": 0.0529, + "step": 1800 + }, + { + "epoch": 0.3173905398276772, + "grad_norm": 1.5149699449539185, + "learning_rate": 1.597327779415624e-05, + "loss": 0.0541, + "step": 1805 + }, + { + "epoch": 0.318269737998945, + "grad_norm": 1.2089002132415771, + "learning_rate": 1.595041505772162e-05, + "loss": 0.0748, + "step": 1810 + }, + { + "epoch": 0.3191489361702128, + "grad_norm": 0.28580719232559204, + "learning_rate": 1.5927504066744147e-05, + "loss": 0.0569, + "step": 1815 + }, + { + "epoch": 0.3200281343414806, + "grad_norm": 0.7665076851844788, + "learning_rate": 1.590454500701917e-05, + "loss": 0.0648, + "step": 1820 + }, + { + "epoch": 0.3209073325127484, + "grad_norm": 0.771531343460083, + "learning_rate": 1.5881538064731838e-05, + "loss": 0.0624, + "step": 1825 + }, + { + "epoch": 0.3217865306840162, + "grad_norm": 0.11554717272520065, + "learning_rate": 1.58584834264556e-05, + "loss": 0.0474, + "step": 1830 + }, + { + "epoch": 0.322665728855284, + "grad_norm": 0.13242988288402557, + "learning_rate": 1.5835381279150705e-05, + "loss": 0.055, + "step": 1835 + }, + { + "epoch": 0.3235449270265518, + "grad_norm": 1.2687695026397705, + "learning_rate": 1.5812231810162656e-05, + "loss": 0.0595, + "step": 1840 + }, + { + "epoch": 0.3244241251978196, + "grad_norm": 0.23642951250076294, + "learning_rate": 1.5789035207220725e-05, + "loss": 0.0433, + "step": 1845 + }, + { + "epoch": 0.3253033233690874, + "grad_norm": 0.6303196549415588, + "learning_rate": 1.5765791658436406e-05, + "loss": 0.0495, + "step": 1850 + }, + { + "epoch": 0.3261825215403552, + "grad_norm": 0.23932726681232452, + "learning_rate": 1.5742501352301894e-05, + "loss": 0.0558, + "step": 1855 + }, + { + "epoch": 0.327061719711623, + "grad_norm": 0.4368959069252014, + "learning_rate": 1.5719164477688566e-05, + "loss": 0.0666, + "step": 1860 + }, + { + "epoch": 0.3279409178828908, + "grad_norm": 0.2366788238286972, + "learning_rate": 1.5695781223845442e-05, + "loss": 0.0716, + "step": 1865 + }, + { + "epoch": 0.3288201160541586, + "grad_norm": 0.643233060836792, + "learning_rate": 1.5672351780397653e-05, + "loss": 0.0524, + "step": 1870 + }, + { + "epoch": 0.3296993142254264, + "grad_norm": 0.523089587688446, + "learning_rate": 1.5648876337344898e-05, + "loss": 0.0615, + "step": 1875 + }, + { + "epoch": 0.3305785123966942, + "grad_norm": 0.18103045225143433, + "learning_rate": 1.5625355085059907e-05, + "loss": 0.0622, + "step": 1880 + }, + { + "epoch": 0.331457710567962, + "grad_norm": 0.35785582661628723, + "learning_rate": 1.5601788214286905e-05, + "loss": 0.0578, + "step": 1885 + }, + { + "epoch": 0.33233690873922983, + "grad_norm": 0.586683988571167, + "learning_rate": 1.557817591614005e-05, + "loss": 0.059, + "step": 1890 + }, + { + "epoch": 0.33321610691049763, + "grad_norm": 0.43284872174263, + "learning_rate": 1.555451838210189e-05, + "loss": 0.0553, + "step": 1895 + }, + { + "epoch": 0.33409530508176544, + "grad_norm": 0.44119471311569214, + "learning_rate": 1.553081580402182e-05, + "loss": 0.0563, + "step": 1900 + }, + { + "epoch": 0.33497450325303324, + "grad_norm": 0.4126788377761841, + "learning_rate": 1.55070683741145e-05, + "loss": 0.0564, + "step": 1905 + }, + { + "epoch": 0.33585370142430104, + "grad_norm": 0.581628680229187, + "learning_rate": 1.548327628495833e-05, + "loss": 0.0528, + "step": 1910 + }, + { + "epoch": 0.33673289959556885, + "grad_norm": 0.49338245391845703, + "learning_rate": 1.5459439729493864e-05, + "loss": 0.046, + "step": 1915 + }, + { + "epoch": 0.33761209776683665, + "grad_norm": 0.43671730160713196, + "learning_rate": 1.543555890102226e-05, + "loss": 0.062, + "step": 1920 + }, + { + "epoch": 0.33849129593810445, + "grad_norm": 0.6600947976112366, + "learning_rate": 1.5411633993203695e-05, + "loss": 0.0616, + "step": 1925 + }, + { + "epoch": 0.33937049410937226, + "grad_norm": 0.6367527842521667, + "learning_rate": 1.538766520005581e-05, + "loss": 0.0621, + "step": 1930 + }, + { + "epoch": 0.34024969228064006, + "grad_norm": 0.39215588569641113, + "learning_rate": 1.536365271595212e-05, + "loss": 0.0659, + "step": 1935 + }, + { + "epoch": 0.34112889045190786, + "grad_norm": 1.8016176223754883, + "learning_rate": 1.5339596735620485e-05, + "loss": 0.0596, + "step": 1940 + }, + { + "epoch": 0.34200808862317567, + "grad_norm": 0.7933741807937622, + "learning_rate": 1.5315497454141446e-05, + "loss": 0.0602, + "step": 1945 + }, + { + "epoch": 0.34288728679444347, + "grad_norm": 0.2943550944328308, + "learning_rate": 1.529135506694673e-05, + "loss": 0.0514, + "step": 1950 + }, + { + "epoch": 0.34376648496571127, + "grad_norm": 0.20394988358020782, + "learning_rate": 1.526716976981761e-05, + "loss": 0.0613, + "step": 1955 + }, + { + "epoch": 0.3446456831369791, + "grad_norm": 0.3414583206176758, + "learning_rate": 1.5242941758883341e-05, + "loss": 0.0446, + "step": 1960 + }, + { + "epoch": 0.3455248813082469, + "grad_norm": 0.6178276538848877, + "learning_rate": 1.5218671230619558e-05, + "loss": 0.0586, + "step": 1965 + }, + { + "epoch": 0.3464040794795147, + "grad_norm": 0.8685587644577026, + "learning_rate": 1.5194358381846686e-05, + "loss": 0.0577, + "step": 1970 + }, + { + "epoch": 0.3472832776507825, + "grad_norm": 0.6029766201972961, + "learning_rate": 1.5170003409728358e-05, + "loss": 0.053, + "step": 1975 + }, + { + "epoch": 0.3481624758220503, + "grad_norm": 0.33781400322914124, + "learning_rate": 1.5145606511769788e-05, + "loss": 0.0625, + "step": 1980 + }, + { + "epoch": 0.3490416739933181, + "grad_norm": 0.541808009147644, + "learning_rate": 1.5121167885816202e-05, + "loss": 0.0505, + "step": 1985 + }, + { + "epoch": 0.3499208721645859, + "grad_norm": 1.0590039491653442, + "learning_rate": 1.50966877300512e-05, + "loss": 0.0698, + "step": 1990 + }, + { + "epoch": 0.3508000703358537, + "grad_norm": 0.4376682937145233, + "learning_rate": 1.5072166242995177e-05, + "loss": 0.066, + "step": 1995 + }, + { + "epoch": 0.3516792685071215, + "grad_norm": 0.44992053508758545, + "learning_rate": 1.5047603623503695e-05, + "loss": 0.074, + "step": 2000 + }, + { + "epoch": 0.3525584666783893, + "grad_norm": 0.5252031683921814, + "learning_rate": 1.5023000070765886e-05, + "loss": 0.0681, + "step": 2005 + }, + { + "epoch": 0.3534376648496571, + "grad_norm": 0.6418195366859436, + "learning_rate": 1.4998355784302816e-05, + "loss": 0.0655, + "step": 2010 + }, + { + "epoch": 0.3543168630209249, + "grad_norm": 1.0612142086029053, + "learning_rate": 1.4973670963965883e-05, + "loss": 0.0681, + "step": 2015 + }, + { + "epoch": 0.3551960611921927, + "grad_norm": 0.288524866104126, + "learning_rate": 1.49489458099352e-05, + "loss": 0.0472, + "step": 2020 + }, + { + "epoch": 0.3560752593634605, + "grad_norm": 0.44207853078842163, + "learning_rate": 1.4924180522717952e-05, + "loss": 0.0629, + "step": 2025 + }, + { + "epoch": 0.3569544575347283, + "grad_norm": 0.501887857913971, + "learning_rate": 1.4899375303146793e-05, + "loss": 0.0467, + "step": 2030 + }, + { + "epoch": 0.3578336557059961, + "grad_norm": 0.3725419044494629, + "learning_rate": 1.4874530352378193e-05, + "loss": 0.0592, + "step": 2035 + }, + { + "epoch": 0.3587128538772639, + "grad_norm": 0.3303128480911255, + "learning_rate": 1.4849645871890832e-05, + "loss": 0.0453, + "step": 2040 + }, + { + "epoch": 0.35959205204853173, + "grad_norm": 0.12348782271146774, + "learning_rate": 1.4824722063483944e-05, + "loss": 0.0434, + "step": 2045 + }, + { + "epoch": 0.36047125021979953, + "grad_norm": 0.381765753030777, + "learning_rate": 1.4799759129275703e-05, + "loss": 0.0497, + "step": 2050 + }, + { + "epoch": 0.36135044839106734, + "grad_norm": 0.24817384779453278, + "learning_rate": 1.477475727170156e-05, + "loss": 0.0495, + "step": 2055 + }, + { + "epoch": 0.36222964656233514, + "grad_norm": 0.3029944896697998, + "learning_rate": 1.4749716693512612e-05, + "loss": 0.0463, + "step": 2060 + }, + { + "epoch": 0.36310884473360294, + "grad_norm": 0.9858243465423584, + "learning_rate": 1.4724637597773969e-05, + "loss": 0.0769, + "step": 2065 + }, + { + "epoch": 0.36398804290487075, + "grad_norm": 0.38801249861717224, + "learning_rate": 1.469952018786309e-05, + "loss": 0.0472, + "step": 2070 + }, + { + "epoch": 0.36486724107613855, + "grad_norm": 0.15492072701454163, + "learning_rate": 1.467436466746814e-05, + "loss": 0.0574, + "step": 2075 + }, + { + "epoch": 0.36574643924740635, + "grad_norm": 0.8615152835845947, + "learning_rate": 1.464917124058634e-05, + "loss": 0.0651, + "step": 2080 + }, + { + "epoch": 0.36662563741867416, + "grad_norm": 1.6309150457382202, + "learning_rate": 1.4623940111522315e-05, + "loss": 0.0559, + "step": 2085 + }, + { + "epoch": 0.36750483558994196, + "grad_norm": 0.1200883612036705, + "learning_rate": 1.4598671484886423e-05, + "loss": 0.049, + "step": 2090 + }, + { + "epoch": 0.36838403376120976, + "grad_norm": 0.3056308627128601, + "learning_rate": 1.4573365565593121e-05, + "loss": 0.0514, + "step": 2095 + }, + { + "epoch": 0.36926323193247756, + "grad_norm": 0.3354267179965973, + "learning_rate": 1.4548022558859281e-05, + "loss": 0.0528, + "step": 2100 + }, + { + "epoch": 0.37014243010374537, + "grad_norm": 0.7310557961463928, + "learning_rate": 1.4522642670202528e-05, + "loss": 0.0676, + "step": 2105 + }, + { + "epoch": 0.37102162827501317, + "grad_norm": 0.6339288353919983, + "learning_rate": 1.4497226105439586e-05, + "loss": 0.0714, + "step": 2110 + }, + { + "epoch": 0.371900826446281, + "grad_norm": 0.2613738179206848, + "learning_rate": 1.44717730706846e-05, + "loss": 0.0586, + "step": 2115 + }, + { + "epoch": 0.3727800246175488, + "grad_norm": 0.4906211793422699, + "learning_rate": 1.4446283772347475e-05, + "loss": 0.0599, + "step": 2120 + }, + { + "epoch": 0.3736592227888166, + "grad_norm": 0.46116968989372253, + "learning_rate": 1.4420758417132177e-05, + "loss": 0.0537, + "step": 2125 + }, + { + "epoch": 0.3745384209600844, + "grad_norm": 0.5634986758232117, + "learning_rate": 1.4395197212035078e-05, + "loss": 0.0552, + "step": 2130 + }, + { + "epoch": 0.3754176191313522, + "grad_norm": 0.14335110783576965, + "learning_rate": 1.4369600364343286e-05, + "loss": 0.0587, + "step": 2135 + }, + { + "epoch": 0.37629681730262, + "grad_norm": 0.17126217484474182, + "learning_rate": 1.434396808163293e-05, + "loss": 0.066, + "step": 2140 + }, + { + "epoch": 0.3771760154738878, + "grad_norm": 0.5141481161117554, + "learning_rate": 1.4318300571767514e-05, + "loss": 0.058, + "step": 2145 + }, + { + "epoch": 0.3780552136451556, + "grad_norm": 0.19028577208518982, + "learning_rate": 1.4292598042896204e-05, + "loss": 0.0667, + "step": 2150 + }, + { + "epoch": 0.3789344118164234, + "grad_norm": 0.20005124807357788, + "learning_rate": 1.4266860703452156e-05, + "loss": 0.0519, + "step": 2155 + }, + { + "epoch": 0.3798136099876912, + "grad_norm": 0.36462101340293884, + "learning_rate": 1.4241088762150817e-05, + "loss": 0.0583, + "step": 2160 + }, + { + "epoch": 0.380692808158959, + "grad_norm": 0.26748377084732056, + "learning_rate": 1.4215282427988242e-05, + "loss": 0.0609, + "step": 2165 + }, + { + "epoch": 0.3815720063302268, + "grad_norm": 0.28044751286506653, + "learning_rate": 1.4189441910239383e-05, + "loss": 0.053, + "step": 2170 + }, + { + "epoch": 0.3824512045014946, + "grad_norm": 0.5757772326469421, + "learning_rate": 1.4163567418456408e-05, + "loss": 0.0651, + "step": 2175 + }, + { + "epoch": 0.3833304026727624, + "grad_norm": 0.6958622336387634, + "learning_rate": 1.4137659162466999e-05, + "loss": 0.0529, + "step": 2180 + }, + { + "epoch": 0.3842096008440302, + "grad_norm": 0.7717348337173462, + "learning_rate": 1.4111717352372635e-05, + "loss": 0.0498, + "step": 2185 + }, + { + "epoch": 0.385088799015298, + "grad_norm": 0.4615864157676697, + "learning_rate": 1.408574219854692e-05, + "loss": 0.0619, + "step": 2190 + }, + { + "epoch": 0.3859679971865658, + "grad_norm": 0.20736804604530334, + "learning_rate": 1.405973391163383e-05, + "loss": 0.0516, + "step": 2195 + }, + { + "epoch": 0.38684719535783363, + "grad_norm": 0.9234808087348938, + "learning_rate": 1.4033692702546056e-05, + "loss": 0.0553, + "step": 2200 + }, + { + "epoch": 0.38772639352910143, + "grad_norm": 0.9873343110084534, + "learning_rate": 1.4007618782463252e-05, + "loss": 0.0683, + "step": 2205 + }, + { + "epoch": 0.3886055917003693, + "grad_norm": 0.8674870133399963, + "learning_rate": 1.3981512362830359e-05, + "loss": 0.0553, + "step": 2210 + }, + { + "epoch": 0.3894847898716371, + "grad_norm": 0.3268803656101227, + "learning_rate": 1.3955373655355852e-05, + "loss": 0.0461, + "step": 2215 + }, + { + "epoch": 0.3903639880429049, + "grad_norm": 0.3193112909793854, + "learning_rate": 1.392920287201005e-05, + "loss": 0.0674, + "step": 2220 + }, + { + "epoch": 0.3912431862141727, + "grad_norm": 0.3444458842277527, + "learning_rate": 1.3903000225023393e-05, + "loss": 0.0471, + "step": 2225 + }, + { + "epoch": 0.3921223843854405, + "grad_norm": 0.6412308812141418, + "learning_rate": 1.3876765926884712e-05, + "loss": 0.0537, + "step": 2230 + }, + { + "epoch": 0.3930015825567083, + "grad_norm": 0.34802794456481934, + "learning_rate": 1.3850500190339515e-05, + "loss": 0.0627, + "step": 2235 + }, + { + "epoch": 0.3938807807279761, + "grad_norm": 0.3646249771118164, + "learning_rate": 1.3824203228388254e-05, + "loss": 0.0513, + "step": 2240 + }, + { + "epoch": 0.3947599788992439, + "grad_norm": 1.2918972969055176, + "learning_rate": 1.3797875254284605e-05, + "loss": 0.0782, + "step": 2245 + }, + { + "epoch": 0.3956391770705117, + "grad_norm": 0.2758769094944, + "learning_rate": 1.3771516481533733e-05, + "loss": 0.0479, + "step": 2250 + }, + { + "epoch": 0.3965183752417795, + "grad_norm": 0.1739499419927597, + "learning_rate": 1.3745127123890565e-05, + "loss": 0.0523, + "step": 2255 + }, + { + "epoch": 0.3973975734130473, + "grad_norm": 0.30399224162101746, + "learning_rate": 1.3718707395358053e-05, + "loss": 0.0604, + "step": 2260 + }, + { + "epoch": 0.3982767715843151, + "grad_norm": 0.3504631519317627, + "learning_rate": 1.3692257510185439e-05, + "loss": 0.0738, + "step": 2265 + }, + { + "epoch": 0.39915596975558293, + "grad_norm": 0.3818477988243103, + "learning_rate": 1.3665777682866521e-05, + "loss": 0.0572, + "step": 2270 + }, + { + "epoch": 0.40003516792685073, + "grad_norm": 0.8020049333572388, + "learning_rate": 1.3639268128137908e-05, + "loss": 0.0616, + "step": 2275 + }, + { + "epoch": 0.40091436609811854, + "grad_norm": 0.5802652835845947, + "learning_rate": 1.3612729060977287e-05, + "loss": 0.0647, + "step": 2280 + }, + { + "epoch": 0.40179356426938634, + "grad_norm": 0.1555125117301941, + "learning_rate": 1.3586160696601667e-05, + "loss": 0.0656, + "step": 2285 + }, + { + "epoch": 0.40267276244065414, + "grad_norm": 0.3963813781738281, + "learning_rate": 1.3559563250465645e-05, + "loss": 0.0555, + "step": 2290 + }, + { + "epoch": 0.40355196061192194, + "grad_norm": 0.31865325570106506, + "learning_rate": 1.3532936938259658e-05, + "loss": 0.0571, + "step": 2295 + }, + { + "epoch": 0.40443115878318975, + "grad_norm": 0.32378071546554565, + "learning_rate": 1.3506281975908224e-05, + "loss": 0.065, + "step": 2300 + }, + { + "epoch": 0.40531035695445755, + "grad_norm": 0.12490954995155334, + "learning_rate": 1.3479598579568205e-05, + "loss": 0.0529, + "step": 2305 + }, + { + "epoch": 0.40618955512572535, + "grad_norm": 0.3125900626182556, + "learning_rate": 1.3452886965627036e-05, + "loss": 0.0408, + "step": 2310 + }, + { + "epoch": 0.40706875329699316, + "grad_norm": 0.8287737369537354, + "learning_rate": 1.3426147350700995e-05, + "loss": 0.062, + "step": 2315 + }, + { + "epoch": 0.40794795146826096, + "grad_norm": 0.39330750703811646, + "learning_rate": 1.339937995163342e-05, + "loss": 0.0493, + "step": 2320 + }, + { + "epoch": 0.40882714963952876, + "grad_norm": 0.2195868045091629, + "learning_rate": 1.3372584985492972e-05, + "loss": 0.0545, + "step": 2325 + }, + { + "epoch": 0.40970634781079657, + "grad_norm": 0.15690335631370544, + "learning_rate": 1.3345762669571855e-05, + "loss": 0.0564, + "step": 2330 + }, + { + "epoch": 0.41058554598206437, + "grad_norm": 0.15210093557834625, + "learning_rate": 1.3318913221384078e-05, + "loss": 0.0501, + "step": 2335 + }, + { + "epoch": 0.4114647441533322, + "grad_norm": 0.08087150007486343, + "learning_rate": 1.3292036858663671e-05, + "loss": 0.0494, + "step": 2340 + }, + { + "epoch": 0.4123439423246, + "grad_norm": 0.17343612015247345, + "learning_rate": 1.3265133799362919e-05, + "loss": 0.0568, + "step": 2345 + }, + { + "epoch": 0.4132231404958678, + "grad_norm": 0.6120114922523499, + "learning_rate": 1.3238204261650613e-05, + "loss": 0.0819, + "step": 2350 + }, + { + "epoch": 0.4141023386671356, + "grad_norm": 0.5394456386566162, + "learning_rate": 1.3211248463910263e-05, + "loss": 0.0574, + "step": 2355 + }, + { + "epoch": 0.4149815368384034, + "grad_norm": 0.14764389395713806, + "learning_rate": 1.3184266624738333e-05, + "loss": 0.0588, + "step": 2360 + }, + { + "epoch": 0.4158607350096712, + "grad_norm": 0.664940595626831, + "learning_rate": 1.3157258962942468e-05, + "loss": 0.0499, + "step": 2365 + }, + { + "epoch": 0.416739933180939, + "grad_norm": 0.43065398931503296, + "learning_rate": 1.3130225697539725e-05, + "loss": 0.056, + "step": 2370 + }, + { + "epoch": 0.4176191313522068, + "grad_norm": 0.23071400821208954, + "learning_rate": 1.3103167047754786e-05, + "loss": 0.0505, + "step": 2375 + }, + { + "epoch": 0.4184983295234746, + "grad_norm": 0.2247258871793747, + "learning_rate": 1.3076083233018188e-05, + "loss": 0.0572, + "step": 2380 + }, + { + "epoch": 0.4193775276947424, + "grad_norm": 0.24306868016719818, + "learning_rate": 1.3048974472964547e-05, + "loss": 0.0751, + "step": 2385 + }, + { + "epoch": 0.4202567258660102, + "grad_norm": 0.6511367559432983, + "learning_rate": 1.3021840987430761e-05, + "loss": 0.0612, + "step": 2390 + }, + { + "epoch": 0.421135924037278, + "grad_norm": 0.5404173731803894, + "learning_rate": 1.2994682996454247e-05, + "loss": 0.0593, + "step": 2395 + }, + { + "epoch": 0.4220151222085458, + "grad_norm": 0.1526852548122406, + "learning_rate": 1.2967500720271142e-05, + "loss": 0.0557, + "step": 2400 + }, + { + "epoch": 0.4228943203798136, + "grad_norm": 0.2758397161960602, + "learning_rate": 1.2940294379314531e-05, + "loss": 0.0599, + "step": 2405 + }, + { + "epoch": 0.4237735185510814, + "grad_norm": 0.6543939709663391, + "learning_rate": 1.2913064194212634e-05, + "loss": 0.0579, + "step": 2410 + }, + { + "epoch": 0.4246527167223492, + "grad_norm": 0.5492807030677795, + "learning_rate": 1.2885810385787056e-05, + "loss": 0.0571, + "step": 2415 + }, + { + "epoch": 0.425531914893617, + "grad_norm": 0.200227290391922, + "learning_rate": 1.2858533175050955e-05, + "loss": 0.0514, + "step": 2420 + }, + { + "epoch": 0.42641111306488483, + "grad_norm": 0.5350441932678223, + "learning_rate": 1.2831232783207278e-05, + "loss": 0.0492, + "step": 2425 + }, + { + "epoch": 0.42729031123615263, + "grad_norm": 0.2634308636188507, + "learning_rate": 1.2803909431646952e-05, + "loss": 0.0511, + "step": 2430 + }, + { + "epoch": 0.42816950940742043, + "grad_norm": 0.1868433952331543, + "learning_rate": 1.2776563341947104e-05, + "loss": 0.0483, + "step": 2435 + }, + { + "epoch": 0.42904870757868824, + "grad_norm": 0.29465433955192566, + "learning_rate": 1.2749194735869246e-05, + "loss": 0.0543, + "step": 2440 + }, + { + "epoch": 0.42992790574995604, + "grad_norm": 0.13924354314804077, + "learning_rate": 1.2721803835357486e-05, + "loss": 0.0564, + "step": 2445 + }, + { + "epoch": 0.43080710392122384, + "grad_norm": 0.5867925882339478, + "learning_rate": 1.2694390862536736e-05, + "loss": 0.0651, + "step": 2450 + }, + { + "epoch": 0.43168630209249165, + "grad_norm": 0.0857938826084137, + "learning_rate": 1.2666956039710889e-05, + "loss": 0.049, + "step": 2455 + }, + { + "epoch": 0.43256550026375945, + "grad_norm": 0.5406795740127563, + "learning_rate": 1.2639499589361041e-05, + "loss": 0.0662, + "step": 2460 + }, + { + "epoch": 0.43344469843502725, + "grad_norm": 0.6612178087234497, + "learning_rate": 1.2612021734143667e-05, + "loss": 0.0634, + "step": 2465 + }, + { + "epoch": 0.43432389660629506, + "grad_norm": 0.9012327194213867, + "learning_rate": 1.2584522696888825e-05, + "loss": 0.0652, + "step": 2470 + }, + { + "epoch": 0.43520309477756286, + "grad_norm": 0.2964789569377899, + "learning_rate": 1.2557002700598353e-05, + "loss": 0.0511, + "step": 2475 + }, + { + "epoch": 0.43608229294883066, + "grad_norm": 0.22119061648845673, + "learning_rate": 1.2529461968444047e-05, + "loss": 0.0556, + "step": 2480 + }, + { + "epoch": 0.43696149112009847, + "grad_norm": 0.35610833764076233, + "learning_rate": 1.250190072376587e-05, + "loss": 0.0559, + "step": 2485 + }, + { + "epoch": 0.43784068929136627, + "grad_norm": 0.596125066280365, + "learning_rate": 1.2474319190070115e-05, + "loss": 0.0562, + "step": 2490 + }, + { + "epoch": 0.4387198874626341, + "grad_norm": 0.31192147731781006, + "learning_rate": 1.2446717591027624e-05, + "loss": 0.0581, + "step": 2495 + }, + { + "epoch": 0.4395990856339019, + "grad_norm": 0.6106126308441162, + "learning_rate": 1.2419096150471944e-05, + "loss": 0.0599, + "step": 2500 + }, + { + "epoch": 0.4404782838051697, + "grad_norm": 0.33565065264701843, + "learning_rate": 1.2391455092397535e-05, + "loss": 0.0627, + "step": 2505 + }, + { + "epoch": 0.4413574819764375, + "grad_norm": 0.3373861610889435, + "learning_rate": 1.236379464095794e-05, + "loss": 0.0605, + "step": 2510 + }, + { + "epoch": 0.4422366801477053, + "grad_norm": 0.2420874387025833, + "learning_rate": 1.233611502046397e-05, + "loss": 0.0584, + "step": 2515 + }, + { + "epoch": 0.4431158783189731, + "grad_norm": 0.17154277861118317, + "learning_rate": 1.2308416455381891e-05, + "loss": 0.0428, + "step": 2520 + }, + { + "epoch": 0.4439950764902409, + "grad_norm": 0.4789488613605499, + "learning_rate": 1.2280699170331593e-05, + "loss": 0.0512, + "step": 2525 + }, + { + "epoch": 0.4448742746615087, + "grad_norm": 0.2340506613254547, + "learning_rate": 1.2252963390084784e-05, + "loss": 0.0586, + "step": 2530 + }, + { + "epoch": 0.4457534728327765, + "grad_norm": 0.3497225046157837, + "learning_rate": 1.2225209339563144e-05, + "loss": 0.0597, + "step": 2535 + }, + { + "epoch": 0.4466326710040443, + "grad_norm": 0.22254657745361328, + "learning_rate": 1.2197437243836529e-05, + "loss": 0.0784, + "step": 2540 + }, + { + "epoch": 0.4475118691753121, + "grad_norm": 0.3253319561481476, + "learning_rate": 1.2169647328121119e-05, + "loss": 0.0575, + "step": 2545 + }, + { + "epoch": 0.4483910673465799, + "grad_norm": 0.455172061920166, + "learning_rate": 1.2141839817777616e-05, + "loss": 0.06, + "step": 2550 + }, + { + "epoch": 0.4492702655178477, + "grad_norm": 0.30848458409309387, + "learning_rate": 1.2114014938309393e-05, + "loss": 0.0583, + "step": 2555 + }, + { + "epoch": 0.4501494636891155, + "grad_norm": 0.38188642263412476, + "learning_rate": 1.2086172915360684e-05, + "loss": 0.0575, + "step": 2560 + }, + { + "epoch": 0.4510286618603833, + "grad_norm": 0.2141093611717224, + "learning_rate": 1.2058313974714746e-05, + "loss": 0.0678, + "step": 2565 + }, + { + "epoch": 0.4519078600316511, + "grad_norm": 0.37475940585136414, + "learning_rate": 1.2030438342292028e-05, + "loss": 0.0621, + "step": 2570 + }, + { + "epoch": 0.4527870582029189, + "grad_norm": 0.5214441418647766, + "learning_rate": 1.2002546244148345e-05, + "loss": 0.0559, + "step": 2575 + }, + { + "epoch": 0.4536662563741867, + "grad_norm": 0.5643457174301147, + "learning_rate": 1.197463790647303e-05, + "loss": 0.0551, + "step": 2580 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 0.2806403338909149, + "learning_rate": 1.1946713555587115e-05, + "loss": 0.059, + "step": 2585 + }, + { + "epoch": 0.45542465271672233, + "grad_norm": 0.7372429370880127, + "learning_rate": 1.1918773417941494e-05, + "loss": 0.0649, + "step": 2590 + }, + { + "epoch": 0.45630385088799014, + "grad_norm": 0.9332758188247681, + "learning_rate": 1.1890817720115075e-05, + "loss": 0.0519, + "step": 2595 + }, + { + "epoch": 0.45718304905925794, + "grad_norm": 0.36277708411216736, + "learning_rate": 1.1862846688812956e-05, + "loss": 0.0605, + "step": 2600 + }, + { + "epoch": 0.45806224723052574, + "grad_norm": 0.19159919023513794, + "learning_rate": 1.183486055086458e-05, + "loss": 0.0528, + "step": 2605 + }, + { + "epoch": 0.45894144540179355, + "grad_norm": 0.21766141057014465, + "learning_rate": 1.1806859533221896e-05, + "loss": 0.0587, + "step": 2610 + }, + { + "epoch": 0.45982064357306135, + "grad_norm": 0.8714537024497986, + "learning_rate": 1.1778843862957515e-05, + "loss": 0.0695, + "step": 2615 + }, + { + "epoch": 0.46069984174432915, + "grad_norm": 0.25225165486335754, + "learning_rate": 1.1750813767262879e-05, + "loss": 0.0551, + "step": 2620 + }, + { + "epoch": 0.46157903991559696, + "grad_norm": 0.2513810992240906, + "learning_rate": 1.1722769473446412e-05, + "loss": 0.0604, + "step": 2625 + }, + { + "epoch": 0.46245823808686476, + "grad_norm": 0.1506785750389099, + "learning_rate": 1.1694711208931668e-05, + "loss": 0.0562, + "step": 2630 + }, + { + "epoch": 0.46333743625813256, + "grad_norm": 0.20237652957439423, + "learning_rate": 1.1666639201255507e-05, + "loss": 0.0526, + "step": 2635 + }, + { + "epoch": 0.46421663442940037, + "grad_norm": 0.5076216459274292, + "learning_rate": 1.163855367806623e-05, + "loss": 0.0594, + "step": 2640 + }, + { + "epoch": 0.46509583260066817, + "grad_norm": 0.3569527268409729, + "learning_rate": 1.1610454867121747e-05, + "loss": 0.0594, + "step": 2645 + }, + { + "epoch": 0.46597503077193597, + "grad_norm": 0.19620120525360107, + "learning_rate": 1.158234299628772e-05, + "loss": 0.0658, + "step": 2650 + }, + { + "epoch": 0.4668542289432038, + "grad_norm": 0.46498578786849976, + "learning_rate": 1.1554218293535727e-05, + "loss": 0.0606, + "step": 2655 + }, + { + "epoch": 0.4677334271144716, + "grad_norm": 0.2302287071943283, + "learning_rate": 1.1526080986941389e-05, + "loss": 0.0589, + "step": 2660 + }, + { + "epoch": 0.4686126252857394, + "grad_norm": 0.39261671900749207, + "learning_rate": 1.1497931304682554e-05, + "loss": 0.0486, + "step": 2665 + }, + { + "epoch": 0.4694918234570072, + "grad_norm": 1.2169684171676636, + "learning_rate": 1.1469769475037427e-05, + "loss": 0.0666, + "step": 2670 + }, + { + "epoch": 0.470371021628275, + "grad_norm": 0.7795551419258118, + "learning_rate": 1.144159572638271e-05, + "loss": 0.0614, + "step": 2675 + }, + { + "epoch": 0.4712502197995428, + "grad_norm": 0.5861936211585999, + "learning_rate": 1.141341028719178e-05, + "loss": 0.0522, + "step": 2680 + }, + { + "epoch": 0.4721294179708106, + "grad_norm": 0.3658725917339325, + "learning_rate": 1.1385213386032797e-05, + "loss": 0.0506, + "step": 2685 + }, + { + "epoch": 0.4730086161420784, + "grad_norm": 0.5009949803352356, + "learning_rate": 1.1357005251566888e-05, + "loss": 0.0716, + "step": 2690 + }, + { + "epoch": 0.4738878143133462, + "grad_norm": 0.34323883056640625, + "learning_rate": 1.1328786112546268e-05, + "loss": 0.0772, + "step": 2695 + }, + { + "epoch": 0.47476701248461406, + "grad_norm": 0.5411744713783264, + "learning_rate": 1.1300556197812393e-05, + "loss": 0.0604, + "step": 2700 + }, + { + "epoch": 0.47564621065588186, + "grad_norm": 0.5839160680770874, + "learning_rate": 1.1272315736294108e-05, + "loss": 0.0575, + "step": 2705 + }, + { + "epoch": 0.47652540882714967, + "grad_norm": 0.2879360020160675, + "learning_rate": 1.1244064957005782e-05, + "loss": 0.0572, + "step": 2710 + }, + { + "epoch": 0.47740460699841747, + "grad_norm": 0.309555321931839, + "learning_rate": 1.121580408904546e-05, + "loss": 0.0679, + "step": 2715 + }, + { + "epoch": 0.47828380516968527, + "grad_norm": 0.31096187233924866, + "learning_rate": 1.1187533361592988e-05, + "loss": 0.0672, + "step": 2720 + }, + { + "epoch": 0.4791630033409531, + "grad_norm": 0.24858668446540833, + "learning_rate": 1.1159253003908188e-05, + "loss": 0.0604, + "step": 2725 + }, + { + "epoch": 0.4800422015122209, + "grad_norm": 0.38457539677619934, + "learning_rate": 1.113096324532896e-05, + "loss": 0.0656, + "step": 2730 + }, + { + "epoch": 0.4809213996834887, + "grad_norm": 0.20690588653087616, + "learning_rate": 1.1102664315269452e-05, + "loss": 0.0612, + "step": 2735 + }, + { + "epoch": 0.4818005978547565, + "grad_norm": 0.8975262641906738, + "learning_rate": 1.1074356443218175e-05, + "loss": 0.0552, + "step": 2740 + }, + { + "epoch": 0.4826797960260243, + "grad_norm": 0.31369662284851074, + "learning_rate": 1.1046039858736167e-05, + "loss": 0.0685, + "step": 2745 + }, + { + "epoch": 0.4835589941972921, + "grad_norm": 0.1889607012271881, + "learning_rate": 1.101771479145511e-05, + "loss": 0.0642, + "step": 2750 + }, + { + "epoch": 0.4844381923685599, + "grad_norm": 0.4452875256538391, + "learning_rate": 1.0989381471075481e-05, + "loss": 0.0643, + "step": 2755 + }, + { + "epoch": 0.4853173905398277, + "grad_norm": 0.9008978009223938, + "learning_rate": 1.0961040127364688e-05, + "loss": 0.0634, + "step": 2760 + }, + { + "epoch": 0.4861965887110955, + "grad_norm": 1.0086400508880615, + "learning_rate": 1.0932690990155195e-05, + "loss": 0.053, + "step": 2765 + }, + { + "epoch": 0.4870757868823633, + "grad_norm": 0.3842066526412964, + "learning_rate": 1.0904334289342675e-05, + "loss": 0.0548, + "step": 2770 + }, + { + "epoch": 0.4879549850536311, + "grad_norm": 0.23029272258281708, + "learning_rate": 1.087597025488413e-05, + "loss": 0.0417, + "step": 2775 + }, + { + "epoch": 0.4888341832248989, + "grad_norm": 0.9451369643211365, + "learning_rate": 1.0847599116796047e-05, + "loss": 0.0535, + "step": 2780 + }, + { + "epoch": 0.4897133813961667, + "grad_norm": 1.2543659210205078, + "learning_rate": 1.0819221105152504e-05, + "loss": 0.0644, + "step": 2785 + }, + { + "epoch": 0.4905925795674345, + "grad_norm": 0.2994709014892578, + "learning_rate": 1.0790836450083327e-05, + "loss": 0.053, + "step": 2790 + }, + { + "epoch": 0.4914717777387023, + "grad_norm": 0.43622636795043945, + "learning_rate": 1.0762445381772217e-05, + "loss": 0.0609, + "step": 2795 + }, + { + "epoch": 0.4923509759099701, + "grad_norm": 0.27832546830177307, + "learning_rate": 1.0734048130454882e-05, + "loss": 0.0642, + "step": 2800 + }, + { + "epoch": 0.4932301740812379, + "grad_norm": 1.0674911737442017, + "learning_rate": 1.0705644926417172e-05, + "loss": 0.0445, + "step": 2805 + }, + { + "epoch": 0.49410937225250573, + "grad_norm": 0.5751109719276428, + "learning_rate": 1.0677235999993205e-05, + "loss": 0.0482, + "step": 2810 + }, + { + "epoch": 0.49498857042377353, + "grad_norm": 0.15411067008972168, + "learning_rate": 1.0648821581563514e-05, + "loss": 0.0561, + "step": 2815 + }, + { + "epoch": 0.49586776859504134, + "grad_norm": 0.29986828565597534, + "learning_rate": 1.0620401901553155e-05, + "loss": 0.0655, + "step": 2820 + }, + { + "epoch": 0.49674696676630914, + "grad_norm": 0.6311131119728088, + "learning_rate": 1.0591977190429868e-05, + "loss": 0.0519, + "step": 2825 + }, + { + "epoch": 0.49762616493757694, + "grad_norm": 0.30874118208885193, + "learning_rate": 1.056354767870218e-05, + "loss": 0.0581, + "step": 2830 + }, + { + "epoch": 0.49850536310884475, + "grad_norm": 0.38604286313056946, + "learning_rate": 1.0535113596917556e-05, + "loss": 0.0627, + "step": 2835 + }, + { + "epoch": 0.49938456128011255, + "grad_norm": 0.16394232213497162, + "learning_rate": 1.0506675175660519e-05, + "loss": 0.0591, + "step": 2840 + }, + { + "epoch": 0.5002637594513804, + "grad_norm": 0.5202212929725647, + "learning_rate": 1.0478232645550784e-05, + "loss": 0.0585, + "step": 2845 + }, + { + "epoch": 0.5011429576226482, + "grad_norm": 0.18142499029636383, + "learning_rate": 1.0449786237241382e-05, + "loss": 0.0603, + "step": 2850 + }, + { + "epoch": 0.502022155793916, + "grad_norm": 0.38024476170539856, + "learning_rate": 1.0421336181416796e-05, + "loss": 0.0712, + "step": 2855 + }, + { + "epoch": 0.5029013539651838, + "grad_norm": 0.28926122188568115, + "learning_rate": 1.03928827087911e-05, + "loss": 0.0669, + "step": 2860 + }, + { + "epoch": 0.5037805521364516, + "grad_norm": 0.4077168405056, + "learning_rate": 1.036442605010605e-05, + "loss": 0.0616, + "step": 2865 + }, + { + "epoch": 0.5046597503077194, + "grad_norm": 0.4079400300979614, + "learning_rate": 1.0335966436129268e-05, + "loss": 0.058, + "step": 2870 + }, + { + "epoch": 0.5055389484789872, + "grad_norm": 0.5996482968330383, + "learning_rate": 1.0307504097652323e-05, + "loss": 0.0512, + "step": 2875 + }, + { + "epoch": 0.506418146650255, + "grad_norm": 0.09414978325366974, + "learning_rate": 1.0279039265488885e-05, + "loss": 0.0519, + "step": 2880 + }, + { + "epoch": 0.5072973448215228, + "grad_norm": 0.3783423602581024, + "learning_rate": 1.0250572170472848e-05, + "loss": 0.0599, + "step": 2885 + }, + { + "epoch": 0.5081765429927906, + "grad_norm": 0.7148971557617188, + "learning_rate": 1.0222103043456447e-05, + "loss": 0.0681, + "step": 2890 + }, + { + "epoch": 0.5090557411640584, + "grad_norm": 0.29532909393310547, + "learning_rate": 1.0193632115308412e-05, + "loss": 0.0628, + "step": 2895 + }, + { + "epoch": 0.5099349393353262, + "grad_norm": 0.231553316116333, + "learning_rate": 1.016515961691206e-05, + "loss": 0.0489, + "step": 2900 + }, + { + "epoch": 0.510814137506594, + "grad_norm": 0.6670016646385193, + "learning_rate": 1.0136685779163458e-05, + "loss": 0.0596, + "step": 2905 + }, + { + "epoch": 0.5116933356778618, + "grad_norm": 0.8102641105651855, + "learning_rate": 1.010821083296952e-05, + "loss": 0.0563, + "step": 2910 + }, + { + "epoch": 0.5125725338491296, + "grad_norm": 0.21613669395446777, + "learning_rate": 1.0079735009246168e-05, + "loss": 0.0571, + "step": 2915 + }, + { + "epoch": 0.5134517320203974, + "grad_norm": 0.34142959117889404, + "learning_rate": 1.0051258538916422e-05, + "loss": 0.0611, + "step": 2920 + }, + { + "epoch": 0.5143309301916652, + "grad_norm": 0.5886263847351074, + "learning_rate": 1.0022781652908549e-05, + "loss": 0.0596, + "step": 2925 + }, + { + "epoch": 0.515210128362933, + "grad_norm": 0.37875401973724365, + "learning_rate": 9.994304582154197e-06, + "loss": 0.045, + "step": 2930 + }, + { + "epoch": 0.5160893265342008, + "grad_norm": 0.6814181804656982, + "learning_rate": 9.9658275575865e-06, + "loss": 0.0399, + "step": 2935 + }, + { + "epoch": 0.5169685247054686, + "grad_norm": 0.6115921139717102, + "learning_rate": 9.93735081013823e-06, + "loss": 0.0586, + "step": 2940 + }, + { + "epoch": 0.5178477228767364, + "grad_norm": 0.7454824447631836, + "learning_rate": 9.908874570739899e-06, + "loss": 0.0623, + "step": 2945 + }, + { + "epoch": 0.5187269210480042, + "grad_norm": 0.16522662341594696, + "learning_rate": 9.880399070317907e-06, + "loss": 0.0578, + "step": 2950 + }, + { + "epoch": 0.519606119219272, + "grad_norm": 0.5657525658607483, + "learning_rate": 9.851924539792656e-06, + "loss": 0.0468, + "step": 2955 + }, + { + "epoch": 0.5204853173905398, + "grad_norm": 0.1268445998430252, + "learning_rate": 9.823451210076691e-06, + "loss": 0.0525, + "step": 2960 + }, + { + "epoch": 0.5213645155618076, + "grad_norm": 0.3536778390407562, + "learning_rate": 9.794979312072807e-06, + "loss": 0.0557, + "step": 2965 + }, + { + "epoch": 0.5222437137330754, + "grad_norm": 0.9080764651298523, + "learning_rate": 9.766509076672204e-06, + "loss": 0.0611, + "step": 2970 + }, + { + "epoch": 0.5231229119043432, + "grad_norm": 0.3755652904510498, + "learning_rate": 9.738040734752582e-06, + "loss": 0.0683, + "step": 2975 + }, + { + "epoch": 0.524002110075611, + "grad_norm": 0.6789637207984924, + "learning_rate": 9.709574517176301e-06, + "loss": 0.0475, + "step": 2980 + }, + { + "epoch": 0.5248813082468788, + "grad_norm": 0.7782059907913208, + "learning_rate": 9.681110654788483e-06, + "loss": 0.0521, + "step": 2985 + }, + { + "epoch": 0.5257605064181466, + "grad_norm": 0.20607317984104156, + "learning_rate": 9.65264937841516e-06, + "loss": 0.0517, + "step": 2990 + }, + { + "epoch": 0.5266397045894144, + "grad_norm": 0.18635804951190948, + "learning_rate": 9.62419091886138e-06, + "loss": 0.0545, + "step": 2995 + }, + { + "epoch": 0.5275189027606823, + "grad_norm": 0.26298439502716064, + "learning_rate": 9.595735506909365e-06, + "loss": 0.0529, + "step": 3000 + }, + { + "epoch": 0.52839810093195, + "grad_norm": 0.4759673774242401, + "learning_rate": 9.567283373316608e-06, + "loss": 0.0544, + "step": 3005 + }, + { + "epoch": 0.5292772991032179, + "grad_norm": 0.2273968905210495, + "learning_rate": 9.538834748814028e-06, + "loss": 0.0643, + "step": 3010 + }, + { + "epoch": 0.5301564972744857, + "grad_norm": 0.27050191164016724, + "learning_rate": 9.510389864104069e-06, + "loss": 0.057, + "step": 3015 + }, + { + "epoch": 0.5310356954457535, + "grad_norm": 0.23292891681194305, + "learning_rate": 9.481948949858876e-06, + "loss": 0.0656, + "step": 3020 + }, + { + "epoch": 0.5319148936170213, + "grad_norm": 0.7964295744895935, + "learning_rate": 9.453512236718365e-06, + "loss": 0.0506, + "step": 3025 + }, + { + "epoch": 0.5327940917882891, + "grad_norm": 0.46331024169921875, + "learning_rate": 9.42507995528841e-06, + "loss": 0.0612, + "step": 3030 + }, + { + "epoch": 0.5336732899595569, + "grad_norm": 0.5655067563056946, + "learning_rate": 9.396652336138923e-06, + "loss": 0.0585, + "step": 3035 + }, + { + "epoch": 0.5345524881308247, + "grad_norm": 0.18470239639282227, + "learning_rate": 9.368229609802028e-06, + "loss": 0.048, + "step": 3040 + }, + { + "epoch": 0.5354316863020925, + "grad_norm": 0.7371414303779602, + "learning_rate": 9.339812006770154e-06, + "loss": 0.0526, + "step": 3045 + }, + { + "epoch": 0.5363108844733603, + "grad_norm": 0.8986045122146606, + "learning_rate": 9.311399757494196e-06, + "loss": 0.0584, + "step": 3050 + }, + { + "epoch": 0.5371900826446281, + "grad_norm": 0.4300435185432434, + "learning_rate": 9.282993092381626e-06, + "loss": 0.0519, + "step": 3055 + }, + { + "epoch": 0.5380692808158959, + "grad_norm": 0.6866196393966675, + "learning_rate": 9.254592241794633e-06, + "loss": 0.0567, + "step": 3060 + }, + { + "epoch": 0.5389484789871637, + "grad_norm": 0.17242960631847382, + "learning_rate": 9.226197436048252e-06, + "loss": 0.0611, + "step": 3065 + }, + { + "epoch": 0.5398276771584315, + "grad_norm": 0.6466073989868164, + "learning_rate": 9.197808905408504e-06, + "loss": 0.0623, + "step": 3070 + }, + { + "epoch": 0.5407068753296993, + "grad_norm": 0.20931695401668549, + "learning_rate": 9.169426880090509e-06, + "loss": 0.0606, + "step": 3075 + }, + { + "epoch": 0.5415860735009671, + "grad_norm": 0.21108533442020416, + "learning_rate": 9.141051590256651e-06, + "loss": 0.0456, + "step": 3080 + }, + { + "epoch": 0.5424652716722349, + "grad_norm": 0.1871514916419983, + "learning_rate": 9.112683266014677e-06, + "loss": 0.0468, + "step": 3085 + }, + { + "epoch": 0.5433444698435027, + "grad_norm": 0.25235074758529663, + "learning_rate": 9.084322137415855e-06, + "loss": 0.0524, + "step": 3090 + }, + { + "epoch": 0.5442236680147705, + "grad_norm": 0.6398855447769165, + "learning_rate": 9.055968434453096e-06, + "loss": 0.0523, + "step": 3095 + }, + { + "epoch": 0.5451028661860383, + "grad_norm": 0.6872847676277161, + "learning_rate": 9.027622387059103e-06, + "loss": 0.0456, + "step": 3100 + }, + { + "epoch": 0.5459820643573061, + "grad_norm": 0.7495630383491516, + "learning_rate": 8.999284225104476e-06, + "loss": 0.0673, + "step": 3105 + }, + { + "epoch": 0.5468612625285739, + "grad_norm": 0.18154507875442505, + "learning_rate": 8.970954178395894e-06, + "loss": 0.0511, + "step": 3110 + }, + { + "epoch": 0.5477404606998417, + "grad_norm": 0.16267339885234833, + "learning_rate": 8.94263247667421e-06, + "loss": 0.0475, + "step": 3115 + }, + { + "epoch": 0.5486196588711095, + "grad_norm": 0.0940733551979065, + "learning_rate": 8.914319349612607e-06, + "loss": 0.0659, + "step": 3120 + }, + { + "epoch": 0.5494988570423773, + "grad_norm": 0.5106806755065918, + "learning_rate": 8.886015026814736e-06, + "loss": 0.0532, + "step": 3125 + }, + { + "epoch": 0.5503780552136451, + "grad_norm": 0.2427133470773697, + "learning_rate": 8.857719737812836e-06, + "loss": 0.0523, + "step": 3130 + }, + { + "epoch": 0.5512572533849129, + "grad_norm": 0.36954110860824585, + "learning_rate": 8.829433712065915e-06, + "loss": 0.061, + "step": 3135 + }, + { + "epoch": 0.5521364515561807, + "grad_norm": 0.892874538898468, + "learning_rate": 8.801157178957827e-06, + "loss": 0.0491, + "step": 3140 + }, + { + "epoch": 0.5530156497274485, + "grad_norm": 0.3820838928222656, + "learning_rate": 8.772890367795476e-06, + "loss": 0.048, + "step": 3145 + }, + { + "epoch": 0.5538948478987163, + "grad_norm": 1.087281346321106, + "learning_rate": 8.744633507806907e-06, + "loss": 0.0608, + "step": 3150 + }, + { + "epoch": 0.5547740460699842, + "grad_norm": 0.2828584909439087, + "learning_rate": 8.716386828139478e-06, + "loss": 0.0616, + "step": 3155 + }, + { + "epoch": 0.555653244241252, + "grad_norm": 0.6577861309051514, + "learning_rate": 8.688150557857979e-06, + "loss": 0.0586, + "step": 3160 + }, + { + "epoch": 0.5565324424125198, + "grad_norm": 0.45899686217308044, + "learning_rate": 8.659924925942798e-06, + "loss": 0.058, + "step": 3165 + }, + { + "epoch": 0.5574116405837876, + "grad_norm": 0.12640362977981567, + "learning_rate": 8.631710161288043e-06, + "loss": 0.0628, + "step": 3170 + }, + { + "epoch": 0.5582908387550554, + "grad_norm": 0.478767454624176, + "learning_rate": 8.603506492699698e-06, + "loss": 0.0636, + "step": 3175 + }, + { + "epoch": 0.5591700369263232, + "grad_norm": 0.22441603243350983, + "learning_rate": 8.575314148893765e-06, + "loss": 0.0461, + "step": 3180 + }, + { + "epoch": 0.560049235097591, + "grad_norm": 0.9642320275306702, + "learning_rate": 8.547133358494408e-06, + "loss": 0.0541, + "step": 3185 + }, + { + "epoch": 0.5609284332688588, + "grad_norm": 0.4824267029762268, + "learning_rate": 8.518964350032092e-06, + "loss": 0.0516, + "step": 3190 + }, + { + "epoch": 0.5618076314401266, + "grad_norm": 0.26887422800064087, + "learning_rate": 8.490807351941753e-06, + "loss": 0.0497, + "step": 3195 + }, + { + "epoch": 0.5626868296113944, + "grad_norm": 0.4273674488067627, + "learning_rate": 8.462662592560911e-06, + "loss": 0.0573, + "step": 3200 + }, + { + "epoch": 0.5635660277826622, + "grad_norm": 0.385642409324646, + "learning_rate": 8.434530300127853e-06, + "loss": 0.0592, + "step": 3205 + }, + { + "epoch": 0.56444522595393, + "grad_norm": 0.2365254908800125, + "learning_rate": 8.406410702779754e-06, + "loss": 0.0597, + "step": 3210 + }, + { + "epoch": 0.5653244241251978, + "grad_norm": 0.20545728504657745, + "learning_rate": 8.378304028550848e-06, + "loss": 0.0524, + "step": 3215 + }, + { + "epoch": 0.5662036222964656, + "grad_norm": 0.45571327209472656, + "learning_rate": 8.35021050537056e-06, + "loss": 0.0651, + "step": 3220 + }, + { + "epoch": 0.5670828204677334, + "grad_norm": 0.3450813591480255, + "learning_rate": 8.32213036106168e-06, + "loss": 0.0534, + "step": 3225 + }, + { + "epoch": 0.5679620186390012, + "grad_norm": 0.8915445804595947, + "learning_rate": 8.294063823338486e-06, + "loss": 0.0607, + "step": 3230 + }, + { + "epoch": 0.568841216810269, + "grad_norm": 0.3208938539028168, + "learning_rate": 8.266011119804937e-06, + "loss": 0.0487, + "step": 3235 + }, + { + "epoch": 0.5697204149815368, + "grad_norm": 0.3896248936653137, + "learning_rate": 8.237972477952779e-06, + "loss": 0.0534, + "step": 3240 + }, + { + "epoch": 0.5705996131528046, + "grad_norm": 0.28958839178085327, + "learning_rate": 8.209948125159745e-06, + "loss": 0.0564, + "step": 3245 + }, + { + "epoch": 0.5714788113240724, + "grad_norm": 0.8576890230178833, + "learning_rate": 8.181938288687683e-06, + "loss": 0.0602, + "step": 3250 + }, + { + "epoch": 0.5723580094953402, + "grad_norm": 1.088234305381775, + "learning_rate": 8.153943195680724e-06, + "loss": 0.0578, + "step": 3255 + }, + { + "epoch": 0.573237207666608, + "grad_norm": 0.2483123540878296, + "learning_rate": 8.125963073163435e-06, + "loss": 0.0588, + "step": 3260 + }, + { + "epoch": 0.5741164058378758, + "grad_norm": 0.21865952014923096, + "learning_rate": 8.097998148038986e-06, + "loss": 0.0587, + "step": 3265 + }, + { + "epoch": 0.5749956040091436, + "grad_norm": 0.44943463802337646, + "learning_rate": 8.070048647087298e-06, + "loss": 0.0542, + "step": 3270 + }, + { + "epoch": 0.5758748021804114, + "grad_norm": 0.1797959953546524, + "learning_rate": 8.042114796963219e-06, + "loss": 0.0541, + "step": 3275 + }, + { + "epoch": 0.5767540003516792, + "grad_norm": 0.7815648317337036, + "learning_rate": 8.014196824194668e-06, + "loss": 0.0466, + "step": 3280 + }, + { + "epoch": 0.577633198522947, + "grad_norm": 0.5617738366127014, + "learning_rate": 7.986294955180815e-06, + "loss": 0.0521, + "step": 3285 + }, + { + "epoch": 0.5785123966942148, + "grad_norm": 0.35597896575927734, + "learning_rate": 7.958409416190233e-06, + "loss": 0.0611, + "step": 3290 + }, + { + "epoch": 0.5793915948654826, + "grad_norm": 0.4443993866443634, + "learning_rate": 7.93054043335907e-06, + "loss": 0.0641, + "step": 3295 + }, + { + "epoch": 0.5802707930367504, + "grad_norm": 0.5022766590118408, + "learning_rate": 7.902688232689212e-06, + "loss": 0.0489, + "step": 3300 + }, + { + "epoch": 0.5811499912080182, + "grad_norm": 0.6365528702735901, + "learning_rate": 7.874853040046455e-06, + "loss": 0.0686, + "step": 3305 + }, + { + "epoch": 0.582029189379286, + "grad_norm": 0.3193661570549011, + "learning_rate": 7.847035081158654e-06, + "loss": 0.0552, + "step": 3310 + }, + { + "epoch": 0.5829083875505539, + "grad_norm": 0.3971255123615265, + "learning_rate": 7.819234581613934e-06, + "loss": 0.068, + "step": 3315 + }, + { + "epoch": 0.5837875857218217, + "grad_norm": 0.4976908564567566, + "learning_rate": 7.791451766858808e-06, + "loss": 0.0508, + "step": 3320 + }, + { + "epoch": 0.5846667838930895, + "grad_norm": 0.25572288036346436, + "learning_rate": 7.763686862196397e-06, + "loss": 0.0594, + "step": 3325 + }, + { + "epoch": 0.5855459820643573, + "grad_norm": 0.7076115012168884, + "learning_rate": 7.735940092784564e-06, + "loss": 0.0649, + "step": 3330 + }, + { + "epoch": 0.5864251802356251, + "grad_norm": 0.33293214440345764, + "learning_rate": 7.708211683634112e-06, + "loss": 0.0528, + "step": 3335 + }, + { + "epoch": 0.5873043784068929, + "grad_norm": 0.33272784948349, + "learning_rate": 7.680501859606961e-06, + "loss": 0.0505, + "step": 3340 + }, + { + "epoch": 0.5881835765781607, + "grad_norm": 0.4092160165309906, + "learning_rate": 7.652810845414297e-06, + "loss": 0.0437, + "step": 3345 + }, + { + "epoch": 0.5890627747494285, + "grad_norm": 0.24491731822490692, + "learning_rate": 7.625138865614795e-06, + "loss": 0.0635, + "step": 3350 + }, + { + "epoch": 0.5899419729206963, + "grad_norm": 0.6766128540039062, + "learning_rate": 7.597486144612741e-06, + "loss": 0.0473, + "step": 3355 + }, + { + "epoch": 0.5908211710919641, + "grad_norm": 0.27820366621017456, + "learning_rate": 7.569852906656269e-06, + "loss": 0.0521, + "step": 3360 + }, + { + "epoch": 0.5917003692632319, + "grad_norm": 0.8893203735351562, + "learning_rate": 7.542239375835499e-06, + "loss": 0.0644, + "step": 3365 + }, + { + "epoch": 0.5925795674344997, + "grad_norm": 0.4650630056858063, + "learning_rate": 7.514645776080747e-06, + "loss": 0.0694, + "step": 3370 + }, + { + "epoch": 0.5934587656057675, + "grad_norm": 0.42498430609703064, + "learning_rate": 7.487072331160696e-06, + "loss": 0.0588, + "step": 3375 + }, + { + "epoch": 0.5943379637770353, + "grad_norm": 0.5707778334617615, + "learning_rate": 7.459519264680586e-06, + "loss": 0.0655, + "step": 3380 + }, + { + "epoch": 0.5952171619483031, + "grad_norm": 0.4820030629634857, + "learning_rate": 7.431986800080394e-06, + "loss": 0.0765, + "step": 3385 + }, + { + "epoch": 0.5960963601195709, + "grad_norm": 1.1453475952148438, + "learning_rate": 7.4044751606330365e-06, + "loss": 0.062, + "step": 3390 + }, + { + "epoch": 0.5969755582908387, + "grad_norm": 0.18737494945526123, + "learning_rate": 7.37698456944254e-06, + "loss": 0.0495, + "step": 3395 + }, + { + "epoch": 0.5978547564621065, + "grad_norm": 0.49220606684684753, + "learning_rate": 7.349515249442248e-06, + "loss": 0.0575, + "step": 3400 + }, + { + "epoch": 0.5987339546333743, + "grad_norm": 0.1627589464187622, + "learning_rate": 7.322067423393002e-06, + "loss": 0.0556, + "step": 3405 + }, + { + "epoch": 0.5996131528046421, + "grad_norm": 0.5828942060470581, + "learning_rate": 7.294641313881348e-06, + "loss": 0.0597, + "step": 3410 + }, + { + "epoch": 0.6004923509759099, + "grad_norm": 0.5319778323173523, + "learning_rate": 7.267237143317707e-06, + "loss": 0.0579, + "step": 3415 + }, + { + "epoch": 0.6013715491471777, + "grad_norm": 0.14266423881053925, + "learning_rate": 7.239855133934608e-06, + "loss": 0.0591, + "step": 3420 + }, + { + "epoch": 0.6022507473184455, + "grad_norm": 0.13865630328655243, + "learning_rate": 7.212495507784843e-06, + "loss": 0.0589, + "step": 3425 + }, + { + "epoch": 0.6031299454897133, + "grad_norm": 0.7919948697090149, + "learning_rate": 7.185158486739712e-06, + "loss": 0.052, + "step": 3430 + }, + { + "epoch": 0.6040091436609812, + "grad_norm": 0.4793383777141571, + "learning_rate": 7.157844292487174e-06, + "loss": 0.0637, + "step": 3435 + }, + { + "epoch": 0.604888341832249, + "grad_norm": 0.2558303773403168, + "learning_rate": 7.130553146530105e-06, + "loss": 0.0724, + "step": 3440 + }, + { + "epoch": 0.6057675400035168, + "grad_norm": 0.33608704805374146, + "learning_rate": 7.103285270184446e-06, + "loss": 0.0502, + "step": 3445 + }, + { + "epoch": 0.6066467381747846, + "grad_norm": 0.31324923038482666, + "learning_rate": 7.076040884577449e-06, + "loss": 0.0559, + "step": 3450 + }, + { + "epoch": 0.6075259363460525, + "grad_norm": 0.13528025150299072, + "learning_rate": 7.048820210645862e-06, + "loss": 0.0579, + "step": 3455 + }, + { + "epoch": 0.6084051345173203, + "grad_norm": 0.12272872775793076, + "learning_rate": 7.021623469134156e-06, + "loss": 0.0573, + "step": 3460 + }, + { + "epoch": 0.6092843326885881, + "grad_norm": 0.2325238287448883, + "learning_rate": 6.994450880592706e-06, + "loss": 0.0698, + "step": 3465 + }, + { + "epoch": 0.6101635308598559, + "grad_norm": 0.27444854378700256, + "learning_rate": 6.967302665376037e-06, + "loss": 0.0605, + "step": 3470 + }, + { + "epoch": 0.6110427290311237, + "grad_norm": 0.7133885622024536, + "learning_rate": 6.940179043641005e-06, + "loss": 0.055, + "step": 3475 + }, + { + "epoch": 0.6119219272023915, + "grad_norm": 0.22960059344768524, + "learning_rate": 6.913080235345042e-06, + "loss": 0.0635, + "step": 3480 + }, + { + "epoch": 0.6128011253736593, + "grad_norm": 0.4592248201370239, + "learning_rate": 6.886006460244342e-06, + "loss": 0.0575, + "step": 3485 + }, + { + "epoch": 0.6136803235449271, + "grad_norm": 0.21212299168109894, + "learning_rate": 6.858957937892105e-06, + "loss": 0.0607, + "step": 3490 + }, + { + "epoch": 0.6145595217161949, + "grad_norm": 0.23819631338119507, + "learning_rate": 6.831934887636737e-06, + "loss": 0.0512, + "step": 3495 + }, + { + "epoch": 0.6154387198874627, + "grad_norm": 0.3616998493671417, + "learning_rate": 6.804937528620088e-06, + "loss": 0.0613, + "step": 3500 + }, + { + "epoch": 0.6163179180587305, + "grad_norm": 0.21602647006511688, + "learning_rate": 6.777966079775657e-06, + "loss": 0.0648, + "step": 3505 + }, + { + "epoch": 0.6171971162299983, + "grad_norm": 0.3276112973690033, + "learning_rate": 6.751020759826836e-06, + "loss": 0.0496, + "step": 3510 + }, + { + "epoch": 0.6180763144012661, + "grad_norm": 0.5276904106140137, + "learning_rate": 6.724101787285113e-06, + "loss": 0.057, + "step": 3515 + }, + { + "epoch": 0.6189555125725339, + "grad_norm": 0.3810863196849823, + "learning_rate": 6.697209380448333e-06, + "loss": 0.0584, + "step": 3520 + }, + { + "epoch": 0.6198347107438017, + "grad_norm": 0.2644079029560089, + "learning_rate": 6.670343757398882e-06, + "loss": 0.0657, + "step": 3525 + }, + { + "epoch": 0.6207139089150695, + "grad_norm": 0.24872681498527527, + "learning_rate": 6.643505136001972e-06, + "loss": 0.0435, + "step": 3530 + }, + { + "epoch": 0.6215931070863373, + "grad_norm": 0.15694132447242737, + "learning_rate": 6.616693733903823e-06, + "loss": 0.053, + "step": 3535 + }, + { + "epoch": 0.6224723052576051, + "grad_norm": 0.58054518699646, + "learning_rate": 6.5899097685299395e-06, + "loss": 0.0735, + "step": 3540 + }, + { + "epoch": 0.6233515034288729, + "grad_norm": 0.479245662689209, + "learning_rate": 6.563153457083315e-06, + "loss": 0.0588, + "step": 3545 + }, + { + "epoch": 0.6242307016001407, + "grad_norm": 0.28133952617645264, + "learning_rate": 6.5364250165427e-06, + "loss": 0.0573, + "step": 3550 + }, + { + "epoch": 0.6251098997714085, + "grad_norm": 0.5088506937026978, + "learning_rate": 6.509724663660813e-06, + "loss": 0.055, + "step": 3555 + }, + { + "epoch": 0.6259890979426763, + "grad_norm": 0.7142006158828735, + "learning_rate": 6.4830526149626064e-06, + "loss": 0.04, + "step": 3560 + }, + { + "epoch": 0.6268682961139441, + "grad_norm": 0.31554004549980164, + "learning_rate": 6.4564090867435e-06, + "loss": 0.0593, + "step": 3565 + }, + { + "epoch": 0.6277474942852119, + "grad_norm": 0.2784283757209778, + "learning_rate": 6.429794295067625e-06, + "loss": 0.046, + "step": 3570 + }, + { + "epoch": 0.6286266924564797, + "grad_norm": 0.44909903407096863, + "learning_rate": 6.403208455766081e-06, + "loss": 0.0563, + "step": 3575 + }, + { + "epoch": 0.6295058906277475, + "grad_norm": 0.21697324514389038, + "learning_rate": 6.376651784435174e-06, + "loss": 0.0527, + "step": 3580 + }, + { + "epoch": 0.6303850887990153, + "grad_norm": 0.5999769568443298, + "learning_rate": 6.350124496434677e-06, + "loss": 0.066, + "step": 3585 + }, + { + "epoch": 0.6312642869702831, + "grad_norm": 0.4327978193759918, + "learning_rate": 6.323626806886082e-06, + "loss": 0.0493, + "step": 3590 + }, + { + "epoch": 0.6321434851415509, + "grad_norm": 0.47847869992256165, + "learning_rate": 6.297158930670852e-06, + "loss": 0.0593, + "step": 3595 + }, + { + "epoch": 0.6330226833128187, + "grad_norm": 1.0836427211761475, + "learning_rate": 6.270721082428678e-06, + "loss": 0.0557, + "step": 3600 + }, + { + "epoch": 0.6339018814840865, + "grad_norm": 0.2735603153705597, + "learning_rate": 6.2443134765557475e-06, + "loss": 0.0662, + "step": 3605 + }, + { + "epoch": 0.6347810796553544, + "grad_norm": 0.3386712372303009, + "learning_rate": 6.2179363272029935e-06, + "loss": 0.0497, + "step": 3610 + }, + { + "epoch": 0.6356602778266222, + "grad_norm": 0.11221656948328018, + "learning_rate": 6.191589848274369e-06, + "loss": 0.0498, + "step": 3615 + }, + { + "epoch": 0.63653947599789, + "grad_norm": 0.5436195135116577, + "learning_rate": 6.1652742534251e-06, + "loss": 0.054, + "step": 3620 + }, + { + "epoch": 0.6374186741691578, + "grad_norm": 0.5452234148979187, + "learning_rate": 6.138989756059968e-06, + "loss": 0.0448, + "step": 3625 + }, + { + "epoch": 0.6382978723404256, + "grad_norm": 0.4096541404724121, + "learning_rate": 6.1127365693315566e-06, + "loss": 0.0556, + "step": 3630 + }, + { + "epoch": 0.6391770705116934, + "grad_norm": 0.4600813388824463, + "learning_rate": 6.086514906138563e-06, + "loss": 0.0562, + "step": 3635 + }, + { + "epoch": 0.6400562686829612, + "grad_norm": 0.8097591996192932, + "learning_rate": 6.060324979124016e-06, + "loss": 0.0534, + "step": 3640 + }, + { + "epoch": 0.640935466854229, + "grad_norm": 0.8001208305358887, + "learning_rate": 6.034167000673611e-06, + "loss": 0.0589, + "step": 3645 + }, + { + "epoch": 0.6418146650254968, + "grad_norm": 0.3946583867073059, + "learning_rate": 6.008041182913933e-06, + "loss": 0.0594, + "step": 3650 + }, + { + "epoch": 0.6426938631967646, + "grad_norm": 0.3865828216075897, + "learning_rate": 5.981947737710779e-06, + "loss": 0.0655, + "step": 3655 + }, + { + "epoch": 0.6435730613680324, + "grad_norm": 1.0482414960861206, + "learning_rate": 5.955886876667414e-06, + "loss": 0.0652, + "step": 3660 + }, + { + "epoch": 0.6444522595393002, + "grad_norm": 1.3280454874038696, + "learning_rate": 5.929858811122868e-06, + "loss": 0.0678, + "step": 3665 + }, + { + "epoch": 0.645331457710568, + "grad_norm": 0.1955081820487976, + "learning_rate": 5.903863752150212e-06, + "loss": 0.0565, + "step": 3670 + }, + { + "epoch": 0.6462106558818358, + "grad_norm": 0.37470242381095886, + "learning_rate": 5.877901910554862e-06, + "loss": 0.0558, + "step": 3675 + }, + { + "epoch": 0.6470898540531036, + "grad_norm": 0.8022100329399109, + "learning_rate": 5.851973496872849e-06, + "loss": 0.0498, + "step": 3680 + }, + { + "epoch": 0.6479690522243714, + "grad_norm": 0.23341333866119385, + "learning_rate": 5.82607872136913e-06, + "loss": 0.0637, + "step": 3685 + }, + { + "epoch": 0.6488482503956392, + "grad_norm": 0.40961551666259766, + "learning_rate": 5.800217794035872e-06, + "loss": 0.0463, + "step": 3690 + }, + { + "epoch": 0.649727448566907, + "grad_norm": 0.21275675296783447, + "learning_rate": 5.774390924590754e-06, + "loss": 0.0552, + "step": 3695 + }, + { + "epoch": 0.6506066467381748, + "grad_norm": 0.17066530883312225, + "learning_rate": 5.748598322475258e-06, + "loss": 0.0585, + "step": 3700 + }, + { + "epoch": 0.6514858449094426, + "grad_norm": 0.3308875262737274, + "learning_rate": 5.7228401968529836e-06, + "loss": 0.058, + "step": 3705 + }, + { + "epoch": 0.6523650430807104, + "grad_norm": 0.7064841389656067, + "learning_rate": 5.697116756607946e-06, + "loss": 0.0608, + "step": 3710 + }, + { + "epoch": 0.6532442412519782, + "grad_norm": 0.13864412903785706, + "learning_rate": 5.671428210342884e-06, + "loss": 0.0409, + "step": 3715 + }, + { + "epoch": 0.654123439423246, + "grad_norm": 0.49417930841445923, + "learning_rate": 5.64577476637755e-06, + "loss": 0.0633, + "step": 3720 + }, + { + "epoch": 0.6550026375945138, + "grad_norm": 0.5518152117729187, + "learning_rate": 5.620156632747053e-06, + "loss": 0.0522, + "step": 3725 + }, + { + "epoch": 0.6558818357657816, + "grad_norm": 0.157115638256073, + "learning_rate": 5.594574017200149e-06, + "loss": 0.0474, + "step": 3730 + }, + { + "epoch": 0.6567610339370494, + "grad_norm": 0.16834110021591187, + "learning_rate": 5.569027127197565e-06, + "loss": 0.0573, + "step": 3735 + }, + { + "epoch": 0.6576402321083172, + "grad_norm": 1.1739530563354492, + "learning_rate": 5.5435161699103055e-06, + "loss": 0.0531, + "step": 3740 + }, + { + "epoch": 0.658519430279585, + "grad_norm": 0.4163840115070343, + "learning_rate": 5.518041352217989e-06, + "loss": 0.0731, + "step": 3745 + }, + { + "epoch": 0.6593986284508528, + "grad_norm": 0.2617214322090149, + "learning_rate": 5.492602880707161e-06, + "loss": 0.0614, + "step": 3750 + }, + { + "epoch": 0.6602778266221206, + "grad_norm": 0.3267338275909424, + "learning_rate": 5.467200961669619e-06, + "loss": 0.0511, + "step": 3755 + }, + { + "epoch": 0.6611570247933884, + "grad_norm": 0.7266274094581604, + "learning_rate": 5.441835801100734e-06, + "loss": 0.0526, + "step": 3760 + }, + { + "epoch": 0.6620362229646563, + "grad_norm": 0.7042490243911743, + "learning_rate": 5.416507604697801e-06, + "loss": 0.0383, + "step": 3765 + }, + { + "epoch": 0.662915421135924, + "grad_norm": 0.5207750797271729, + "learning_rate": 5.391216577858331e-06, + "loss": 0.0561, + "step": 3770 + }, + { + "epoch": 0.6637946193071919, + "grad_norm": 0.7317136526107788, + "learning_rate": 5.365962925678443e-06, + "loss": 0.0609, + "step": 3775 + }, + { + "epoch": 0.6646738174784597, + "grad_norm": 0.47223329544067383, + "learning_rate": 5.340746852951151e-06, + "loss": 0.0661, + "step": 3780 + }, + { + "epoch": 0.6655530156497275, + "grad_norm": 0.5919240713119507, + "learning_rate": 5.315568564164713e-06, + "loss": 0.0591, + "step": 3785 + }, + { + "epoch": 0.6664322138209953, + "grad_norm": 0.16643045842647552, + "learning_rate": 5.290428263500996e-06, + "loss": 0.0512, + "step": 3790 + }, + { + "epoch": 0.6673114119922631, + "grad_norm": 0.6606490612030029, + "learning_rate": 5.26532615483379e-06, + "loss": 0.06, + "step": 3795 + }, + { + "epoch": 0.6681906101635309, + "grad_norm": 0.5036027431488037, + "learning_rate": 5.240262441727187e-06, + "loss": 0.0546, + "step": 3800 + }, + { + "epoch": 0.6690698083347987, + "grad_norm": 0.2331734448671341, + "learning_rate": 5.215237327433895e-06, + "loss": 0.0512, + "step": 3805 + }, + { + "epoch": 0.6699490065060665, + "grad_norm": 0.19755728542804718, + "learning_rate": 5.190251014893621e-06, + "loss": 0.047, + "step": 3810 + }, + { + "epoch": 0.6708282046773343, + "grad_norm": 0.788175642490387, + "learning_rate": 5.165303706731397e-06, + "loss": 0.0681, + "step": 3815 + }, + { + "epoch": 0.6717074028486021, + "grad_norm": 0.19185423851013184, + "learning_rate": 5.140395605255965e-06, + "loss": 0.0535, + "step": 3820 + }, + { + "epoch": 0.6725866010198699, + "grad_norm": 0.22378072142601013, + "learning_rate": 5.115526912458113e-06, + "loss": 0.0584, + "step": 3825 + }, + { + "epoch": 0.6734657991911377, + "grad_norm": 0.31949400901794434, + "learning_rate": 5.090697830009057e-06, + "loss": 0.059, + "step": 3830 + }, + { + "epoch": 0.6743449973624055, + "grad_norm": 0.31779804825782776, + "learning_rate": 5.065908559258782e-06, + "loss": 0.0541, + "step": 3835 + }, + { + "epoch": 0.6752241955336733, + "grad_norm": 0.9284188747406006, + "learning_rate": 5.0411593012344305e-06, + "loss": 0.0461, + "step": 3840 + }, + { + "epoch": 0.6761033937049411, + "grad_norm": 1.0573227405548096, + "learning_rate": 5.0164502566386655e-06, + "loss": 0.0529, + "step": 3845 + }, + { + "epoch": 0.6769825918762089, + "grad_norm": 0.8638303279876709, + "learning_rate": 4.991781625848039e-06, + "loss": 0.0652, + "step": 3850 + }, + { + "epoch": 0.6778617900474767, + "grad_norm": 0.3580770194530487, + "learning_rate": 4.967153608911366e-06, + "loss": 0.0456, + "step": 3855 + }, + { + "epoch": 0.6787409882187445, + "grad_norm": 0.1639026552438736, + "learning_rate": 4.942566405548109e-06, + "loss": 0.0624, + "step": 3860 + }, + { + "epoch": 0.6796201863900123, + "grad_norm": 0.22067619860172272, + "learning_rate": 4.918020215146759e-06, + "loss": 0.0586, + "step": 3865 + }, + { + "epoch": 0.6804993845612801, + "grad_norm": 0.24657614529132843, + "learning_rate": 4.8935152367632136e-06, + "loss": 0.0542, + "step": 3870 + }, + { + "epoch": 0.6813785827325479, + "grad_norm": 0.9170181751251221, + "learning_rate": 4.869051669119153e-06, + "loss": 0.0517, + "step": 3875 + }, + { + "epoch": 0.6822577809038157, + "grad_norm": 1.0674784183502197, + "learning_rate": 4.844629710600457e-06, + "loss": 0.0725, + "step": 3880 + }, + { + "epoch": 0.6831369790750835, + "grad_norm": 0.3955898880958557, + "learning_rate": 4.820249559255559e-06, + "loss": 0.0557, + "step": 3885 + }, + { + "epoch": 0.6840161772463513, + "grad_norm": 0.3524361252784729, + "learning_rate": 4.795911412793883e-06, + "loss": 0.0589, + "step": 3890 + }, + { + "epoch": 0.6848953754176191, + "grad_norm": 0.3493446111679077, + "learning_rate": 4.771615468584194e-06, + "loss": 0.0516, + "step": 3895 + }, + { + "epoch": 0.6857745735888869, + "grad_norm": 0.3028711676597595, + "learning_rate": 4.747361923653039e-06, + "loss": 0.0513, + "step": 3900 + }, + { + "epoch": 0.6866537717601547, + "grad_norm": 0.09052418917417526, + "learning_rate": 4.723150974683112e-06, + "loss": 0.0559, + "step": 3905 + }, + { + "epoch": 0.6875329699314225, + "grad_norm": 0.7868338823318481, + "learning_rate": 4.698982818011694e-06, + "loss": 0.0666, + "step": 3910 + }, + { + "epoch": 0.6884121681026903, + "grad_norm": 0.16918179392814636, + "learning_rate": 4.674857649629035e-06, + "loss": 0.0527, + "step": 3915 + }, + { + "epoch": 0.6892913662739582, + "grad_norm": 0.3652224540710449, + "learning_rate": 4.650775665176783e-06, + "loss": 0.0567, + "step": 3920 + }, + { + "epoch": 0.690170564445226, + "grad_norm": 0.5725377798080444, + "learning_rate": 4.626737059946375e-06, + "loss": 0.0632, + "step": 3925 + }, + { + "epoch": 0.6910497626164938, + "grad_norm": 0.4080544114112854, + "learning_rate": 4.602742028877475e-06, + "loss": 0.0485, + "step": 3930 + }, + { + "epoch": 0.6919289607877616, + "grad_norm": 0.13161161541938782, + "learning_rate": 4.578790766556386e-06, + "loss": 0.0661, + "step": 3935 + }, + { + "epoch": 0.6928081589590294, + "grad_norm": 0.28293490409851074, + "learning_rate": 4.554883467214472e-06, + "loss": 0.0572, + "step": 3940 + }, + { + "epoch": 0.6936873571302972, + "grad_norm": 0.4282214343547821, + "learning_rate": 4.53102032472657e-06, + "loss": 0.0519, + "step": 3945 + }, + { + "epoch": 0.694566555301565, + "grad_norm": 0.44754886627197266, + "learning_rate": 4.507201532609444e-06, + "loss": 0.056, + "step": 3950 + }, + { + "epoch": 0.6954457534728328, + "grad_norm": 0.1610032469034195, + "learning_rate": 4.4834272840201945e-06, + "loss": 0.0592, + "step": 3955 + }, + { + "epoch": 0.6963249516441006, + "grad_norm": 0.18849897384643555, + "learning_rate": 4.459697771754704e-06, + "loss": 0.0546, + "step": 3960 + }, + { + "epoch": 0.6972041498153684, + "grad_norm": 0.41627243161201477, + "learning_rate": 4.436013188246056e-06, + "loss": 0.0654, + "step": 3965 + }, + { + "epoch": 0.6980833479866362, + "grad_norm": 0.3271617889404297, + "learning_rate": 4.412373725563001e-06, + "loss": 0.0524, + "step": 3970 + }, + { + "epoch": 0.698962546157904, + "grad_norm": 0.231711283326149, + "learning_rate": 4.388779575408371e-06, + "loss": 0.0543, + "step": 3975 + }, + { + "epoch": 0.6998417443291718, + "grad_norm": 0.38711780309677124, + "learning_rate": 4.36523092911756e-06, + "loss": 0.0592, + "step": 3980 + }, + { + "epoch": 0.7007209425004396, + "grad_norm": 0.3789129853248596, + "learning_rate": 4.341727977656925e-06, + "loss": 0.0556, + "step": 3985 + }, + { + "epoch": 0.7016001406717074, + "grad_norm": 1.4185389280319214, + "learning_rate": 4.318270911622285e-06, + "loss": 0.0618, + "step": 3990 + }, + { + "epoch": 0.7024793388429752, + "grad_norm": 0.749077558517456, + "learning_rate": 4.2948599212373386e-06, + "loss": 0.0558, + "step": 3995 + }, + { + "epoch": 0.703358537014243, + "grad_norm": 0.23492804169654846, + "learning_rate": 4.271495196352141e-06, + "loss": 0.0614, + "step": 4000 + }, + { + "epoch": 0.7042377351855108, + "grad_norm": 0.2221526801586151, + "learning_rate": 4.248176926441574e-06, + "loss": 0.0592, + "step": 4005 + }, + { + "epoch": 0.7051169333567786, + "grad_norm": 0.2977111339569092, + "learning_rate": 4.224905300603772e-06, + "loss": 0.0449, + "step": 4010 + }, + { + "epoch": 0.7059961315280464, + "grad_norm": 0.3051705062389374, + "learning_rate": 4.2016805075586306e-06, + "loss": 0.0507, + "step": 4015 + }, + { + "epoch": 0.7068753296993142, + "grad_norm": 0.21050839126110077, + "learning_rate": 4.178502735646244e-06, + "loss": 0.0666, + "step": 4020 + }, + { + "epoch": 0.707754527870582, + "grad_norm": 0.37007981538772583, + "learning_rate": 4.1553721728254e-06, + "loss": 0.0565, + "step": 4025 + }, + { + "epoch": 0.7086337260418498, + "grad_norm": 0.8668897747993469, + "learning_rate": 4.1322890066720465e-06, + "loss": 0.0503, + "step": 4030 + }, + { + "epoch": 0.7095129242131176, + "grad_norm": 0.13824816048145294, + "learning_rate": 4.109253424377773e-06, + "loss": 0.0656, + "step": 4035 + }, + { + "epoch": 0.7103921223843854, + "grad_norm": 0.5541465282440186, + "learning_rate": 4.086265612748277e-06, + "loss": 0.0518, + "step": 4040 + }, + { + "epoch": 0.7112713205556532, + "grad_norm": 0.5307297706604004, + "learning_rate": 4.063325758201878e-06, + "loss": 0.0554, + "step": 4045 + }, + { + "epoch": 0.712150518726921, + "grad_norm": 0.6194763779640198, + "learning_rate": 4.040434046767984e-06, + "loss": 0.0648, + "step": 4050 + }, + { + "epoch": 0.7130297168981888, + "grad_norm": 0.08174088597297668, + "learning_rate": 4.017590664085593e-06, + "loss": 0.0512, + "step": 4055 + }, + { + "epoch": 0.7139089150694566, + "grad_norm": 0.3599446713924408, + "learning_rate": 3.994795795401774e-06, + "loss": 0.0466, + "step": 4060 + }, + { + "epoch": 0.7147881132407244, + "grad_norm": 0.7686516046524048, + "learning_rate": 3.9720496255701855e-06, + "loss": 0.0576, + "step": 4065 + }, + { + "epoch": 0.7156673114119922, + "grad_norm": 0.4762028157711029, + "learning_rate": 3.949352339049561e-06, + "loss": 0.0507, + "step": 4070 + }, + { + "epoch": 0.71654650958326, + "grad_norm": 0.37046587467193604, + "learning_rate": 3.926704119902219e-06, + "loss": 0.063, + "step": 4075 + }, + { + "epoch": 0.7174257077545279, + "grad_norm": 0.0833682268857956, + "learning_rate": 3.904105151792563e-06, + "loss": 0.0501, + "step": 4080 + }, + { + "epoch": 0.7183049059257957, + "grad_norm": 0.24260376393795013, + "learning_rate": 3.8815556179856106e-06, + "loss": 0.0531, + "step": 4085 + }, + { + "epoch": 0.7191841040970635, + "grad_norm": 0.28895097970962524, + "learning_rate": 3.859055701345477e-06, + "loss": 0.0558, + "step": 4090 + }, + { + "epoch": 0.7200633022683313, + "grad_norm": 0.3396805226802826, + "learning_rate": 3.8366055843339315e-06, + "loss": 0.0718, + "step": 4095 + }, + { + "epoch": 0.7209425004395991, + "grad_norm": 0.32812032103538513, + "learning_rate": 3.8142054490088752e-06, + "loss": 0.06, + "step": 4100 + }, + { + "epoch": 0.7218216986108669, + "grad_norm": 0.2770575284957886, + "learning_rate": 3.791855477022903e-06, + "loss": 0.0596, + "step": 4105 + }, + { + "epoch": 0.7227008967821347, + "grad_norm": 0.38149547576904297, + "learning_rate": 3.769555849621799e-06, + "loss": 0.0639, + "step": 4110 + }, + { + "epoch": 0.7235800949534025, + "grad_norm": 0.19870160520076752, + "learning_rate": 3.747306747643089e-06, + "loss": 0.0503, + "step": 4115 + }, + { + "epoch": 0.7244592931246703, + "grad_norm": 0.31559768319129944, + "learning_rate": 3.7251083515145658e-06, + "loss": 0.0546, + "step": 4120 + }, + { + "epoch": 0.7253384912959381, + "grad_norm": 0.4508054852485657, + "learning_rate": 3.7029608412528263e-06, + "loss": 0.0658, + "step": 4125 + }, + { + "epoch": 0.7262176894672059, + "grad_norm": 0.34780052304267883, + "learning_rate": 3.680864396461803e-06, + "loss": 0.0562, + "step": 4130 + }, + { + "epoch": 0.7270968876384737, + "grad_norm": 0.5677065849304199, + "learning_rate": 3.658819196331327e-06, + "loss": 0.0643, + "step": 4135 + }, + { + "epoch": 0.7279760858097415, + "grad_norm": 0.1682279258966446, + "learning_rate": 3.6368254196356576e-06, + "loss": 0.053, + "step": 4140 + }, + { + "epoch": 0.7288552839810093, + "grad_norm": 0.32326585054397583, + "learning_rate": 3.614883244732045e-06, + "loss": 0.0408, + "step": 4145 + }, + { + "epoch": 0.7297344821522771, + "grad_norm": 0.2262571007013321, + "learning_rate": 3.5929928495592657e-06, + "loss": 0.0552, + "step": 4150 + }, + { + "epoch": 0.7306136803235449, + "grad_norm": 0.26115310192108154, + "learning_rate": 3.5711544116362028e-06, + "loss": 0.0611, + "step": 4155 + }, + { + "epoch": 0.7314928784948127, + "grad_norm": 0.464222252368927, + "learning_rate": 3.5493681080603903e-06, + "loss": 0.055, + "step": 4160 + }, + { + "epoch": 0.7323720766660805, + "grad_norm": 0.35738715529441833, + "learning_rate": 3.5276341155065864e-06, + "loss": 0.0632, + "step": 4165 + }, + { + "epoch": 0.7332512748373483, + "grad_norm": 0.393279105424881, + "learning_rate": 3.505952610225327e-06, + "loss": 0.0529, + "step": 4170 + }, + { + "epoch": 0.7341304730086161, + "grad_norm": 0.5563225746154785, + "learning_rate": 3.4843237680415153e-06, + "loss": 0.0628, + "step": 4175 + }, + { + "epoch": 0.7350096711798839, + "grad_norm": 0.2666977047920227, + "learning_rate": 3.462747764352974e-06, + "loss": 0.0547, + "step": 4180 + }, + { + "epoch": 0.7358888693511517, + "grad_norm": 0.4078899919986725, + "learning_rate": 3.441224774129055e-06, + "loss": 0.0639, + "step": 4185 + }, + { + "epoch": 0.7367680675224195, + "grad_norm": 0.31975606083869934, + "learning_rate": 3.4197549719091794e-06, + "loss": 0.0628, + "step": 4190 + }, + { + "epoch": 0.7376472656936873, + "grad_norm": 0.295015424489975, + "learning_rate": 3.3983385318014573e-06, + "loss": 0.049, + "step": 4195 + }, + { + "epoch": 0.7385264638649551, + "grad_norm": 0.608062207698822, + "learning_rate": 3.3769756274812526e-06, + "loss": 0.047, + "step": 4200 + }, + { + "epoch": 0.7394056620362229, + "grad_norm": 0.20837105810642242, + "learning_rate": 3.3556664321897914e-06, + "loss": 0.0623, + "step": 4205 + }, + { + "epoch": 0.7402848602074907, + "grad_norm": 0.20374645292758942, + "learning_rate": 3.334411118732744e-06, + "loss": 0.0576, + "step": 4210 + }, + { + "epoch": 0.7411640583787585, + "grad_norm": 0.49013030529022217, + "learning_rate": 3.3132098594788385e-06, + "loss": 0.0632, + "step": 4215 + }, + { + "epoch": 0.7420432565500263, + "grad_norm": 0.3315906822681427, + "learning_rate": 3.2920628263584375e-06, + "loss": 0.0536, + "step": 4220 + }, + { + "epoch": 0.7429224547212941, + "grad_norm": 0.3624337613582611, + "learning_rate": 3.2709701908621726e-06, + "loss": 0.0542, + "step": 4225 + }, + { + "epoch": 0.743801652892562, + "grad_norm": 0.2681962549686432, + "learning_rate": 3.2499321240395387e-06, + "loss": 0.0581, + "step": 4230 + }, + { + "epoch": 0.7446808510638298, + "grad_norm": 0.11040078103542328, + "learning_rate": 3.2289487964975074e-06, + "loss": 0.0497, + "step": 4235 + }, + { + "epoch": 0.7455600492350976, + "grad_norm": 0.5186774730682373, + "learning_rate": 3.2080203783991504e-06, + "loss": 0.0594, + "step": 4240 + }, + { + "epoch": 0.7464392474063654, + "grad_norm": 0.4543941617012024, + "learning_rate": 3.1871470394622407e-06, + "loss": 0.0602, + "step": 4245 + }, + { + "epoch": 0.7473184455776332, + "grad_norm": 0.8331423997879028, + "learning_rate": 3.1663289489579054e-06, + "loss": 0.0453, + "step": 4250 + }, + { + "epoch": 0.748197643748901, + "grad_norm": 0.21638324856758118, + "learning_rate": 3.145566275709231e-06, + "loss": 0.0534, + "step": 4255 + }, + { + "epoch": 0.7490768419201688, + "grad_norm": 0.19550643861293793, + "learning_rate": 3.124859188089905e-06, + "loss": 0.0502, + "step": 4260 + }, + { + "epoch": 0.7499560400914366, + "grad_norm": 0.18126316368579865, + "learning_rate": 3.1042078540228358e-06, + "loss": 0.0542, + "step": 4265 + }, + { + "epoch": 0.7508352382627044, + "grad_norm": 0.4952530264854431, + "learning_rate": 3.0836124409788137e-06, + "loss": 0.0518, + "step": 4270 + }, + { + "epoch": 0.7517144364339722, + "grad_norm": 0.7046754956245422, + "learning_rate": 3.063073115975136e-06, + "loss": 0.0575, + "step": 4275 + }, + { + "epoch": 0.75259363460524, + "grad_norm": 0.13164618611335754, + "learning_rate": 3.0425900455742584e-06, + "loss": 0.0475, + "step": 4280 + }, + { + "epoch": 0.7534728327765078, + "grad_norm": 0.21266759932041168, + "learning_rate": 3.022163395882438e-06, + "loss": 0.0532, + "step": 4285 + }, + { + "epoch": 0.7543520309477756, + "grad_norm": 0.6557819247245789, + "learning_rate": 3.0017933325484028e-06, + "loss": 0.0501, + "step": 4290 + }, + { + "epoch": 0.7552312291190434, + "grad_norm": 0.253121554851532, + "learning_rate": 2.981480020761978e-06, + "loss": 0.0568, + "step": 4295 + }, + { + "epoch": 0.7561104272903112, + "grad_norm": 0.26722845435142517, + "learning_rate": 2.9612236252527904e-06, + "loss": 0.0564, + "step": 4300 + }, + { + "epoch": 0.756989625461579, + "grad_norm": 0.1494354009628296, + "learning_rate": 2.941024310288886e-06, + "loss": 0.0577, + "step": 4305 + }, + { + "epoch": 0.7578688236328468, + "grad_norm": 0.16053201258182526, + "learning_rate": 2.9208822396754333e-06, + "loss": 0.0604, + "step": 4310 + }, + { + "epoch": 0.7587480218041146, + "grad_norm": 0.5304045677185059, + "learning_rate": 2.9007975767533714e-06, + "loss": 0.0598, + "step": 4315 + }, + { + "epoch": 0.7596272199753824, + "grad_norm": 0.587253987789154, + "learning_rate": 2.8807704843981e-06, + "loss": 0.0596, + "step": 4320 + }, + { + "epoch": 0.7605064181466502, + "grad_norm": 0.6153156161308289, + "learning_rate": 2.8608011250181544e-06, + "loss": 0.052, + "step": 4325 + }, + { + "epoch": 0.761385616317918, + "grad_norm": 0.7715466618537903, + "learning_rate": 2.8408896605538905e-06, + "loss": 0.0501, + "step": 4330 + }, + { + "epoch": 0.7622648144891858, + "grad_norm": 0.24888671934604645, + "learning_rate": 2.8210362524761557e-06, + "loss": 0.0594, + "step": 4335 + }, + { + "epoch": 0.7631440126604536, + "grad_norm": 0.6384350061416626, + "learning_rate": 2.8012410617850083e-06, + "loss": 0.0491, + "step": 4340 + }, + { + "epoch": 0.7640232108317214, + "grad_norm": 0.5361153483390808, + "learning_rate": 2.7815042490083857e-06, + "loss": 0.053, + "step": 4345 + }, + { + "epoch": 0.7649024090029892, + "grad_norm": 0.3747937083244324, + "learning_rate": 2.7618259742008226e-06, + "loss": 0.0555, + "step": 4350 + }, + { + "epoch": 0.765781607174257, + "grad_norm": 0.2643256187438965, + "learning_rate": 2.7422063969421286e-06, + "loss": 0.0533, + "step": 4355 + }, + { + "epoch": 0.7666608053455248, + "grad_norm": 0.6822528839111328, + "learning_rate": 2.722645676336123e-06, + "loss": 0.057, + "step": 4360 + }, + { + "epoch": 0.7675400035167926, + "grad_norm": 0.2672845423221588, + "learning_rate": 2.7031439710093254e-06, + "loss": 0.058, + "step": 4365 + }, + { + "epoch": 0.7684192016880604, + "grad_norm": 0.1608499139547348, + "learning_rate": 2.683701439109676e-06, + "loss": 0.0573, + "step": 4370 + }, + { + "epoch": 0.7692983998593282, + "grad_norm": 0.2795965373516083, + "learning_rate": 2.6643182383052448e-06, + "loss": 0.0667, + "step": 4375 + }, + { + "epoch": 0.770177598030596, + "grad_norm": 0.11451299488544464, + "learning_rate": 2.644994525782971e-06, + "loss": 0.0527, + "step": 4380 + }, + { + "epoch": 0.7710567962018638, + "grad_norm": 0.48407652974128723, + "learning_rate": 2.625730458247362e-06, + "loss": 0.0572, + "step": 4385 + }, + { + "epoch": 0.7719359943731317, + "grad_norm": 0.5724853277206421, + "learning_rate": 2.606526191919259e-06, + "loss": 0.0432, + "step": 4390 + }, + { + "epoch": 0.7728151925443995, + "grad_norm": 0.6579432487487793, + "learning_rate": 2.5873818825345254e-06, + "loss": 0.0521, + "step": 4395 + }, + { + "epoch": 0.7736943907156673, + "grad_norm": 0.7850777506828308, + "learning_rate": 2.5682976853428264e-06, + "loss": 0.0563, + "step": 4400 + }, + { + "epoch": 0.7745735888869351, + "grad_norm": 0.20982079207897186, + "learning_rate": 2.5492737551063374e-06, + "loss": 0.0587, + "step": 4405 + }, + { + "epoch": 0.7754527870582029, + "grad_norm": 0.20126987993717194, + "learning_rate": 2.5303102460985098e-06, + "loss": 0.0585, + "step": 4410 + }, + { + "epoch": 0.7763319852294708, + "grad_norm": 0.16406214237213135, + "learning_rate": 2.511407312102809e-06, + "loss": 0.0641, + "step": 4415 + }, + { + "epoch": 0.7772111834007386, + "grad_norm": 0.8947567939758301, + "learning_rate": 2.4925651064114788e-06, + "loss": 0.0563, + "step": 4420 + }, + { + "epoch": 0.7780903815720064, + "grad_norm": 0.9534220099449158, + "learning_rate": 2.4737837818242747e-06, + "loss": 0.0472, + "step": 4425 + }, + { + "epoch": 0.7789695797432742, + "grad_norm": 0.24452577531337738, + "learning_rate": 2.455063490647257e-06, + "loss": 0.0545, + "step": 4430 + }, + { + "epoch": 0.779848777914542, + "grad_norm": 0.5126246213912964, + "learning_rate": 2.4364043846915273e-06, + "loss": 0.0502, + "step": 4435 + }, + { + "epoch": 0.7807279760858098, + "grad_norm": 0.3192353844642639, + "learning_rate": 2.4178066152720203e-06, + "loss": 0.0672, + "step": 4440 + }, + { + "epoch": 0.7816071742570776, + "grad_norm": 0.8498159050941467, + "learning_rate": 2.399270333206253e-06, + "loss": 0.0575, + "step": 4445 + }, + { + "epoch": 0.7824863724283454, + "grad_norm": 0.7275409698486328, + "learning_rate": 2.3807956888131213e-06, + "loss": 0.0623, + "step": 4450 + }, + { + "epoch": 0.7833655705996132, + "grad_norm": 0.6221399307250977, + "learning_rate": 2.362382831911675e-06, + "loss": 0.0554, + "step": 4455 + }, + { + "epoch": 0.784244768770881, + "grad_norm": 0.21380391716957092, + "learning_rate": 2.3440319118198997e-06, + "loss": 0.0551, + "step": 4460 + }, + { + "epoch": 0.7851239669421488, + "grad_norm": 0.8011379837989807, + "learning_rate": 2.3257430773535116e-06, + "loss": 0.051, + "step": 4465 + }, + { + "epoch": 0.7860031651134166, + "grad_norm": 0.5205139517784119, + "learning_rate": 2.307516476824738e-06, + "loss": 0.0615, + "step": 4470 + }, + { + "epoch": 0.7868823632846844, + "grad_norm": 0.40472784638404846, + "learning_rate": 2.289352258041133e-06, + "loss": 0.0515, + "step": 4475 + }, + { + "epoch": 0.7877615614559522, + "grad_norm": 0.11419567465782166, + "learning_rate": 2.271250568304366e-06, + "loss": 0.0511, + "step": 4480 + }, + { + "epoch": 0.78864075962722, + "grad_norm": 0.11799798905849457, + "learning_rate": 2.253211554409034e-06, + "loss": 0.0584, + "step": 4485 + }, + { + "epoch": 0.7895199577984878, + "grad_norm": 0.5825390219688416, + "learning_rate": 2.235235362641458e-06, + "loss": 0.052, + "step": 4490 + }, + { + "epoch": 0.7903991559697556, + "grad_norm": 0.14880183339118958, + "learning_rate": 2.2173221387785215e-06, + "loss": 0.0567, + "step": 4495 + }, + { + "epoch": 0.7912783541410234, + "grad_norm": 0.16404080390930176, + "learning_rate": 2.1994720280864567e-06, + "loss": 0.0555, + "step": 4500 + }, + { + "epoch": 0.7921575523122912, + "grad_norm": 0.13647328317165375, + "learning_rate": 2.1816851753197023e-06, + "loss": 0.0443, + "step": 4505 + }, + { + "epoch": 0.793036750483559, + "grad_norm": 0.3101443648338318, + "learning_rate": 2.163961724719693e-06, + "loss": 0.0568, + "step": 4510 + }, + { + "epoch": 0.7939159486548268, + "grad_norm": 0.5262371897697449, + "learning_rate": 2.1463018200137197e-06, + "loss": 0.0541, + "step": 4515 + }, + { + "epoch": 0.7947951468260946, + "grad_norm": 0.15383735299110413, + "learning_rate": 2.128705604413741e-06, + "loss": 0.057, + "step": 4520 + }, + { + "epoch": 0.7956743449973624, + "grad_norm": 0.33696064352989197, + "learning_rate": 2.1111732206152424e-06, + "loss": 0.0541, + "step": 4525 + }, + { + "epoch": 0.7965535431686303, + "grad_norm": 0.37046894431114197, + "learning_rate": 2.093704810796062e-06, + "loss": 0.0677, + "step": 4530 + }, + { + "epoch": 0.797432741339898, + "grad_norm": 0.2881315350532532, + "learning_rate": 2.076300516615252e-06, + "loss": 0.0516, + "step": 4535 + }, + { + "epoch": 0.7983119395111659, + "grad_norm": 0.21759085357189178, + "learning_rate": 2.0589604792119124e-06, + "loss": 0.0604, + "step": 4540 + }, + { + "epoch": 0.7991911376824337, + "grad_norm": 0.6279814839363098, + "learning_rate": 2.0416848392040647e-06, + "loss": 0.0618, + "step": 4545 + }, + { + "epoch": 0.8000703358537015, + "grad_norm": 0.06609740853309631, + "learning_rate": 2.024473736687501e-06, + "loss": 0.0478, + "step": 4550 + }, + { + "epoch": 0.8009495340249693, + "grad_norm": 0.3990488052368164, + "learning_rate": 2.0073273112346526e-06, + "loss": 0.0563, + "step": 4555 + }, + { + "epoch": 0.8018287321962371, + "grad_norm": 0.54508376121521, + "learning_rate": 1.9902457018934496e-06, + "loss": 0.0665, + "step": 4560 + }, + { + "epoch": 0.8027079303675049, + "grad_norm": 0.28929072618484497, + "learning_rate": 1.973229047186206e-06, + "loss": 0.0583, + "step": 4565 + }, + { + "epoch": 0.8035871285387727, + "grad_norm": 0.4705251157283783, + "learning_rate": 1.9562774851084865e-06, + "loss": 0.0639, + "step": 4570 + }, + { + "epoch": 0.8044663267100405, + "grad_norm": 0.10792229324579239, + "learning_rate": 1.9393911531279973e-06, + "loss": 0.0687, + "step": 4575 + }, + { + "epoch": 0.8053455248813083, + "grad_norm": 0.8370270729064941, + "learning_rate": 1.9225701881834524e-06, + "loss": 0.0616, + "step": 4580 + }, + { + "epoch": 0.8062247230525761, + "grad_norm": 0.20940972864627838, + "learning_rate": 1.9058147266834892e-06, + "loss": 0.0588, + "step": 4585 + }, + { + "epoch": 0.8071039212238439, + "grad_norm": 0.43674102425575256, + "learning_rate": 1.8891249045055349e-06, + "loss": 0.0424, + "step": 4590 + }, + { + "epoch": 0.8079831193951117, + "grad_norm": 0.4499530494213104, + "learning_rate": 1.8725008569947366e-06, + "loss": 0.0583, + "step": 4595 + }, + { + "epoch": 0.8088623175663795, + "grad_norm": 0.16668100655078888, + "learning_rate": 1.8559427189628277e-06, + "loss": 0.0604, + "step": 4600 + }, + { + "epoch": 0.8097415157376473, + "grad_norm": 0.20123635232448578, + "learning_rate": 1.8394506246870635e-06, + "loss": 0.0561, + "step": 4605 + }, + { + "epoch": 0.8106207139089151, + "grad_norm": 0.2664503753185272, + "learning_rate": 1.8230247079091146e-06, + "loss": 0.053, + "step": 4610 + }, + { + "epoch": 0.8114999120801829, + "grad_norm": 0.7137978076934814, + "learning_rate": 1.8066651018339943e-06, + "loss": 0.0572, + "step": 4615 + }, + { + "epoch": 0.8123791102514507, + "grad_norm": 0.5590645670890808, + "learning_rate": 1.790371939128972e-06, + "loss": 0.0616, + "step": 4620 + }, + { + "epoch": 0.8132583084227185, + "grad_norm": 0.08397898077964783, + "learning_rate": 1.7741453519224982e-06, + "loss": 0.058, + "step": 4625 + }, + { + "epoch": 0.8141375065939863, + "grad_norm": 0.33352193236351013, + "learning_rate": 1.7579854718031285e-06, + "loss": 0.0517, + "step": 4630 + }, + { + "epoch": 0.8150167047652541, + "grad_norm": 0.20153024792671204, + "learning_rate": 1.741892429818468e-06, + "loss": 0.0547, + "step": 4635 + }, + { + "epoch": 0.8158959029365219, + "grad_norm": 0.8683350086212158, + "learning_rate": 1.7258663564740996e-06, + "loss": 0.0618, + "step": 4640 + }, + { + "epoch": 0.8167751011077897, + "grad_norm": 0.19968271255493164, + "learning_rate": 1.7099073817325307e-06, + "loss": 0.0568, + "step": 4645 + }, + { + "epoch": 0.8176542992790575, + "grad_norm": 0.3933415412902832, + "learning_rate": 1.6940156350121273e-06, + "loss": 0.0622, + "step": 4650 + }, + { + "epoch": 0.8185334974503253, + "grad_norm": 0.8188037872314453, + "learning_rate": 1.6781912451860827e-06, + "loss": 0.0645, + "step": 4655 + }, + { + "epoch": 0.8194126956215931, + "grad_norm": 0.6926817893981934, + "learning_rate": 1.6624343405813615e-06, + "loss": 0.0561, + "step": 4660 + }, + { + "epoch": 0.8202918937928609, + "grad_norm": 0.5045862197875977, + "learning_rate": 1.6467450489776581e-06, + "loss": 0.0668, + "step": 4665 + }, + { + "epoch": 0.8211710919641287, + "grad_norm": 0.5935778617858887, + "learning_rate": 1.6311234976063694e-06, + "loss": 0.0575, + "step": 4670 + }, + { + "epoch": 0.8220502901353965, + "grad_norm": 0.3604169189929962, + "learning_rate": 1.6155698131495457e-06, + "loss": 0.0543, + "step": 4675 + }, + { + "epoch": 0.8229294883066643, + "grad_norm": 0.24668653309345245, + "learning_rate": 1.6000841217388864e-06, + "loss": 0.057, + "step": 4680 + }, + { + "epoch": 0.8238086864779322, + "grad_norm": 0.33264681696891785, + "learning_rate": 1.5846665489546964e-06, + "loss": 0.0572, + "step": 4685 + }, + { + "epoch": 0.8246878846492, + "grad_norm": 0.6432152986526489, + "learning_rate": 1.5693172198248863e-06, + "loss": 0.0604, + "step": 4690 + }, + { + "epoch": 0.8255670828204678, + "grad_norm": 0.4105263948440552, + "learning_rate": 1.5540362588239366e-06, + "loss": 0.0701, + "step": 4695 + }, + { + "epoch": 0.8264462809917356, + "grad_norm": 0.25226283073425293, + "learning_rate": 1.5388237898719105e-06, + "loss": 0.0534, + "step": 4700 + }, + { + "epoch": 0.8273254791630034, + "grad_norm": 0.2534274160861969, + "learning_rate": 1.5236799363334298e-06, + "loss": 0.0535, + "step": 4705 + }, + { + "epoch": 0.8282046773342712, + "grad_norm": 0.29621097445487976, + "learning_rate": 1.508604821016698e-06, + "loss": 0.0499, + "step": 4710 + }, + { + "epoch": 0.829083875505539, + "grad_norm": 0.38203129172325134, + "learning_rate": 1.4935985661724727e-06, + "loss": 0.0539, + "step": 4715 + }, + { + "epoch": 0.8299630736768068, + "grad_norm": 0.9345189929008484, + "learning_rate": 1.4786612934931055e-06, + "loss": 0.0578, + "step": 4720 + }, + { + "epoch": 0.8308422718480746, + "grad_norm": 0.21688897907733917, + "learning_rate": 1.463793124111531e-06, + "loss": 0.0427, + "step": 4725 + }, + { + "epoch": 0.8317214700193424, + "grad_norm": 0.22990085184574127, + "learning_rate": 1.4489941786003004e-06, + "loss": 0.0441, + "step": 4730 + }, + { + "epoch": 0.8326006681906102, + "grad_norm": 0.4832035005092621, + "learning_rate": 1.4342645769705977e-06, + "loss": 0.0588, + "step": 4735 + }, + { + "epoch": 0.833479866361878, + "grad_norm": 0.40238794684410095, + "learning_rate": 1.419604438671267e-06, + "loss": 0.0519, + "step": 4740 + }, + { + "epoch": 0.8343590645331458, + "grad_norm": 0.5739127397537231, + "learning_rate": 1.405013882587839e-06, + "loss": 0.0637, + "step": 4745 + }, + { + "epoch": 0.8352382627044136, + "grad_norm": 0.3309503495693207, + "learning_rate": 1.3904930270415763e-06, + "loss": 0.0506, + "step": 4750 + }, + { + "epoch": 0.8361174608756814, + "grad_norm": 0.09466574341058731, + "learning_rate": 1.376041989788508e-06, + "loss": 0.0524, + "step": 4755 + }, + { + "epoch": 0.8369966590469492, + "grad_norm": 0.27358055114746094, + "learning_rate": 1.3616608880184768e-06, + "loss": 0.0545, + "step": 4760 + }, + { + "epoch": 0.837875857218217, + "grad_norm": 0.26218411326408386, + "learning_rate": 1.3473498383541817e-06, + "loss": 0.0467, + "step": 4765 + }, + { + "epoch": 0.8387550553894848, + "grad_norm": 0.3839583992958069, + "learning_rate": 1.3331089568502465e-06, + "loss": 0.043, + "step": 4770 + }, + { + "epoch": 0.8396342535607526, + "grad_norm": 0.37673670053482056, + "learning_rate": 1.3189383589922667e-06, + "loss": 0.0634, + "step": 4775 + }, + { + "epoch": 0.8405134517320204, + "grad_norm": 0.3072827160358429, + "learning_rate": 1.304838159695877e-06, + "loss": 0.0684, + "step": 4780 + }, + { + "epoch": 0.8413926499032882, + "grad_norm": 0.7255908250808716, + "learning_rate": 1.290808473305817e-06, + "loss": 0.0545, + "step": 4785 + }, + { + "epoch": 0.842271848074556, + "grad_norm": 0.4259006381034851, + "learning_rate": 1.2768494135950093e-06, + "loss": 0.0516, + "step": 4790 + }, + { + "epoch": 0.8431510462458238, + "grad_norm": 0.6573328971862793, + "learning_rate": 1.2629610937636284e-06, + "loss": 0.0494, + "step": 4795 + }, + { + "epoch": 0.8440302444170916, + "grad_norm": 0.09125727415084839, + "learning_rate": 1.2491436264381984e-06, + "loss": 0.0621, + "step": 4800 + }, + { + "epoch": 0.8449094425883594, + "grad_norm": 0.4536990821361542, + "learning_rate": 1.2353971236706564e-06, + "loss": 0.0506, + "step": 4805 + }, + { + "epoch": 0.8457886407596272, + "grad_norm": 0.47011682391166687, + "learning_rate": 1.2217216969374669e-06, + "loss": 0.0582, + "step": 4810 + }, + { + "epoch": 0.846667838930895, + "grad_norm": 0.15771393477916718, + "learning_rate": 1.208117457138699e-06, + "loss": 0.0612, + "step": 4815 + }, + { + "epoch": 0.8475470371021628, + "grad_norm": 1.0707141160964966, + "learning_rate": 1.1945845145971414e-06, + "loss": 0.0597, + "step": 4820 + }, + { + "epoch": 0.8484262352734306, + "grad_norm": 0.381056547164917, + "learning_rate": 1.1811229790573996e-06, + "loss": 0.0678, + "step": 4825 + }, + { + "epoch": 0.8493054334446984, + "grad_norm": 0.33800312876701355, + "learning_rate": 1.1677329596850117e-06, + "loss": 0.0516, + "step": 4830 + }, + { + "epoch": 0.8501846316159662, + "grad_norm": 0.19112703204154968, + "learning_rate": 1.1544145650655514e-06, + "loss": 0.062, + "step": 4835 + }, + { + "epoch": 0.851063829787234, + "grad_norm": 0.4575953483581543, + "learning_rate": 1.1411679032037636e-06, + "loss": 0.0542, + "step": 4840 + }, + { + "epoch": 0.8519430279585019, + "grad_norm": 0.6570442914962769, + "learning_rate": 1.127993081522678e-06, + "loss": 0.0567, + "step": 4845 + }, + { + "epoch": 0.8528222261297697, + "grad_norm": 0.4207151234149933, + "learning_rate": 1.114890206862742e-06, + "loss": 0.0595, + "step": 4850 + }, + { + "epoch": 0.8537014243010375, + "grad_norm": 0.183942511677742, + "learning_rate": 1.1018593854809478e-06, + "loss": 0.0537, + "step": 4855 + }, + { + "epoch": 0.8545806224723053, + "grad_norm": 0.5328112840652466, + "learning_rate": 1.0889007230499805e-06, + "loss": 0.0598, + "step": 4860 + }, + { + "epoch": 0.8554598206435731, + "grad_norm": 0.1587759107351303, + "learning_rate": 1.0760143246573552e-06, + "loss": 0.0607, + "step": 4865 + }, + { + "epoch": 0.8563390188148409, + "grad_norm": 0.3230392336845398, + "learning_rate": 1.0632002948045672e-06, + "loss": 0.0434, + "step": 4870 + }, + { + "epoch": 0.8572182169861087, + "grad_norm": 0.6485795378684998, + "learning_rate": 1.0504587374062392e-06, + "loss": 0.0518, + "step": 4875 + }, + { + "epoch": 0.8580974151573765, + "grad_norm": 0.20032288134098053, + "learning_rate": 1.037789755789289e-06, + "loss": 0.047, + "step": 4880 + }, + { + "epoch": 0.8589766133286443, + "grad_norm": 0.2695741355419159, + "learning_rate": 1.025193452692076e-06, + "loss": 0.0544, + "step": 4885 + }, + { + "epoch": 0.8598558114999121, + "grad_norm": 0.4286085069179535, + "learning_rate": 1.0126699302635901e-06, + "loss": 0.0749, + "step": 4890 + }, + { + "epoch": 0.8607350096711799, + "grad_norm": 0.21349282562732697, + "learning_rate": 1.0002192900626028e-06, + "loss": 0.0598, + "step": 4895 + }, + { + "epoch": 0.8616142078424477, + "grad_norm": 0.16585257649421692, + "learning_rate": 9.878416330568486e-07, + "loss": 0.056, + "step": 4900 + }, + { + "epoch": 0.8624934060137155, + "grad_norm": 0.723551332950592, + "learning_rate": 9.75537059622218e-07, + "loss": 0.0531, + "step": 4905 + }, + { + "epoch": 0.8633726041849833, + "grad_norm": 0.7833985090255737, + "learning_rate": 9.633056695419229e-07, + "loss": 0.0499, + "step": 4910 + }, + { + "epoch": 0.8642518023562511, + "grad_norm": 0.18503841757774353, + "learning_rate": 9.511475620057132e-07, + "loss": 0.0432, + "step": 4915 + }, + { + "epoch": 0.8651310005275189, + "grad_norm": 0.25365808606147766, + "learning_rate": 9.390628356090459e-07, + "loss": 0.0677, + "step": 4920 + }, + { + "epoch": 0.8660101986987867, + "grad_norm": 0.6311047673225403, + "learning_rate": 9.270515883523057e-07, + "loss": 0.0642, + "step": 4925 + }, + { + "epoch": 0.8668893968700545, + "grad_norm": 0.13977408409118652, + "learning_rate": 9.15113917639997e-07, + "loss": 0.0538, + "step": 4930 + }, + { + "epoch": 0.8677685950413223, + "grad_norm": 0.3203240931034088, + "learning_rate": 9.032499202799627e-07, + "loss": 0.0535, + "step": 4935 + }, + { + "epoch": 0.8686477932125901, + "grad_norm": 0.5966960191726685, + "learning_rate": 8.914596924825958e-07, + "loss": 0.0485, + "step": 4940 + }, + { + "epoch": 0.8695269913838579, + "grad_norm": 0.4596826732158661, + "learning_rate": 8.797433298600622e-07, + "loss": 0.0659, + "step": 4945 + }, + { + "epoch": 0.8704061895551257, + "grad_norm": 0.757762610912323, + "learning_rate": 8.681009274255136e-07, + "loss": 0.0639, + "step": 4950 + }, + { + "epoch": 0.8712853877263935, + "grad_norm": 0.2202068269252777, + "learning_rate": 8.56532579592334e-07, + "loss": 0.0493, + "step": 4955 + }, + { + "epoch": 0.8721645858976613, + "grad_norm": 0.655022144317627, + "learning_rate": 8.450383801733642e-07, + "loss": 0.0631, + "step": 4960 + }, + { + "epoch": 0.8730437840689291, + "grad_norm": 0.5713546872138977, + "learning_rate": 8.336184223801424e-07, + "loss": 0.0592, + "step": 4965 + }, + { + "epoch": 0.8739229822401969, + "grad_norm": 0.4096655249595642, + "learning_rate": 8.222727988221469e-07, + "loss": 0.0644, + "step": 4970 + }, + { + "epoch": 0.8748021804114647, + "grad_norm": 0.3128865957260132, + "learning_rate": 8.110016015060484e-07, + "loss": 0.059, + "step": 4975 + }, + { + "epoch": 0.8756813785827325, + "grad_norm": 0.1819022297859192, + "learning_rate": 7.998049218349624e-07, + "loss": 0.0547, + "step": 4980 + }, + { + "epoch": 0.8765605767540003, + "grad_norm": 0.2281774878501892, + "learning_rate": 7.886828506077105e-07, + "loss": 0.0584, + "step": 4985 + }, + { + "epoch": 0.8774397749252681, + "grad_norm": 0.29507550597190857, + "learning_rate": 7.776354780180739e-07, + "loss": 0.0523, + "step": 4990 + }, + { + "epoch": 0.878318973096536, + "grad_norm": 0.1599227786064148, + "learning_rate": 7.666628936540776e-07, + "loss": 0.0597, + "step": 4995 + }, + { + "epoch": 0.8791981712678038, + "grad_norm": 0.33402958512306213, + "learning_rate": 7.557651864972504e-07, + "loss": 0.048, + "step": 5000 + }, + { + "epoch": 0.8800773694390716, + "grad_norm": 0.8994088172912598, + "learning_rate": 7.449424449219144e-07, + "loss": 0.0602, + "step": 5005 + }, + { + "epoch": 0.8809565676103394, + "grad_norm": 0.392220139503479, + "learning_rate": 7.341947566944563e-07, + "loss": 0.0438, + "step": 5010 + }, + { + "epoch": 0.8818357657816072, + "grad_norm": 0.3253031373023987, + "learning_rate": 7.23522208972628e-07, + "loss": 0.0568, + "step": 5015 + }, + { + "epoch": 0.882714963952875, + "grad_norm": 0.41497498750686646, + "learning_rate": 7.129248883048278e-07, + "loss": 0.0453, + "step": 5020 + }, + { + "epoch": 0.8835941621241428, + "grad_norm": 0.4564589560031891, + "learning_rate": 7.024028806294092e-07, + "loss": 0.0559, + "step": 5025 + }, + { + "epoch": 0.8844733602954106, + "grad_norm": 0.6449925303459167, + "learning_rate": 6.91956271273978e-07, + "loss": 0.047, + "step": 5030 + }, + { + "epoch": 0.8853525584666784, + "grad_norm": 0.3050267994403839, + "learning_rate": 6.815851449547029e-07, + "loss": 0.0583, + "step": 5035 + }, + { + "epoch": 0.8862317566379462, + "grad_norm": 0.13408750295639038, + "learning_rate": 6.712895857756229e-07, + "loss": 0.0434, + "step": 5040 + }, + { + "epoch": 0.887110954809214, + "grad_norm": 0.2986939549446106, + "learning_rate": 6.610696772279757e-07, + "loss": 0.0594, + "step": 5045 + }, + { + "epoch": 0.8879901529804818, + "grad_norm": 0.6257616877555847, + "learning_rate": 6.509255021895111e-07, + "loss": 0.0621, + "step": 5050 + }, + { + "epoch": 0.8888693511517496, + "grad_norm": 0.2986117899417877, + "learning_rate": 6.408571429238253e-07, + "loss": 0.0505, + "step": 5055 + }, + { + "epoch": 0.8897485493230174, + "grad_norm": 0.15408103168010712, + "learning_rate": 6.308646810796836e-07, + "loss": 0.0534, + "step": 5060 + }, + { + "epoch": 0.8906277474942852, + "grad_norm": 0.4512818157672882, + "learning_rate": 6.209481976903752e-07, + "loss": 0.0433, + "step": 5065 + }, + { + "epoch": 0.891506945665553, + "grad_norm": 0.41841113567352295, + "learning_rate": 6.111077731730408e-07, + "loss": 0.0697, + "step": 5070 + }, + { + "epoch": 0.8923861438368208, + "grad_norm": 0.472064346075058, + "learning_rate": 6.013434873280288e-07, + "loss": 0.058, + "step": 5075 + }, + { + "epoch": 0.8932653420080886, + "grad_norm": 0.20314988493919373, + "learning_rate": 5.916554193382418e-07, + "loss": 0.0456, + "step": 5080 + }, + { + "epoch": 0.8941445401793564, + "grad_norm": 0.1879410743713379, + "learning_rate": 5.820436477685021e-07, + "loss": 0.0506, + "step": 5085 + }, + { + "epoch": 0.8950237383506242, + "grad_norm": 0.4098430573940277, + "learning_rate": 5.72508250564906e-07, + "loss": 0.0528, + "step": 5090 + }, + { + "epoch": 0.895902936521892, + "grad_norm": 0.5645979642868042, + "learning_rate": 5.63049305054204e-07, + "loss": 0.0689, + "step": 5095 + }, + { + "epoch": 0.8967821346931598, + "grad_norm": 0.38696274161338806, + "learning_rate": 5.536668879431584e-07, + "loss": 0.0621, + "step": 5100 + }, + { + "epoch": 0.8976613328644276, + "grad_norm": 0.18984673917293549, + "learning_rate": 5.44361075317934e-07, + "loss": 0.0574, + "step": 5105 + }, + { + "epoch": 0.8985405310356954, + "grad_norm": 0.1238960400223732, + "learning_rate": 5.35131942643472e-07, + "loss": 0.0522, + "step": 5110 + }, + { + "epoch": 0.8994197292069632, + "grad_norm": 0.47579723596572876, + "learning_rate": 5.259795647628818e-07, + "loss": 0.0437, + "step": 5115 + }, + { + "epoch": 0.900298927378231, + "grad_norm": 0.0708310604095459, + "learning_rate": 5.169040158968431e-07, + "loss": 0.057, + "step": 5120 + }, + { + "epoch": 0.9011781255494988, + "grad_norm": 0.24418748915195465, + "learning_rate": 5.079053696429837e-07, + "loss": 0.054, + "step": 5125 + }, + { + "epoch": 0.9020573237207666, + "grad_norm": 0.4015823304653168, + "learning_rate": 4.989836989753005e-07, + "loss": 0.0472, + "step": 5130 + }, + { + "epoch": 0.9029365218920344, + "grad_norm": 0.09180589020252228, + "learning_rate": 4.901390762435588e-07, + "loss": 0.0565, + "step": 5135 + }, + { + "epoch": 0.9038157200633022, + "grad_norm": 0.23137515783309937, + "learning_rate": 4.813715731727098e-07, + "loss": 0.0594, + "step": 5140 + }, + { + "epoch": 0.90469491823457, + "grad_norm": 0.42869314551353455, + "learning_rate": 4.726812608623077e-07, + "loss": 0.0578, + "step": 5145 + }, + { + "epoch": 0.9055741164058378, + "grad_norm": 0.28821223974227905, + "learning_rate": 4.640682097859317e-07, + "loss": 0.0608, + "step": 5150 + }, + { + "epoch": 0.9064533145771057, + "grad_norm": 0.8055384755134583, + "learning_rate": 4.555324897906133e-07, + "loss": 0.0635, + "step": 5155 + }, + { + "epoch": 0.9073325127483735, + "grad_norm": 0.12101097404956818, + "learning_rate": 4.470741700962777e-07, + "loss": 0.0559, + "step": 5160 + }, + { + "epoch": 0.9082117109196413, + "grad_norm": 0.4471381902694702, + "learning_rate": 4.3869331929517144e-07, + "loss": 0.055, + "step": 5165 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.554818332195282, + "learning_rate": 4.303900053513166e-07, + "loss": 0.0565, + "step": 5170 + }, + { + "epoch": 0.9099701072621769, + "grad_norm": 0.6611088514328003, + "learning_rate": 4.2216429559994945e-07, + "loss": 0.0556, + "step": 5175 + }, + { + "epoch": 0.9108493054334447, + "grad_norm": 0.1738196760416031, + "learning_rate": 4.1401625674698186e-07, + "loss": 0.0574, + "step": 5180 + }, + { + "epoch": 0.9117285036047125, + "grad_norm": 0.19940215349197388, + "learning_rate": 4.0594595486845964e-07, + "loss": 0.0598, + "step": 5185 + }, + { + "epoch": 0.9126077017759803, + "grad_norm": 0.6054111123085022, + "learning_rate": 3.9795345541002395e-07, + "loss": 0.0467, + "step": 5190 + }, + { + "epoch": 0.9134868999472481, + "grad_norm": 0.5578285455703735, + "learning_rate": 3.9003882318638053e-07, + "loss": 0.0577, + "step": 5195 + }, + { + "epoch": 0.9143660981185159, + "grad_norm": 0.12331661581993103, + "learning_rate": 3.8220212238077703e-07, + "loss": 0.0632, + "step": 5200 + }, + { + "epoch": 0.9152452962897837, + "grad_norm": 0.4738437831401825, + "learning_rate": 3.744434165444788e-07, + "loss": 0.0619, + "step": 5205 + }, + { + "epoch": 0.9161244944610515, + "grad_norm": 0.35755178332328796, + "learning_rate": 3.667627685962605e-07, + "loss": 0.059, + "step": 5210 + }, + { + "epoch": 0.9170036926323193, + "grad_norm": 0.45652657747268677, + "learning_rate": 3.591602408218842e-07, + "loss": 0.0543, + "step": 5215 + }, + { + "epoch": 0.9178828908035871, + "grad_norm": 0.2660428583621979, + "learning_rate": 3.516358948736065e-07, + "loss": 0.0526, + "step": 5220 + }, + { + "epoch": 0.9187620889748549, + "grad_norm": 0.21268558502197266, + "learning_rate": 3.441897917696679e-07, + "loss": 0.0644, + "step": 5225 + }, + { + "epoch": 0.9196412871461227, + "grad_norm": 0.6168246865272522, + "learning_rate": 3.368219918938076e-07, + "loss": 0.0512, + "step": 5230 + }, + { + "epoch": 0.9205204853173905, + "grad_norm": 0.5585867762565613, + "learning_rate": 3.29532554994767e-07, + "loss": 0.0612, + "step": 5235 + }, + { + "epoch": 0.9213996834886583, + "grad_norm": 0.6709286570549011, + "learning_rate": 3.223215401858115e-07, + "loss": 0.047, + "step": 5240 + }, + { + "epoch": 0.9222788816599261, + "grad_norm": 0.2702469527721405, + "learning_rate": 3.151890059442386e-07, + "loss": 0.0445, + "step": 5245 + }, + { + "epoch": 0.9231580798311939, + "grad_norm": 0.6882309913635254, + "learning_rate": 3.081350101109215e-07, + "loss": 0.0513, + "step": 5250 + }, + { + "epoch": 0.9240372780024617, + "grad_norm": 0.3388536274433136, + "learning_rate": 3.0115960988982504e-07, + "loss": 0.0525, + "step": 5255 + }, + { + "epoch": 0.9249164761737295, + "grad_norm": 0.3622197210788727, + "learning_rate": 2.942628618475507e-07, + "loss": 0.0565, + "step": 5260 + }, + { + "epoch": 0.9257956743449973, + "grad_norm": 0.519477903842926, + "learning_rate": 2.8744482191287113e-07, + "loss": 0.0605, + "step": 5265 + }, + { + "epoch": 0.9266748725162651, + "grad_norm": 0.3441823422908783, + "learning_rate": 2.8070554537628413e-07, + "loss": 0.0545, + "step": 5270 + }, + { + "epoch": 0.9275540706875329, + "grad_norm": 0.2639375329017639, + "learning_rate": 2.7404508688955835e-07, + "loss": 0.0402, + "step": 5275 + }, + { + "epoch": 0.9284332688588007, + "grad_norm": 0.35200339555740356, + "learning_rate": 2.674635004652926e-07, + "loss": 0.057, + "step": 5280 + }, + { + "epoch": 0.9293124670300685, + "grad_norm": 0.30402621626853943, + "learning_rate": 2.609608394764751e-07, + "loss": 0.0508, + "step": 5285 + }, + { + "epoch": 0.9301916652013363, + "grad_norm": 0.6213688254356384, + "learning_rate": 2.5453715665605725e-07, + "loss": 0.0652, + "step": 5290 + }, + { + "epoch": 0.9310708633726041, + "grad_norm": 0.24661576747894287, + "learning_rate": 2.4819250409651605e-07, + "loss": 0.0398, + "step": 5295 + }, + { + "epoch": 0.9319500615438719, + "grad_norm": 0.13059848546981812, + "learning_rate": 2.419269332494434e-07, + "loss": 0.0586, + "step": 5300 + }, + { + "epoch": 0.9328292597151397, + "grad_norm": 0.4985171854496002, + "learning_rate": 2.3574049492511852e-07, + "loss": 0.0649, + "step": 5305 + }, + { + "epoch": 0.9337084578864076, + "grad_norm": 0.5534043312072754, + "learning_rate": 2.296332392921019e-07, + "loss": 0.0564, + "step": 5310 + }, + { + "epoch": 0.9345876560576754, + "grad_norm": 0.386444091796875, + "learning_rate": 2.2360521587682316e-07, + "loss": 0.0646, + "step": 5315 + }, + { + "epoch": 0.9354668542289432, + "grad_norm": 0.22402134537696838, + "learning_rate": 2.176564735631881e-07, + "loss": 0.0614, + "step": 5320 + }, + { + "epoch": 0.936346052400211, + "grad_norm": 0.6845077872276306, + "learning_rate": 2.1178706059217346e-07, + "loss": 0.0639, + "step": 5325 + }, + { + "epoch": 0.9372252505714788, + "grad_norm": 0.5488569736480713, + "learning_rate": 2.0599702456144178e-07, + "loss": 0.0518, + "step": 5330 + }, + { + "epoch": 0.9381044487427466, + "grad_norm": 0.6429914832115173, + "learning_rate": 2.002864124249504e-07, + "loss": 0.0618, + "step": 5335 + }, + { + "epoch": 0.9389836469140144, + "grad_norm": 0.8213487267494202, + "learning_rate": 1.9465527049257416e-07, + "loss": 0.0623, + "step": 5340 + }, + { + "epoch": 0.9398628450852822, + "grad_norm": 0.6064088940620422, + "learning_rate": 1.8910364442972896e-07, + "loss": 0.068, + "step": 5345 + }, + { + "epoch": 0.94074204325655, + "grad_norm": 0.9644356966018677, + "learning_rate": 1.8363157925700316e-07, + "loss": 0.0581, + "step": 5350 + }, + { + "epoch": 0.9416212414278178, + "grad_norm": 0.7088313102722168, + "learning_rate": 1.78239119349789e-07, + "loss": 0.0469, + "step": 5355 + }, + { + "epoch": 0.9425004395990856, + "grad_norm": 0.14932291209697723, + "learning_rate": 1.7292630843792292e-07, + "loss": 0.0584, + "step": 5360 + }, + { + "epoch": 0.9433796377703534, + "grad_norm": 0.18209953606128693, + "learning_rate": 1.6769318960533465e-07, + "loss": 0.0526, + "step": 5365 + }, + { + "epoch": 0.9442588359416212, + "grad_norm": 0.5487361550331116, + "learning_rate": 1.625398052896965e-07, + "loss": 0.0551, + "step": 5370 + }, + { + "epoch": 0.945138034112889, + "grad_norm": 0.1913604438304901, + "learning_rate": 1.574661972820779e-07, + "loss": 0.0556, + "step": 5375 + }, + { + "epoch": 0.9460172322841568, + "grad_norm": 0.44386959075927734, + "learning_rate": 1.5247240672660258e-07, + "loss": 0.0633, + "step": 5380 + }, + { + "epoch": 0.9468964304554246, + "grad_norm": 0.3759801685810089, + "learning_rate": 1.4755847412012635e-07, + "loss": 0.0557, + "step": 5385 + }, + { + "epoch": 0.9477756286266924, + "grad_norm": 0.19469892978668213, + "learning_rate": 1.427244393118965e-07, + "loss": 0.0591, + "step": 5390 + }, + { + "epoch": 0.9486548267979603, + "grad_norm": 0.5927003026008606, + "learning_rate": 1.379703415032374e-07, + "loss": 0.045, + "step": 5395 + }, + { + "epoch": 0.9495340249692281, + "grad_norm": 0.13490422070026398, + "learning_rate": 1.3329621924722536e-07, + "loss": 0.0437, + "step": 5400 + }, + { + "epoch": 0.9504132231404959, + "grad_norm": 0.3926790654659271, + "learning_rate": 1.287021104483821e-07, + "loss": 0.053, + "step": 5405 + }, + { + "epoch": 0.9512924213117637, + "grad_norm": 0.5249255299568176, + "learning_rate": 1.2418805236236287e-07, + "loss": 0.06, + "step": 5410 + }, + { + "epoch": 0.9521716194830315, + "grad_norm": 1.2005724906921387, + "learning_rate": 1.1975408159566105e-07, + "loss": 0.0479, + "step": 5415 + }, + { + "epoch": 0.9530508176542993, + "grad_norm": 0.2839159667491913, + "learning_rate": 1.1540023410529844e-07, + "loss": 0.0623, + "step": 5420 + }, + { + "epoch": 0.9539300158255671, + "grad_norm": 0.5383673906326294, + "learning_rate": 1.1112654519855104e-07, + "loss": 0.0596, + "step": 5425 + }, + { + "epoch": 0.9548092139968349, + "grad_norm": 0.595245361328125, + "learning_rate": 1.0693304953264705e-07, + "loss": 0.0661, + "step": 5430 + }, + { + "epoch": 0.9556884121681027, + "grad_norm": 0.5355046391487122, + "learning_rate": 1.0281978111449375e-07, + "loss": 0.0715, + "step": 5435 + }, + { + "epoch": 0.9565676103393705, + "grad_norm": 0.31870976090431213, + "learning_rate": 9.87867733004011e-08, + "loss": 0.053, + "step": 5440 + }, + { + "epoch": 0.9574468085106383, + "grad_norm": 0.6525245308876038, + "learning_rate": 9.483405879581187e-08, + "loss": 0.0519, + "step": 5445 + }, + { + "epoch": 0.9583260066819062, + "grad_norm": 0.6109974384307861, + "learning_rate": 9.096166965502972e-08, + "loss": 0.0583, + "step": 5450 + }, + { + "epoch": 0.959205204853174, + "grad_norm": 0.22431720793247223, + "learning_rate": 8.71696372809705e-08, + "loss": 0.0486, + "step": 5455 + }, + { + "epoch": 0.9600844030244418, + "grad_norm": 0.5329923033714294, + "learning_rate": 8.345799242489905e-08, + "loss": 0.0525, + "step": 5460 + }, + { + "epoch": 0.9609636011957096, + "grad_norm": 0.4853318929672241, + "learning_rate": 7.982676518618059e-08, + "loss": 0.0597, + "step": 5465 + }, + { + "epoch": 0.9618427993669774, + "grad_norm": 0.3214046061038971, + "learning_rate": 7.627598501204092e-08, + "loss": 0.05, + "step": 5470 + }, + { + "epoch": 0.9627219975382452, + "grad_norm": 0.18154819309711456, + "learning_rate": 7.28056806973243e-08, + "loss": 0.0484, + "step": 5475 + }, + { + "epoch": 0.963601195709513, + "grad_norm": 0.14486095309257507, + "learning_rate": 6.941588038426039e-08, + "loss": 0.0586, + "step": 5480 + }, + { + "epoch": 0.9644803938807808, + "grad_norm": 0.6655594706535339, + "learning_rate": 6.610661156223664e-08, + "loss": 0.0641, + "step": 5485 + }, + { + "epoch": 0.9653595920520486, + "grad_norm": 0.22004252672195435, + "learning_rate": 6.287790106757396e-08, + "loss": 0.0483, + "step": 5490 + }, + { + "epoch": 0.9662387902233164, + "grad_norm": 0.20186270773410797, + "learning_rate": 5.972977508331368e-08, + "loss": 0.0528, + "step": 5495 + }, + { + "epoch": 0.9671179883945842, + "grad_norm": 0.23505160212516785, + "learning_rate": 5.666225913899648e-08, + "loss": 0.0663, + "step": 5500 + }, + { + "epoch": 0.967997186565852, + "grad_norm": 0.423562616109848, + "learning_rate": 5.367537811046486e-08, + "loss": 0.0516, + "step": 5505 + }, + { + "epoch": 0.9688763847371198, + "grad_norm": 0.23930394649505615, + "learning_rate": 5.0769156219656614e-08, + "loss": 0.0572, + "step": 5510 + }, + { + "epoch": 0.9697555829083876, + "grad_norm": 0.1267559826374054, + "learning_rate": 4.7943617034407196e-08, + "loss": 0.0411, + "step": 5515 + }, + { + "epoch": 0.9706347810796554, + "grad_norm": 0.5073260068893433, + "learning_rate": 4.51987834682599e-08, + "loss": 0.052, + "step": 5520 + }, + { + "epoch": 0.9715139792509232, + "grad_norm": 0.3135651648044586, + "learning_rate": 4.253467778028486e-08, + "loss": 0.0547, + "step": 5525 + }, + { + "epoch": 0.972393177422191, + "grad_norm": 0.5157844424247742, + "learning_rate": 3.9951321574890345e-08, + "loss": 0.0463, + "step": 5530 + }, + { + "epoch": 0.9732723755934588, + "grad_norm": 0.16779236495494843, + "learning_rate": 3.744873580165176e-08, + "loss": 0.059, + "step": 5535 + }, + { + "epoch": 0.9741515737647266, + "grad_norm": 0.26359498500823975, + "learning_rate": 3.502694075514179e-08, + "loss": 0.0567, + "step": 5540 + }, + { + "epoch": 0.9750307719359944, + "grad_norm": 0.6399820446968079, + "learning_rate": 3.26859560747661e-08, + "loss": 0.0544, + "step": 5545 + }, + { + "epoch": 0.9759099701072622, + "grad_norm": 0.49394690990448, + "learning_rate": 3.042580074460344e-08, + "loss": 0.0649, + "step": 5550 + }, + { + "epoch": 0.97678916827853, + "grad_norm": 0.11140932142734528, + "learning_rate": 2.8246493093250226e-08, + "loss": 0.0572, + "step": 5555 + }, + { + "epoch": 0.9776683664497978, + "grad_norm": 0.803503155708313, + "learning_rate": 2.6148050793676217e-08, + "loss": 0.0583, + "step": 5560 + }, + { + "epoch": 0.9785475646210656, + "grad_norm": 0.20883022248744965, + "learning_rate": 2.4130490863075727e-08, + "loss": 0.0631, + "step": 5565 + }, + { + "epoch": 0.9794267627923334, + "grad_norm": 0.5355504155158997, + "learning_rate": 2.2193829662731093e-08, + "loss": 0.0548, + "step": 5570 + }, + { + "epoch": 0.9803059609636012, + "grad_norm": 0.5118641257286072, + "learning_rate": 2.033808289788608e-08, + "loss": 0.0551, + "step": 5575 + }, + { + "epoch": 0.981185159134869, + "grad_norm": 0.4112605154514313, + "learning_rate": 1.856326561760824e-08, + "loss": 0.0567, + "step": 5580 + }, + { + "epoch": 0.9820643573061368, + "grad_norm": 0.36727771162986755, + "learning_rate": 1.686939221467565e-08, + "loss": 0.0524, + "step": 5585 + }, + { + "epoch": 0.9829435554774046, + "grad_norm": 0.32696032524108887, + "learning_rate": 1.5256476425455912e-08, + "loss": 0.0546, + "step": 5590 + }, + { + "epoch": 0.9838227536486724, + "grad_norm": 0.14299306273460388, + "learning_rate": 1.37245313297929e-08, + "loss": 0.0603, + "step": 5595 + }, + { + "epoch": 0.9847019518199402, + "grad_norm": 0.1304006576538086, + "learning_rate": 1.2273569350909065e-08, + "loss": 0.048, + "step": 5600 + }, + { + "epoch": 0.985581149991208, + "grad_norm": 0.3255922496318817, + "learning_rate": 1.09036022552933e-08, + "loss": 0.0624, + "step": 5605 + }, + { + "epoch": 0.9864603481624759, + "grad_norm": 0.14127831161022186, + "learning_rate": 9.614641152615457e-09, + "loss": 0.0537, + "step": 5610 + }, + { + "epoch": 0.9873395463337437, + "grad_norm": 0.49166056513786316, + "learning_rate": 8.406696495627531e-09, + "loss": 0.0555, + "step": 5615 + }, + { + "epoch": 0.9882187445050115, + "grad_norm": 0.4225272238254547, + "learning_rate": 7.279778080089284e-09, + "loss": 0.0508, + "step": 5620 + }, + { + "epoch": 0.9890979426762793, + "grad_norm": 0.43666836619377136, + "learning_rate": 6.233895044677196e-09, + "loss": 0.0495, + "step": 5625 + }, + { + "epoch": 0.9899771408475471, + "grad_norm": 0.23192152380943298, + "learning_rate": 5.269055870920081e-09, + "loss": 0.0601, + "step": 5630 + }, + { + "epoch": 0.9908563390188149, + "grad_norm": 0.09601382911205292, + "learning_rate": 4.385268383123586e-09, + "loss": 0.0571, + "step": 5635 + }, + { + "epoch": 0.9917355371900827, + "grad_norm": 0.22825530171394348, + "learning_rate": 3.5825397483113532e-09, + "loss": 0.0647, + "step": 5640 + }, + { + "epoch": 0.9926147353613505, + "grad_norm": 0.18555951118469238, + "learning_rate": 2.8608764761639542e-09, + "loss": 0.0468, + "step": 5645 + }, + { + "epoch": 0.9934939335326183, + "grad_norm": 0.15407155454158783, + "learning_rate": 2.220284418968932e-09, + "loss": 0.0543, + "step": 5650 + }, + { + "epoch": 0.9943731317038861, + "grad_norm": 0.30971744656562805, + "learning_rate": 1.6607687715675113e-09, + "loss": 0.0788, + "step": 5655 + }, + { + "epoch": 0.9952523298751539, + "grad_norm": 0.4101414680480957, + "learning_rate": 1.1823340713212894e-09, + "loss": 0.0527, + "step": 5660 + }, + { + "epoch": 0.9961315280464217, + "grad_norm": 0.29319489002227783, + "learning_rate": 7.849841980667183e-10, + "loss": 0.0472, + "step": 5665 + }, + { + "epoch": 0.9970107262176895, + "grad_norm": 0.3551650941371918, + "learning_rate": 4.687223740917901e-10, + "loss": 0.0683, + "step": 5670 + }, + { + "epoch": 0.9978899243889573, + "grad_norm": 0.2584504783153534, + "learning_rate": 2.335511641005095e-10, + "loss": 0.0501, + "step": 5675 + }, + { + "epoch": 0.9987691225602251, + "grad_norm": 0.5419056415557861, + "learning_rate": 7.947247520179169e-11, + "loss": 0.0518, + "step": 5680 + }, + { + "epoch": 0.9996483207314929, + "grad_norm": 0.7458012700080872, + "learning_rate": 6.487556887257995e-12, + "loss": 0.0659, + "step": 5685 + }, + { + "epoch": 1.0, + "step": 5687, + "total_flos": 0.0, + "train_loss": 0.059674585497827885, + "train_runtime": 13149.4885, + "train_samples_per_second": 13.837, + "train_steps_per_second": 0.432 + } + ], + "logging_steps": 5, + "max_steps": 5687, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}