{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 747, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004016064257028112, "grad_norm": 6.783391952514648, "learning_rate": 0.0002, "loss": 17.0626, "step": 1 }, { "epoch": 0.008032128514056224, "grad_norm": 6.832010269165039, "learning_rate": 0.0001997322623828648, "loss": 16.3736, "step": 2 }, { "epoch": 0.012048192771084338, "grad_norm": 4.412657260894775, "learning_rate": 0.0001994645247657296, "loss": 13.7202, "step": 3 }, { "epoch": 0.01606425702811245, "grad_norm": 4.6994500160217285, "learning_rate": 0.0001991967871485944, "loss": 12.1103, "step": 4 }, { "epoch": 0.020080321285140562, "grad_norm": 5.078355312347412, "learning_rate": 0.00019892904953145918, "loss": 11.9491, "step": 5 }, { "epoch": 0.024096385542168676, "grad_norm": 5.82587194442749, "learning_rate": 0.00019866131191432397, "loss": 10.24, "step": 6 }, { "epoch": 0.028112449799196786, "grad_norm": 5.521396160125732, "learning_rate": 0.00019839357429718877, "loss": 9.7617, "step": 7 }, { "epoch": 0.0321285140562249, "grad_norm": 5.55628776550293, "learning_rate": 0.00019812583668005356, "loss": 8.9588, "step": 8 }, { "epoch": 0.03614457831325301, "grad_norm": 4.77673864364624, "learning_rate": 0.00019785809906291835, "loss": 7.413, "step": 9 }, { "epoch": 0.040160642570281124, "grad_norm": 3.045475482940674, "learning_rate": 0.00019759036144578314, "loss": 8.4555, "step": 10 }, { "epoch": 0.04417670682730924, "grad_norm": 2.4188013076782227, "learning_rate": 0.0001973226238286479, "loss": 6.3816, "step": 11 }, { "epoch": 0.04819277108433735, "grad_norm": 2.483142852783203, "learning_rate": 0.00019705488621151273, "loss": 6.0486, "step": 12 }, { "epoch": 0.05220883534136546, "grad_norm": 2.7488200664520264, "learning_rate": 0.00019678714859437752, "loss": 6.0559, "step": 13 }, { "epoch": 0.05622489959839357, "grad_norm": 3.509127140045166, "learning_rate": 0.00019651941097724232, "loss": 6.5013, "step": 14 }, { "epoch": 0.060240963855421686, "grad_norm": 4.097210884094238, "learning_rate": 0.0001962516733601071, "loss": 6.6959, "step": 15 }, { "epoch": 0.0642570281124498, "grad_norm": 5.211580753326416, "learning_rate": 0.0001959839357429719, "loss": 7.4451, "step": 16 }, { "epoch": 0.06827309236947791, "grad_norm": 4.360202312469482, "learning_rate": 0.00019571619812583667, "loss": 7.5475, "step": 17 }, { "epoch": 0.07228915662650602, "grad_norm": 4.646812915802002, "learning_rate": 0.0001954484605087015, "loss": 5.9117, "step": 18 }, { "epoch": 0.07630522088353414, "grad_norm": 4.076641082763672, "learning_rate": 0.00019518072289156628, "loss": 6.5152, "step": 19 }, { "epoch": 0.08032128514056225, "grad_norm": 4.571013450622559, "learning_rate": 0.00019491298527443107, "loss": 7.7192, "step": 20 }, { "epoch": 0.08433734939759036, "grad_norm": 3.786604881286621, "learning_rate": 0.00019464524765729587, "loss": 6.0262, "step": 21 }, { "epoch": 0.08835341365461848, "grad_norm": 3.7632923126220703, "learning_rate": 0.00019437751004016066, "loss": 5.515, "step": 22 }, { "epoch": 0.09236947791164658, "grad_norm": 3.142625093460083, "learning_rate": 0.00019410977242302542, "loss": 5.5428, "step": 23 }, { "epoch": 0.0963855421686747, "grad_norm": 4.195131778717041, "learning_rate": 0.00019384203480589022, "loss": 5.0073, "step": 24 }, { "epoch": 0.10040160642570281, "grad_norm": 7.452038764953613, "learning_rate": 0.00019357429718875504, "loss": 5.6765, "step": 25 }, { "epoch": 0.10441767068273092, "grad_norm": 9.708063125610352, "learning_rate": 0.00019330655957161983, "loss": 5.6149, "step": 26 }, { "epoch": 0.10843373493975904, "grad_norm": 19.072011947631836, "learning_rate": 0.00019303882195448462, "loss": 5.4365, "step": 27 }, { "epoch": 0.11244979919678715, "grad_norm": 6.726373195648193, "learning_rate": 0.00019277108433734942, "loss": 4.858, "step": 28 }, { "epoch": 0.11646586345381527, "grad_norm": 3.187056064605713, "learning_rate": 0.0001925033467202142, "loss": 5.3406, "step": 29 }, { "epoch": 0.12048192771084337, "grad_norm": 3.364069700241089, "learning_rate": 0.00019223560910307897, "loss": 5.5143, "step": 30 }, { "epoch": 0.12449799196787148, "grad_norm": 2.4620518684387207, "learning_rate": 0.00019196787148594377, "loss": 4.638, "step": 31 }, { "epoch": 0.1285140562248996, "grad_norm": 3.9363696575164795, "learning_rate": 0.0001917001338688086, "loss": 4.6009, "step": 32 }, { "epoch": 0.13253012048192772, "grad_norm": 3.230189561843872, "learning_rate": 0.00019143239625167338, "loss": 4.7928, "step": 33 }, { "epoch": 0.13654618473895583, "grad_norm": 2.873898983001709, "learning_rate": 0.00019116465863453817, "loss": 3.7444, "step": 34 }, { "epoch": 0.14056224899598393, "grad_norm": 3.2136387825012207, "learning_rate": 0.00019089692101740297, "loss": 4.452, "step": 35 }, { "epoch": 0.14457831325301204, "grad_norm": 2.8411664962768555, "learning_rate": 0.00019062918340026773, "loss": 4.483, "step": 36 }, { "epoch": 0.14859437751004015, "grad_norm": 2.68854022026062, "learning_rate": 0.00019036144578313252, "loss": 3.92, "step": 37 }, { "epoch": 0.15261044176706828, "grad_norm": 3.324504852294922, "learning_rate": 0.00019009370816599734, "loss": 4.4238, "step": 38 }, { "epoch": 0.1566265060240964, "grad_norm": 3.0757510662078857, "learning_rate": 0.00018982597054886214, "loss": 4.0354, "step": 39 }, { "epoch": 0.1606425702811245, "grad_norm": 3.1478559970855713, "learning_rate": 0.00018955823293172693, "loss": 4.7587, "step": 40 }, { "epoch": 0.1646586345381526, "grad_norm": 2.923387050628662, "learning_rate": 0.00018929049531459172, "loss": 4.1713, "step": 41 }, { "epoch": 0.1686746987951807, "grad_norm": 3.3262710571289062, "learning_rate": 0.0001890227576974565, "loss": 5.7246, "step": 42 }, { "epoch": 0.17269076305220885, "grad_norm": 2.9940414428710938, "learning_rate": 0.00018875502008032128, "loss": 3.9502, "step": 43 }, { "epoch": 0.17670682730923695, "grad_norm": 2.4215221405029297, "learning_rate": 0.00018848728246318607, "loss": 3.3469, "step": 44 }, { "epoch": 0.18072289156626506, "grad_norm": 4.08881139755249, "learning_rate": 0.0001882195448460509, "loss": 3.6203, "step": 45 }, { "epoch": 0.18473895582329317, "grad_norm": 2.550448417663574, "learning_rate": 0.00018795180722891569, "loss": 3.9986, "step": 46 }, { "epoch": 0.18875502008032127, "grad_norm": 2.3286774158477783, "learning_rate": 0.00018768406961178048, "loss": 3.3749, "step": 47 }, { "epoch": 0.1927710843373494, "grad_norm": 2.724431276321411, "learning_rate": 0.00018741633199464524, "loss": 3.4734, "step": 48 }, { "epoch": 0.19678714859437751, "grad_norm": 2.961087226867676, "learning_rate": 0.00018714859437751004, "loss": 4.242, "step": 49 }, { "epoch": 0.20080321285140562, "grad_norm": 2.4245645999908447, "learning_rate": 0.00018688085676037483, "loss": 3.7956, "step": 50 }, { "epoch": 0.20481927710843373, "grad_norm": 2.141226053237915, "learning_rate": 0.00018661311914323962, "loss": 3.0041, "step": 51 }, { "epoch": 0.20883534136546184, "grad_norm": 2.7774155139923096, "learning_rate": 0.00018634538152610444, "loss": 3.5062, "step": 52 }, { "epoch": 0.21285140562248997, "grad_norm": 2.6332597732543945, "learning_rate": 0.00018607764390896924, "loss": 3.9305, "step": 53 }, { "epoch": 0.21686746987951808, "grad_norm": 3.4417197704315186, "learning_rate": 0.000185809906291834, "loss": 5.1481, "step": 54 }, { "epoch": 0.22088353413654618, "grad_norm": 2.576704978942871, "learning_rate": 0.0001855421686746988, "loss": 3.6137, "step": 55 }, { "epoch": 0.2248995983935743, "grad_norm": 2.816452980041504, "learning_rate": 0.0001852744310575636, "loss": 3.5015, "step": 56 }, { "epoch": 0.2289156626506024, "grad_norm": 3.5300023555755615, "learning_rate": 0.00018500669344042838, "loss": 4.7758, "step": 57 }, { "epoch": 0.23293172690763053, "grad_norm": 2.594787120819092, "learning_rate": 0.0001847389558232932, "loss": 4.0104, "step": 58 }, { "epoch": 0.23694779116465864, "grad_norm": 3.472842216491699, "learning_rate": 0.000184471218206158, "loss": 4.2051, "step": 59 }, { "epoch": 0.24096385542168675, "grad_norm": 2.195838212966919, "learning_rate": 0.00018420348058902276, "loss": 3.4561, "step": 60 }, { "epoch": 0.24497991967871485, "grad_norm": 2.6737020015716553, "learning_rate": 0.00018393574297188755, "loss": 5.4281, "step": 61 }, { "epoch": 0.24899598393574296, "grad_norm": 3.128307342529297, "learning_rate": 0.00018366800535475234, "loss": 4.835, "step": 62 }, { "epoch": 0.25301204819277107, "grad_norm": 2.8915627002716064, "learning_rate": 0.00018340026773761714, "loss": 5.6513, "step": 63 }, { "epoch": 0.2570281124497992, "grad_norm": 2.4325616359710693, "learning_rate": 0.00018313253012048193, "loss": 3.8769, "step": 64 }, { "epoch": 0.26104417670682734, "grad_norm": 2.717306613922119, "learning_rate": 0.00018286479250334675, "loss": 4.7258, "step": 65 }, { "epoch": 0.26506024096385544, "grad_norm": 2.6178746223449707, "learning_rate": 0.00018259705488621152, "loss": 4.0424, "step": 66 }, { "epoch": 0.26907630522088355, "grad_norm": 2.382551431655884, "learning_rate": 0.0001823293172690763, "loss": 3.547, "step": 67 }, { "epoch": 0.27309236947791166, "grad_norm": 2.546783685684204, "learning_rate": 0.0001820615796519411, "loss": 4.2495, "step": 68 }, { "epoch": 0.27710843373493976, "grad_norm": 2.4738221168518066, "learning_rate": 0.0001817938420348059, "loss": 3.69, "step": 69 }, { "epoch": 0.28112449799196787, "grad_norm": 2.2191786766052246, "learning_rate": 0.0001815261044176707, "loss": 3.1576, "step": 70 }, { "epoch": 0.285140562248996, "grad_norm": 2.4891932010650635, "learning_rate": 0.00018125836680053548, "loss": 3.7767, "step": 71 }, { "epoch": 0.2891566265060241, "grad_norm": 2.0602684020996094, "learning_rate": 0.00018099062918340027, "loss": 3.1497, "step": 72 }, { "epoch": 0.2931726907630522, "grad_norm": 2.435455560684204, "learning_rate": 0.00018072289156626507, "loss": 4.3061, "step": 73 }, { "epoch": 0.2971887550200803, "grad_norm": 2.7304036617279053, "learning_rate": 0.00018045515394912986, "loss": 3.6995, "step": 74 }, { "epoch": 0.30120481927710846, "grad_norm": 2.6375226974487305, "learning_rate": 0.00018018741633199465, "loss": 3.3922, "step": 75 }, { "epoch": 0.30522088353413657, "grad_norm": 2.097759246826172, "learning_rate": 0.00017991967871485944, "loss": 3.1887, "step": 76 }, { "epoch": 0.3092369477911647, "grad_norm": 2.600724458694458, "learning_rate": 0.00017965194109772424, "loss": 3.8532, "step": 77 }, { "epoch": 0.3132530120481928, "grad_norm": 3.0356369018554688, "learning_rate": 0.00017938420348058903, "loss": 4.6221, "step": 78 }, { "epoch": 0.3172690763052209, "grad_norm": 2.1509416103363037, "learning_rate": 0.00017911646586345382, "loss": 3.5473, "step": 79 }, { "epoch": 0.321285140562249, "grad_norm": 2.7542128562927246, "learning_rate": 0.00017884872824631862, "loss": 4.3206, "step": 80 }, { "epoch": 0.3253012048192771, "grad_norm": 2.7480881214141846, "learning_rate": 0.0001785809906291834, "loss": 3.4596, "step": 81 }, { "epoch": 0.3293172690763052, "grad_norm": 2.8787624835968018, "learning_rate": 0.0001783132530120482, "loss": 4.0409, "step": 82 }, { "epoch": 0.3333333333333333, "grad_norm": 2.234320878982544, "learning_rate": 0.000178045515394913, "loss": 3.6684, "step": 83 }, { "epoch": 0.3373493975903614, "grad_norm": 2.174452781677246, "learning_rate": 0.00017777777777777779, "loss": 3.8964, "step": 84 }, { "epoch": 0.3413654618473896, "grad_norm": 2.25730299949646, "learning_rate": 0.00017751004016064258, "loss": 3.3793, "step": 85 }, { "epoch": 0.3453815261044177, "grad_norm": 2.3120176792144775, "learning_rate": 0.00017724230254350737, "loss": 3.9183, "step": 86 }, { "epoch": 0.3493975903614458, "grad_norm": 2.696288824081421, "learning_rate": 0.00017697456492637216, "loss": 4.1063, "step": 87 }, { "epoch": 0.3534136546184739, "grad_norm": 3.9386634826660156, "learning_rate": 0.00017670682730923696, "loss": 4.599, "step": 88 }, { "epoch": 0.357429718875502, "grad_norm": 2.7136473655700684, "learning_rate": 0.00017643908969210175, "loss": 4.1535, "step": 89 }, { "epoch": 0.3614457831325301, "grad_norm": 2.4276645183563232, "learning_rate": 0.00017617135207496654, "loss": 4.4834, "step": 90 }, { "epoch": 0.3654618473895582, "grad_norm": 2.6002511978149414, "learning_rate": 0.00017590361445783134, "loss": 4.0748, "step": 91 }, { "epoch": 0.36947791164658633, "grad_norm": 2.682366132736206, "learning_rate": 0.00017563587684069613, "loss": 4.4142, "step": 92 }, { "epoch": 0.37349397590361444, "grad_norm": 2.108722686767578, "learning_rate": 0.00017536813922356092, "loss": 4.4304, "step": 93 }, { "epoch": 0.37751004016064255, "grad_norm": 2.0732803344726562, "learning_rate": 0.00017510040160642571, "loss": 3.2521, "step": 94 }, { "epoch": 0.3815261044176707, "grad_norm": 2.3038790225982666, "learning_rate": 0.0001748326639892905, "loss": 4.3167, "step": 95 }, { "epoch": 0.3855421686746988, "grad_norm": 2.623572587966919, "learning_rate": 0.0001745649263721553, "loss": 5.3465, "step": 96 }, { "epoch": 0.3895582329317269, "grad_norm": 2.4543046951293945, "learning_rate": 0.0001742971887550201, "loss": 3.4479, "step": 97 }, { "epoch": 0.39357429718875503, "grad_norm": 2.291369915008545, "learning_rate": 0.00017402945113788489, "loss": 4.0893, "step": 98 }, { "epoch": 0.39759036144578314, "grad_norm": 2.4371914863586426, "learning_rate": 0.00017376171352074968, "loss": 3.7132, "step": 99 }, { "epoch": 0.40160642570281124, "grad_norm": 2.1401989459991455, "learning_rate": 0.00017349397590361447, "loss": 2.9892, "step": 100 }, { "epoch": 0.40562248995983935, "grad_norm": 2.1574857234954834, "learning_rate": 0.00017322623828647926, "loss": 3.3145, "step": 101 }, { "epoch": 0.40963855421686746, "grad_norm": 2.7298076152801514, "learning_rate": 0.00017295850066934406, "loss": 4.2365, "step": 102 }, { "epoch": 0.41365461847389556, "grad_norm": 2.5634846687316895, "learning_rate": 0.00017269076305220885, "loss": 3.4466, "step": 103 }, { "epoch": 0.41767068273092367, "grad_norm": 2.573195695877075, "learning_rate": 0.00017242302543507362, "loss": 3.3283, "step": 104 }, { "epoch": 0.42168674698795183, "grad_norm": 2.205293655395508, "learning_rate": 0.00017215528781793844, "loss": 3.7288, "step": 105 }, { "epoch": 0.42570281124497994, "grad_norm": 3.3177073001861572, "learning_rate": 0.00017188755020080323, "loss": 3.9341, "step": 106 }, { "epoch": 0.42971887550200805, "grad_norm": 2.601710557937622, "learning_rate": 0.00017161981258366802, "loss": 4.3724, "step": 107 }, { "epoch": 0.43373493975903615, "grad_norm": 2.490556478500366, "learning_rate": 0.00017135207496653281, "loss": 3.0784, "step": 108 }, { "epoch": 0.43775100401606426, "grad_norm": 2.7771122455596924, "learning_rate": 0.0001710843373493976, "loss": 3.7125, "step": 109 }, { "epoch": 0.44176706827309237, "grad_norm": 2.9865031242370605, "learning_rate": 0.00017081659973226237, "loss": 4.9747, "step": 110 }, { "epoch": 0.4457831325301205, "grad_norm": 3.2922353744506836, "learning_rate": 0.00017054886211512717, "loss": 4.229, "step": 111 }, { "epoch": 0.4497991967871486, "grad_norm": 2.2360899448394775, "learning_rate": 0.00017028112449799199, "loss": 3.1859, "step": 112 }, { "epoch": 0.4538152610441767, "grad_norm": 2.4282941818237305, "learning_rate": 0.00017001338688085678, "loss": 4.4577, "step": 113 }, { "epoch": 0.4578313253012048, "grad_norm": 2.2384181022644043, "learning_rate": 0.00016974564926372157, "loss": 3.435, "step": 114 }, { "epoch": 0.46184738955823296, "grad_norm": 2.586678981781006, "learning_rate": 0.00016947791164658636, "loss": 3.7974, "step": 115 }, { "epoch": 0.46586345381526106, "grad_norm": 2.2473366260528564, "learning_rate": 0.00016921017402945113, "loss": 3.2193, "step": 116 }, { "epoch": 0.46987951807228917, "grad_norm": 2.2137515544891357, "learning_rate": 0.00016894243641231592, "loss": 3.2774, "step": 117 }, { "epoch": 0.4738955823293173, "grad_norm": 2.6827173233032227, "learning_rate": 0.00016867469879518074, "loss": 3.843, "step": 118 }, { "epoch": 0.4779116465863454, "grad_norm": 2.499166250228882, "learning_rate": 0.00016840696117804553, "loss": 3.1818, "step": 119 }, { "epoch": 0.4819277108433735, "grad_norm": 2.609964609146118, "learning_rate": 0.00016813922356091033, "loss": 3.6292, "step": 120 }, { "epoch": 0.4859437751004016, "grad_norm": 2.697786808013916, "learning_rate": 0.00016787148594377512, "loss": 3.7501, "step": 121 }, { "epoch": 0.4899598393574297, "grad_norm": 2.834494113922119, "learning_rate": 0.00016760374832663989, "loss": 3.9265, "step": 122 }, { "epoch": 0.4939759036144578, "grad_norm": 2.3431777954101562, "learning_rate": 0.00016733601070950468, "loss": 3.7916, "step": 123 }, { "epoch": 0.4979919678714859, "grad_norm": 2.434953212738037, "learning_rate": 0.00016706827309236947, "loss": 3.4279, "step": 124 }, { "epoch": 0.5020080321285141, "grad_norm": 2.3629250526428223, "learning_rate": 0.0001668005354752343, "loss": 3.4382, "step": 125 }, { "epoch": 0.5060240963855421, "grad_norm": 2.7543423175811768, "learning_rate": 0.00016653279785809908, "loss": 4.8146, "step": 126 }, { "epoch": 0.5100401606425703, "grad_norm": 3.149775981903076, "learning_rate": 0.00016626506024096388, "loss": 5.365, "step": 127 }, { "epoch": 0.5140562248995983, "grad_norm": 2.640326499938965, "learning_rate": 0.00016599732262382864, "loss": 4.2036, "step": 128 }, { "epoch": 0.5180722891566265, "grad_norm": 2.6297357082366943, "learning_rate": 0.00016572958500669344, "loss": 3.7331, "step": 129 }, { "epoch": 0.5220883534136547, "grad_norm": 2.9165263175964355, "learning_rate": 0.00016546184738955823, "loss": 4.2224, "step": 130 }, { "epoch": 0.5261044176706827, "grad_norm": 2.003908634185791, "learning_rate": 0.00016519410977242302, "loss": 3.5818, "step": 131 }, { "epoch": 0.5301204819277109, "grad_norm": 2.3137078285217285, "learning_rate": 0.00016492637215528784, "loss": 3.4726, "step": 132 }, { "epoch": 0.5341365461847389, "grad_norm": 2.69950795173645, "learning_rate": 0.00016465863453815263, "loss": 4.0059, "step": 133 }, { "epoch": 0.5381526104417671, "grad_norm": 2.1858394145965576, "learning_rate": 0.0001643908969210174, "loss": 3.6957, "step": 134 }, { "epoch": 0.5421686746987951, "grad_norm": 2.423802137374878, "learning_rate": 0.0001641231593038822, "loss": 4.1535, "step": 135 }, { "epoch": 0.5461847389558233, "grad_norm": 2.244253158569336, "learning_rate": 0.00016385542168674699, "loss": 3.3276, "step": 136 }, { "epoch": 0.5502008032128514, "grad_norm": 2.2932465076446533, "learning_rate": 0.00016358768406961178, "loss": 3.6498, "step": 137 }, { "epoch": 0.5542168674698795, "grad_norm": 2.0782933235168457, "learning_rate": 0.0001633199464524766, "loss": 4.007, "step": 138 }, { "epoch": 0.5582329317269076, "grad_norm": 2.778797149658203, "learning_rate": 0.0001630522088353414, "loss": 3.8436, "step": 139 }, { "epoch": 0.5622489959839357, "grad_norm": 2.7823002338409424, "learning_rate": 0.00016278447121820616, "loss": 5.5985, "step": 140 }, { "epoch": 0.5662650602409639, "grad_norm": 3.124753475189209, "learning_rate": 0.00016251673360107095, "loss": 3.8402, "step": 141 }, { "epoch": 0.570281124497992, "grad_norm": 2.999889612197876, "learning_rate": 0.00016224899598393574, "loss": 4.8463, "step": 142 }, { "epoch": 0.5742971887550201, "grad_norm": 2.2176406383514404, "learning_rate": 0.00016198125836680054, "loss": 3.6488, "step": 143 }, { "epoch": 0.5783132530120482, "grad_norm": 2.334336757659912, "learning_rate": 0.00016171352074966533, "loss": 3.4351, "step": 144 }, { "epoch": 0.5823293172690763, "grad_norm": 2.1625120639801025, "learning_rate": 0.00016144578313253015, "loss": 3.4423, "step": 145 }, { "epoch": 0.5863453815261044, "grad_norm": 2.3950042724609375, "learning_rate": 0.00016117804551539491, "loss": 3.4302, "step": 146 }, { "epoch": 0.5903614457831325, "grad_norm": 1.968996524810791, "learning_rate": 0.0001609103078982597, "loss": 3.3924, "step": 147 }, { "epoch": 0.5943775100401606, "grad_norm": 2.259298801422119, "learning_rate": 0.0001606425702811245, "loss": 3.4544, "step": 148 }, { "epoch": 0.5983935742971888, "grad_norm": 2.5227410793304443, "learning_rate": 0.0001603748326639893, "loss": 3.6276, "step": 149 }, { "epoch": 0.6024096385542169, "grad_norm": 2.4112424850463867, "learning_rate": 0.00016010709504685409, "loss": 3.8806, "step": 150 }, { "epoch": 0.606425702811245, "grad_norm": 2.5478017330169678, "learning_rate": 0.00015983935742971888, "loss": 4.1461, "step": 151 }, { "epoch": 0.6104417670682731, "grad_norm": 2.832744836807251, "learning_rate": 0.00015957161981258367, "loss": 5.0162, "step": 152 }, { "epoch": 0.6144578313253012, "grad_norm": 2.7249608039855957, "learning_rate": 0.00015930388219544846, "loss": 3.2521, "step": 153 }, { "epoch": 0.6184738955823293, "grad_norm": 2.579235315322876, "learning_rate": 0.00015903614457831326, "loss": 4.0444, "step": 154 }, { "epoch": 0.6224899598393574, "grad_norm": 2.719031572341919, "learning_rate": 0.00015876840696117805, "loss": 3.8091, "step": 155 }, { "epoch": 0.6265060240963856, "grad_norm": 2.9060187339782715, "learning_rate": 0.00015850066934404284, "loss": 3.574, "step": 156 }, { "epoch": 0.6305220883534136, "grad_norm": 2.3890836238861084, "learning_rate": 0.00015823293172690763, "loss": 3.0126, "step": 157 }, { "epoch": 0.6345381526104418, "grad_norm": 2.4875965118408203, "learning_rate": 0.00015796519410977243, "loss": 3.8722, "step": 158 }, { "epoch": 0.6385542168674698, "grad_norm": 2.452133893966675, "learning_rate": 0.00015769745649263722, "loss": 3.1996, "step": 159 }, { "epoch": 0.642570281124498, "grad_norm": 2.644927740097046, "learning_rate": 0.000157429718875502, "loss": 4.5955, "step": 160 }, { "epoch": 0.6465863453815262, "grad_norm": 2.4523508548736572, "learning_rate": 0.0001571619812583668, "loss": 3.3654, "step": 161 }, { "epoch": 0.6506024096385542, "grad_norm": 2.5598349571228027, "learning_rate": 0.0001568942436412316, "loss": 3.0078, "step": 162 }, { "epoch": 0.6546184738955824, "grad_norm": 3.0518641471862793, "learning_rate": 0.0001566265060240964, "loss": 4.5464, "step": 163 }, { "epoch": 0.6586345381526104, "grad_norm": 2.8101203441619873, "learning_rate": 0.00015635876840696118, "loss": 3.4404, "step": 164 }, { "epoch": 0.6626506024096386, "grad_norm": 2.7174525260925293, "learning_rate": 0.00015609103078982598, "loss": 3.6615, "step": 165 }, { "epoch": 0.6666666666666666, "grad_norm": 2.620638608932495, "learning_rate": 0.00015582329317269077, "loss": 3.448, "step": 166 }, { "epoch": 0.6706827309236948, "grad_norm": 2.9395246505737305, "learning_rate": 0.00015555555555555556, "loss": 3.6454, "step": 167 }, { "epoch": 0.6746987951807228, "grad_norm": 3.050710916519165, "learning_rate": 0.00015528781793842036, "loss": 4.0765, "step": 168 }, { "epoch": 0.678714859437751, "grad_norm": 2.2552433013916016, "learning_rate": 0.00015502008032128515, "loss": 3.1558, "step": 169 }, { "epoch": 0.6827309236947792, "grad_norm": 2.1489574909210205, "learning_rate": 0.00015475234270414994, "loss": 4.2047, "step": 170 }, { "epoch": 0.6867469879518072, "grad_norm": 2.172776937484741, "learning_rate": 0.00015448460508701473, "loss": 3.4285, "step": 171 }, { "epoch": 0.6907630522088354, "grad_norm": 2.1401731967926025, "learning_rate": 0.00015421686746987953, "loss": 3.2497, "step": 172 }, { "epoch": 0.6947791164658634, "grad_norm": 2.7701947689056396, "learning_rate": 0.00015394912985274432, "loss": 3.9331, "step": 173 }, { "epoch": 0.6987951807228916, "grad_norm": 2.319415330886841, "learning_rate": 0.0001536813922356091, "loss": 3.176, "step": 174 }, { "epoch": 0.7028112449799196, "grad_norm": 2.428131341934204, "learning_rate": 0.0001534136546184739, "loss": 3.1192, "step": 175 }, { "epoch": 0.7068273092369478, "grad_norm": 2.135892868041992, "learning_rate": 0.0001531459170013387, "loss": 3.0222, "step": 176 }, { "epoch": 0.7108433734939759, "grad_norm": 2.7550647258758545, "learning_rate": 0.0001528781793842035, "loss": 4.6775, "step": 177 }, { "epoch": 0.714859437751004, "grad_norm": 2.2021191120147705, "learning_rate": 0.00015261044176706828, "loss": 2.7476, "step": 178 }, { "epoch": 0.7188755020080321, "grad_norm": 2.686431407928467, "learning_rate": 0.00015234270414993308, "loss": 4.1621, "step": 179 }, { "epoch": 0.7228915662650602, "grad_norm": 2.827143669128418, "learning_rate": 0.00015207496653279787, "loss": 4.4613, "step": 180 }, { "epoch": 0.7269076305220884, "grad_norm": 3.090308904647827, "learning_rate": 0.00015180722891566266, "loss": 4.6863, "step": 181 }, { "epoch": 0.7309236947791165, "grad_norm": 2.492013454437256, "learning_rate": 0.00015153949129852746, "loss": 3.2319, "step": 182 }, { "epoch": 0.7349397590361446, "grad_norm": 2.6304264068603516, "learning_rate": 0.00015127175368139225, "loss": 3.3099, "step": 183 }, { "epoch": 0.7389558232931727, "grad_norm": 2.270024299621582, "learning_rate": 0.00015100401606425701, "loss": 3.8332, "step": 184 }, { "epoch": 0.7429718875502008, "grad_norm": 2.2107675075531006, "learning_rate": 0.00015073627844712183, "loss": 3.4966, "step": 185 }, { "epoch": 0.7469879518072289, "grad_norm": 1.804654598236084, "learning_rate": 0.00015046854082998663, "loss": 2.7441, "step": 186 }, { "epoch": 0.751004016064257, "grad_norm": 2.8919899463653564, "learning_rate": 0.00015020080321285142, "loss": 3.7274, "step": 187 }, { "epoch": 0.7550200803212851, "grad_norm": 2.4757237434387207, "learning_rate": 0.0001499330655957162, "loss": 3.6959, "step": 188 }, { "epoch": 0.7590361445783133, "grad_norm": 2.037745952606201, "learning_rate": 0.000149665327978581, "loss": 3.0673, "step": 189 }, { "epoch": 0.7630522088353414, "grad_norm": 2.479806423187256, "learning_rate": 0.00014939759036144577, "loss": 3.5497, "step": 190 }, { "epoch": 0.7670682730923695, "grad_norm": 2.532616138458252, "learning_rate": 0.00014912985274431056, "loss": 4.4538, "step": 191 }, { "epoch": 0.7710843373493976, "grad_norm": 2.2965128421783447, "learning_rate": 0.00014886211512717538, "loss": 3.8924, "step": 192 }, { "epoch": 0.7751004016064257, "grad_norm": 2.569096088409424, "learning_rate": 0.00014859437751004018, "loss": 4.3112, "step": 193 }, { "epoch": 0.7791164658634538, "grad_norm": 2.3299782276153564, "learning_rate": 0.00014832663989290497, "loss": 3.4171, "step": 194 }, { "epoch": 0.7831325301204819, "grad_norm": 2.4750306606292725, "learning_rate": 0.00014805890227576976, "loss": 4.2418, "step": 195 }, { "epoch": 0.7871485943775101, "grad_norm": 2.34830904006958, "learning_rate": 0.00014779116465863453, "loss": 4.7654, "step": 196 }, { "epoch": 0.7911646586345381, "grad_norm": 2.3084421157836914, "learning_rate": 0.00014752342704149932, "loss": 3.5955, "step": 197 }, { "epoch": 0.7951807228915663, "grad_norm": 2.088836431503296, "learning_rate": 0.00014725568942436414, "loss": 3.4426, "step": 198 }, { "epoch": 0.7991967871485943, "grad_norm": 2.387511968612671, "learning_rate": 0.00014698795180722893, "loss": 3.4799, "step": 199 }, { "epoch": 0.8032128514056225, "grad_norm": 2.173638343811035, "learning_rate": 0.00014672021419009373, "loss": 3.1073, "step": 200 }, { "epoch": 0.8072289156626506, "grad_norm": 2.4268410205841064, "learning_rate": 0.00014645247657295852, "loss": 3.895, "step": 201 }, { "epoch": 0.8112449799196787, "grad_norm": 2.298238515853882, "learning_rate": 0.00014618473895582328, "loss": 3.1374, "step": 202 }, { "epoch": 0.8152610441767069, "grad_norm": 2.5447280406951904, "learning_rate": 0.00014591700133868808, "loss": 4.201, "step": 203 }, { "epoch": 0.8192771084337349, "grad_norm": 2.2700531482696533, "learning_rate": 0.00014564926372155287, "loss": 3.3756, "step": 204 }, { "epoch": 0.8232931726907631, "grad_norm": 2.2147793769836426, "learning_rate": 0.0001453815261044177, "loss": 2.8677, "step": 205 }, { "epoch": 0.8273092369477911, "grad_norm": 2.820615768432617, "learning_rate": 0.00014511378848728248, "loss": 3.8278, "step": 206 }, { "epoch": 0.8313253012048193, "grad_norm": 2.214066743850708, "learning_rate": 0.00014484605087014728, "loss": 2.8015, "step": 207 }, { "epoch": 0.8353413654618473, "grad_norm": 2.7223362922668457, "learning_rate": 0.00014457831325301204, "loss": 4.5482, "step": 208 }, { "epoch": 0.8393574297188755, "grad_norm": 2.6131458282470703, "learning_rate": 0.00014431057563587683, "loss": 3.258, "step": 209 }, { "epoch": 0.8433734939759037, "grad_norm": 2.378821611404419, "learning_rate": 0.00014404283801874163, "loss": 3.4395, "step": 210 }, { "epoch": 0.8473895582329317, "grad_norm": 2.5394039154052734, "learning_rate": 0.00014377510040160642, "loss": 3.5583, "step": 211 }, { "epoch": 0.8514056224899599, "grad_norm": 2.8768603801727295, "learning_rate": 0.00014350736278447124, "loss": 4.1826, "step": 212 }, { "epoch": 0.8554216867469879, "grad_norm": 2.325242757797241, "learning_rate": 0.00014323962516733603, "loss": 3.2996, "step": 213 }, { "epoch": 0.8594377510040161, "grad_norm": 2.847722053527832, "learning_rate": 0.0001429718875502008, "loss": 3.7535, "step": 214 }, { "epoch": 0.8634538152610441, "grad_norm": 2.3787224292755127, "learning_rate": 0.0001427041499330656, "loss": 2.989, "step": 215 }, { "epoch": 0.8674698795180723, "grad_norm": 2.3759453296661377, "learning_rate": 0.00014243641231593038, "loss": 3.2181, "step": 216 }, { "epoch": 0.8714859437751004, "grad_norm": 2.48319411277771, "learning_rate": 0.00014216867469879518, "loss": 4.0624, "step": 217 }, { "epoch": 0.8755020080321285, "grad_norm": 2.75231671333313, "learning_rate": 0.00014190093708166, "loss": 4.2616, "step": 218 }, { "epoch": 0.8795180722891566, "grad_norm": 2.165195941925049, "learning_rate": 0.0001416331994645248, "loss": 2.773, "step": 219 }, { "epoch": 0.8835341365461847, "grad_norm": 2.9390523433685303, "learning_rate": 0.00014136546184738956, "loss": 5.3133, "step": 220 }, { "epoch": 0.8875502008032129, "grad_norm": 2.4109458923339844, "learning_rate": 0.00014109772423025435, "loss": 3.8292, "step": 221 }, { "epoch": 0.891566265060241, "grad_norm": 2.5037901401519775, "learning_rate": 0.00014082998661311914, "loss": 4.0122, "step": 222 }, { "epoch": 0.8955823293172691, "grad_norm": 2.985944986343384, "learning_rate": 0.00014056224899598393, "loss": 3.7539, "step": 223 }, { "epoch": 0.8995983935742972, "grad_norm": 2.2456915378570557, "learning_rate": 0.00014029451137884873, "loss": 3.4707, "step": 224 }, { "epoch": 0.9036144578313253, "grad_norm": 2.0935449600219727, "learning_rate": 0.00014002677376171355, "loss": 2.7515, "step": 225 }, { "epoch": 0.9076305220883534, "grad_norm": 2.4609766006469727, "learning_rate": 0.00013975903614457834, "loss": 3.8227, "step": 226 }, { "epoch": 0.9116465863453815, "grad_norm": 2.2097980976104736, "learning_rate": 0.0001394912985274431, "loss": 3.2733, "step": 227 }, { "epoch": 0.9156626506024096, "grad_norm": 2.0642688274383545, "learning_rate": 0.0001392235609103079, "loss": 3.0938, "step": 228 }, { "epoch": 0.9196787148594378, "grad_norm": 2.3710100650787354, "learning_rate": 0.0001389558232931727, "loss": 4.2002, "step": 229 }, { "epoch": 0.9236947791164659, "grad_norm": 2.6360647678375244, "learning_rate": 0.00013868808567603748, "loss": 3.8326, "step": 230 }, { "epoch": 0.927710843373494, "grad_norm": 2.2522687911987305, "learning_rate": 0.00013842034805890228, "loss": 4.0576, "step": 231 }, { "epoch": 0.9317269076305221, "grad_norm": 2.3965373039245605, "learning_rate": 0.0001381526104417671, "loss": 2.551, "step": 232 }, { "epoch": 0.9357429718875502, "grad_norm": 2.160850763320923, "learning_rate": 0.00013788487282463186, "loss": 3.0346, "step": 233 }, { "epoch": 0.9397590361445783, "grad_norm": 2.7340362071990967, "learning_rate": 0.00013761713520749665, "loss": 3.8792, "step": 234 }, { "epoch": 0.9437751004016064, "grad_norm": 2.373431921005249, "learning_rate": 0.00013734939759036145, "loss": 3.4563, "step": 235 }, { "epoch": 0.9477911646586346, "grad_norm": 2.887669801712036, "learning_rate": 0.00013708165997322624, "loss": 3.4205, "step": 236 }, { "epoch": 0.9518072289156626, "grad_norm": 2.47088360786438, "learning_rate": 0.00013681392235609103, "loss": 3.7738, "step": 237 }, { "epoch": 0.9558232931726908, "grad_norm": 2.7040438652038574, "learning_rate": 0.00013654618473895585, "loss": 3.5389, "step": 238 }, { "epoch": 0.9598393574297188, "grad_norm": 2.2656071186065674, "learning_rate": 0.00013627844712182062, "loss": 2.5192, "step": 239 }, { "epoch": 0.963855421686747, "grad_norm": 2.0689640045166016, "learning_rate": 0.0001360107095046854, "loss": 3.2038, "step": 240 }, { "epoch": 0.9678714859437751, "grad_norm": 2.456049680709839, "learning_rate": 0.0001357429718875502, "loss": 3.3779, "step": 241 }, { "epoch": 0.9718875502008032, "grad_norm": 3.6520512104034424, "learning_rate": 0.000135475234270415, "loss": 6.3828, "step": 242 }, { "epoch": 0.9759036144578314, "grad_norm": 2.9019930362701416, "learning_rate": 0.0001352074966532798, "loss": 4.4033, "step": 243 }, { "epoch": 0.9799196787148594, "grad_norm": 2.688805103302002, "learning_rate": 0.00013493975903614458, "loss": 3.7718, "step": 244 }, { "epoch": 0.9839357429718876, "grad_norm": 2.3583173751831055, "learning_rate": 0.00013467202141900938, "loss": 2.8558, "step": 245 }, { "epoch": 0.9879518072289156, "grad_norm": 2.2991857528686523, "learning_rate": 0.00013440428380187417, "loss": 3.3544, "step": 246 }, { "epoch": 0.9919678714859438, "grad_norm": 2.3462352752685547, "learning_rate": 0.00013413654618473896, "loss": 3.4804, "step": 247 }, { "epoch": 0.9959839357429718, "grad_norm": 2.375304698944092, "learning_rate": 0.00013386880856760375, "loss": 3.9284, "step": 248 }, { "epoch": 1.0, "grad_norm": 2.3574721813201904, "learning_rate": 0.00013360107095046855, "loss": 3.5948, "step": 249 }, { "epoch": 1.0, "eval_loss": 0.906198263168335, "eval_runtime": 202.0311, "eval_samples_per_second": 2.47, "eval_steps_per_second": 1.237, "step": 249 }, { "epoch": 1.0040160642570282, "grad_norm": 2.329230546951294, "learning_rate": 0.00013333333333333334, "loss": 3.8794, "step": 250 }, { "epoch": 1.0080321285140563, "grad_norm": 2.304131507873535, "learning_rate": 0.00013306559571619813, "loss": 2.618, "step": 251 }, { "epoch": 1.0120481927710843, "grad_norm": 2.258854389190674, "learning_rate": 0.00013279785809906293, "loss": 4.5112, "step": 252 }, { "epoch": 1.0160642570281124, "grad_norm": 1.9307198524475098, "learning_rate": 0.00013253012048192772, "loss": 2.8023, "step": 253 }, { "epoch": 1.0200803212851406, "grad_norm": 2.070939540863037, "learning_rate": 0.0001322623828647925, "loss": 2.9067, "step": 254 }, { "epoch": 1.0240963855421688, "grad_norm": 2.1403632164001465, "learning_rate": 0.0001319946452476573, "loss": 3.0498, "step": 255 }, { "epoch": 1.0281124497991967, "grad_norm": 1.9982527494430542, "learning_rate": 0.0001317269076305221, "loss": 2.7652, "step": 256 }, { "epoch": 1.0321285140562249, "grad_norm": 2.3440232276916504, "learning_rate": 0.0001314591700133869, "loss": 3.8854, "step": 257 }, { "epoch": 1.036144578313253, "grad_norm": 2.3406286239624023, "learning_rate": 0.00013119143239625168, "loss": 2.9114, "step": 258 }, { "epoch": 1.0401606425702812, "grad_norm": 2.673793077468872, "learning_rate": 0.00013092369477911648, "loss": 3.0531, "step": 259 }, { "epoch": 1.0441767068273093, "grad_norm": 2.2808480262756348, "learning_rate": 0.00013065595716198127, "loss": 2.9484, "step": 260 }, { "epoch": 1.0481927710843373, "grad_norm": 2.513705253601074, "learning_rate": 0.00013038821954484606, "loss": 2.6625, "step": 261 }, { "epoch": 1.0522088353413654, "grad_norm": 2.7780377864837646, "learning_rate": 0.00013012048192771085, "loss": 3.1793, "step": 262 }, { "epoch": 1.0562248995983936, "grad_norm": 2.522724151611328, "learning_rate": 0.00012985274431057565, "loss": 3.1926, "step": 263 }, { "epoch": 1.0602409638554218, "grad_norm": 3.2487499713897705, "learning_rate": 0.0001295850066934404, "loss": 3.9779, "step": 264 }, { "epoch": 1.0642570281124497, "grad_norm": 2.4341378211975098, "learning_rate": 0.00012931726907630523, "loss": 2.9064, "step": 265 }, { "epoch": 1.0682730923694779, "grad_norm": 2.5539276599884033, "learning_rate": 0.00012904953145917002, "loss": 3.4219, "step": 266 }, { "epoch": 1.072289156626506, "grad_norm": 2.0425596237182617, "learning_rate": 0.00012878179384203482, "loss": 2.5395, "step": 267 }, { "epoch": 1.0763052208835342, "grad_norm": 2.3625378608703613, "learning_rate": 0.0001285140562248996, "loss": 2.757, "step": 268 }, { "epoch": 1.0803212851405624, "grad_norm": 2.0414483547210693, "learning_rate": 0.0001282463186077644, "loss": 2.7764, "step": 269 }, { "epoch": 1.0843373493975903, "grad_norm": 3.544743061065674, "learning_rate": 0.00012797858099062917, "loss": 3.6176, "step": 270 }, { "epoch": 1.0883534136546185, "grad_norm": 2.4814655780792236, "learning_rate": 0.00012771084337349396, "loss": 3.2284, "step": 271 }, { "epoch": 1.0923694779116466, "grad_norm": 2.364025592803955, "learning_rate": 0.00012744310575635878, "loss": 3.6178, "step": 272 }, { "epoch": 1.0963855421686748, "grad_norm": 1.989912748336792, "learning_rate": 0.00012717536813922357, "loss": 2.5839, "step": 273 }, { "epoch": 1.1004016064257027, "grad_norm": 2.413421154022217, "learning_rate": 0.00012690763052208837, "loss": 3.5416, "step": 274 }, { "epoch": 1.104417670682731, "grad_norm": 2.679314613342285, "learning_rate": 0.00012663989290495316, "loss": 3.0015, "step": 275 }, { "epoch": 1.108433734939759, "grad_norm": 2.2354209423065186, "learning_rate": 0.00012637215528781793, "loss": 3.3867, "step": 276 }, { "epoch": 1.1124497991967872, "grad_norm": 2.4003982543945312, "learning_rate": 0.00012610441767068272, "loss": 3.0927, "step": 277 }, { "epoch": 1.1164658634538154, "grad_norm": 2.2922661304473877, "learning_rate": 0.00012583668005354754, "loss": 2.835, "step": 278 }, { "epoch": 1.1204819277108433, "grad_norm": 2.1880528926849365, "learning_rate": 0.00012556894243641233, "loss": 2.9581, "step": 279 }, { "epoch": 1.1244979919678715, "grad_norm": 2.5255534648895264, "learning_rate": 0.00012530120481927712, "loss": 2.7931, "step": 280 }, { "epoch": 1.1285140562248996, "grad_norm": 2.2529118061065674, "learning_rate": 0.00012503346720214192, "loss": 2.6831, "step": 281 }, { "epoch": 1.1325301204819278, "grad_norm": 2.2123444080352783, "learning_rate": 0.0001247657295850067, "loss": 2.8091, "step": 282 }, { "epoch": 1.1365461847389557, "grad_norm": 2.538160800933838, "learning_rate": 0.00012449799196787148, "loss": 3.0089, "step": 283 }, { "epoch": 1.140562248995984, "grad_norm": 3.0052592754364014, "learning_rate": 0.00012423025435073627, "loss": 3.9042, "step": 284 }, { "epoch": 1.144578313253012, "grad_norm": 2.691096067428589, "learning_rate": 0.0001239625167336011, "loss": 3.9491, "step": 285 }, { "epoch": 1.1485943775100402, "grad_norm": 2.6101088523864746, "learning_rate": 0.00012369477911646588, "loss": 2.9432, "step": 286 }, { "epoch": 1.1526104417670684, "grad_norm": 2.368319511413574, "learning_rate": 0.00012342704149933067, "loss": 2.966, "step": 287 }, { "epoch": 1.1566265060240963, "grad_norm": 2.4615232944488525, "learning_rate": 0.00012315930388219547, "loss": 3.4359, "step": 288 }, { "epoch": 1.1606425702811245, "grad_norm": 2.3296902179718018, "learning_rate": 0.00012289156626506023, "loss": 3.0168, "step": 289 }, { "epoch": 1.1646586345381527, "grad_norm": 2.7844183444976807, "learning_rate": 0.00012262382864792503, "loss": 3.1574, "step": 290 }, { "epoch": 1.1686746987951806, "grad_norm": 2.486553430557251, "learning_rate": 0.00012235609103078982, "loss": 3.1044, "step": 291 }, { "epoch": 1.1726907630522088, "grad_norm": 2.4482836723327637, "learning_rate": 0.00012208835341365464, "loss": 3.2606, "step": 292 }, { "epoch": 1.176706827309237, "grad_norm": 2.393049955368042, "learning_rate": 0.00012182061579651942, "loss": 2.9026, "step": 293 }, { "epoch": 1.180722891566265, "grad_norm": 2.8396050930023193, "learning_rate": 0.00012155287817938421, "loss": 2.9787, "step": 294 }, { "epoch": 1.1847389558232932, "grad_norm": 2.447458028793335, "learning_rate": 0.000121285140562249, "loss": 2.6885, "step": 295 }, { "epoch": 1.1887550200803212, "grad_norm": 2.3094258308410645, "learning_rate": 0.0001210174029451138, "loss": 2.9401, "step": 296 }, { "epoch": 1.1927710843373494, "grad_norm": 2.5315654277801514, "learning_rate": 0.00012074966532797858, "loss": 3.2829, "step": 297 }, { "epoch": 1.1967871485943775, "grad_norm": 2.4781811237335205, "learning_rate": 0.0001204819277108434, "loss": 2.9542, "step": 298 }, { "epoch": 1.2008032128514057, "grad_norm": 2.759524345397949, "learning_rate": 0.00012021419009370817, "loss": 3.5029, "step": 299 }, { "epoch": 1.2048192771084336, "grad_norm": 2.388485908508301, "learning_rate": 0.00011994645247657297, "loss": 2.6706, "step": 300 }, { "epoch": 1.2088353413654618, "grad_norm": 2.5414671897888184, "learning_rate": 0.00011967871485943776, "loss": 2.7898, "step": 301 }, { "epoch": 1.21285140562249, "grad_norm": 3.36741042137146, "learning_rate": 0.00011941097724230255, "loss": 2.7475, "step": 302 }, { "epoch": 1.216867469879518, "grad_norm": 2.7749950885772705, "learning_rate": 0.00011914323962516733, "loss": 2.9617, "step": 303 }, { "epoch": 1.2208835341365463, "grad_norm": 2.685976505279541, "learning_rate": 0.00011887550200803212, "loss": 3.2493, "step": 304 }, { "epoch": 1.2248995983935742, "grad_norm": 2.7357215881347656, "learning_rate": 0.00011860776439089693, "loss": 2.7249, "step": 305 }, { "epoch": 1.2289156626506024, "grad_norm": 2.962019443511963, "learning_rate": 0.00011834002677376172, "loss": 3.4647, "step": 306 }, { "epoch": 1.2329317269076305, "grad_norm": 2.891343832015991, "learning_rate": 0.00011807228915662652, "loss": 3.5527, "step": 307 }, { "epoch": 1.2369477911646587, "grad_norm": 2.7382125854492188, "learning_rate": 0.00011780455153949131, "loss": 3.1955, "step": 308 }, { "epoch": 1.2409638554216866, "grad_norm": 2.385486602783203, "learning_rate": 0.00011753681392235609, "loss": 3.022, "step": 309 }, { "epoch": 1.2449799196787148, "grad_norm": 2.553295612335205, "learning_rate": 0.00011726907630522088, "loss": 2.801, "step": 310 }, { "epoch": 1.248995983935743, "grad_norm": 2.9965014457702637, "learning_rate": 0.00011700133868808567, "loss": 2.4453, "step": 311 }, { "epoch": 1.2530120481927711, "grad_norm": 2.327629566192627, "learning_rate": 0.00011673360107095048, "loss": 2.2897, "step": 312 }, { "epoch": 1.2570281124497993, "grad_norm": 2.7544825077056885, "learning_rate": 0.00011646586345381527, "loss": 3.2796, "step": 313 }, { "epoch": 1.2610441767068274, "grad_norm": 2.590733051300049, "learning_rate": 0.00011619812583668007, "loss": 2.9126, "step": 314 }, { "epoch": 1.2650602409638554, "grad_norm": 3.3064663410186768, "learning_rate": 0.00011593038821954485, "loss": 3.6784, "step": 315 }, { "epoch": 1.2690763052208835, "grad_norm": 3.3928616046905518, "learning_rate": 0.00011566265060240964, "loss": 3.3292, "step": 316 }, { "epoch": 1.2730923694779117, "grad_norm": 2.6576473712921143, "learning_rate": 0.00011539491298527443, "loss": 3.0617, "step": 317 }, { "epoch": 1.2771084337349397, "grad_norm": 2.5956337451934814, "learning_rate": 0.00011512717536813924, "loss": 2.9754, "step": 318 }, { "epoch": 1.2811244979919678, "grad_norm": 2.8080995082855225, "learning_rate": 0.00011485943775100403, "loss": 3.1712, "step": 319 }, { "epoch": 1.285140562248996, "grad_norm": 2.4304864406585693, "learning_rate": 0.00011459170013386882, "loss": 3.0387, "step": 320 }, { "epoch": 1.2891566265060241, "grad_norm": 2.2777411937713623, "learning_rate": 0.0001143239625167336, "loss": 2.8357, "step": 321 }, { "epoch": 1.2931726907630523, "grad_norm": 2.370192289352417, "learning_rate": 0.0001140562248995984, "loss": 2.5937, "step": 322 }, { "epoch": 1.2971887550200802, "grad_norm": 3.0521585941314697, "learning_rate": 0.00011378848728246319, "loss": 4.4271, "step": 323 }, { "epoch": 1.3012048192771084, "grad_norm": 2.4153242111206055, "learning_rate": 0.00011352074966532798, "loss": 2.7952, "step": 324 }, { "epoch": 1.3052208835341366, "grad_norm": 2.629312038421631, "learning_rate": 0.00011325301204819279, "loss": 3.6324, "step": 325 }, { "epoch": 1.3092369477911647, "grad_norm": 2.0146517753601074, "learning_rate": 0.00011298527443105758, "loss": 2.3154, "step": 326 }, { "epoch": 1.3132530120481927, "grad_norm": 2.3414394855499268, "learning_rate": 0.00011271753681392236, "loss": 2.809, "step": 327 }, { "epoch": 1.3172690763052208, "grad_norm": 2.366577386856079, "learning_rate": 0.00011244979919678715, "loss": 3.7852, "step": 328 }, { "epoch": 1.321285140562249, "grad_norm": 2.661543130874634, "learning_rate": 0.00011218206157965195, "loss": 2.818, "step": 329 }, { "epoch": 1.3253012048192772, "grad_norm": 2.51835036277771, "learning_rate": 0.00011191432396251674, "loss": 2.8359, "step": 330 }, { "epoch": 1.3293172690763053, "grad_norm": 2.473179817199707, "learning_rate": 0.00011164658634538152, "loss": 2.8498, "step": 331 }, { "epoch": 1.3333333333333333, "grad_norm": 2.9637928009033203, "learning_rate": 0.00011137884872824634, "loss": 4.164, "step": 332 }, { "epoch": 1.3373493975903614, "grad_norm": 2.5028486251831055, "learning_rate": 0.00011111111111111112, "loss": 3.6701, "step": 333 }, { "epoch": 1.3413654618473896, "grad_norm": 3.149928092956543, "learning_rate": 0.00011084337349397591, "loss": 3.7949, "step": 334 }, { "epoch": 1.3453815261044177, "grad_norm": 2.7405877113342285, "learning_rate": 0.0001105756358768407, "loss": 3.2064, "step": 335 }, { "epoch": 1.3493975903614457, "grad_norm": 2.830744743347168, "learning_rate": 0.0001103078982597055, "loss": 2.8919, "step": 336 }, { "epoch": 1.3534136546184738, "grad_norm": 2.9335427284240723, "learning_rate": 0.00011004016064257027, "loss": 3.1013, "step": 337 }, { "epoch": 1.357429718875502, "grad_norm": 2.505171537399292, "learning_rate": 0.0001097724230254351, "loss": 3.206, "step": 338 }, { "epoch": 1.3614457831325302, "grad_norm": 3.127634286880493, "learning_rate": 0.00010950468540829987, "loss": 3.2454, "step": 339 }, { "epoch": 1.3654618473895583, "grad_norm": 2.7009451389312744, "learning_rate": 0.00010923694779116467, "loss": 3.0679, "step": 340 }, { "epoch": 1.3694779116465863, "grad_norm": 2.3906707763671875, "learning_rate": 0.00010896921017402946, "loss": 3.7267, "step": 341 }, { "epoch": 1.3734939759036144, "grad_norm": 2.4884233474731445, "learning_rate": 0.00010870147255689425, "loss": 3.2707, "step": 342 }, { "epoch": 1.3775100401606426, "grad_norm": 2.514148712158203, "learning_rate": 0.00010843373493975903, "loss": 3.0734, "step": 343 }, { "epoch": 1.3815261044176708, "grad_norm": 2.450438976287842, "learning_rate": 0.00010816599732262382, "loss": 2.7529, "step": 344 }, { "epoch": 1.3855421686746987, "grad_norm": 2.5931103229522705, "learning_rate": 0.00010789825970548863, "loss": 3.8578, "step": 345 }, { "epoch": 1.3895582329317269, "grad_norm": 2.386543035507202, "learning_rate": 0.00010763052208835342, "loss": 3.2145, "step": 346 }, { "epoch": 1.393574297188755, "grad_norm": 2.643378973007202, "learning_rate": 0.00010736278447121822, "loss": 2.7853, "step": 347 }, { "epoch": 1.3975903614457832, "grad_norm": 1.9885903596878052, "learning_rate": 0.00010709504685408301, "loss": 2.2022, "step": 348 }, { "epoch": 1.4016064257028114, "grad_norm": 2.6465091705322266, "learning_rate": 0.00010682730923694779, "loss": 3.5565, "step": 349 }, { "epoch": 1.4056224899598393, "grad_norm": 2.6052937507629395, "learning_rate": 0.00010655957161981258, "loss": 2.9741, "step": 350 }, { "epoch": 1.4096385542168675, "grad_norm": 2.7112314701080322, "learning_rate": 0.00010629183400267737, "loss": 4.0259, "step": 351 }, { "epoch": 1.4136546184738956, "grad_norm": 2.5356833934783936, "learning_rate": 0.00010602409638554218, "loss": 2.6879, "step": 352 }, { "epoch": 1.4176706827309236, "grad_norm": 2.745176315307617, "learning_rate": 0.00010575635876840697, "loss": 4.0105, "step": 353 }, { "epoch": 1.4216867469879517, "grad_norm": 2.5344765186309814, "learning_rate": 0.00010548862115127177, "loss": 2.9797, "step": 354 }, { "epoch": 1.4257028112449799, "grad_norm": 2.680912733078003, "learning_rate": 0.00010522088353413654, "loss": 3.3971, "step": 355 }, { "epoch": 1.429718875502008, "grad_norm": 3.498023271560669, "learning_rate": 0.00010495314591700134, "loss": 3.6706, "step": 356 }, { "epoch": 1.4337349397590362, "grad_norm": 2.4419398307800293, "learning_rate": 0.00010468540829986613, "loss": 2.6477, "step": 357 }, { "epoch": 1.4377510040160644, "grad_norm": 3.2264997959136963, "learning_rate": 0.00010441767068273094, "loss": 4.5181, "step": 358 }, { "epoch": 1.4417670682730923, "grad_norm": 2.5578315258026123, "learning_rate": 0.00010414993306559573, "loss": 2.6282, "step": 359 }, { "epoch": 1.4457831325301205, "grad_norm": 2.539045572280884, "learning_rate": 0.00010388219544846052, "loss": 2.6435, "step": 360 }, { "epoch": 1.4497991967871486, "grad_norm": 2.9697344303131104, "learning_rate": 0.0001036144578313253, "loss": 2.6676, "step": 361 }, { "epoch": 1.4538152610441766, "grad_norm": 2.606131076812744, "learning_rate": 0.0001033467202141901, "loss": 2.9316, "step": 362 }, { "epoch": 1.4578313253012047, "grad_norm": 3.290837049484253, "learning_rate": 0.00010307898259705489, "loss": 3.0869, "step": 363 }, { "epoch": 1.461847389558233, "grad_norm": 2.331320285797119, "learning_rate": 0.00010281124497991968, "loss": 2.555, "step": 364 }, { "epoch": 1.465863453815261, "grad_norm": 2.8447391986846924, "learning_rate": 0.00010254350736278449, "loss": 2.6998, "step": 365 }, { "epoch": 1.4698795180722892, "grad_norm": 2.6170618534088135, "learning_rate": 0.00010227576974564928, "loss": 2.7688, "step": 366 }, { "epoch": 1.4738955823293174, "grad_norm": 2.933560609817505, "learning_rate": 0.00010200803212851406, "loss": 3.0291, "step": 367 }, { "epoch": 1.4779116465863453, "grad_norm": 2.6285972595214844, "learning_rate": 0.00010174029451137885, "loss": 2.8629, "step": 368 }, { "epoch": 1.4819277108433735, "grad_norm": 3.2716546058654785, "learning_rate": 0.00010147255689424364, "loss": 3.1994, "step": 369 }, { "epoch": 1.4859437751004017, "grad_norm": 2.758296489715576, "learning_rate": 0.00010120481927710844, "loss": 2.6734, "step": 370 }, { "epoch": 1.4899598393574296, "grad_norm": 2.3439807891845703, "learning_rate": 0.00010093708165997322, "loss": 2.8747, "step": 371 }, { "epoch": 1.4939759036144578, "grad_norm": 2.4199349880218506, "learning_rate": 0.00010066934404283804, "loss": 2.7135, "step": 372 }, { "epoch": 1.497991967871486, "grad_norm": 2.8863987922668457, "learning_rate": 0.00010040160642570282, "loss": 3.3239, "step": 373 }, { "epoch": 1.502008032128514, "grad_norm": 2.5620765686035156, "learning_rate": 0.00010013386880856761, "loss": 2.5748, "step": 374 }, { "epoch": 1.5060240963855422, "grad_norm": 2.5705456733703613, "learning_rate": 9.98661311914324e-05, "loss": 3.4645, "step": 375 }, { "epoch": 1.5100401606425704, "grad_norm": 2.75276780128479, "learning_rate": 9.95983935742972e-05, "loss": 2.7345, "step": 376 }, { "epoch": 1.5140562248995983, "grad_norm": 2.5206143856048584, "learning_rate": 9.933065595716199e-05, "loss": 2.8325, "step": 377 }, { "epoch": 1.5180722891566265, "grad_norm": 2.3054890632629395, "learning_rate": 9.906291834002678e-05, "loss": 2.884, "step": 378 }, { "epoch": 1.5220883534136547, "grad_norm": 2.563084125518799, "learning_rate": 9.879518072289157e-05, "loss": 3.0262, "step": 379 }, { "epoch": 1.5261044176706826, "grad_norm": 2.575040817260742, "learning_rate": 9.852744310575637e-05, "loss": 3.096, "step": 380 }, { "epoch": 1.5301204819277108, "grad_norm": 2.3715319633483887, "learning_rate": 9.825970548862116e-05, "loss": 2.7163, "step": 381 }, { "epoch": 1.534136546184739, "grad_norm": 2.7323389053344727, "learning_rate": 9.799196787148595e-05, "loss": 2.792, "step": 382 }, { "epoch": 1.538152610441767, "grad_norm": 2.523524522781372, "learning_rate": 9.772423025435074e-05, "loss": 3.2821, "step": 383 }, { "epoch": 1.5421686746987953, "grad_norm": 2.533090114593506, "learning_rate": 9.745649263721554e-05, "loss": 2.7672, "step": 384 }, { "epoch": 1.5461847389558234, "grad_norm": 2.644031286239624, "learning_rate": 9.718875502008033e-05, "loss": 3.0318, "step": 385 }, { "epoch": 1.5502008032128514, "grad_norm": 3.1442739963531494, "learning_rate": 9.692101740294511e-05, "loss": 3.6628, "step": 386 }, { "epoch": 1.5542168674698795, "grad_norm": 2.403552532196045, "learning_rate": 9.665327978580992e-05, "loss": 2.4332, "step": 387 }, { "epoch": 1.5582329317269075, "grad_norm": 2.478534698486328, "learning_rate": 9.638554216867471e-05, "loss": 2.4746, "step": 388 }, { "epoch": 1.5622489959839356, "grad_norm": 2.7873339653015137, "learning_rate": 9.611780455153949e-05, "loss": 2.8514, "step": 389 }, { "epoch": 1.5662650602409638, "grad_norm": 2.751532793045044, "learning_rate": 9.58500669344043e-05, "loss": 2.9365, "step": 390 }, { "epoch": 1.570281124497992, "grad_norm": 2.8862998485565186, "learning_rate": 9.558232931726909e-05, "loss": 3.2632, "step": 391 }, { "epoch": 1.5742971887550201, "grad_norm": 2.5372817516326904, "learning_rate": 9.531459170013387e-05, "loss": 2.8649, "step": 392 }, { "epoch": 1.5783132530120483, "grad_norm": 2.428025007247925, "learning_rate": 9.504685408299867e-05, "loss": 2.6417, "step": 393 }, { "epoch": 1.5823293172690764, "grad_norm": 3.284771680831909, "learning_rate": 9.477911646586346e-05, "loss": 3.4804, "step": 394 }, { "epoch": 1.5863453815261044, "grad_norm": 2.8651950359344482, "learning_rate": 9.451137884872824e-05, "loss": 3.1454, "step": 395 }, { "epoch": 1.5903614457831325, "grad_norm": 3.078660011291504, "learning_rate": 9.424364123159304e-05, "loss": 3.5961, "step": 396 }, { "epoch": 1.5943775100401605, "grad_norm": 2.2207376956939697, "learning_rate": 9.397590361445784e-05, "loss": 2.3121, "step": 397 }, { "epoch": 1.5983935742971886, "grad_norm": 2.4094178676605225, "learning_rate": 9.370816599732262e-05, "loss": 2.7138, "step": 398 }, { "epoch": 1.6024096385542168, "grad_norm": 2.759876251220703, "learning_rate": 9.344042838018742e-05, "loss": 3.5605, "step": 399 }, { "epoch": 1.606425702811245, "grad_norm": 2.189237117767334, "learning_rate": 9.317269076305222e-05, "loss": 2.6023, "step": 400 }, { "epoch": 1.6104417670682731, "grad_norm": 2.585479736328125, "learning_rate": 9.2904953145917e-05, "loss": 3.2234, "step": 401 }, { "epoch": 1.6144578313253013, "grad_norm": 2.565342664718628, "learning_rate": 9.26372155287818e-05, "loss": 3.0341, "step": 402 }, { "epoch": 1.6184738955823295, "grad_norm": 2.4045302867889404, "learning_rate": 9.23694779116466e-05, "loss": 2.7032, "step": 403 }, { "epoch": 1.6224899598393574, "grad_norm": 3.0136139392852783, "learning_rate": 9.210174029451138e-05, "loss": 3.1651, "step": 404 }, { "epoch": 1.6265060240963856, "grad_norm": 2.253669261932373, "learning_rate": 9.183400267737617e-05, "loss": 2.2507, "step": 405 }, { "epoch": 1.6305220883534135, "grad_norm": 2.734966993331909, "learning_rate": 9.156626506024096e-05, "loss": 3.0798, "step": 406 }, { "epoch": 1.6345381526104417, "grad_norm": 2.955502986907959, "learning_rate": 9.129852744310576e-05, "loss": 3.086, "step": 407 }, { "epoch": 1.6385542168674698, "grad_norm": 3.2345542907714844, "learning_rate": 9.103078982597055e-05, "loss": 3.3553, "step": 408 }, { "epoch": 1.642570281124498, "grad_norm": 2.7762720584869385, "learning_rate": 9.076305220883534e-05, "loss": 3.4238, "step": 409 }, { "epoch": 1.6465863453815262, "grad_norm": 2.824641466140747, "learning_rate": 9.049531459170014e-05, "loss": 2.8925, "step": 410 }, { "epoch": 1.6506024096385543, "grad_norm": 2.754810094833374, "learning_rate": 9.022757697456493e-05, "loss": 2.9022, "step": 411 }, { "epoch": 1.6546184738955825, "grad_norm": 2.5305283069610596, "learning_rate": 8.995983935742972e-05, "loss": 2.927, "step": 412 }, { "epoch": 1.6586345381526104, "grad_norm": 2.796165943145752, "learning_rate": 8.969210174029451e-05, "loss": 2.9185, "step": 413 }, { "epoch": 1.6626506024096386, "grad_norm": 2.9504239559173584, "learning_rate": 8.942436412315931e-05, "loss": 3.3915, "step": 414 }, { "epoch": 1.6666666666666665, "grad_norm": 2.8904786109924316, "learning_rate": 8.91566265060241e-05, "loss": 2.8841, "step": 415 }, { "epoch": 1.6706827309236947, "grad_norm": 2.184354305267334, "learning_rate": 8.888888888888889e-05, "loss": 2.3859, "step": 416 }, { "epoch": 1.6746987951807228, "grad_norm": 3.1552340984344482, "learning_rate": 8.862115127175369e-05, "loss": 3.22, "step": 417 }, { "epoch": 1.678714859437751, "grad_norm": 3.2323250770568848, "learning_rate": 8.835341365461848e-05, "loss": 2.8859, "step": 418 }, { "epoch": 1.6827309236947792, "grad_norm": 2.726513147354126, "learning_rate": 8.808567603748327e-05, "loss": 3.0969, "step": 419 }, { "epoch": 1.6867469879518073, "grad_norm": 2.7404675483703613, "learning_rate": 8.781793842034806e-05, "loss": 2.76, "step": 420 }, { "epoch": 1.6907630522088355, "grad_norm": 3.433872699737549, "learning_rate": 8.755020080321286e-05, "loss": 3.1852, "step": 421 }, { "epoch": 1.6947791164658634, "grad_norm": 3.4727306365966797, "learning_rate": 8.728246318607765e-05, "loss": 3.6413, "step": 422 }, { "epoch": 1.6987951807228916, "grad_norm": 2.968161106109619, "learning_rate": 8.701472556894244e-05, "loss": 3.15, "step": 423 }, { "epoch": 1.7028112449799195, "grad_norm": 2.8164682388305664, "learning_rate": 8.674698795180724e-05, "loss": 3.0286, "step": 424 }, { "epoch": 1.7068273092369477, "grad_norm": 2.7942745685577393, "learning_rate": 8.647925033467203e-05, "loss": 3.2501, "step": 425 }, { "epoch": 1.7108433734939759, "grad_norm": 3.2419016361236572, "learning_rate": 8.621151271753681e-05, "loss": 4.3181, "step": 426 }, { "epoch": 1.714859437751004, "grad_norm": 3.3823928833007812, "learning_rate": 8.594377510040161e-05, "loss": 3.2917, "step": 427 }, { "epoch": 1.7188755020080322, "grad_norm": 2.8482446670532227, "learning_rate": 8.567603748326641e-05, "loss": 3.0338, "step": 428 }, { "epoch": 1.7228915662650603, "grad_norm": 2.435845375061035, "learning_rate": 8.540829986613119e-05, "loss": 2.5519, "step": 429 }, { "epoch": 1.7269076305220885, "grad_norm": 2.9163546562194824, "learning_rate": 8.514056224899599e-05, "loss": 3.72, "step": 430 }, { "epoch": 1.7309236947791165, "grad_norm": 2.3660037517547607, "learning_rate": 8.487282463186079e-05, "loss": 2.3941, "step": 431 }, { "epoch": 1.7349397590361446, "grad_norm": 2.527449131011963, "learning_rate": 8.460508701472556e-05, "loss": 2.9851, "step": 432 }, { "epoch": 1.7389558232931726, "grad_norm": 2.2324576377868652, "learning_rate": 8.433734939759037e-05, "loss": 2.6241, "step": 433 }, { "epoch": 1.7429718875502007, "grad_norm": 2.7165253162384033, "learning_rate": 8.406961178045516e-05, "loss": 2.7749, "step": 434 }, { "epoch": 1.7469879518072289, "grad_norm": 2.7401411533355713, "learning_rate": 8.380187416331994e-05, "loss": 2.9022, "step": 435 }, { "epoch": 1.751004016064257, "grad_norm": 2.518826961517334, "learning_rate": 8.353413654618474e-05, "loss": 2.7587, "step": 436 }, { "epoch": 1.7550200803212852, "grad_norm": 2.493936061859131, "learning_rate": 8.326639892904954e-05, "loss": 3.1417, "step": 437 }, { "epoch": 1.7590361445783134, "grad_norm": 2.747951030731201, "learning_rate": 8.299866131191432e-05, "loss": 2.6913, "step": 438 }, { "epoch": 1.7630522088353415, "grad_norm": 2.8907039165496826, "learning_rate": 8.273092369477911e-05, "loss": 2.4416, "step": 439 }, { "epoch": 1.7670682730923695, "grad_norm": 3.6564669609069824, "learning_rate": 8.246318607764392e-05, "loss": 3.9361, "step": 440 }, { "epoch": 1.7710843373493976, "grad_norm": 2.4362285137176514, "learning_rate": 8.21954484605087e-05, "loss": 2.461, "step": 441 }, { "epoch": 1.7751004016064256, "grad_norm": 3.2182202339172363, "learning_rate": 8.192771084337349e-05, "loss": 3.2511, "step": 442 }, { "epoch": 1.7791164658634537, "grad_norm": 3.2106211185455322, "learning_rate": 8.16599732262383e-05, "loss": 4.4307, "step": 443 }, { "epoch": 1.783132530120482, "grad_norm": 3.4369003772735596, "learning_rate": 8.139223560910308e-05, "loss": 4.08, "step": 444 }, { "epoch": 1.78714859437751, "grad_norm": 2.2681970596313477, "learning_rate": 8.112449799196787e-05, "loss": 2.3631, "step": 445 }, { "epoch": 1.7911646586345382, "grad_norm": 2.691133975982666, "learning_rate": 8.085676037483266e-05, "loss": 2.6157, "step": 446 }, { "epoch": 1.7951807228915664, "grad_norm": 2.9200479984283447, "learning_rate": 8.058902275769746e-05, "loss": 2.6649, "step": 447 }, { "epoch": 1.7991967871485943, "grad_norm": 2.787264108657837, "learning_rate": 8.032128514056225e-05, "loss": 2.763, "step": 448 }, { "epoch": 1.8032128514056225, "grad_norm": 2.940075635910034, "learning_rate": 8.005354752342704e-05, "loss": 2.9436, "step": 449 }, { "epoch": 1.8072289156626506, "grad_norm": 3.1111507415771484, "learning_rate": 7.978580990629184e-05, "loss": 3.1194, "step": 450 }, { "epoch": 1.8112449799196786, "grad_norm": 2.695709228515625, "learning_rate": 7.951807228915663e-05, "loss": 2.7517, "step": 451 }, { "epoch": 1.8152610441767068, "grad_norm": 2.939112663269043, "learning_rate": 7.925033467202142e-05, "loss": 3.7794, "step": 452 }, { "epoch": 1.819277108433735, "grad_norm": 2.583163022994995, "learning_rate": 7.898259705488621e-05, "loss": 3.0265, "step": 453 }, { "epoch": 1.823293172690763, "grad_norm": 2.496131181716919, "learning_rate": 7.8714859437751e-05, "loss": 2.5762, "step": 454 }, { "epoch": 1.8273092369477912, "grad_norm": 2.4272570610046387, "learning_rate": 7.84471218206158e-05, "loss": 2.758, "step": 455 }, { "epoch": 1.8313253012048194, "grad_norm": 2.4154021739959717, "learning_rate": 7.817938420348059e-05, "loss": 2.7325, "step": 456 }, { "epoch": 1.8353413654618473, "grad_norm": 2.5219106674194336, "learning_rate": 7.791164658634539e-05, "loss": 2.779, "step": 457 }, { "epoch": 1.8393574297188755, "grad_norm": 2.3390161991119385, "learning_rate": 7.764390896921018e-05, "loss": 2.2922, "step": 458 }, { "epoch": 1.8433734939759037, "grad_norm": 2.7101354598999023, "learning_rate": 7.737617135207497e-05, "loss": 2.9825, "step": 459 }, { "epoch": 1.8473895582329316, "grad_norm": 2.8510243892669678, "learning_rate": 7.710843373493976e-05, "loss": 2.8628, "step": 460 }, { "epoch": 1.8514056224899598, "grad_norm": 2.6924989223480225, "learning_rate": 7.684069611780456e-05, "loss": 2.6543, "step": 461 }, { "epoch": 1.855421686746988, "grad_norm": 2.6552584171295166, "learning_rate": 7.657295850066935e-05, "loss": 3.0625, "step": 462 }, { "epoch": 1.859437751004016, "grad_norm": 3.2962827682495117, "learning_rate": 7.630522088353414e-05, "loss": 3.308, "step": 463 }, { "epoch": 1.8634538152610443, "grad_norm": 3.0845699310302734, "learning_rate": 7.603748326639893e-05, "loss": 3.5178, "step": 464 }, { "epoch": 1.8674698795180724, "grad_norm": 2.768254518508911, "learning_rate": 7.576974564926373e-05, "loss": 3.6667, "step": 465 }, { "epoch": 1.8714859437751004, "grad_norm": 2.5801167488098145, "learning_rate": 7.550200803212851e-05, "loss": 2.7686, "step": 466 }, { "epoch": 1.8755020080321285, "grad_norm": 2.2853081226348877, "learning_rate": 7.523427041499331e-05, "loss": 2.2115, "step": 467 }, { "epoch": 1.8795180722891565, "grad_norm": 2.9309747219085693, "learning_rate": 7.49665327978581e-05, "loss": 2.9426, "step": 468 }, { "epoch": 1.8835341365461846, "grad_norm": 3.146700143814087, "learning_rate": 7.469879518072289e-05, "loss": 3.3903, "step": 469 }, { "epoch": 1.8875502008032128, "grad_norm": 3.3652424812316895, "learning_rate": 7.443105756358769e-05, "loss": 3.0085, "step": 470 }, { "epoch": 1.891566265060241, "grad_norm": 2.424377918243408, "learning_rate": 7.416331994645248e-05, "loss": 2.5145, "step": 471 }, { "epoch": 1.895582329317269, "grad_norm": 2.5642752647399902, "learning_rate": 7.389558232931726e-05, "loss": 3.1927, "step": 472 }, { "epoch": 1.8995983935742973, "grad_norm": 2.7574706077575684, "learning_rate": 7.362784471218207e-05, "loss": 2.6753, "step": 473 }, { "epoch": 1.9036144578313254, "grad_norm": 2.6844048500061035, "learning_rate": 7.336010709504686e-05, "loss": 2.7126, "step": 474 }, { "epoch": 1.9076305220883534, "grad_norm": 2.3251895904541016, "learning_rate": 7.309236947791164e-05, "loss": 2.5947, "step": 475 }, { "epoch": 1.9116465863453815, "grad_norm": 2.1562206745147705, "learning_rate": 7.282463186077644e-05, "loss": 2.2137, "step": 476 }, { "epoch": 1.9156626506024095, "grad_norm": 2.400747776031494, "learning_rate": 7.255689424364124e-05, "loss": 2.8869, "step": 477 }, { "epoch": 1.9196787148594376, "grad_norm": 3.1380369663238525, "learning_rate": 7.228915662650602e-05, "loss": 3.4202, "step": 478 }, { "epoch": 1.9236947791164658, "grad_norm": 2.9858291149139404, "learning_rate": 7.202141900937081e-05, "loss": 3.1519, "step": 479 }, { "epoch": 1.927710843373494, "grad_norm": 2.6354973316192627, "learning_rate": 7.175368139223562e-05, "loss": 2.8662, "step": 480 }, { "epoch": 1.9317269076305221, "grad_norm": 2.7349445819854736, "learning_rate": 7.14859437751004e-05, "loss": 4.2679, "step": 481 }, { "epoch": 1.9357429718875503, "grad_norm": 3.0139505863189697, "learning_rate": 7.121820615796519e-05, "loss": 2.9382, "step": 482 }, { "epoch": 1.9397590361445785, "grad_norm": 3.1879093647003174, "learning_rate": 7.095046854083e-05, "loss": 3.168, "step": 483 }, { "epoch": 1.9437751004016064, "grad_norm": 3.2778398990631104, "learning_rate": 7.068273092369478e-05, "loss": 3.4373, "step": 484 }, { "epoch": 1.9477911646586346, "grad_norm": 3.024111747741699, "learning_rate": 7.041499330655957e-05, "loss": 3.7807, "step": 485 }, { "epoch": 1.9518072289156625, "grad_norm": 2.750593423843384, "learning_rate": 7.014725568942436e-05, "loss": 3.4546, "step": 486 }, { "epoch": 1.9558232931726907, "grad_norm": 2.9757187366485596, "learning_rate": 6.987951807228917e-05, "loss": 3.0145, "step": 487 }, { "epoch": 1.9598393574297188, "grad_norm": 2.867292881011963, "learning_rate": 6.961178045515395e-05, "loss": 2.5524, "step": 488 }, { "epoch": 1.963855421686747, "grad_norm": 2.563595771789551, "learning_rate": 6.934404283801874e-05, "loss": 2.7503, "step": 489 }, { "epoch": 1.9678714859437751, "grad_norm": 2.52006459236145, "learning_rate": 6.907630522088355e-05, "loss": 3.0431, "step": 490 }, { "epoch": 1.9718875502008033, "grad_norm": 3.0700199604034424, "learning_rate": 6.880856760374833e-05, "loss": 3.7242, "step": 491 }, { "epoch": 1.9759036144578315, "grad_norm": 2.7504234313964844, "learning_rate": 6.854082998661312e-05, "loss": 2.6293, "step": 492 }, { "epoch": 1.9799196787148594, "grad_norm": 2.919828414916992, "learning_rate": 6.827309236947793e-05, "loss": 2.6278, "step": 493 }, { "epoch": 1.9839357429718876, "grad_norm": 2.453157663345337, "learning_rate": 6.80053547523427e-05, "loss": 2.2764, "step": 494 }, { "epoch": 1.9879518072289155, "grad_norm": 2.635430335998535, "learning_rate": 6.77376171352075e-05, "loss": 2.9467, "step": 495 }, { "epoch": 1.9919678714859437, "grad_norm": 2.7158102989196777, "learning_rate": 6.746987951807229e-05, "loss": 2.7886, "step": 496 }, { "epoch": 1.9959839357429718, "grad_norm": 2.3272292613983154, "learning_rate": 6.720214190093708e-05, "loss": 2.6445, "step": 497 }, { "epoch": 2.0, "grad_norm": 2.2954020500183105, "learning_rate": 6.693440428380188e-05, "loss": 2.5719, "step": 498 }, { "epoch": 2.0, "eval_loss": 0.8565791249275208, "eval_runtime": 200.8505, "eval_samples_per_second": 2.484, "eval_steps_per_second": 1.245, "step": 498 }, { "epoch": 2.004016064257028, "grad_norm": 2.3647961616516113, "learning_rate": 6.666666666666667e-05, "loss": 2.5357, "step": 499 }, { "epoch": 2.0080321285140563, "grad_norm": 2.052393674850464, "learning_rate": 6.639892904953146e-05, "loss": 2.1653, "step": 500 }, { "epoch": 2.0120481927710845, "grad_norm": 2.6393344402313232, "learning_rate": 6.613119143239626e-05, "loss": 2.2634, "step": 501 }, { "epoch": 2.0160642570281126, "grad_norm": 2.4461183547973633, "learning_rate": 6.586345381526105e-05, "loss": 2.7017, "step": 502 }, { "epoch": 2.0200803212851404, "grad_norm": 3.1604115962982178, "learning_rate": 6.559571619812584e-05, "loss": 3.6735, "step": 503 }, { "epoch": 2.0240963855421685, "grad_norm": 3.0627472400665283, "learning_rate": 6.532797858099063e-05, "loss": 2.9889, "step": 504 }, { "epoch": 2.0281124497991967, "grad_norm": 2.568150520324707, "learning_rate": 6.506024096385543e-05, "loss": 2.492, "step": 505 }, { "epoch": 2.032128514056225, "grad_norm": 2.2594618797302246, "learning_rate": 6.47925033467202e-05, "loss": 1.8152, "step": 506 }, { "epoch": 2.036144578313253, "grad_norm": 2.544188976287842, "learning_rate": 6.452476572958501e-05, "loss": 3.7016, "step": 507 }, { "epoch": 2.040160642570281, "grad_norm": 2.418565511703491, "learning_rate": 6.42570281124498e-05, "loss": 2.3062, "step": 508 }, { "epoch": 2.0441767068273093, "grad_norm": 2.3617923259735107, "learning_rate": 6.398929049531458e-05, "loss": 2.2887, "step": 509 }, { "epoch": 2.0481927710843375, "grad_norm": 2.4115524291992188, "learning_rate": 6.372155287817939e-05, "loss": 2.4596, "step": 510 }, { "epoch": 2.0522088353413657, "grad_norm": 2.763218402862549, "learning_rate": 6.345381526104418e-05, "loss": 2.7423, "step": 511 }, { "epoch": 2.0562248995983934, "grad_norm": 2.515378713607788, "learning_rate": 6.318607764390896e-05, "loss": 2.4356, "step": 512 }, { "epoch": 2.0602409638554215, "grad_norm": 2.809786796569824, "learning_rate": 6.291834002677377e-05, "loss": 3.3361, "step": 513 }, { "epoch": 2.0642570281124497, "grad_norm": 2.3717005252838135, "learning_rate": 6.265060240963856e-05, "loss": 3.0205, "step": 514 }, { "epoch": 2.068273092369478, "grad_norm": 2.7689290046691895, "learning_rate": 6.238286479250335e-05, "loss": 2.9104, "step": 515 }, { "epoch": 2.072289156626506, "grad_norm": 2.573058843612671, "learning_rate": 6.211512717536813e-05, "loss": 2.2966, "step": 516 }, { "epoch": 2.076305220883534, "grad_norm": 2.5662682056427, "learning_rate": 6.184738955823294e-05, "loss": 2.4407, "step": 517 }, { "epoch": 2.0803212851405624, "grad_norm": 2.475853681564331, "learning_rate": 6.157965194109773e-05, "loss": 2.2512, "step": 518 }, { "epoch": 2.0843373493975905, "grad_norm": 2.426939010620117, "learning_rate": 6.131191432396251e-05, "loss": 2.2575, "step": 519 }, { "epoch": 2.0883534136546187, "grad_norm": 2.709951877593994, "learning_rate": 6.104417670682732e-05, "loss": 2.2289, "step": 520 }, { "epoch": 2.0923694779116464, "grad_norm": 2.620199680328369, "learning_rate": 6.0776439089692105e-05, "loss": 2.6856, "step": 521 }, { "epoch": 2.0963855421686746, "grad_norm": 2.236469030380249, "learning_rate": 6.05087014725569e-05, "loss": 2.1652, "step": 522 }, { "epoch": 2.1004016064257027, "grad_norm": 2.4781830310821533, "learning_rate": 6.02409638554217e-05, "loss": 2.0519, "step": 523 }, { "epoch": 2.104417670682731, "grad_norm": 2.9179675579071045, "learning_rate": 5.9973226238286484e-05, "loss": 2.3534, "step": 524 }, { "epoch": 2.108433734939759, "grad_norm": 2.7088980674743652, "learning_rate": 5.9705488621151276e-05, "loss": 2.3717, "step": 525 }, { "epoch": 2.112449799196787, "grad_norm": 2.784228801727295, "learning_rate": 5.943775100401606e-05, "loss": 2.7936, "step": 526 }, { "epoch": 2.1164658634538154, "grad_norm": 3.1045587062835693, "learning_rate": 5.917001338688086e-05, "loss": 2.1785, "step": 527 }, { "epoch": 2.1204819277108435, "grad_norm": 2.7609670162200928, "learning_rate": 5.8902275769745655e-05, "loss": 2.4232, "step": 528 }, { "epoch": 2.1244979919678713, "grad_norm": 2.9791460037231445, "learning_rate": 5.863453815261044e-05, "loss": 2.6127, "step": 529 }, { "epoch": 2.1285140562248994, "grad_norm": 2.917396306991577, "learning_rate": 5.836680053547524e-05, "loss": 2.5008, "step": 530 }, { "epoch": 2.1325301204819276, "grad_norm": 3.066033124923706, "learning_rate": 5.809906291834003e-05, "loss": 2.8997, "step": 531 }, { "epoch": 2.1365461847389557, "grad_norm": 2.570894241333008, "learning_rate": 5.783132530120482e-05, "loss": 2.2987, "step": 532 }, { "epoch": 2.140562248995984, "grad_norm": 2.4431967735290527, "learning_rate": 5.756358768406962e-05, "loss": 2.1485, "step": 533 }, { "epoch": 2.144578313253012, "grad_norm": 2.789560079574585, "learning_rate": 5.729585006693441e-05, "loss": 2.3678, "step": 534 }, { "epoch": 2.1485943775100402, "grad_norm": 2.691913366317749, "learning_rate": 5.70281124497992e-05, "loss": 2.3469, "step": 535 }, { "epoch": 2.1526104417670684, "grad_norm": 2.472721815109253, "learning_rate": 5.676037483266399e-05, "loss": 2.0741, "step": 536 }, { "epoch": 2.1566265060240966, "grad_norm": 2.705008029937744, "learning_rate": 5.649263721552879e-05, "loss": 2.3399, "step": 537 }, { "epoch": 2.1606425702811247, "grad_norm": 2.8036177158355713, "learning_rate": 5.6224899598393576e-05, "loss": 2.4336, "step": 538 }, { "epoch": 2.1646586345381524, "grad_norm": 2.8112568855285645, "learning_rate": 5.595716198125837e-05, "loss": 2.4039, "step": 539 }, { "epoch": 2.1686746987951806, "grad_norm": 2.932802438735962, "learning_rate": 5.568942436412317e-05, "loss": 2.4175, "step": 540 }, { "epoch": 2.1726907630522088, "grad_norm": 3.0952837467193604, "learning_rate": 5.5421686746987955e-05, "loss": 2.4552, "step": 541 }, { "epoch": 2.176706827309237, "grad_norm": 2.6719419956207275, "learning_rate": 5.515394912985275e-05, "loss": 2.0765, "step": 542 }, { "epoch": 2.180722891566265, "grad_norm": 3.0576534271240234, "learning_rate": 5.488621151271755e-05, "loss": 2.417, "step": 543 }, { "epoch": 2.1847389558232932, "grad_norm": 3.0612807273864746, "learning_rate": 5.461847389558233e-05, "loss": 2.9868, "step": 544 }, { "epoch": 2.1887550200803214, "grad_norm": 3.5036559104919434, "learning_rate": 5.4350736278447126e-05, "loss": 2.7975, "step": 545 }, { "epoch": 2.1927710843373496, "grad_norm": 3.5645198822021484, "learning_rate": 5.408299866131191e-05, "loss": 2.8446, "step": 546 }, { "epoch": 2.1967871485943773, "grad_norm": 2.72088360786438, "learning_rate": 5.381526104417671e-05, "loss": 2.3907, "step": 547 }, { "epoch": 2.2008032128514055, "grad_norm": 3.901146411895752, "learning_rate": 5.3547523427041504e-05, "loss": 3.4091, "step": 548 }, { "epoch": 2.2048192771084336, "grad_norm": 2.9762930870056152, "learning_rate": 5.327978580990629e-05, "loss": 2.2808, "step": 549 }, { "epoch": 2.208835341365462, "grad_norm": 3.1252336502075195, "learning_rate": 5.301204819277109e-05, "loss": 2.3206, "step": 550 }, { "epoch": 2.21285140562249, "grad_norm": 3.61395525932312, "learning_rate": 5.274431057563588e-05, "loss": 2.9899, "step": 551 }, { "epoch": 2.216867469879518, "grad_norm": 3.035787582397461, "learning_rate": 5.247657295850067e-05, "loss": 2.2514, "step": 552 }, { "epoch": 2.2208835341365463, "grad_norm": 3.0700008869171143, "learning_rate": 5.220883534136547e-05, "loss": 2.7965, "step": 553 }, { "epoch": 2.2248995983935744, "grad_norm": 3.380383253097534, "learning_rate": 5.194109772423026e-05, "loss": 2.7258, "step": 554 }, { "epoch": 2.2289156626506026, "grad_norm": 3.3445475101470947, "learning_rate": 5.167336010709505e-05, "loss": 3.0532, "step": 555 }, { "epoch": 2.2329317269076308, "grad_norm": 3.305169105529785, "learning_rate": 5.140562248995984e-05, "loss": 2.7851, "step": 556 }, { "epoch": 2.2369477911646585, "grad_norm": 3.3952481746673584, "learning_rate": 5.113788487282464e-05, "loss": 2.6845, "step": 557 }, { "epoch": 2.2409638554216866, "grad_norm": 2.7673559188842773, "learning_rate": 5.0870147255689426e-05, "loss": 2.6067, "step": 558 }, { "epoch": 2.244979919678715, "grad_norm": 3.3448803424835205, "learning_rate": 5.060240963855422e-05, "loss": 2.4804, "step": 559 }, { "epoch": 2.248995983935743, "grad_norm": 2.797827959060669, "learning_rate": 5.033467202141902e-05, "loss": 2.1237, "step": 560 }, { "epoch": 2.253012048192771, "grad_norm": 2.9383599758148193, "learning_rate": 5.0066934404283804e-05, "loss": 2.3107, "step": 561 }, { "epoch": 2.2570281124497993, "grad_norm": 3.0028162002563477, "learning_rate": 4.97991967871486e-05, "loss": 3.2211, "step": 562 }, { "epoch": 2.2610441767068274, "grad_norm": 2.928341865539551, "learning_rate": 4.953145917001339e-05, "loss": 2.5173, "step": 563 }, { "epoch": 2.2650602409638556, "grad_norm": 2.9720232486724854, "learning_rate": 4.926372155287818e-05, "loss": 2.3146, "step": 564 }, { "epoch": 2.2690763052208833, "grad_norm": 3.558094024658203, "learning_rate": 4.8995983935742975e-05, "loss": 3.1953, "step": 565 }, { "epoch": 2.2730923694779115, "grad_norm": 3.0352494716644287, "learning_rate": 4.872824631860777e-05, "loss": 2.4965, "step": 566 }, { "epoch": 2.2771084337349397, "grad_norm": 2.7428176403045654, "learning_rate": 4.8460508701472554e-05, "loss": 2.1514, "step": 567 }, { "epoch": 2.281124497991968, "grad_norm": 2.3594534397125244, "learning_rate": 4.8192771084337354e-05, "loss": 1.8075, "step": 568 }, { "epoch": 2.285140562248996, "grad_norm": 3.3449742794036865, "learning_rate": 4.792503346720215e-05, "loss": 2.5945, "step": 569 }, { "epoch": 2.289156626506024, "grad_norm": 3.104633331298828, "learning_rate": 4.765729585006693e-05, "loss": 2.9666, "step": 570 }, { "epoch": 2.2931726907630523, "grad_norm": 3.094238758087158, "learning_rate": 4.738955823293173e-05, "loss": 2.489, "step": 571 }, { "epoch": 2.2971887550200805, "grad_norm": 3.381775379180908, "learning_rate": 4.712182061579652e-05, "loss": 2.9042, "step": 572 }, { "epoch": 2.3012048192771086, "grad_norm": 3.2117156982421875, "learning_rate": 4.685408299866131e-05, "loss": 2.6925, "step": 573 }, { "epoch": 2.305220883534137, "grad_norm": 2.8267903327941895, "learning_rate": 4.658634538152611e-05, "loss": 2.3816, "step": 574 }, { "epoch": 2.3092369477911645, "grad_norm": 3.068437099456787, "learning_rate": 4.63186077643909e-05, "loss": 2.3124, "step": 575 }, { "epoch": 2.3132530120481927, "grad_norm": 2.832303762435913, "learning_rate": 4.605087014725569e-05, "loss": 2.5169, "step": 576 }, { "epoch": 2.317269076305221, "grad_norm": 2.8893704414367676, "learning_rate": 4.578313253012048e-05, "loss": 2.3119, "step": 577 }, { "epoch": 2.321285140562249, "grad_norm": 2.952976703643799, "learning_rate": 4.5515394912985275e-05, "loss": 2.3063, "step": 578 }, { "epoch": 2.325301204819277, "grad_norm": 2.7303566932678223, "learning_rate": 4.524765729585007e-05, "loss": 2.5834, "step": 579 }, { "epoch": 2.3293172690763053, "grad_norm": 2.9680216312408447, "learning_rate": 4.497991967871486e-05, "loss": 2.249, "step": 580 }, { "epoch": 2.3333333333333335, "grad_norm": 2.997044324874878, "learning_rate": 4.4712182061579654e-05, "loss": 2.5954, "step": 581 }, { "epoch": 2.337349397590361, "grad_norm": 3.4494729042053223, "learning_rate": 4.4444444444444447e-05, "loss": 3.1359, "step": 582 }, { "epoch": 2.3413654618473894, "grad_norm": 3.1353585720062256, "learning_rate": 4.417670682730924e-05, "loss": 2.4317, "step": 583 }, { "epoch": 2.3453815261044175, "grad_norm": 2.9816396236419678, "learning_rate": 4.390896921017403e-05, "loss": 2.8438, "step": 584 }, { "epoch": 2.3493975903614457, "grad_norm": 2.6249794960021973, "learning_rate": 4.3641231593038825e-05, "loss": 2.0497, "step": 585 }, { "epoch": 2.353413654618474, "grad_norm": 2.8994345664978027, "learning_rate": 4.337349397590362e-05, "loss": 2.149, "step": 586 }, { "epoch": 2.357429718875502, "grad_norm": 3.8927950859069824, "learning_rate": 4.3105756358768404e-05, "loss": 3.0218, "step": 587 }, { "epoch": 2.36144578313253, "grad_norm": 3.120274543762207, "learning_rate": 4.2838018741633203e-05, "loss": 2.1973, "step": 588 }, { "epoch": 2.3654618473895583, "grad_norm": 3.104851007461548, "learning_rate": 4.2570281124497996e-05, "loss": 2.3442, "step": 589 }, { "epoch": 2.3694779116465865, "grad_norm": 2.97161602973938, "learning_rate": 4.230254350736278e-05, "loss": 2.5706, "step": 590 }, { "epoch": 2.3734939759036147, "grad_norm": 2.6856470108032227, "learning_rate": 4.203480589022758e-05, "loss": 2.0781, "step": 591 }, { "epoch": 2.3775100401606424, "grad_norm": 2.9654481410980225, "learning_rate": 4.176706827309237e-05, "loss": 2.2495, "step": 592 }, { "epoch": 2.3815261044176705, "grad_norm": 2.861020088195801, "learning_rate": 4.149933065595716e-05, "loss": 1.9942, "step": 593 }, { "epoch": 2.3855421686746987, "grad_norm": 3.413158893585205, "learning_rate": 4.123159303882196e-05, "loss": 2.6585, "step": 594 }, { "epoch": 2.389558232931727, "grad_norm": 3.1313233375549316, "learning_rate": 4.0963855421686746e-05, "loss": 2.9493, "step": 595 }, { "epoch": 2.393574297188755, "grad_norm": 3.325638771057129, "learning_rate": 4.069611780455154e-05, "loss": 2.7101, "step": 596 }, { "epoch": 2.397590361445783, "grad_norm": 2.991661787033081, "learning_rate": 4.042838018741633e-05, "loss": 2.5683, "step": 597 }, { "epoch": 2.4016064257028114, "grad_norm": 3.0619921684265137, "learning_rate": 4.0160642570281125e-05, "loss": 2.5722, "step": 598 }, { "epoch": 2.4056224899598395, "grad_norm": 2.730375289916992, "learning_rate": 3.989290495314592e-05, "loss": 2.2107, "step": 599 }, { "epoch": 2.4096385542168672, "grad_norm": 2.5859103202819824, "learning_rate": 3.962516733601071e-05, "loss": 2.0576, "step": 600 }, { "epoch": 2.4136546184738954, "grad_norm": 2.8956499099731445, "learning_rate": 3.93574297188755e-05, "loss": 2.1889, "step": 601 }, { "epoch": 2.4176706827309236, "grad_norm": 2.575547933578491, "learning_rate": 3.9089692101740296e-05, "loss": 1.9322, "step": 602 }, { "epoch": 2.4216867469879517, "grad_norm": 3.3304378986358643, "learning_rate": 3.882195448460509e-05, "loss": 2.4677, "step": 603 }, { "epoch": 2.42570281124498, "grad_norm": 3.5554420948028564, "learning_rate": 3.855421686746988e-05, "loss": 2.6703, "step": 604 }, { "epoch": 2.429718875502008, "grad_norm": 3.415844440460205, "learning_rate": 3.8286479250334675e-05, "loss": 2.9157, "step": 605 }, { "epoch": 2.433734939759036, "grad_norm": 3.127218008041382, "learning_rate": 3.801874163319947e-05, "loss": 2.416, "step": 606 }, { "epoch": 2.4377510040160644, "grad_norm": 3.796701192855835, "learning_rate": 3.7751004016064253e-05, "loss": 2.3505, "step": 607 }, { "epoch": 2.4417670682730925, "grad_norm": 3.6044912338256836, "learning_rate": 3.748326639892905e-05, "loss": 2.8561, "step": 608 }, { "epoch": 2.4457831325301207, "grad_norm": 3.2551517486572266, "learning_rate": 3.7215528781793846e-05, "loss": 2.5376, "step": 609 }, { "epoch": 2.4497991967871484, "grad_norm": 2.890302896499634, "learning_rate": 3.694779116465863e-05, "loss": 2.2256, "step": 610 }, { "epoch": 2.4538152610441766, "grad_norm": 3.478085517883301, "learning_rate": 3.668005354752343e-05, "loss": 2.6602, "step": 611 }, { "epoch": 2.4578313253012047, "grad_norm": 3.682518720626831, "learning_rate": 3.641231593038822e-05, "loss": 2.8083, "step": 612 }, { "epoch": 2.461847389558233, "grad_norm": 2.841364860534668, "learning_rate": 3.614457831325301e-05, "loss": 2.0827, "step": 613 }, { "epoch": 2.465863453815261, "grad_norm": 2.784315347671509, "learning_rate": 3.587684069611781e-05, "loss": 3.9997, "step": 614 }, { "epoch": 2.4698795180722892, "grad_norm": 3.153395652770996, "learning_rate": 3.5609103078982596e-05, "loss": 2.3443, "step": 615 }, { "epoch": 2.4738955823293174, "grad_norm": 3.2817304134368896, "learning_rate": 3.534136546184739e-05, "loss": 2.6729, "step": 616 }, { "epoch": 2.4779116465863456, "grad_norm": 2.8291358947753906, "learning_rate": 3.507362784471218e-05, "loss": 2.1918, "step": 617 }, { "epoch": 2.4819277108433733, "grad_norm": 3.548492670059204, "learning_rate": 3.4805890227576974e-05, "loss": 3.5277, "step": 618 }, { "epoch": 2.4859437751004014, "grad_norm": 9.622389793395996, "learning_rate": 3.4538152610441774e-05, "loss": 3.3926, "step": 619 }, { "epoch": 2.4899598393574296, "grad_norm": 3.489105224609375, "learning_rate": 3.427041499330656e-05, "loss": 2.5828, "step": 620 }, { "epoch": 2.4939759036144578, "grad_norm": 2.7694857120513916, "learning_rate": 3.400267737617135e-05, "loss": 1.9917, "step": 621 }, { "epoch": 2.497991967871486, "grad_norm": 3.2993392944335938, "learning_rate": 3.3734939759036146e-05, "loss": 2.8177, "step": 622 }, { "epoch": 2.502008032128514, "grad_norm": 2.863051176071167, "learning_rate": 3.346720214190094e-05, "loss": 2.0999, "step": 623 }, { "epoch": 2.5060240963855422, "grad_norm": 3.025731086730957, "learning_rate": 3.319946452476573e-05, "loss": 2.555, "step": 624 }, { "epoch": 2.5100401606425704, "grad_norm": 3.236588716506958, "learning_rate": 3.2931726907630524e-05, "loss": 2.3746, "step": 625 }, { "epoch": 2.5140562248995986, "grad_norm": 3.071715831756592, "learning_rate": 3.266398929049532e-05, "loss": 2.1943, "step": 626 }, { "epoch": 2.5180722891566267, "grad_norm": 3.353304147720337, "learning_rate": 3.23962516733601e-05, "loss": 3.2267, "step": 627 }, { "epoch": 2.522088353413655, "grad_norm": 2.9166722297668457, "learning_rate": 3.21285140562249e-05, "loss": 2.5768, "step": 628 }, { "epoch": 2.5261044176706826, "grad_norm": 2.571737051010132, "learning_rate": 3.1860776439089695e-05, "loss": 2.4097, "step": 629 }, { "epoch": 2.5301204819277108, "grad_norm": 3.2051124572753906, "learning_rate": 3.159303882195448e-05, "loss": 2.6875, "step": 630 }, { "epoch": 2.534136546184739, "grad_norm": 3.414586067199707, "learning_rate": 3.132530120481928e-05, "loss": 2.467, "step": 631 }, { "epoch": 2.538152610441767, "grad_norm": 3.201895236968994, "learning_rate": 3.105756358768407e-05, "loss": 2.6332, "step": 632 }, { "epoch": 2.5421686746987953, "grad_norm": 3.2875518798828125, "learning_rate": 3.078982597054887e-05, "loss": 3.0367, "step": 633 }, { "epoch": 2.5461847389558234, "grad_norm": 2.6989524364471436, "learning_rate": 3.052208835341366e-05, "loss": 2.1665, "step": 634 }, { "epoch": 2.550200803212851, "grad_norm": 2.7747488021850586, "learning_rate": 3.025435073627845e-05, "loss": 2.1499, "step": 635 }, { "epoch": 2.5542168674698793, "grad_norm": 3.4082605838775635, "learning_rate": 2.9986613119143242e-05, "loss": 2.6462, "step": 636 }, { "epoch": 2.5582329317269075, "grad_norm": 2.713757276535034, "learning_rate": 2.971887550200803e-05, "loss": 2.09, "step": 637 }, { "epoch": 2.5622489959839356, "grad_norm": 3.2788338661193848, "learning_rate": 2.9451137884872827e-05, "loss": 2.3322, "step": 638 }, { "epoch": 2.566265060240964, "grad_norm": 2.6642184257507324, "learning_rate": 2.918340026773762e-05, "loss": 2.1751, "step": 639 }, { "epoch": 2.570281124497992, "grad_norm": 3.069793224334717, "learning_rate": 2.891566265060241e-05, "loss": 2.2499, "step": 640 }, { "epoch": 2.57429718875502, "grad_norm": 3.132709503173828, "learning_rate": 2.8647925033467206e-05, "loss": 2.585, "step": 641 }, { "epoch": 2.5783132530120483, "grad_norm": 3.27109432220459, "learning_rate": 2.8380187416331995e-05, "loss": 2.4458, "step": 642 }, { "epoch": 2.5823293172690764, "grad_norm": 3.5450148582458496, "learning_rate": 2.8112449799196788e-05, "loss": 3.8692, "step": 643 }, { "epoch": 2.5863453815261046, "grad_norm": 3.2768943309783936, "learning_rate": 2.7844712182061584e-05, "loss": 2.4152, "step": 644 }, { "epoch": 2.5903614457831328, "grad_norm": 3.1916306018829346, "learning_rate": 2.7576974564926374e-05, "loss": 2.5376, "step": 645 }, { "epoch": 2.5943775100401605, "grad_norm": 2.7519237995147705, "learning_rate": 2.7309236947791167e-05, "loss": 2.1762, "step": 646 }, { "epoch": 2.5983935742971886, "grad_norm": 3.649415969848633, "learning_rate": 2.7041499330655956e-05, "loss": 3.0767, "step": 647 }, { "epoch": 2.602409638554217, "grad_norm": 3.1575088500976562, "learning_rate": 2.6773761713520752e-05, "loss": 2.5746, "step": 648 }, { "epoch": 2.606425702811245, "grad_norm": 3.1661970615386963, "learning_rate": 2.6506024096385545e-05, "loss": 2.8486, "step": 649 }, { "epoch": 2.610441767068273, "grad_norm": 3.374446392059326, "learning_rate": 2.6238286479250334e-05, "loss": 3.0536, "step": 650 }, { "epoch": 2.6144578313253013, "grad_norm": 3.2961578369140625, "learning_rate": 2.597054886211513e-05, "loss": 2.403, "step": 651 }, { "epoch": 2.6184738955823295, "grad_norm": 3.078670024871826, "learning_rate": 2.570281124497992e-05, "loss": 2.0923, "step": 652 }, { "epoch": 2.622489959839357, "grad_norm": 3.625155448913574, "learning_rate": 2.5435073627844713e-05, "loss": 3.3948, "step": 653 }, { "epoch": 2.6265060240963853, "grad_norm": 3.2434301376342773, "learning_rate": 2.516733601070951e-05, "loss": 3.0131, "step": 654 }, { "epoch": 2.6305220883534135, "grad_norm": 3.321974515914917, "learning_rate": 2.48995983935743e-05, "loss": 2.5972, "step": 655 }, { "epoch": 2.6345381526104417, "grad_norm": 2.6846182346343994, "learning_rate": 2.463186077643909e-05, "loss": 2.2812, "step": 656 }, { "epoch": 2.63855421686747, "grad_norm": 2.814183235168457, "learning_rate": 2.4364123159303884e-05, "loss": 2.1195, "step": 657 }, { "epoch": 2.642570281124498, "grad_norm": 2.640397310256958, "learning_rate": 2.4096385542168677e-05, "loss": 2.1728, "step": 658 }, { "epoch": 2.646586345381526, "grad_norm": 3.7056844234466553, "learning_rate": 2.3828647925033466e-05, "loss": 2.8224, "step": 659 }, { "epoch": 2.6506024096385543, "grad_norm": 2.740823268890381, "learning_rate": 2.356091030789826e-05, "loss": 2.3886, "step": 660 }, { "epoch": 2.6546184738955825, "grad_norm": 2.689279079437256, "learning_rate": 2.3293172690763055e-05, "loss": 2.3151, "step": 661 }, { "epoch": 2.6586345381526106, "grad_norm": 3.4579248428344727, "learning_rate": 2.3025435073627845e-05, "loss": 2.7812, "step": 662 }, { "epoch": 2.662650602409639, "grad_norm": 3.293381690979004, "learning_rate": 2.2757697456492638e-05, "loss": 2.9381, "step": 663 }, { "epoch": 2.6666666666666665, "grad_norm": 3.3860654830932617, "learning_rate": 2.248995983935743e-05, "loss": 2.4111, "step": 664 }, { "epoch": 2.6706827309236947, "grad_norm": 3.3504996299743652, "learning_rate": 2.2222222222222223e-05, "loss": 2.4411, "step": 665 }, { "epoch": 2.674698795180723, "grad_norm": 3.2323498725891113, "learning_rate": 2.1954484605087016e-05, "loss": 2.6294, "step": 666 }, { "epoch": 2.678714859437751, "grad_norm": 2.935426950454712, "learning_rate": 2.168674698795181e-05, "loss": 2.5489, "step": 667 }, { "epoch": 2.682730923694779, "grad_norm": 3.483436346054077, "learning_rate": 2.1419009370816602e-05, "loss": 2.7512, "step": 668 }, { "epoch": 2.6867469879518073, "grad_norm": 3.4001944065093994, "learning_rate": 2.115127175368139e-05, "loss": 2.4015, "step": 669 }, { "epoch": 2.6907630522088355, "grad_norm": 3.6413683891296387, "learning_rate": 2.0883534136546184e-05, "loss": 3.5122, "step": 670 }, { "epoch": 2.694779116465863, "grad_norm": 2.5411088466644287, "learning_rate": 2.061579651941098e-05, "loss": 2.0925, "step": 671 }, { "epoch": 2.6987951807228914, "grad_norm": 3.1367125511169434, "learning_rate": 2.034805890227577e-05, "loss": 2.5457, "step": 672 }, { "epoch": 2.7028112449799195, "grad_norm": 3.300114393234253, "learning_rate": 2.0080321285140562e-05, "loss": 3.0402, "step": 673 }, { "epoch": 2.7068273092369477, "grad_norm": 2.744513750076294, "learning_rate": 1.9812583668005355e-05, "loss": 2.2273, "step": 674 }, { "epoch": 2.710843373493976, "grad_norm": 3.0049889087677, "learning_rate": 1.9544846050870148e-05, "loss": 2.4656, "step": 675 }, { "epoch": 2.714859437751004, "grad_norm": 2.9064860343933105, "learning_rate": 1.927710843373494e-05, "loss": 2.3855, "step": 676 }, { "epoch": 2.718875502008032, "grad_norm": 3.317073106765747, "learning_rate": 1.9009370816599734e-05, "loss": 2.7036, "step": 677 }, { "epoch": 2.7228915662650603, "grad_norm": 3.580209732055664, "learning_rate": 1.8741633199464527e-05, "loss": 2.4416, "step": 678 }, { "epoch": 2.7269076305220885, "grad_norm": 3.0195388793945312, "learning_rate": 1.8473895582329316e-05, "loss": 2.0284, "step": 679 }, { "epoch": 2.7309236947791167, "grad_norm": 3.5155584812164307, "learning_rate": 1.820615796519411e-05, "loss": 3.6898, "step": 680 }, { "epoch": 2.734939759036145, "grad_norm": 3.3643851280212402, "learning_rate": 1.7938420348058905e-05, "loss": 2.7534, "step": 681 }, { "epoch": 2.7389558232931726, "grad_norm": 3.949350595474243, "learning_rate": 1.7670682730923694e-05, "loss": 3.6933, "step": 682 }, { "epoch": 2.7429718875502007, "grad_norm": 2.7811617851257324, "learning_rate": 1.7402945113788487e-05, "loss": 2.0857, "step": 683 }, { "epoch": 2.746987951807229, "grad_norm": 3.3071796894073486, "learning_rate": 1.713520749665328e-05, "loss": 2.9454, "step": 684 }, { "epoch": 2.751004016064257, "grad_norm": 3.181541919708252, "learning_rate": 1.6867469879518073e-05, "loss": 2.4977, "step": 685 }, { "epoch": 2.755020080321285, "grad_norm": 2.8570432662963867, "learning_rate": 1.6599732262382866e-05, "loss": 2.2448, "step": 686 }, { "epoch": 2.7590361445783134, "grad_norm": 2.8519392013549805, "learning_rate": 1.633199464524766e-05, "loss": 2.0659, "step": 687 }, { "epoch": 2.7630522088353415, "grad_norm": 3.0057828426361084, "learning_rate": 1.606425702811245e-05, "loss": 2.711, "step": 688 }, { "epoch": 2.7670682730923692, "grad_norm": 3.7644693851470947, "learning_rate": 1.579651941097724e-05, "loss": 2.7368, "step": 689 }, { "epoch": 2.7710843373493974, "grad_norm": 3.339076519012451, "learning_rate": 1.5528781793842034e-05, "loss": 2.4372, "step": 690 }, { "epoch": 2.7751004016064256, "grad_norm": 3.3303468227386475, "learning_rate": 1.526104417670683e-05, "loss": 2.1496, "step": 691 }, { "epoch": 2.7791164658634537, "grad_norm": 3.007516384124756, "learning_rate": 1.4993306559571621e-05, "loss": 2.0637, "step": 692 }, { "epoch": 2.783132530120482, "grad_norm": 3.2054901123046875, "learning_rate": 1.4725568942436414e-05, "loss": 2.6325, "step": 693 }, { "epoch": 2.78714859437751, "grad_norm": 3.089660882949829, "learning_rate": 1.4457831325301205e-05, "loss": 2.6186, "step": 694 }, { "epoch": 2.791164658634538, "grad_norm": 3.6075477600097656, "learning_rate": 1.4190093708165998e-05, "loss": 3.04, "step": 695 }, { "epoch": 2.7951807228915664, "grad_norm": 2.9559810161590576, "learning_rate": 1.3922356091030792e-05, "loss": 2.1752, "step": 696 }, { "epoch": 2.7991967871485945, "grad_norm": 3.062072992324829, "learning_rate": 1.3654618473895583e-05, "loss": 2.0509, "step": 697 }, { "epoch": 2.8032128514056227, "grad_norm": 4.112563610076904, "learning_rate": 1.3386880856760376e-05, "loss": 2.937, "step": 698 }, { "epoch": 2.807228915662651, "grad_norm": 3.2194480895996094, "learning_rate": 1.3119143239625167e-05, "loss": 2.2974, "step": 699 }, { "epoch": 2.8112449799196786, "grad_norm": 3.2111270427703857, "learning_rate": 1.285140562248996e-05, "loss": 2.3903, "step": 700 }, { "epoch": 2.8152610441767068, "grad_norm": 3.1619982719421387, "learning_rate": 1.2583668005354755e-05, "loss": 2.154, "step": 701 }, { "epoch": 2.819277108433735, "grad_norm": 3.0533196926116943, "learning_rate": 1.2315930388219546e-05, "loss": 2.8862, "step": 702 }, { "epoch": 2.823293172690763, "grad_norm": 2.838397264480591, "learning_rate": 1.2048192771084338e-05, "loss": 2.1974, "step": 703 }, { "epoch": 2.8273092369477912, "grad_norm": 2.960359573364258, "learning_rate": 1.178045515394913e-05, "loss": 2.2714, "step": 704 }, { "epoch": 2.8313253012048194, "grad_norm": 3.3387844562530518, "learning_rate": 1.1512717536813922e-05, "loss": 2.5617, "step": 705 }, { "epoch": 2.835341365461847, "grad_norm": 3.802029609680176, "learning_rate": 1.1244979919678715e-05, "loss": 2.6791, "step": 706 }, { "epoch": 2.8393574297188753, "grad_norm": 3.0797119140625, "learning_rate": 1.0977242302543508e-05, "loss": 2.008, "step": 707 }, { "epoch": 2.8433734939759034, "grad_norm": 3.6929612159729004, "learning_rate": 1.0709504685408301e-05, "loss": 3.0253, "step": 708 }, { "epoch": 2.8473895582329316, "grad_norm": 3.409666061401367, "learning_rate": 1.0441767068273092e-05, "loss": 2.488, "step": 709 }, { "epoch": 2.8514056224899598, "grad_norm": 3.4419896602630615, "learning_rate": 1.0174029451137885e-05, "loss": 2.5107, "step": 710 }, { "epoch": 2.855421686746988, "grad_norm": 2.9970462322235107, "learning_rate": 9.906291834002678e-06, "loss": 2.4561, "step": 711 }, { "epoch": 2.859437751004016, "grad_norm": 2.9567370414733887, "learning_rate": 9.63855421686747e-06, "loss": 2.0972, "step": 712 }, { "epoch": 2.8634538152610443, "grad_norm": 3.134462356567383, "learning_rate": 9.370816599732263e-06, "loss": 2.4256, "step": 713 }, { "epoch": 2.8674698795180724, "grad_norm": 3.376096487045288, "learning_rate": 9.103078982597054e-06, "loss": 2.221, "step": 714 }, { "epoch": 2.8714859437751006, "grad_norm": 3.569254159927368, "learning_rate": 8.835341365461847e-06, "loss": 2.379, "step": 715 }, { "epoch": 2.8755020080321287, "grad_norm": 3.4028611183166504, "learning_rate": 8.56760374832664e-06, "loss": 2.3297, "step": 716 }, { "epoch": 2.8795180722891565, "grad_norm": 3.772540807723999, "learning_rate": 8.299866131191433e-06, "loss": 2.9839, "step": 717 }, { "epoch": 2.8835341365461846, "grad_norm": 3.2679340839385986, "learning_rate": 8.032128514056226e-06, "loss": 2.3875, "step": 718 }, { "epoch": 2.887550200803213, "grad_norm": 3.6074769496917725, "learning_rate": 7.764390896921017e-06, "loss": 2.9021, "step": 719 }, { "epoch": 2.891566265060241, "grad_norm": 3.7479116916656494, "learning_rate": 7.4966532797858104e-06, "loss": 2.5803, "step": 720 }, { "epoch": 2.895582329317269, "grad_norm": 3.051452875137329, "learning_rate": 7.228915662650602e-06, "loss": 2.9504, "step": 721 }, { "epoch": 2.8995983935742973, "grad_norm": 3.341724157333374, "learning_rate": 6.961178045515396e-06, "loss": 2.8643, "step": 722 }, { "epoch": 2.9036144578313254, "grad_norm": 2.8065922260284424, "learning_rate": 6.693440428380188e-06, "loss": 2.6456, "step": 723 }, { "epoch": 2.907630522088353, "grad_norm": 3.295828342437744, "learning_rate": 6.42570281124498e-06, "loss": 3.2691, "step": 724 }, { "epoch": 2.9116465863453813, "grad_norm": 3.15494966506958, "learning_rate": 6.157965194109773e-06, "loss": 2.3256, "step": 725 }, { "epoch": 2.9156626506024095, "grad_norm": 3.146188259124756, "learning_rate": 5.890227576974565e-06, "loss": 2.5247, "step": 726 }, { "epoch": 2.9196787148594376, "grad_norm": 3.042181968688965, "learning_rate": 5.622489959839358e-06, "loss": 2.3458, "step": 727 }, { "epoch": 2.923694779116466, "grad_norm": 2.8072509765625, "learning_rate": 5.3547523427041504e-06, "loss": 2.2129, "step": 728 }, { "epoch": 2.927710843373494, "grad_norm": 3.1902520656585693, "learning_rate": 5.087014725568942e-06, "loss": 2.1905, "step": 729 }, { "epoch": 2.931726907630522, "grad_norm": 3.706218719482422, "learning_rate": 4.819277108433735e-06, "loss": 2.8587, "step": 730 }, { "epoch": 2.9357429718875503, "grad_norm": 3.516908645629883, "learning_rate": 4.551539491298527e-06, "loss": 3.0003, "step": 731 }, { "epoch": 2.9397590361445785, "grad_norm": 3.9051806926727295, "learning_rate": 4.28380187416332e-06, "loss": 2.6986, "step": 732 }, { "epoch": 2.9437751004016066, "grad_norm": 2.434493064880371, "learning_rate": 4.016064257028113e-06, "loss": 2.0143, "step": 733 }, { "epoch": 2.9477911646586348, "grad_norm": 3.514988899230957, "learning_rate": 3.7483266398929052e-06, "loss": 2.5539, "step": 734 }, { "epoch": 2.9518072289156625, "grad_norm": 3.145475387573242, "learning_rate": 3.480589022757698e-06, "loss": 2.3991, "step": 735 }, { "epoch": 2.9558232931726907, "grad_norm": 3.0328280925750732, "learning_rate": 3.21285140562249e-06, "loss": 2.4384, "step": 736 }, { "epoch": 2.959839357429719, "grad_norm": 3.584406614303589, "learning_rate": 2.9451137884872824e-06, "loss": 2.219, "step": 737 }, { "epoch": 2.963855421686747, "grad_norm": 2.8902695178985596, "learning_rate": 2.6773761713520752e-06, "loss": 2.0701, "step": 738 }, { "epoch": 2.967871485943775, "grad_norm": 2.714848518371582, "learning_rate": 2.4096385542168676e-06, "loss": 2.3578, "step": 739 }, { "epoch": 2.9718875502008033, "grad_norm": 3.4589223861694336, "learning_rate": 2.14190093708166e-06, "loss": 2.4076, "step": 740 }, { "epoch": 2.9759036144578315, "grad_norm": 2.8250577449798584, "learning_rate": 1.8741633199464526e-06, "loss": 2.2688, "step": 741 }, { "epoch": 2.979919678714859, "grad_norm": 3.090301752090454, "learning_rate": 1.606425702811245e-06, "loss": 2.0527, "step": 742 }, { "epoch": 2.9839357429718874, "grad_norm": 3.82488751411438, "learning_rate": 1.3386880856760376e-06, "loss": 2.9784, "step": 743 }, { "epoch": 2.9879518072289155, "grad_norm": 3.046949863433838, "learning_rate": 1.07095046854083e-06, "loss": 2.987, "step": 744 }, { "epoch": 2.9919678714859437, "grad_norm": 3.08667254447937, "learning_rate": 8.032128514056225e-07, "loss": 2.3121, "step": 745 }, { "epoch": 2.995983935742972, "grad_norm": 3.114004611968994, "learning_rate": 5.35475234270415e-07, "loss": 2.4549, "step": 746 }, { "epoch": 3.0, "grad_norm": 3.1294381618499756, "learning_rate": 2.677376171352075e-07, "loss": 2.2527, "step": 747 }, { "epoch": 3.0, "eval_loss": 0.8732815980911255, "eval_runtime": 201.6297, "eval_samples_per_second": 2.475, "eval_steps_per_second": 1.24, "step": 747 } ], "logging_steps": 1, "max_steps": 747, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.0605631120002253e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }