|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 747, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004016064257028112, |
|
"grad_norm": 6.783391952514648, |
|
"learning_rate": 0.0002, |
|
"loss": 17.0626, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.008032128514056224, |
|
"grad_norm": 6.832010269165039, |
|
"learning_rate": 0.0001997322623828648, |
|
"loss": 16.3736, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.012048192771084338, |
|
"grad_norm": 4.412657260894775, |
|
"learning_rate": 0.0001994645247657296, |
|
"loss": 13.7202, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.01606425702811245, |
|
"grad_norm": 4.6994500160217285, |
|
"learning_rate": 0.0001991967871485944, |
|
"loss": 12.1103, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.020080321285140562, |
|
"grad_norm": 5.078355312347412, |
|
"learning_rate": 0.00019892904953145918, |
|
"loss": 11.9491, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.024096385542168676, |
|
"grad_norm": 5.82587194442749, |
|
"learning_rate": 0.00019866131191432397, |
|
"loss": 10.24, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.028112449799196786, |
|
"grad_norm": 5.521396160125732, |
|
"learning_rate": 0.00019839357429718877, |
|
"loss": 9.7617, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0321285140562249, |
|
"grad_norm": 5.55628776550293, |
|
"learning_rate": 0.00019812583668005356, |
|
"loss": 8.9588, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.03614457831325301, |
|
"grad_norm": 4.77673864364624, |
|
"learning_rate": 0.00019785809906291835, |
|
"loss": 7.413, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.040160642570281124, |
|
"grad_norm": 3.045475482940674, |
|
"learning_rate": 0.00019759036144578314, |
|
"loss": 8.4555, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04417670682730924, |
|
"grad_norm": 2.4188013076782227, |
|
"learning_rate": 0.0001973226238286479, |
|
"loss": 6.3816, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.04819277108433735, |
|
"grad_norm": 2.483142852783203, |
|
"learning_rate": 0.00019705488621151273, |
|
"loss": 6.0486, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.05220883534136546, |
|
"grad_norm": 2.7488200664520264, |
|
"learning_rate": 0.00019678714859437752, |
|
"loss": 6.0559, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.05622489959839357, |
|
"grad_norm": 3.509127140045166, |
|
"learning_rate": 0.00019651941097724232, |
|
"loss": 6.5013, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.060240963855421686, |
|
"grad_norm": 4.097210884094238, |
|
"learning_rate": 0.0001962516733601071, |
|
"loss": 6.6959, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0642570281124498, |
|
"grad_norm": 5.211580753326416, |
|
"learning_rate": 0.0001959839357429719, |
|
"loss": 7.4451, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.06827309236947791, |
|
"grad_norm": 4.360202312469482, |
|
"learning_rate": 0.00019571619812583667, |
|
"loss": 7.5475, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.07228915662650602, |
|
"grad_norm": 4.646812915802002, |
|
"learning_rate": 0.0001954484605087015, |
|
"loss": 5.9117, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.07630522088353414, |
|
"grad_norm": 4.076641082763672, |
|
"learning_rate": 0.00019518072289156628, |
|
"loss": 6.5152, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.08032128514056225, |
|
"grad_norm": 4.571013450622559, |
|
"learning_rate": 0.00019491298527443107, |
|
"loss": 7.7192, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.08433734939759036, |
|
"grad_norm": 3.786604881286621, |
|
"learning_rate": 0.00019464524765729587, |
|
"loss": 6.0262, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.08835341365461848, |
|
"grad_norm": 3.7632923126220703, |
|
"learning_rate": 0.00019437751004016066, |
|
"loss": 5.515, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.09236947791164658, |
|
"grad_norm": 3.142625093460083, |
|
"learning_rate": 0.00019410977242302542, |
|
"loss": 5.5428, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.0963855421686747, |
|
"grad_norm": 4.195131778717041, |
|
"learning_rate": 0.00019384203480589022, |
|
"loss": 5.0073, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.10040160642570281, |
|
"grad_norm": 7.452038764953613, |
|
"learning_rate": 0.00019357429718875504, |
|
"loss": 5.6765, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.10441767068273092, |
|
"grad_norm": 9.708063125610352, |
|
"learning_rate": 0.00019330655957161983, |
|
"loss": 5.6149, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.10843373493975904, |
|
"grad_norm": 19.072011947631836, |
|
"learning_rate": 0.00019303882195448462, |
|
"loss": 5.4365, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.11244979919678715, |
|
"grad_norm": 6.726373195648193, |
|
"learning_rate": 0.00019277108433734942, |
|
"loss": 4.858, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.11646586345381527, |
|
"grad_norm": 3.187056064605713, |
|
"learning_rate": 0.0001925033467202142, |
|
"loss": 5.3406, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.12048192771084337, |
|
"grad_norm": 3.364069700241089, |
|
"learning_rate": 0.00019223560910307897, |
|
"loss": 5.5143, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.12449799196787148, |
|
"grad_norm": 2.4620518684387207, |
|
"learning_rate": 0.00019196787148594377, |
|
"loss": 4.638, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.1285140562248996, |
|
"grad_norm": 3.9363696575164795, |
|
"learning_rate": 0.0001917001338688086, |
|
"loss": 4.6009, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.13253012048192772, |
|
"grad_norm": 3.230189561843872, |
|
"learning_rate": 0.00019143239625167338, |
|
"loss": 4.7928, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.13654618473895583, |
|
"grad_norm": 2.873898983001709, |
|
"learning_rate": 0.00019116465863453817, |
|
"loss": 3.7444, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.14056224899598393, |
|
"grad_norm": 3.2136387825012207, |
|
"learning_rate": 0.00019089692101740297, |
|
"loss": 4.452, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.14457831325301204, |
|
"grad_norm": 2.8411664962768555, |
|
"learning_rate": 0.00019062918340026773, |
|
"loss": 4.483, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.14859437751004015, |
|
"grad_norm": 2.68854022026062, |
|
"learning_rate": 0.00019036144578313252, |
|
"loss": 3.92, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.15261044176706828, |
|
"grad_norm": 3.324504852294922, |
|
"learning_rate": 0.00019009370816599734, |
|
"loss": 4.4238, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.1566265060240964, |
|
"grad_norm": 3.0757510662078857, |
|
"learning_rate": 0.00018982597054886214, |
|
"loss": 4.0354, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.1606425702811245, |
|
"grad_norm": 3.1478559970855713, |
|
"learning_rate": 0.00018955823293172693, |
|
"loss": 4.7587, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1646586345381526, |
|
"grad_norm": 2.923387050628662, |
|
"learning_rate": 0.00018929049531459172, |
|
"loss": 4.1713, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.1686746987951807, |
|
"grad_norm": 3.3262710571289062, |
|
"learning_rate": 0.0001890227576974565, |
|
"loss": 5.7246, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.17269076305220885, |
|
"grad_norm": 2.9940414428710938, |
|
"learning_rate": 0.00018875502008032128, |
|
"loss": 3.9502, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.17670682730923695, |
|
"grad_norm": 2.4215221405029297, |
|
"learning_rate": 0.00018848728246318607, |
|
"loss": 3.3469, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.18072289156626506, |
|
"grad_norm": 4.08881139755249, |
|
"learning_rate": 0.0001882195448460509, |
|
"loss": 3.6203, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.18473895582329317, |
|
"grad_norm": 2.550448417663574, |
|
"learning_rate": 0.00018795180722891569, |
|
"loss": 3.9986, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.18875502008032127, |
|
"grad_norm": 2.3286774158477783, |
|
"learning_rate": 0.00018768406961178048, |
|
"loss": 3.3749, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.1927710843373494, |
|
"grad_norm": 2.724431276321411, |
|
"learning_rate": 0.00018741633199464524, |
|
"loss": 3.4734, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.19678714859437751, |
|
"grad_norm": 2.961087226867676, |
|
"learning_rate": 0.00018714859437751004, |
|
"loss": 4.242, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.20080321285140562, |
|
"grad_norm": 2.4245645999908447, |
|
"learning_rate": 0.00018688085676037483, |
|
"loss": 3.7956, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.20481927710843373, |
|
"grad_norm": 2.141226053237915, |
|
"learning_rate": 0.00018661311914323962, |
|
"loss": 3.0041, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.20883534136546184, |
|
"grad_norm": 2.7774155139923096, |
|
"learning_rate": 0.00018634538152610444, |
|
"loss": 3.5062, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.21285140562248997, |
|
"grad_norm": 2.6332597732543945, |
|
"learning_rate": 0.00018607764390896924, |
|
"loss": 3.9305, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.21686746987951808, |
|
"grad_norm": 3.4417197704315186, |
|
"learning_rate": 0.000185809906291834, |
|
"loss": 5.1481, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.22088353413654618, |
|
"grad_norm": 2.576704978942871, |
|
"learning_rate": 0.0001855421686746988, |
|
"loss": 3.6137, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.2248995983935743, |
|
"grad_norm": 2.816452980041504, |
|
"learning_rate": 0.0001852744310575636, |
|
"loss": 3.5015, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.2289156626506024, |
|
"grad_norm": 3.5300023555755615, |
|
"learning_rate": 0.00018500669344042838, |
|
"loss": 4.7758, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.23293172690763053, |
|
"grad_norm": 2.594787120819092, |
|
"learning_rate": 0.0001847389558232932, |
|
"loss": 4.0104, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.23694779116465864, |
|
"grad_norm": 3.472842216491699, |
|
"learning_rate": 0.000184471218206158, |
|
"loss": 4.2051, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.24096385542168675, |
|
"grad_norm": 2.195838212966919, |
|
"learning_rate": 0.00018420348058902276, |
|
"loss": 3.4561, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.24497991967871485, |
|
"grad_norm": 2.6737020015716553, |
|
"learning_rate": 0.00018393574297188755, |
|
"loss": 5.4281, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.24899598393574296, |
|
"grad_norm": 3.128307342529297, |
|
"learning_rate": 0.00018366800535475234, |
|
"loss": 4.835, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.25301204819277107, |
|
"grad_norm": 2.8915627002716064, |
|
"learning_rate": 0.00018340026773761714, |
|
"loss": 5.6513, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.2570281124497992, |
|
"grad_norm": 2.4325616359710693, |
|
"learning_rate": 0.00018313253012048193, |
|
"loss": 3.8769, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.26104417670682734, |
|
"grad_norm": 2.717306613922119, |
|
"learning_rate": 0.00018286479250334675, |
|
"loss": 4.7258, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.26506024096385544, |
|
"grad_norm": 2.6178746223449707, |
|
"learning_rate": 0.00018259705488621152, |
|
"loss": 4.0424, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.26907630522088355, |
|
"grad_norm": 2.382551431655884, |
|
"learning_rate": 0.0001823293172690763, |
|
"loss": 3.547, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.27309236947791166, |
|
"grad_norm": 2.546783685684204, |
|
"learning_rate": 0.0001820615796519411, |
|
"loss": 4.2495, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.27710843373493976, |
|
"grad_norm": 2.4738221168518066, |
|
"learning_rate": 0.0001817938420348059, |
|
"loss": 3.69, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.28112449799196787, |
|
"grad_norm": 2.2191786766052246, |
|
"learning_rate": 0.0001815261044176707, |
|
"loss": 3.1576, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.285140562248996, |
|
"grad_norm": 2.4891932010650635, |
|
"learning_rate": 0.00018125836680053548, |
|
"loss": 3.7767, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.2891566265060241, |
|
"grad_norm": 2.0602684020996094, |
|
"learning_rate": 0.00018099062918340027, |
|
"loss": 3.1497, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.2931726907630522, |
|
"grad_norm": 2.435455560684204, |
|
"learning_rate": 0.00018072289156626507, |
|
"loss": 4.3061, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.2971887550200803, |
|
"grad_norm": 2.7304036617279053, |
|
"learning_rate": 0.00018045515394912986, |
|
"loss": 3.6995, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.30120481927710846, |
|
"grad_norm": 2.6375226974487305, |
|
"learning_rate": 0.00018018741633199465, |
|
"loss": 3.3922, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.30522088353413657, |
|
"grad_norm": 2.097759246826172, |
|
"learning_rate": 0.00017991967871485944, |
|
"loss": 3.1887, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.3092369477911647, |
|
"grad_norm": 2.600724458694458, |
|
"learning_rate": 0.00017965194109772424, |
|
"loss": 3.8532, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.3132530120481928, |
|
"grad_norm": 3.0356369018554688, |
|
"learning_rate": 0.00017938420348058903, |
|
"loss": 4.6221, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.3172690763052209, |
|
"grad_norm": 2.1509416103363037, |
|
"learning_rate": 0.00017911646586345382, |
|
"loss": 3.5473, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.321285140562249, |
|
"grad_norm": 2.7542128562927246, |
|
"learning_rate": 0.00017884872824631862, |
|
"loss": 4.3206, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.3253012048192771, |
|
"grad_norm": 2.7480881214141846, |
|
"learning_rate": 0.0001785809906291834, |
|
"loss": 3.4596, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.3293172690763052, |
|
"grad_norm": 2.8787624835968018, |
|
"learning_rate": 0.0001783132530120482, |
|
"loss": 4.0409, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 2.234320878982544, |
|
"learning_rate": 0.000178045515394913, |
|
"loss": 3.6684, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.3373493975903614, |
|
"grad_norm": 2.174452781677246, |
|
"learning_rate": 0.00017777777777777779, |
|
"loss": 3.8964, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.3413654618473896, |
|
"grad_norm": 2.25730299949646, |
|
"learning_rate": 0.00017751004016064258, |
|
"loss": 3.3793, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.3453815261044177, |
|
"grad_norm": 2.3120176792144775, |
|
"learning_rate": 0.00017724230254350737, |
|
"loss": 3.9183, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.3493975903614458, |
|
"grad_norm": 2.696288824081421, |
|
"learning_rate": 0.00017697456492637216, |
|
"loss": 4.1063, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.3534136546184739, |
|
"grad_norm": 3.9386634826660156, |
|
"learning_rate": 0.00017670682730923696, |
|
"loss": 4.599, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.357429718875502, |
|
"grad_norm": 2.7136473655700684, |
|
"learning_rate": 0.00017643908969210175, |
|
"loss": 4.1535, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.3614457831325301, |
|
"grad_norm": 2.4276645183563232, |
|
"learning_rate": 0.00017617135207496654, |
|
"loss": 4.4834, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3654618473895582, |
|
"grad_norm": 2.6002511978149414, |
|
"learning_rate": 0.00017590361445783134, |
|
"loss": 4.0748, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.36947791164658633, |
|
"grad_norm": 2.682366132736206, |
|
"learning_rate": 0.00017563587684069613, |
|
"loss": 4.4142, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.37349397590361444, |
|
"grad_norm": 2.108722686767578, |
|
"learning_rate": 0.00017536813922356092, |
|
"loss": 4.4304, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.37751004016064255, |
|
"grad_norm": 2.0732803344726562, |
|
"learning_rate": 0.00017510040160642571, |
|
"loss": 3.2521, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.3815261044176707, |
|
"grad_norm": 2.3038790225982666, |
|
"learning_rate": 0.0001748326639892905, |
|
"loss": 4.3167, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.3855421686746988, |
|
"grad_norm": 2.623572587966919, |
|
"learning_rate": 0.0001745649263721553, |
|
"loss": 5.3465, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.3895582329317269, |
|
"grad_norm": 2.4543046951293945, |
|
"learning_rate": 0.0001742971887550201, |
|
"loss": 3.4479, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.39357429718875503, |
|
"grad_norm": 2.291369915008545, |
|
"learning_rate": 0.00017402945113788489, |
|
"loss": 4.0893, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.39759036144578314, |
|
"grad_norm": 2.4371914863586426, |
|
"learning_rate": 0.00017376171352074968, |
|
"loss": 3.7132, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.40160642570281124, |
|
"grad_norm": 2.1401989459991455, |
|
"learning_rate": 0.00017349397590361447, |
|
"loss": 2.9892, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.40562248995983935, |
|
"grad_norm": 2.1574857234954834, |
|
"learning_rate": 0.00017322623828647926, |
|
"loss": 3.3145, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.40963855421686746, |
|
"grad_norm": 2.7298076152801514, |
|
"learning_rate": 0.00017295850066934406, |
|
"loss": 4.2365, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.41365461847389556, |
|
"grad_norm": 2.5634846687316895, |
|
"learning_rate": 0.00017269076305220885, |
|
"loss": 3.4466, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.41767068273092367, |
|
"grad_norm": 2.573195695877075, |
|
"learning_rate": 0.00017242302543507362, |
|
"loss": 3.3283, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.42168674698795183, |
|
"grad_norm": 2.205293655395508, |
|
"learning_rate": 0.00017215528781793844, |
|
"loss": 3.7288, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.42570281124497994, |
|
"grad_norm": 3.3177073001861572, |
|
"learning_rate": 0.00017188755020080323, |
|
"loss": 3.9341, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.42971887550200805, |
|
"grad_norm": 2.601710557937622, |
|
"learning_rate": 0.00017161981258366802, |
|
"loss": 4.3724, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.43373493975903615, |
|
"grad_norm": 2.490556478500366, |
|
"learning_rate": 0.00017135207496653281, |
|
"loss": 3.0784, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.43775100401606426, |
|
"grad_norm": 2.7771122455596924, |
|
"learning_rate": 0.0001710843373493976, |
|
"loss": 3.7125, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.44176706827309237, |
|
"grad_norm": 2.9865031242370605, |
|
"learning_rate": 0.00017081659973226237, |
|
"loss": 4.9747, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.4457831325301205, |
|
"grad_norm": 3.2922353744506836, |
|
"learning_rate": 0.00017054886211512717, |
|
"loss": 4.229, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.4497991967871486, |
|
"grad_norm": 2.2360899448394775, |
|
"learning_rate": 0.00017028112449799199, |
|
"loss": 3.1859, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.4538152610441767, |
|
"grad_norm": 2.4282941818237305, |
|
"learning_rate": 0.00017001338688085678, |
|
"loss": 4.4577, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.4578313253012048, |
|
"grad_norm": 2.2384181022644043, |
|
"learning_rate": 0.00016974564926372157, |
|
"loss": 3.435, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.46184738955823296, |
|
"grad_norm": 2.586678981781006, |
|
"learning_rate": 0.00016947791164658636, |
|
"loss": 3.7974, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.46586345381526106, |
|
"grad_norm": 2.2473366260528564, |
|
"learning_rate": 0.00016921017402945113, |
|
"loss": 3.2193, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.46987951807228917, |
|
"grad_norm": 2.2137515544891357, |
|
"learning_rate": 0.00016894243641231592, |
|
"loss": 3.2774, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.4738955823293173, |
|
"grad_norm": 2.6827173233032227, |
|
"learning_rate": 0.00016867469879518074, |
|
"loss": 3.843, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.4779116465863454, |
|
"grad_norm": 2.499166250228882, |
|
"learning_rate": 0.00016840696117804553, |
|
"loss": 3.1818, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.4819277108433735, |
|
"grad_norm": 2.609964609146118, |
|
"learning_rate": 0.00016813922356091033, |
|
"loss": 3.6292, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4859437751004016, |
|
"grad_norm": 2.697786808013916, |
|
"learning_rate": 0.00016787148594377512, |
|
"loss": 3.7501, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.4899598393574297, |
|
"grad_norm": 2.834494113922119, |
|
"learning_rate": 0.00016760374832663989, |
|
"loss": 3.9265, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.4939759036144578, |
|
"grad_norm": 2.3431777954101562, |
|
"learning_rate": 0.00016733601070950468, |
|
"loss": 3.7916, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.4979919678714859, |
|
"grad_norm": 2.434953212738037, |
|
"learning_rate": 0.00016706827309236947, |
|
"loss": 3.4279, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.5020080321285141, |
|
"grad_norm": 2.3629250526428223, |
|
"learning_rate": 0.0001668005354752343, |
|
"loss": 3.4382, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.5060240963855421, |
|
"grad_norm": 2.7543423175811768, |
|
"learning_rate": 0.00016653279785809908, |
|
"loss": 4.8146, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.5100401606425703, |
|
"grad_norm": 3.149775981903076, |
|
"learning_rate": 0.00016626506024096388, |
|
"loss": 5.365, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.5140562248995983, |
|
"grad_norm": 2.640326499938965, |
|
"learning_rate": 0.00016599732262382864, |
|
"loss": 4.2036, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.5180722891566265, |
|
"grad_norm": 2.6297357082366943, |
|
"learning_rate": 0.00016572958500669344, |
|
"loss": 3.7331, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.5220883534136547, |
|
"grad_norm": 2.9165263175964355, |
|
"learning_rate": 0.00016546184738955823, |
|
"loss": 4.2224, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.5261044176706827, |
|
"grad_norm": 2.003908634185791, |
|
"learning_rate": 0.00016519410977242302, |
|
"loss": 3.5818, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.5301204819277109, |
|
"grad_norm": 2.3137078285217285, |
|
"learning_rate": 0.00016492637215528784, |
|
"loss": 3.4726, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.5341365461847389, |
|
"grad_norm": 2.69950795173645, |
|
"learning_rate": 0.00016465863453815263, |
|
"loss": 4.0059, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.5381526104417671, |
|
"grad_norm": 2.1858394145965576, |
|
"learning_rate": 0.0001643908969210174, |
|
"loss": 3.6957, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.5421686746987951, |
|
"grad_norm": 2.423802137374878, |
|
"learning_rate": 0.0001641231593038822, |
|
"loss": 4.1535, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.5461847389558233, |
|
"grad_norm": 2.244253158569336, |
|
"learning_rate": 0.00016385542168674699, |
|
"loss": 3.3276, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.5502008032128514, |
|
"grad_norm": 2.2932465076446533, |
|
"learning_rate": 0.00016358768406961178, |
|
"loss": 3.6498, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.5542168674698795, |
|
"grad_norm": 2.0782933235168457, |
|
"learning_rate": 0.0001633199464524766, |
|
"loss": 4.007, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.5582329317269076, |
|
"grad_norm": 2.778797149658203, |
|
"learning_rate": 0.0001630522088353414, |
|
"loss": 3.8436, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.5622489959839357, |
|
"grad_norm": 2.7823002338409424, |
|
"learning_rate": 0.00016278447121820616, |
|
"loss": 5.5985, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.5662650602409639, |
|
"grad_norm": 3.124753475189209, |
|
"learning_rate": 0.00016251673360107095, |
|
"loss": 3.8402, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.570281124497992, |
|
"grad_norm": 2.999889612197876, |
|
"learning_rate": 0.00016224899598393574, |
|
"loss": 4.8463, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.5742971887550201, |
|
"grad_norm": 2.2176406383514404, |
|
"learning_rate": 0.00016198125836680054, |
|
"loss": 3.6488, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.5783132530120482, |
|
"grad_norm": 2.334336757659912, |
|
"learning_rate": 0.00016171352074966533, |
|
"loss": 3.4351, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.5823293172690763, |
|
"grad_norm": 2.1625120639801025, |
|
"learning_rate": 0.00016144578313253015, |
|
"loss": 3.4423, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.5863453815261044, |
|
"grad_norm": 2.3950042724609375, |
|
"learning_rate": 0.00016117804551539491, |
|
"loss": 3.4302, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.5903614457831325, |
|
"grad_norm": 1.968996524810791, |
|
"learning_rate": 0.0001609103078982597, |
|
"loss": 3.3924, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.5943775100401606, |
|
"grad_norm": 2.259298801422119, |
|
"learning_rate": 0.0001606425702811245, |
|
"loss": 3.4544, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.5983935742971888, |
|
"grad_norm": 2.5227410793304443, |
|
"learning_rate": 0.0001603748326639893, |
|
"loss": 3.6276, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.6024096385542169, |
|
"grad_norm": 2.4112424850463867, |
|
"learning_rate": 0.00016010709504685409, |
|
"loss": 3.8806, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.606425702811245, |
|
"grad_norm": 2.5478017330169678, |
|
"learning_rate": 0.00015983935742971888, |
|
"loss": 4.1461, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.6104417670682731, |
|
"grad_norm": 2.832744836807251, |
|
"learning_rate": 0.00015957161981258367, |
|
"loss": 5.0162, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.6144578313253012, |
|
"grad_norm": 2.7249608039855957, |
|
"learning_rate": 0.00015930388219544846, |
|
"loss": 3.2521, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.6184738955823293, |
|
"grad_norm": 2.579235315322876, |
|
"learning_rate": 0.00015903614457831326, |
|
"loss": 4.0444, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.6224899598393574, |
|
"grad_norm": 2.719031572341919, |
|
"learning_rate": 0.00015876840696117805, |
|
"loss": 3.8091, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.6265060240963856, |
|
"grad_norm": 2.9060187339782715, |
|
"learning_rate": 0.00015850066934404284, |
|
"loss": 3.574, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.6305220883534136, |
|
"grad_norm": 2.3890836238861084, |
|
"learning_rate": 0.00015823293172690763, |
|
"loss": 3.0126, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.6345381526104418, |
|
"grad_norm": 2.4875965118408203, |
|
"learning_rate": 0.00015796519410977243, |
|
"loss": 3.8722, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.6385542168674698, |
|
"grad_norm": 2.452133893966675, |
|
"learning_rate": 0.00015769745649263722, |
|
"loss": 3.1996, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.642570281124498, |
|
"grad_norm": 2.644927740097046, |
|
"learning_rate": 0.000157429718875502, |
|
"loss": 4.5955, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.6465863453815262, |
|
"grad_norm": 2.4523508548736572, |
|
"learning_rate": 0.0001571619812583668, |
|
"loss": 3.3654, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.6506024096385542, |
|
"grad_norm": 2.5598349571228027, |
|
"learning_rate": 0.0001568942436412316, |
|
"loss": 3.0078, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.6546184738955824, |
|
"grad_norm": 3.0518641471862793, |
|
"learning_rate": 0.0001566265060240964, |
|
"loss": 4.5464, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.6586345381526104, |
|
"grad_norm": 2.8101203441619873, |
|
"learning_rate": 0.00015635876840696118, |
|
"loss": 3.4404, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.6626506024096386, |
|
"grad_norm": 2.7174525260925293, |
|
"learning_rate": 0.00015609103078982598, |
|
"loss": 3.6615, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 2.620638608932495, |
|
"learning_rate": 0.00015582329317269077, |
|
"loss": 3.448, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.6706827309236948, |
|
"grad_norm": 2.9395246505737305, |
|
"learning_rate": 0.00015555555555555556, |
|
"loss": 3.6454, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.6746987951807228, |
|
"grad_norm": 3.050710916519165, |
|
"learning_rate": 0.00015528781793842036, |
|
"loss": 4.0765, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.678714859437751, |
|
"grad_norm": 2.2552433013916016, |
|
"learning_rate": 0.00015502008032128515, |
|
"loss": 3.1558, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.6827309236947792, |
|
"grad_norm": 2.1489574909210205, |
|
"learning_rate": 0.00015475234270414994, |
|
"loss": 4.2047, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.6867469879518072, |
|
"grad_norm": 2.172776937484741, |
|
"learning_rate": 0.00015448460508701473, |
|
"loss": 3.4285, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.6907630522088354, |
|
"grad_norm": 2.1401731967926025, |
|
"learning_rate": 0.00015421686746987953, |
|
"loss": 3.2497, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.6947791164658634, |
|
"grad_norm": 2.7701947689056396, |
|
"learning_rate": 0.00015394912985274432, |
|
"loss": 3.9331, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.6987951807228916, |
|
"grad_norm": 2.319415330886841, |
|
"learning_rate": 0.0001536813922356091, |
|
"loss": 3.176, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.7028112449799196, |
|
"grad_norm": 2.428131341934204, |
|
"learning_rate": 0.0001534136546184739, |
|
"loss": 3.1192, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.7068273092369478, |
|
"grad_norm": 2.135892868041992, |
|
"learning_rate": 0.0001531459170013387, |
|
"loss": 3.0222, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.7108433734939759, |
|
"grad_norm": 2.7550647258758545, |
|
"learning_rate": 0.0001528781793842035, |
|
"loss": 4.6775, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.714859437751004, |
|
"grad_norm": 2.2021191120147705, |
|
"learning_rate": 0.00015261044176706828, |
|
"loss": 2.7476, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.7188755020080321, |
|
"grad_norm": 2.686431407928467, |
|
"learning_rate": 0.00015234270414993308, |
|
"loss": 4.1621, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.7228915662650602, |
|
"grad_norm": 2.827143669128418, |
|
"learning_rate": 0.00015207496653279787, |
|
"loss": 4.4613, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.7269076305220884, |
|
"grad_norm": 3.090308904647827, |
|
"learning_rate": 0.00015180722891566266, |
|
"loss": 4.6863, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.7309236947791165, |
|
"grad_norm": 2.492013454437256, |
|
"learning_rate": 0.00015153949129852746, |
|
"loss": 3.2319, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.7349397590361446, |
|
"grad_norm": 2.6304264068603516, |
|
"learning_rate": 0.00015127175368139225, |
|
"loss": 3.3099, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.7389558232931727, |
|
"grad_norm": 2.270024299621582, |
|
"learning_rate": 0.00015100401606425701, |
|
"loss": 3.8332, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.7429718875502008, |
|
"grad_norm": 2.2107675075531006, |
|
"learning_rate": 0.00015073627844712183, |
|
"loss": 3.4966, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.7469879518072289, |
|
"grad_norm": 1.804654598236084, |
|
"learning_rate": 0.00015046854082998663, |
|
"loss": 2.7441, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.751004016064257, |
|
"grad_norm": 2.8919899463653564, |
|
"learning_rate": 0.00015020080321285142, |
|
"loss": 3.7274, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.7550200803212851, |
|
"grad_norm": 2.4757237434387207, |
|
"learning_rate": 0.0001499330655957162, |
|
"loss": 3.6959, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.7590361445783133, |
|
"grad_norm": 2.037745952606201, |
|
"learning_rate": 0.000149665327978581, |
|
"loss": 3.0673, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.7630522088353414, |
|
"grad_norm": 2.479806423187256, |
|
"learning_rate": 0.00014939759036144577, |
|
"loss": 3.5497, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.7670682730923695, |
|
"grad_norm": 2.532616138458252, |
|
"learning_rate": 0.00014912985274431056, |
|
"loss": 4.4538, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.7710843373493976, |
|
"grad_norm": 2.2965128421783447, |
|
"learning_rate": 0.00014886211512717538, |
|
"loss": 3.8924, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.7751004016064257, |
|
"grad_norm": 2.569096088409424, |
|
"learning_rate": 0.00014859437751004018, |
|
"loss": 4.3112, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.7791164658634538, |
|
"grad_norm": 2.3299782276153564, |
|
"learning_rate": 0.00014832663989290497, |
|
"loss": 3.4171, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.7831325301204819, |
|
"grad_norm": 2.4750306606292725, |
|
"learning_rate": 0.00014805890227576976, |
|
"loss": 4.2418, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.7871485943775101, |
|
"grad_norm": 2.34830904006958, |
|
"learning_rate": 0.00014779116465863453, |
|
"loss": 4.7654, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.7911646586345381, |
|
"grad_norm": 2.3084421157836914, |
|
"learning_rate": 0.00014752342704149932, |
|
"loss": 3.5955, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.7951807228915663, |
|
"grad_norm": 2.088836431503296, |
|
"learning_rate": 0.00014725568942436414, |
|
"loss": 3.4426, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.7991967871485943, |
|
"grad_norm": 2.387511968612671, |
|
"learning_rate": 0.00014698795180722893, |
|
"loss": 3.4799, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.8032128514056225, |
|
"grad_norm": 2.173638343811035, |
|
"learning_rate": 0.00014672021419009373, |
|
"loss": 3.1073, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.8072289156626506, |
|
"grad_norm": 2.4268410205841064, |
|
"learning_rate": 0.00014645247657295852, |
|
"loss": 3.895, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.8112449799196787, |
|
"grad_norm": 2.298238515853882, |
|
"learning_rate": 0.00014618473895582328, |
|
"loss": 3.1374, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.8152610441767069, |
|
"grad_norm": 2.5447280406951904, |
|
"learning_rate": 0.00014591700133868808, |
|
"loss": 4.201, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.8192771084337349, |
|
"grad_norm": 2.2700531482696533, |
|
"learning_rate": 0.00014564926372155287, |
|
"loss": 3.3756, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.8232931726907631, |
|
"grad_norm": 2.2147793769836426, |
|
"learning_rate": 0.0001453815261044177, |
|
"loss": 2.8677, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.8273092369477911, |
|
"grad_norm": 2.820615768432617, |
|
"learning_rate": 0.00014511378848728248, |
|
"loss": 3.8278, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.8313253012048193, |
|
"grad_norm": 2.214066743850708, |
|
"learning_rate": 0.00014484605087014728, |
|
"loss": 2.8015, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.8353413654618473, |
|
"grad_norm": 2.7223362922668457, |
|
"learning_rate": 0.00014457831325301204, |
|
"loss": 4.5482, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.8393574297188755, |
|
"grad_norm": 2.6131458282470703, |
|
"learning_rate": 0.00014431057563587683, |
|
"loss": 3.258, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.8433734939759037, |
|
"grad_norm": 2.378821611404419, |
|
"learning_rate": 0.00014404283801874163, |
|
"loss": 3.4395, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.8473895582329317, |
|
"grad_norm": 2.5394039154052734, |
|
"learning_rate": 0.00014377510040160642, |
|
"loss": 3.5583, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.8514056224899599, |
|
"grad_norm": 2.8768603801727295, |
|
"learning_rate": 0.00014350736278447124, |
|
"loss": 4.1826, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.8554216867469879, |
|
"grad_norm": 2.325242757797241, |
|
"learning_rate": 0.00014323962516733603, |
|
"loss": 3.2996, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.8594377510040161, |
|
"grad_norm": 2.847722053527832, |
|
"learning_rate": 0.0001429718875502008, |
|
"loss": 3.7535, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.8634538152610441, |
|
"grad_norm": 2.3787224292755127, |
|
"learning_rate": 0.0001427041499330656, |
|
"loss": 2.989, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.8674698795180723, |
|
"grad_norm": 2.3759453296661377, |
|
"learning_rate": 0.00014243641231593038, |
|
"loss": 3.2181, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.8714859437751004, |
|
"grad_norm": 2.48319411277771, |
|
"learning_rate": 0.00014216867469879518, |
|
"loss": 4.0624, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.8755020080321285, |
|
"grad_norm": 2.75231671333313, |
|
"learning_rate": 0.00014190093708166, |
|
"loss": 4.2616, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.8795180722891566, |
|
"grad_norm": 2.165195941925049, |
|
"learning_rate": 0.0001416331994645248, |
|
"loss": 2.773, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.8835341365461847, |
|
"grad_norm": 2.9390523433685303, |
|
"learning_rate": 0.00014136546184738956, |
|
"loss": 5.3133, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.8875502008032129, |
|
"grad_norm": 2.4109458923339844, |
|
"learning_rate": 0.00014109772423025435, |
|
"loss": 3.8292, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.891566265060241, |
|
"grad_norm": 2.5037901401519775, |
|
"learning_rate": 0.00014082998661311914, |
|
"loss": 4.0122, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.8955823293172691, |
|
"grad_norm": 2.985944986343384, |
|
"learning_rate": 0.00014056224899598393, |
|
"loss": 3.7539, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.8995983935742972, |
|
"grad_norm": 2.2456915378570557, |
|
"learning_rate": 0.00014029451137884873, |
|
"loss": 3.4707, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.9036144578313253, |
|
"grad_norm": 2.0935449600219727, |
|
"learning_rate": 0.00014002677376171355, |
|
"loss": 2.7515, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.9076305220883534, |
|
"grad_norm": 2.4609766006469727, |
|
"learning_rate": 0.00013975903614457834, |
|
"loss": 3.8227, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.9116465863453815, |
|
"grad_norm": 2.2097980976104736, |
|
"learning_rate": 0.0001394912985274431, |
|
"loss": 3.2733, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.9156626506024096, |
|
"grad_norm": 2.0642688274383545, |
|
"learning_rate": 0.0001392235609103079, |
|
"loss": 3.0938, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.9196787148594378, |
|
"grad_norm": 2.3710100650787354, |
|
"learning_rate": 0.0001389558232931727, |
|
"loss": 4.2002, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.9236947791164659, |
|
"grad_norm": 2.6360647678375244, |
|
"learning_rate": 0.00013868808567603748, |
|
"loss": 3.8326, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.927710843373494, |
|
"grad_norm": 2.2522687911987305, |
|
"learning_rate": 0.00013842034805890228, |
|
"loss": 4.0576, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.9317269076305221, |
|
"grad_norm": 2.3965373039245605, |
|
"learning_rate": 0.0001381526104417671, |
|
"loss": 2.551, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.9357429718875502, |
|
"grad_norm": 2.160850763320923, |
|
"learning_rate": 0.00013788487282463186, |
|
"loss": 3.0346, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.9397590361445783, |
|
"grad_norm": 2.7340362071990967, |
|
"learning_rate": 0.00013761713520749665, |
|
"loss": 3.8792, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.9437751004016064, |
|
"grad_norm": 2.373431921005249, |
|
"learning_rate": 0.00013734939759036145, |
|
"loss": 3.4563, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.9477911646586346, |
|
"grad_norm": 2.887669801712036, |
|
"learning_rate": 0.00013708165997322624, |
|
"loss": 3.4205, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.9518072289156626, |
|
"grad_norm": 2.47088360786438, |
|
"learning_rate": 0.00013681392235609103, |
|
"loss": 3.7738, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.9558232931726908, |
|
"grad_norm": 2.7040438652038574, |
|
"learning_rate": 0.00013654618473895585, |
|
"loss": 3.5389, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.9598393574297188, |
|
"grad_norm": 2.2656071186065674, |
|
"learning_rate": 0.00013627844712182062, |
|
"loss": 2.5192, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.963855421686747, |
|
"grad_norm": 2.0689640045166016, |
|
"learning_rate": 0.0001360107095046854, |
|
"loss": 3.2038, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.9678714859437751, |
|
"grad_norm": 2.456049680709839, |
|
"learning_rate": 0.0001357429718875502, |
|
"loss": 3.3779, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.9718875502008032, |
|
"grad_norm": 3.6520512104034424, |
|
"learning_rate": 0.000135475234270415, |
|
"loss": 6.3828, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.9759036144578314, |
|
"grad_norm": 2.9019930362701416, |
|
"learning_rate": 0.0001352074966532798, |
|
"loss": 4.4033, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.9799196787148594, |
|
"grad_norm": 2.688805103302002, |
|
"learning_rate": 0.00013493975903614458, |
|
"loss": 3.7718, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.9839357429718876, |
|
"grad_norm": 2.3583173751831055, |
|
"learning_rate": 0.00013467202141900938, |
|
"loss": 2.8558, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.9879518072289156, |
|
"grad_norm": 2.2991857528686523, |
|
"learning_rate": 0.00013440428380187417, |
|
"loss": 3.3544, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.9919678714859438, |
|
"grad_norm": 2.3462352752685547, |
|
"learning_rate": 0.00013413654618473896, |
|
"loss": 3.4804, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.9959839357429718, |
|
"grad_norm": 2.375304698944092, |
|
"learning_rate": 0.00013386880856760375, |
|
"loss": 3.9284, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 2.3574721813201904, |
|
"learning_rate": 0.00013360107095046855, |
|
"loss": 3.5948, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.906198263168335, |
|
"eval_runtime": 202.0311, |
|
"eval_samples_per_second": 2.47, |
|
"eval_steps_per_second": 1.237, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.0040160642570282, |
|
"grad_norm": 2.329230546951294, |
|
"learning_rate": 0.00013333333333333334, |
|
"loss": 3.8794, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.0080321285140563, |
|
"grad_norm": 2.304131507873535, |
|
"learning_rate": 0.00013306559571619813, |
|
"loss": 2.618, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.0120481927710843, |
|
"grad_norm": 2.258854389190674, |
|
"learning_rate": 0.00013279785809906293, |
|
"loss": 4.5112, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.0160642570281124, |
|
"grad_norm": 1.9307198524475098, |
|
"learning_rate": 0.00013253012048192772, |
|
"loss": 2.8023, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.0200803212851406, |
|
"grad_norm": 2.070939540863037, |
|
"learning_rate": 0.0001322623828647925, |
|
"loss": 2.9067, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.0240963855421688, |
|
"grad_norm": 2.1403632164001465, |
|
"learning_rate": 0.0001319946452476573, |
|
"loss": 3.0498, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.0281124497991967, |
|
"grad_norm": 1.9982527494430542, |
|
"learning_rate": 0.0001317269076305221, |
|
"loss": 2.7652, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.0321285140562249, |
|
"grad_norm": 2.3440232276916504, |
|
"learning_rate": 0.0001314591700133869, |
|
"loss": 3.8854, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.036144578313253, |
|
"grad_norm": 2.3406286239624023, |
|
"learning_rate": 0.00013119143239625168, |
|
"loss": 2.9114, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.0401606425702812, |
|
"grad_norm": 2.673793077468872, |
|
"learning_rate": 0.00013092369477911648, |
|
"loss": 3.0531, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.0441767068273093, |
|
"grad_norm": 2.2808480262756348, |
|
"learning_rate": 0.00013065595716198127, |
|
"loss": 2.9484, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.0481927710843373, |
|
"grad_norm": 2.513705253601074, |
|
"learning_rate": 0.00013038821954484606, |
|
"loss": 2.6625, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.0522088353413654, |
|
"grad_norm": 2.7780377864837646, |
|
"learning_rate": 0.00013012048192771085, |
|
"loss": 3.1793, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.0562248995983936, |
|
"grad_norm": 2.522724151611328, |
|
"learning_rate": 0.00012985274431057565, |
|
"loss": 3.1926, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.0602409638554218, |
|
"grad_norm": 3.2487499713897705, |
|
"learning_rate": 0.0001295850066934404, |
|
"loss": 3.9779, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.0642570281124497, |
|
"grad_norm": 2.4341378211975098, |
|
"learning_rate": 0.00012931726907630523, |
|
"loss": 2.9064, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.0682730923694779, |
|
"grad_norm": 2.5539276599884033, |
|
"learning_rate": 0.00012904953145917002, |
|
"loss": 3.4219, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.072289156626506, |
|
"grad_norm": 2.0425596237182617, |
|
"learning_rate": 0.00012878179384203482, |
|
"loss": 2.5395, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.0763052208835342, |
|
"grad_norm": 2.3625378608703613, |
|
"learning_rate": 0.0001285140562248996, |
|
"loss": 2.757, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.0803212851405624, |
|
"grad_norm": 2.0414483547210693, |
|
"learning_rate": 0.0001282463186077644, |
|
"loss": 2.7764, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.0843373493975903, |
|
"grad_norm": 3.544743061065674, |
|
"learning_rate": 0.00012797858099062917, |
|
"loss": 3.6176, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.0883534136546185, |
|
"grad_norm": 2.4814655780792236, |
|
"learning_rate": 0.00012771084337349396, |
|
"loss": 3.2284, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.0923694779116466, |
|
"grad_norm": 2.364025592803955, |
|
"learning_rate": 0.00012744310575635878, |
|
"loss": 3.6178, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.0963855421686748, |
|
"grad_norm": 1.989912748336792, |
|
"learning_rate": 0.00012717536813922357, |
|
"loss": 2.5839, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.1004016064257027, |
|
"grad_norm": 2.413421154022217, |
|
"learning_rate": 0.00012690763052208837, |
|
"loss": 3.5416, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.104417670682731, |
|
"grad_norm": 2.679314613342285, |
|
"learning_rate": 0.00012663989290495316, |
|
"loss": 3.0015, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.108433734939759, |
|
"grad_norm": 2.2354209423065186, |
|
"learning_rate": 0.00012637215528781793, |
|
"loss": 3.3867, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.1124497991967872, |
|
"grad_norm": 2.4003982543945312, |
|
"learning_rate": 0.00012610441767068272, |
|
"loss": 3.0927, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.1164658634538154, |
|
"grad_norm": 2.2922661304473877, |
|
"learning_rate": 0.00012583668005354754, |
|
"loss": 2.835, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.1204819277108433, |
|
"grad_norm": 2.1880528926849365, |
|
"learning_rate": 0.00012556894243641233, |
|
"loss": 2.9581, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.1244979919678715, |
|
"grad_norm": 2.5255534648895264, |
|
"learning_rate": 0.00012530120481927712, |
|
"loss": 2.7931, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.1285140562248996, |
|
"grad_norm": 2.2529118061065674, |
|
"learning_rate": 0.00012503346720214192, |
|
"loss": 2.6831, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 1.1325301204819278, |
|
"grad_norm": 2.2123444080352783, |
|
"learning_rate": 0.0001247657295850067, |
|
"loss": 2.8091, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.1365461847389557, |
|
"grad_norm": 2.538160800933838, |
|
"learning_rate": 0.00012449799196787148, |
|
"loss": 3.0089, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.140562248995984, |
|
"grad_norm": 3.0052592754364014, |
|
"learning_rate": 0.00012423025435073627, |
|
"loss": 3.9042, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.144578313253012, |
|
"grad_norm": 2.691096067428589, |
|
"learning_rate": 0.0001239625167336011, |
|
"loss": 3.9491, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.1485943775100402, |
|
"grad_norm": 2.6101088523864746, |
|
"learning_rate": 0.00012369477911646588, |
|
"loss": 2.9432, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.1526104417670684, |
|
"grad_norm": 2.368319511413574, |
|
"learning_rate": 0.00012342704149933067, |
|
"loss": 2.966, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 1.1566265060240963, |
|
"grad_norm": 2.4615232944488525, |
|
"learning_rate": 0.00012315930388219547, |
|
"loss": 3.4359, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.1606425702811245, |
|
"grad_norm": 2.3296902179718018, |
|
"learning_rate": 0.00012289156626506023, |
|
"loss": 3.0168, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 1.1646586345381527, |
|
"grad_norm": 2.7844183444976807, |
|
"learning_rate": 0.00012262382864792503, |
|
"loss": 3.1574, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.1686746987951806, |
|
"grad_norm": 2.486553430557251, |
|
"learning_rate": 0.00012235609103078982, |
|
"loss": 3.1044, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 1.1726907630522088, |
|
"grad_norm": 2.4482836723327637, |
|
"learning_rate": 0.00012208835341365464, |
|
"loss": 3.2606, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.176706827309237, |
|
"grad_norm": 2.393049955368042, |
|
"learning_rate": 0.00012182061579651942, |
|
"loss": 2.9026, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 1.180722891566265, |
|
"grad_norm": 2.8396050930023193, |
|
"learning_rate": 0.00012155287817938421, |
|
"loss": 2.9787, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.1847389558232932, |
|
"grad_norm": 2.447458028793335, |
|
"learning_rate": 0.000121285140562249, |
|
"loss": 2.6885, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.1887550200803212, |
|
"grad_norm": 2.3094258308410645, |
|
"learning_rate": 0.0001210174029451138, |
|
"loss": 2.9401, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.1927710843373494, |
|
"grad_norm": 2.5315654277801514, |
|
"learning_rate": 0.00012074966532797858, |
|
"loss": 3.2829, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 1.1967871485943775, |
|
"grad_norm": 2.4781811237335205, |
|
"learning_rate": 0.0001204819277108434, |
|
"loss": 2.9542, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 1.2008032128514057, |
|
"grad_norm": 2.759524345397949, |
|
"learning_rate": 0.00012021419009370817, |
|
"loss": 3.5029, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 1.2048192771084336, |
|
"grad_norm": 2.388485908508301, |
|
"learning_rate": 0.00011994645247657297, |
|
"loss": 2.6706, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.2088353413654618, |
|
"grad_norm": 2.5414671897888184, |
|
"learning_rate": 0.00011967871485943776, |
|
"loss": 2.7898, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 1.21285140562249, |
|
"grad_norm": 3.36741042137146, |
|
"learning_rate": 0.00011941097724230255, |
|
"loss": 2.7475, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 1.216867469879518, |
|
"grad_norm": 2.7749950885772705, |
|
"learning_rate": 0.00011914323962516733, |
|
"loss": 2.9617, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 1.2208835341365463, |
|
"grad_norm": 2.685976505279541, |
|
"learning_rate": 0.00011887550200803212, |
|
"loss": 3.2493, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 1.2248995983935742, |
|
"grad_norm": 2.7357215881347656, |
|
"learning_rate": 0.00011860776439089693, |
|
"loss": 2.7249, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.2289156626506024, |
|
"grad_norm": 2.962019443511963, |
|
"learning_rate": 0.00011834002677376172, |
|
"loss": 3.4647, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 1.2329317269076305, |
|
"grad_norm": 2.891343832015991, |
|
"learning_rate": 0.00011807228915662652, |
|
"loss": 3.5527, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 1.2369477911646587, |
|
"grad_norm": 2.7382125854492188, |
|
"learning_rate": 0.00011780455153949131, |
|
"loss": 3.1955, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 1.2409638554216866, |
|
"grad_norm": 2.385486602783203, |
|
"learning_rate": 0.00011753681392235609, |
|
"loss": 3.022, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 1.2449799196787148, |
|
"grad_norm": 2.553295612335205, |
|
"learning_rate": 0.00011726907630522088, |
|
"loss": 2.801, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.248995983935743, |
|
"grad_norm": 2.9965014457702637, |
|
"learning_rate": 0.00011700133868808567, |
|
"loss": 2.4453, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 1.2530120481927711, |
|
"grad_norm": 2.327629566192627, |
|
"learning_rate": 0.00011673360107095048, |
|
"loss": 2.2897, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 1.2570281124497993, |
|
"grad_norm": 2.7544825077056885, |
|
"learning_rate": 0.00011646586345381527, |
|
"loss": 3.2796, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 1.2610441767068274, |
|
"grad_norm": 2.590733051300049, |
|
"learning_rate": 0.00011619812583668007, |
|
"loss": 2.9126, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 1.2650602409638554, |
|
"grad_norm": 3.3064663410186768, |
|
"learning_rate": 0.00011593038821954485, |
|
"loss": 3.6784, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.2690763052208835, |
|
"grad_norm": 3.3928616046905518, |
|
"learning_rate": 0.00011566265060240964, |
|
"loss": 3.3292, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 1.2730923694779117, |
|
"grad_norm": 2.6576473712921143, |
|
"learning_rate": 0.00011539491298527443, |
|
"loss": 3.0617, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 1.2771084337349397, |
|
"grad_norm": 2.5956337451934814, |
|
"learning_rate": 0.00011512717536813924, |
|
"loss": 2.9754, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 1.2811244979919678, |
|
"grad_norm": 2.8080995082855225, |
|
"learning_rate": 0.00011485943775100403, |
|
"loss": 3.1712, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 1.285140562248996, |
|
"grad_norm": 2.4304864406585693, |
|
"learning_rate": 0.00011459170013386882, |
|
"loss": 3.0387, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.2891566265060241, |
|
"grad_norm": 2.2777411937713623, |
|
"learning_rate": 0.0001143239625167336, |
|
"loss": 2.8357, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 1.2931726907630523, |
|
"grad_norm": 2.370192289352417, |
|
"learning_rate": 0.0001140562248995984, |
|
"loss": 2.5937, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 1.2971887550200802, |
|
"grad_norm": 3.0521585941314697, |
|
"learning_rate": 0.00011378848728246319, |
|
"loss": 4.4271, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 1.3012048192771084, |
|
"grad_norm": 2.4153242111206055, |
|
"learning_rate": 0.00011352074966532798, |
|
"loss": 2.7952, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 1.3052208835341366, |
|
"grad_norm": 2.629312038421631, |
|
"learning_rate": 0.00011325301204819279, |
|
"loss": 3.6324, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.3092369477911647, |
|
"grad_norm": 2.0146517753601074, |
|
"learning_rate": 0.00011298527443105758, |
|
"loss": 2.3154, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 1.3132530120481927, |
|
"grad_norm": 2.3414394855499268, |
|
"learning_rate": 0.00011271753681392236, |
|
"loss": 2.809, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 1.3172690763052208, |
|
"grad_norm": 2.366577386856079, |
|
"learning_rate": 0.00011244979919678715, |
|
"loss": 3.7852, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 1.321285140562249, |
|
"grad_norm": 2.661543130874634, |
|
"learning_rate": 0.00011218206157965195, |
|
"loss": 2.818, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 1.3253012048192772, |
|
"grad_norm": 2.51835036277771, |
|
"learning_rate": 0.00011191432396251674, |
|
"loss": 2.8359, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.3293172690763053, |
|
"grad_norm": 2.473179817199707, |
|
"learning_rate": 0.00011164658634538152, |
|
"loss": 2.8498, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 2.9637928009033203, |
|
"learning_rate": 0.00011137884872824634, |
|
"loss": 4.164, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 1.3373493975903614, |
|
"grad_norm": 2.5028486251831055, |
|
"learning_rate": 0.00011111111111111112, |
|
"loss": 3.6701, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 1.3413654618473896, |
|
"grad_norm": 3.149928092956543, |
|
"learning_rate": 0.00011084337349397591, |
|
"loss": 3.7949, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 1.3453815261044177, |
|
"grad_norm": 2.7405877113342285, |
|
"learning_rate": 0.0001105756358768407, |
|
"loss": 3.2064, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.3493975903614457, |
|
"grad_norm": 2.830744743347168, |
|
"learning_rate": 0.0001103078982597055, |
|
"loss": 2.8919, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 1.3534136546184738, |
|
"grad_norm": 2.9335427284240723, |
|
"learning_rate": 0.00011004016064257027, |
|
"loss": 3.1013, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 1.357429718875502, |
|
"grad_norm": 2.505171537399292, |
|
"learning_rate": 0.0001097724230254351, |
|
"loss": 3.206, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 1.3614457831325302, |
|
"grad_norm": 3.127634286880493, |
|
"learning_rate": 0.00010950468540829987, |
|
"loss": 3.2454, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 1.3654618473895583, |
|
"grad_norm": 2.7009451389312744, |
|
"learning_rate": 0.00010923694779116467, |
|
"loss": 3.0679, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.3694779116465863, |
|
"grad_norm": 2.3906707763671875, |
|
"learning_rate": 0.00010896921017402946, |
|
"loss": 3.7267, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 1.3734939759036144, |
|
"grad_norm": 2.4884233474731445, |
|
"learning_rate": 0.00010870147255689425, |
|
"loss": 3.2707, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 1.3775100401606426, |
|
"grad_norm": 2.514148712158203, |
|
"learning_rate": 0.00010843373493975903, |
|
"loss": 3.0734, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 1.3815261044176708, |
|
"grad_norm": 2.450438976287842, |
|
"learning_rate": 0.00010816599732262382, |
|
"loss": 2.7529, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 1.3855421686746987, |
|
"grad_norm": 2.5931103229522705, |
|
"learning_rate": 0.00010789825970548863, |
|
"loss": 3.8578, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.3895582329317269, |
|
"grad_norm": 2.386543035507202, |
|
"learning_rate": 0.00010763052208835342, |
|
"loss": 3.2145, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 1.393574297188755, |
|
"grad_norm": 2.643378973007202, |
|
"learning_rate": 0.00010736278447121822, |
|
"loss": 2.7853, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 1.3975903614457832, |
|
"grad_norm": 1.9885903596878052, |
|
"learning_rate": 0.00010709504685408301, |
|
"loss": 2.2022, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 1.4016064257028114, |
|
"grad_norm": 2.6465091705322266, |
|
"learning_rate": 0.00010682730923694779, |
|
"loss": 3.5565, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 1.4056224899598393, |
|
"grad_norm": 2.6052937507629395, |
|
"learning_rate": 0.00010655957161981258, |
|
"loss": 2.9741, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.4096385542168675, |
|
"grad_norm": 2.7112314701080322, |
|
"learning_rate": 0.00010629183400267737, |
|
"loss": 4.0259, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 1.4136546184738956, |
|
"grad_norm": 2.5356833934783936, |
|
"learning_rate": 0.00010602409638554218, |
|
"loss": 2.6879, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 1.4176706827309236, |
|
"grad_norm": 2.745176315307617, |
|
"learning_rate": 0.00010575635876840697, |
|
"loss": 4.0105, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 1.4216867469879517, |
|
"grad_norm": 2.5344765186309814, |
|
"learning_rate": 0.00010548862115127177, |
|
"loss": 2.9797, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 1.4257028112449799, |
|
"grad_norm": 2.680912733078003, |
|
"learning_rate": 0.00010522088353413654, |
|
"loss": 3.3971, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.429718875502008, |
|
"grad_norm": 3.498023271560669, |
|
"learning_rate": 0.00010495314591700134, |
|
"loss": 3.6706, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 1.4337349397590362, |
|
"grad_norm": 2.4419398307800293, |
|
"learning_rate": 0.00010468540829986613, |
|
"loss": 2.6477, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 1.4377510040160644, |
|
"grad_norm": 3.2264997959136963, |
|
"learning_rate": 0.00010441767068273094, |
|
"loss": 4.5181, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 1.4417670682730923, |
|
"grad_norm": 2.5578315258026123, |
|
"learning_rate": 0.00010414993306559573, |
|
"loss": 2.6282, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 1.4457831325301205, |
|
"grad_norm": 2.539045572280884, |
|
"learning_rate": 0.00010388219544846052, |
|
"loss": 2.6435, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.4497991967871486, |
|
"grad_norm": 2.9697344303131104, |
|
"learning_rate": 0.0001036144578313253, |
|
"loss": 2.6676, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 1.4538152610441766, |
|
"grad_norm": 2.606131076812744, |
|
"learning_rate": 0.0001033467202141901, |
|
"loss": 2.9316, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 1.4578313253012047, |
|
"grad_norm": 3.290837049484253, |
|
"learning_rate": 0.00010307898259705489, |
|
"loss": 3.0869, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 1.461847389558233, |
|
"grad_norm": 2.331320285797119, |
|
"learning_rate": 0.00010281124497991968, |
|
"loss": 2.555, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 1.465863453815261, |
|
"grad_norm": 2.8447391986846924, |
|
"learning_rate": 0.00010254350736278449, |
|
"loss": 2.6998, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.4698795180722892, |
|
"grad_norm": 2.6170618534088135, |
|
"learning_rate": 0.00010227576974564928, |
|
"loss": 2.7688, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 1.4738955823293174, |
|
"grad_norm": 2.933560609817505, |
|
"learning_rate": 0.00010200803212851406, |
|
"loss": 3.0291, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 1.4779116465863453, |
|
"grad_norm": 2.6285972595214844, |
|
"learning_rate": 0.00010174029451137885, |
|
"loss": 2.8629, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 1.4819277108433735, |
|
"grad_norm": 3.2716546058654785, |
|
"learning_rate": 0.00010147255689424364, |
|
"loss": 3.1994, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 1.4859437751004017, |
|
"grad_norm": 2.758296489715576, |
|
"learning_rate": 0.00010120481927710844, |
|
"loss": 2.6734, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.4899598393574296, |
|
"grad_norm": 2.3439807891845703, |
|
"learning_rate": 0.00010093708165997322, |
|
"loss": 2.8747, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 1.4939759036144578, |
|
"grad_norm": 2.4199349880218506, |
|
"learning_rate": 0.00010066934404283804, |
|
"loss": 2.7135, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 1.497991967871486, |
|
"grad_norm": 2.8863987922668457, |
|
"learning_rate": 0.00010040160642570282, |
|
"loss": 3.3239, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 1.502008032128514, |
|
"grad_norm": 2.5620765686035156, |
|
"learning_rate": 0.00010013386880856761, |
|
"loss": 2.5748, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 1.5060240963855422, |
|
"grad_norm": 2.5705456733703613, |
|
"learning_rate": 9.98661311914324e-05, |
|
"loss": 3.4645, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.5100401606425704, |
|
"grad_norm": 2.75276780128479, |
|
"learning_rate": 9.95983935742972e-05, |
|
"loss": 2.7345, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 1.5140562248995983, |
|
"grad_norm": 2.5206143856048584, |
|
"learning_rate": 9.933065595716199e-05, |
|
"loss": 2.8325, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 1.5180722891566265, |
|
"grad_norm": 2.3054890632629395, |
|
"learning_rate": 9.906291834002678e-05, |
|
"loss": 2.884, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 1.5220883534136547, |
|
"grad_norm": 2.563084125518799, |
|
"learning_rate": 9.879518072289157e-05, |
|
"loss": 3.0262, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 1.5261044176706826, |
|
"grad_norm": 2.575040817260742, |
|
"learning_rate": 9.852744310575637e-05, |
|
"loss": 3.096, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.5301204819277108, |
|
"grad_norm": 2.3715319633483887, |
|
"learning_rate": 9.825970548862116e-05, |
|
"loss": 2.7163, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 1.534136546184739, |
|
"grad_norm": 2.7323389053344727, |
|
"learning_rate": 9.799196787148595e-05, |
|
"loss": 2.792, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 1.538152610441767, |
|
"grad_norm": 2.523524522781372, |
|
"learning_rate": 9.772423025435074e-05, |
|
"loss": 3.2821, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 1.5421686746987953, |
|
"grad_norm": 2.533090114593506, |
|
"learning_rate": 9.745649263721554e-05, |
|
"loss": 2.7672, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 1.5461847389558234, |
|
"grad_norm": 2.644031286239624, |
|
"learning_rate": 9.718875502008033e-05, |
|
"loss": 3.0318, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.5502008032128514, |
|
"grad_norm": 3.1442739963531494, |
|
"learning_rate": 9.692101740294511e-05, |
|
"loss": 3.6628, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 1.5542168674698795, |
|
"grad_norm": 2.403552532196045, |
|
"learning_rate": 9.665327978580992e-05, |
|
"loss": 2.4332, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 1.5582329317269075, |
|
"grad_norm": 2.478534698486328, |
|
"learning_rate": 9.638554216867471e-05, |
|
"loss": 2.4746, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 1.5622489959839356, |
|
"grad_norm": 2.7873339653015137, |
|
"learning_rate": 9.611780455153949e-05, |
|
"loss": 2.8514, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 1.5662650602409638, |
|
"grad_norm": 2.751532793045044, |
|
"learning_rate": 9.58500669344043e-05, |
|
"loss": 2.9365, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.570281124497992, |
|
"grad_norm": 2.8862998485565186, |
|
"learning_rate": 9.558232931726909e-05, |
|
"loss": 3.2632, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 1.5742971887550201, |
|
"grad_norm": 2.5372817516326904, |
|
"learning_rate": 9.531459170013387e-05, |
|
"loss": 2.8649, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 1.5783132530120483, |
|
"grad_norm": 2.428025007247925, |
|
"learning_rate": 9.504685408299867e-05, |
|
"loss": 2.6417, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 1.5823293172690764, |
|
"grad_norm": 3.284771680831909, |
|
"learning_rate": 9.477911646586346e-05, |
|
"loss": 3.4804, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 1.5863453815261044, |
|
"grad_norm": 2.8651950359344482, |
|
"learning_rate": 9.451137884872824e-05, |
|
"loss": 3.1454, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.5903614457831325, |
|
"grad_norm": 3.078660011291504, |
|
"learning_rate": 9.424364123159304e-05, |
|
"loss": 3.5961, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 1.5943775100401605, |
|
"grad_norm": 2.2207376956939697, |
|
"learning_rate": 9.397590361445784e-05, |
|
"loss": 2.3121, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 1.5983935742971886, |
|
"grad_norm": 2.4094178676605225, |
|
"learning_rate": 9.370816599732262e-05, |
|
"loss": 2.7138, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 1.6024096385542168, |
|
"grad_norm": 2.759876251220703, |
|
"learning_rate": 9.344042838018742e-05, |
|
"loss": 3.5605, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 1.606425702811245, |
|
"grad_norm": 2.189237117767334, |
|
"learning_rate": 9.317269076305222e-05, |
|
"loss": 2.6023, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.6104417670682731, |
|
"grad_norm": 2.585479736328125, |
|
"learning_rate": 9.2904953145917e-05, |
|
"loss": 3.2234, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 1.6144578313253013, |
|
"grad_norm": 2.565342664718628, |
|
"learning_rate": 9.26372155287818e-05, |
|
"loss": 3.0341, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 1.6184738955823295, |
|
"grad_norm": 2.4045302867889404, |
|
"learning_rate": 9.23694779116466e-05, |
|
"loss": 2.7032, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 1.6224899598393574, |
|
"grad_norm": 3.0136139392852783, |
|
"learning_rate": 9.210174029451138e-05, |
|
"loss": 3.1651, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 1.6265060240963856, |
|
"grad_norm": 2.253669261932373, |
|
"learning_rate": 9.183400267737617e-05, |
|
"loss": 2.2507, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.6305220883534135, |
|
"grad_norm": 2.734966993331909, |
|
"learning_rate": 9.156626506024096e-05, |
|
"loss": 3.0798, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 1.6345381526104417, |
|
"grad_norm": 2.955502986907959, |
|
"learning_rate": 9.129852744310576e-05, |
|
"loss": 3.086, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 1.6385542168674698, |
|
"grad_norm": 3.2345542907714844, |
|
"learning_rate": 9.103078982597055e-05, |
|
"loss": 3.3553, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 1.642570281124498, |
|
"grad_norm": 2.7762720584869385, |
|
"learning_rate": 9.076305220883534e-05, |
|
"loss": 3.4238, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 1.6465863453815262, |
|
"grad_norm": 2.824641466140747, |
|
"learning_rate": 9.049531459170014e-05, |
|
"loss": 2.8925, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.6506024096385543, |
|
"grad_norm": 2.754810094833374, |
|
"learning_rate": 9.022757697456493e-05, |
|
"loss": 2.9022, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 1.6546184738955825, |
|
"grad_norm": 2.5305283069610596, |
|
"learning_rate": 8.995983935742972e-05, |
|
"loss": 2.927, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 1.6586345381526104, |
|
"grad_norm": 2.796165943145752, |
|
"learning_rate": 8.969210174029451e-05, |
|
"loss": 2.9185, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 1.6626506024096386, |
|
"grad_norm": 2.9504239559173584, |
|
"learning_rate": 8.942436412315931e-05, |
|
"loss": 3.3915, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 2.8904786109924316, |
|
"learning_rate": 8.91566265060241e-05, |
|
"loss": 2.8841, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.6706827309236947, |
|
"grad_norm": 2.184354305267334, |
|
"learning_rate": 8.888888888888889e-05, |
|
"loss": 2.3859, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 1.6746987951807228, |
|
"grad_norm": 3.1552340984344482, |
|
"learning_rate": 8.862115127175369e-05, |
|
"loss": 3.22, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 1.678714859437751, |
|
"grad_norm": 3.2323250770568848, |
|
"learning_rate": 8.835341365461848e-05, |
|
"loss": 2.8859, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 1.6827309236947792, |
|
"grad_norm": 2.726513147354126, |
|
"learning_rate": 8.808567603748327e-05, |
|
"loss": 3.0969, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 1.6867469879518073, |
|
"grad_norm": 2.7404675483703613, |
|
"learning_rate": 8.781793842034806e-05, |
|
"loss": 2.76, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.6907630522088355, |
|
"grad_norm": 3.433872699737549, |
|
"learning_rate": 8.755020080321286e-05, |
|
"loss": 3.1852, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 1.6947791164658634, |
|
"grad_norm": 3.4727306365966797, |
|
"learning_rate": 8.728246318607765e-05, |
|
"loss": 3.6413, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 1.6987951807228916, |
|
"grad_norm": 2.968161106109619, |
|
"learning_rate": 8.701472556894244e-05, |
|
"loss": 3.15, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 1.7028112449799195, |
|
"grad_norm": 2.8164682388305664, |
|
"learning_rate": 8.674698795180724e-05, |
|
"loss": 3.0286, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 1.7068273092369477, |
|
"grad_norm": 2.7942745685577393, |
|
"learning_rate": 8.647925033467203e-05, |
|
"loss": 3.2501, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.7108433734939759, |
|
"grad_norm": 3.2419016361236572, |
|
"learning_rate": 8.621151271753681e-05, |
|
"loss": 4.3181, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 1.714859437751004, |
|
"grad_norm": 3.3823928833007812, |
|
"learning_rate": 8.594377510040161e-05, |
|
"loss": 3.2917, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 1.7188755020080322, |
|
"grad_norm": 2.8482446670532227, |
|
"learning_rate": 8.567603748326641e-05, |
|
"loss": 3.0338, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 1.7228915662650603, |
|
"grad_norm": 2.435845375061035, |
|
"learning_rate": 8.540829986613119e-05, |
|
"loss": 2.5519, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 1.7269076305220885, |
|
"grad_norm": 2.9163546562194824, |
|
"learning_rate": 8.514056224899599e-05, |
|
"loss": 3.72, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.7309236947791165, |
|
"grad_norm": 2.3660037517547607, |
|
"learning_rate": 8.487282463186079e-05, |
|
"loss": 2.3941, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 1.7349397590361446, |
|
"grad_norm": 2.527449131011963, |
|
"learning_rate": 8.460508701472556e-05, |
|
"loss": 2.9851, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 1.7389558232931726, |
|
"grad_norm": 2.2324576377868652, |
|
"learning_rate": 8.433734939759037e-05, |
|
"loss": 2.6241, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 1.7429718875502007, |
|
"grad_norm": 2.7165253162384033, |
|
"learning_rate": 8.406961178045516e-05, |
|
"loss": 2.7749, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 1.7469879518072289, |
|
"grad_norm": 2.7401411533355713, |
|
"learning_rate": 8.380187416331994e-05, |
|
"loss": 2.9022, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.751004016064257, |
|
"grad_norm": 2.518826961517334, |
|
"learning_rate": 8.353413654618474e-05, |
|
"loss": 2.7587, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 1.7550200803212852, |
|
"grad_norm": 2.493936061859131, |
|
"learning_rate": 8.326639892904954e-05, |
|
"loss": 3.1417, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 1.7590361445783134, |
|
"grad_norm": 2.747951030731201, |
|
"learning_rate": 8.299866131191432e-05, |
|
"loss": 2.6913, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 1.7630522088353415, |
|
"grad_norm": 2.8907039165496826, |
|
"learning_rate": 8.273092369477911e-05, |
|
"loss": 2.4416, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 1.7670682730923695, |
|
"grad_norm": 3.6564669609069824, |
|
"learning_rate": 8.246318607764392e-05, |
|
"loss": 3.9361, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.7710843373493976, |
|
"grad_norm": 2.4362285137176514, |
|
"learning_rate": 8.21954484605087e-05, |
|
"loss": 2.461, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 1.7751004016064256, |
|
"grad_norm": 3.2182202339172363, |
|
"learning_rate": 8.192771084337349e-05, |
|
"loss": 3.2511, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 1.7791164658634537, |
|
"grad_norm": 3.2106211185455322, |
|
"learning_rate": 8.16599732262383e-05, |
|
"loss": 4.4307, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 1.783132530120482, |
|
"grad_norm": 3.4369003772735596, |
|
"learning_rate": 8.139223560910308e-05, |
|
"loss": 4.08, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 1.78714859437751, |
|
"grad_norm": 2.2681970596313477, |
|
"learning_rate": 8.112449799196787e-05, |
|
"loss": 2.3631, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.7911646586345382, |
|
"grad_norm": 2.691133975982666, |
|
"learning_rate": 8.085676037483266e-05, |
|
"loss": 2.6157, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 1.7951807228915664, |
|
"grad_norm": 2.9200479984283447, |
|
"learning_rate": 8.058902275769746e-05, |
|
"loss": 2.6649, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 1.7991967871485943, |
|
"grad_norm": 2.787264108657837, |
|
"learning_rate": 8.032128514056225e-05, |
|
"loss": 2.763, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 1.8032128514056225, |
|
"grad_norm": 2.940075635910034, |
|
"learning_rate": 8.005354752342704e-05, |
|
"loss": 2.9436, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 1.8072289156626506, |
|
"grad_norm": 3.1111507415771484, |
|
"learning_rate": 7.978580990629184e-05, |
|
"loss": 3.1194, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.8112449799196786, |
|
"grad_norm": 2.695709228515625, |
|
"learning_rate": 7.951807228915663e-05, |
|
"loss": 2.7517, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 1.8152610441767068, |
|
"grad_norm": 2.939112663269043, |
|
"learning_rate": 7.925033467202142e-05, |
|
"loss": 3.7794, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 1.819277108433735, |
|
"grad_norm": 2.583163022994995, |
|
"learning_rate": 7.898259705488621e-05, |
|
"loss": 3.0265, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 1.823293172690763, |
|
"grad_norm": 2.496131181716919, |
|
"learning_rate": 7.8714859437751e-05, |
|
"loss": 2.5762, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 1.8273092369477912, |
|
"grad_norm": 2.4272570610046387, |
|
"learning_rate": 7.84471218206158e-05, |
|
"loss": 2.758, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.8313253012048194, |
|
"grad_norm": 2.4154021739959717, |
|
"learning_rate": 7.817938420348059e-05, |
|
"loss": 2.7325, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 1.8353413654618473, |
|
"grad_norm": 2.5219106674194336, |
|
"learning_rate": 7.791164658634539e-05, |
|
"loss": 2.779, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 1.8393574297188755, |
|
"grad_norm": 2.3390161991119385, |
|
"learning_rate": 7.764390896921018e-05, |
|
"loss": 2.2922, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 1.8433734939759037, |
|
"grad_norm": 2.7101354598999023, |
|
"learning_rate": 7.737617135207497e-05, |
|
"loss": 2.9825, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 1.8473895582329316, |
|
"grad_norm": 2.8510243892669678, |
|
"learning_rate": 7.710843373493976e-05, |
|
"loss": 2.8628, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.8514056224899598, |
|
"grad_norm": 2.6924989223480225, |
|
"learning_rate": 7.684069611780456e-05, |
|
"loss": 2.6543, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 1.855421686746988, |
|
"grad_norm": 2.6552584171295166, |
|
"learning_rate": 7.657295850066935e-05, |
|
"loss": 3.0625, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 1.859437751004016, |
|
"grad_norm": 3.2962827682495117, |
|
"learning_rate": 7.630522088353414e-05, |
|
"loss": 3.308, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 1.8634538152610443, |
|
"grad_norm": 3.0845699310302734, |
|
"learning_rate": 7.603748326639893e-05, |
|
"loss": 3.5178, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 1.8674698795180724, |
|
"grad_norm": 2.768254518508911, |
|
"learning_rate": 7.576974564926373e-05, |
|
"loss": 3.6667, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.8714859437751004, |
|
"grad_norm": 2.5801167488098145, |
|
"learning_rate": 7.550200803212851e-05, |
|
"loss": 2.7686, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 1.8755020080321285, |
|
"grad_norm": 2.2853081226348877, |
|
"learning_rate": 7.523427041499331e-05, |
|
"loss": 2.2115, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 1.8795180722891565, |
|
"grad_norm": 2.9309747219085693, |
|
"learning_rate": 7.49665327978581e-05, |
|
"loss": 2.9426, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 1.8835341365461846, |
|
"grad_norm": 3.146700143814087, |
|
"learning_rate": 7.469879518072289e-05, |
|
"loss": 3.3903, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 1.8875502008032128, |
|
"grad_norm": 3.3652424812316895, |
|
"learning_rate": 7.443105756358769e-05, |
|
"loss": 3.0085, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.891566265060241, |
|
"grad_norm": 2.424377918243408, |
|
"learning_rate": 7.416331994645248e-05, |
|
"loss": 2.5145, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 1.895582329317269, |
|
"grad_norm": 2.5642752647399902, |
|
"learning_rate": 7.389558232931726e-05, |
|
"loss": 3.1927, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 1.8995983935742973, |
|
"grad_norm": 2.7574706077575684, |
|
"learning_rate": 7.362784471218207e-05, |
|
"loss": 2.6753, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 1.9036144578313254, |
|
"grad_norm": 2.6844048500061035, |
|
"learning_rate": 7.336010709504686e-05, |
|
"loss": 2.7126, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 1.9076305220883534, |
|
"grad_norm": 2.3251895904541016, |
|
"learning_rate": 7.309236947791164e-05, |
|
"loss": 2.5947, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.9116465863453815, |
|
"grad_norm": 2.1562206745147705, |
|
"learning_rate": 7.282463186077644e-05, |
|
"loss": 2.2137, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 1.9156626506024095, |
|
"grad_norm": 2.400747776031494, |
|
"learning_rate": 7.255689424364124e-05, |
|
"loss": 2.8869, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 1.9196787148594376, |
|
"grad_norm": 3.1380369663238525, |
|
"learning_rate": 7.228915662650602e-05, |
|
"loss": 3.4202, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 1.9236947791164658, |
|
"grad_norm": 2.9858291149139404, |
|
"learning_rate": 7.202141900937081e-05, |
|
"loss": 3.1519, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 1.927710843373494, |
|
"grad_norm": 2.6354973316192627, |
|
"learning_rate": 7.175368139223562e-05, |
|
"loss": 2.8662, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.9317269076305221, |
|
"grad_norm": 2.7349445819854736, |
|
"learning_rate": 7.14859437751004e-05, |
|
"loss": 4.2679, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 1.9357429718875503, |
|
"grad_norm": 3.0139505863189697, |
|
"learning_rate": 7.121820615796519e-05, |
|
"loss": 2.9382, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 1.9397590361445785, |
|
"grad_norm": 3.1879093647003174, |
|
"learning_rate": 7.095046854083e-05, |
|
"loss": 3.168, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 1.9437751004016064, |
|
"grad_norm": 3.2778398990631104, |
|
"learning_rate": 7.068273092369478e-05, |
|
"loss": 3.4373, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 1.9477911646586346, |
|
"grad_norm": 3.024111747741699, |
|
"learning_rate": 7.041499330655957e-05, |
|
"loss": 3.7807, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.9518072289156625, |
|
"grad_norm": 2.750593423843384, |
|
"learning_rate": 7.014725568942436e-05, |
|
"loss": 3.4546, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 1.9558232931726907, |
|
"grad_norm": 2.9757187366485596, |
|
"learning_rate": 6.987951807228917e-05, |
|
"loss": 3.0145, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 1.9598393574297188, |
|
"grad_norm": 2.867292881011963, |
|
"learning_rate": 6.961178045515395e-05, |
|
"loss": 2.5524, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 1.963855421686747, |
|
"grad_norm": 2.563595771789551, |
|
"learning_rate": 6.934404283801874e-05, |
|
"loss": 2.7503, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 1.9678714859437751, |
|
"grad_norm": 2.52006459236145, |
|
"learning_rate": 6.907630522088355e-05, |
|
"loss": 3.0431, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.9718875502008033, |
|
"grad_norm": 3.0700199604034424, |
|
"learning_rate": 6.880856760374833e-05, |
|
"loss": 3.7242, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 1.9759036144578315, |
|
"grad_norm": 2.7504234313964844, |
|
"learning_rate": 6.854082998661312e-05, |
|
"loss": 2.6293, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 1.9799196787148594, |
|
"grad_norm": 2.919828414916992, |
|
"learning_rate": 6.827309236947793e-05, |
|
"loss": 2.6278, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 1.9839357429718876, |
|
"grad_norm": 2.453157663345337, |
|
"learning_rate": 6.80053547523427e-05, |
|
"loss": 2.2764, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 1.9879518072289155, |
|
"grad_norm": 2.635430335998535, |
|
"learning_rate": 6.77376171352075e-05, |
|
"loss": 2.9467, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.9919678714859437, |
|
"grad_norm": 2.7158102989196777, |
|
"learning_rate": 6.746987951807229e-05, |
|
"loss": 2.7886, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 1.9959839357429718, |
|
"grad_norm": 2.3272292613983154, |
|
"learning_rate": 6.720214190093708e-05, |
|
"loss": 2.6445, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 2.2954020500183105, |
|
"learning_rate": 6.693440428380188e-05, |
|
"loss": 2.5719, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.8565791249275208, |
|
"eval_runtime": 200.8505, |
|
"eval_samples_per_second": 2.484, |
|
"eval_steps_per_second": 1.245, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 2.004016064257028, |
|
"grad_norm": 2.3647961616516113, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 2.5357, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 2.0080321285140563, |
|
"grad_norm": 2.052393674850464, |
|
"learning_rate": 6.639892904953146e-05, |
|
"loss": 2.1653, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.0120481927710845, |
|
"grad_norm": 2.6393344402313232, |
|
"learning_rate": 6.613119143239626e-05, |
|
"loss": 2.2634, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 2.0160642570281126, |
|
"grad_norm": 2.4461183547973633, |
|
"learning_rate": 6.586345381526105e-05, |
|
"loss": 2.7017, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 2.0200803212851404, |
|
"grad_norm": 3.1604115962982178, |
|
"learning_rate": 6.559571619812584e-05, |
|
"loss": 3.6735, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 2.0240963855421685, |
|
"grad_norm": 3.0627472400665283, |
|
"learning_rate": 6.532797858099063e-05, |
|
"loss": 2.9889, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 2.0281124497991967, |
|
"grad_norm": 2.568150520324707, |
|
"learning_rate": 6.506024096385543e-05, |
|
"loss": 2.492, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 2.032128514056225, |
|
"grad_norm": 2.2594618797302246, |
|
"learning_rate": 6.47925033467202e-05, |
|
"loss": 1.8152, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 2.036144578313253, |
|
"grad_norm": 2.544188976287842, |
|
"learning_rate": 6.452476572958501e-05, |
|
"loss": 3.7016, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 2.040160642570281, |
|
"grad_norm": 2.418565511703491, |
|
"learning_rate": 6.42570281124498e-05, |
|
"loss": 2.3062, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 2.0441767068273093, |
|
"grad_norm": 2.3617923259735107, |
|
"learning_rate": 6.398929049531458e-05, |
|
"loss": 2.2887, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 2.0481927710843375, |
|
"grad_norm": 2.4115524291992188, |
|
"learning_rate": 6.372155287817939e-05, |
|
"loss": 2.4596, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.0522088353413657, |
|
"grad_norm": 2.763218402862549, |
|
"learning_rate": 6.345381526104418e-05, |
|
"loss": 2.7423, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 2.0562248995983934, |
|
"grad_norm": 2.515378713607788, |
|
"learning_rate": 6.318607764390896e-05, |
|
"loss": 2.4356, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 2.0602409638554215, |
|
"grad_norm": 2.809786796569824, |
|
"learning_rate": 6.291834002677377e-05, |
|
"loss": 3.3361, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 2.0642570281124497, |
|
"grad_norm": 2.3717005252838135, |
|
"learning_rate": 6.265060240963856e-05, |
|
"loss": 3.0205, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 2.068273092369478, |
|
"grad_norm": 2.7689290046691895, |
|
"learning_rate": 6.238286479250335e-05, |
|
"loss": 2.9104, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 2.072289156626506, |
|
"grad_norm": 2.573058843612671, |
|
"learning_rate": 6.211512717536813e-05, |
|
"loss": 2.2966, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 2.076305220883534, |
|
"grad_norm": 2.5662682056427, |
|
"learning_rate": 6.184738955823294e-05, |
|
"loss": 2.4407, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 2.0803212851405624, |
|
"grad_norm": 2.475853681564331, |
|
"learning_rate": 6.157965194109773e-05, |
|
"loss": 2.2512, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 2.0843373493975905, |
|
"grad_norm": 2.426939010620117, |
|
"learning_rate": 6.131191432396251e-05, |
|
"loss": 2.2575, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 2.0883534136546187, |
|
"grad_norm": 2.709951877593994, |
|
"learning_rate": 6.104417670682732e-05, |
|
"loss": 2.2289, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.0923694779116464, |
|
"grad_norm": 2.620199680328369, |
|
"learning_rate": 6.0776439089692105e-05, |
|
"loss": 2.6856, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 2.0963855421686746, |
|
"grad_norm": 2.236469030380249, |
|
"learning_rate": 6.05087014725569e-05, |
|
"loss": 2.1652, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 2.1004016064257027, |
|
"grad_norm": 2.4781830310821533, |
|
"learning_rate": 6.02409638554217e-05, |
|
"loss": 2.0519, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 2.104417670682731, |
|
"grad_norm": 2.9179675579071045, |
|
"learning_rate": 5.9973226238286484e-05, |
|
"loss": 2.3534, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 2.108433734939759, |
|
"grad_norm": 2.7088980674743652, |
|
"learning_rate": 5.9705488621151276e-05, |
|
"loss": 2.3717, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 2.112449799196787, |
|
"grad_norm": 2.784228801727295, |
|
"learning_rate": 5.943775100401606e-05, |
|
"loss": 2.7936, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 2.1164658634538154, |
|
"grad_norm": 3.1045587062835693, |
|
"learning_rate": 5.917001338688086e-05, |
|
"loss": 2.1785, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 2.1204819277108435, |
|
"grad_norm": 2.7609670162200928, |
|
"learning_rate": 5.8902275769745655e-05, |
|
"loss": 2.4232, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 2.1244979919678713, |
|
"grad_norm": 2.9791460037231445, |
|
"learning_rate": 5.863453815261044e-05, |
|
"loss": 2.6127, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 2.1285140562248994, |
|
"grad_norm": 2.917396306991577, |
|
"learning_rate": 5.836680053547524e-05, |
|
"loss": 2.5008, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.1325301204819276, |
|
"grad_norm": 3.066033124923706, |
|
"learning_rate": 5.809906291834003e-05, |
|
"loss": 2.8997, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 2.1365461847389557, |
|
"grad_norm": 2.570894241333008, |
|
"learning_rate": 5.783132530120482e-05, |
|
"loss": 2.2987, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 2.140562248995984, |
|
"grad_norm": 2.4431967735290527, |
|
"learning_rate": 5.756358768406962e-05, |
|
"loss": 2.1485, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 2.144578313253012, |
|
"grad_norm": 2.789560079574585, |
|
"learning_rate": 5.729585006693441e-05, |
|
"loss": 2.3678, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 2.1485943775100402, |
|
"grad_norm": 2.691913366317749, |
|
"learning_rate": 5.70281124497992e-05, |
|
"loss": 2.3469, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 2.1526104417670684, |
|
"grad_norm": 2.472721815109253, |
|
"learning_rate": 5.676037483266399e-05, |
|
"loss": 2.0741, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 2.1566265060240966, |
|
"grad_norm": 2.705008029937744, |
|
"learning_rate": 5.649263721552879e-05, |
|
"loss": 2.3399, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 2.1606425702811247, |
|
"grad_norm": 2.8036177158355713, |
|
"learning_rate": 5.6224899598393576e-05, |
|
"loss": 2.4336, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 2.1646586345381524, |
|
"grad_norm": 2.8112568855285645, |
|
"learning_rate": 5.595716198125837e-05, |
|
"loss": 2.4039, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 2.1686746987951806, |
|
"grad_norm": 2.932802438735962, |
|
"learning_rate": 5.568942436412317e-05, |
|
"loss": 2.4175, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.1726907630522088, |
|
"grad_norm": 3.0952837467193604, |
|
"learning_rate": 5.5421686746987955e-05, |
|
"loss": 2.4552, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 2.176706827309237, |
|
"grad_norm": 2.6719419956207275, |
|
"learning_rate": 5.515394912985275e-05, |
|
"loss": 2.0765, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 2.180722891566265, |
|
"grad_norm": 3.0576534271240234, |
|
"learning_rate": 5.488621151271755e-05, |
|
"loss": 2.417, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 2.1847389558232932, |
|
"grad_norm": 3.0612807273864746, |
|
"learning_rate": 5.461847389558233e-05, |
|
"loss": 2.9868, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 2.1887550200803214, |
|
"grad_norm": 3.5036559104919434, |
|
"learning_rate": 5.4350736278447126e-05, |
|
"loss": 2.7975, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 2.1927710843373496, |
|
"grad_norm": 3.5645198822021484, |
|
"learning_rate": 5.408299866131191e-05, |
|
"loss": 2.8446, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 2.1967871485943773, |
|
"grad_norm": 2.72088360786438, |
|
"learning_rate": 5.381526104417671e-05, |
|
"loss": 2.3907, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 2.2008032128514055, |
|
"grad_norm": 3.901146411895752, |
|
"learning_rate": 5.3547523427041504e-05, |
|
"loss": 3.4091, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 2.2048192771084336, |
|
"grad_norm": 2.9762930870056152, |
|
"learning_rate": 5.327978580990629e-05, |
|
"loss": 2.2808, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 2.208835341365462, |
|
"grad_norm": 3.1252336502075195, |
|
"learning_rate": 5.301204819277109e-05, |
|
"loss": 2.3206, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.21285140562249, |
|
"grad_norm": 3.61395525932312, |
|
"learning_rate": 5.274431057563588e-05, |
|
"loss": 2.9899, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 2.216867469879518, |
|
"grad_norm": 3.035787582397461, |
|
"learning_rate": 5.247657295850067e-05, |
|
"loss": 2.2514, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 2.2208835341365463, |
|
"grad_norm": 3.0700008869171143, |
|
"learning_rate": 5.220883534136547e-05, |
|
"loss": 2.7965, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 2.2248995983935744, |
|
"grad_norm": 3.380383253097534, |
|
"learning_rate": 5.194109772423026e-05, |
|
"loss": 2.7258, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 2.2289156626506026, |
|
"grad_norm": 3.3445475101470947, |
|
"learning_rate": 5.167336010709505e-05, |
|
"loss": 3.0532, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 2.2329317269076308, |
|
"grad_norm": 3.305169105529785, |
|
"learning_rate": 5.140562248995984e-05, |
|
"loss": 2.7851, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 2.2369477911646585, |
|
"grad_norm": 3.3952481746673584, |
|
"learning_rate": 5.113788487282464e-05, |
|
"loss": 2.6845, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 2.2409638554216866, |
|
"grad_norm": 2.7673559188842773, |
|
"learning_rate": 5.0870147255689426e-05, |
|
"loss": 2.6067, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 2.244979919678715, |
|
"grad_norm": 3.3448803424835205, |
|
"learning_rate": 5.060240963855422e-05, |
|
"loss": 2.4804, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 2.248995983935743, |
|
"grad_norm": 2.797827959060669, |
|
"learning_rate": 5.033467202141902e-05, |
|
"loss": 2.1237, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.253012048192771, |
|
"grad_norm": 2.9383599758148193, |
|
"learning_rate": 5.0066934404283804e-05, |
|
"loss": 2.3107, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 2.2570281124497993, |
|
"grad_norm": 3.0028162002563477, |
|
"learning_rate": 4.97991967871486e-05, |
|
"loss": 3.2211, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 2.2610441767068274, |
|
"grad_norm": 2.928341865539551, |
|
"learning_rate": 4.953145917001339e-05, |
|
"loss": 2.5173, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 2.2650602409638556, |
|
"grad_norm": 2.9720232486724854, |
|
"learning_rate": 4.926372155287818e-05, |
|
"loss": 2.3146, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 2.2690763052208833, |
|
"grad_norm": 3.558094024658203, |
|
"learning_rate": 4.8995983935742975e-05, |
|
"loss": 3.1953, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 2.2730923694779115, |
|
"grad_norm": 3.0352494716644287, |
|
"learning_rate": 4.872824631860777e-05, |
|
"loss": 2.4965, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 2.2771084337349397, |
|
"grad_norm": 2.7428176403045654, |
|
"learning_rate": 4.8460508701472554e-05, |
|
"loss": 2.1514, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 2.281124497991968, |
|
"grad_norm": 2.3594534397125244, |
|
"learning_rate": 4.8192771084337354e-05, |
|
"loss": 1.8075, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 2.285140562248996, |
|
"grad_norm": 3.3449742794036865, |
|
"learning_rate": 4.792503346720215e-05, |
|
"loss": 2.5945, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 2.289156626506024, |
|
"grad_norm": 3.104633331298828, |
|
"learning_rate": 4.765729585006693e-05, |
|
"loss": 2.9666, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.2931726907630523, |
|
"grad_norm": 3.094238758087158, |
|
"learning_rate": 4.738955823293173e-05, |
|
"loss": 2.489, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 2.2971887550200805, |
|
"grad_norm": 3.381775379180908, |
|
"learning_rate": 4.712182061579652e-05, |
|
"loss": 2.9042, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 2.3012048192771086, |
|
"grad_norm": 3.2117156982421875, |
|
"learning_rate": 4.685408299866131e-05, |
|
"loss": 2.6925, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 2.305220883534137, |
|
"grad_norm": 2.8267903327941895, |
|
"learning_rate": 4.658634538152611e-05, |
|
"loss": 2.3816, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 2.3092369477911645, |
|
"grad_norm": 3.068437099456787, |
|
"learning_rate": 4.63186077643909e-05, |
|
"loss": 2.3124, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 2.3132530120481927, |
|
"grad_norm": 2.832303762435913, |
|
"learning_rate": 4.605087014725569e-05, |
|
"loss": 2.5169, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 2.317269076305221, |
|
"grad_norm": 2.8893704414367676, |
|
"learning_rate": 4.578313253012048e-05, |
|
"loss": 2.3119, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 2.321285140562249, |
|
"grad_norm": 2.952976703643799, |
|
"learning_rate": 4.5515394912985275e-05, |
|
"loss": 2.3063, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 2.325301204819277, |
|
"grad_norm": 2.7303566932678223, |
|
"learning_rate": 4.524765729585007e-05, |
|
"loss": 2.5834, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 2.3293172690763053, |
|
"grad_norm": 2.9680216312408447, |
|
"learning_rate": 4.497991967871486e-05, |
|
"loss": 2.249, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.3333333333333335, |
|
"grad_norm": 2.997044324874878, |
|
"learning_rate": 4.4712182061579654e-05, |
|
"loss": 2.5954, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 2.337349397590361, |
|
"grad_norm": 3.4494729042053223, |
|
"learning_rate": 4.4444444444444447e-05, |
|
"loss": 3.1359, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 2.3413654618473894, |
|
"grad_norm": 3.1353585720062256, |
|
"learning_rate": 4.417670682730924e-05, |
|
"loss": 2.4317, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 2.3453815261044175, |
|
"grad_norm": 2.9816396236419678, |
|
"learning_rate": 4.390896921017403e-05, |
|
"loss": 2.8438, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 2.3493975903614457, |
|
"grad_norm": 2.6249794960021973, |
|
"learning_rate": 4.3641231593038825e-05, |
|
"loss": 2.0497, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 2.353413654618474, |
|
"grad_norm": 2.8994345664978027, |
|
"learning_rate": 4.337349397590362e-05, |
|
"loss": 2.149, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 2.357429718875502, |
|
"grad_norm": 3.8927950859069824, |
|
"learning_rate": 4.3105756358768404e-05, |
|
"loss": 3.0218, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 2.36144578313253, |
|
"grad_norm": 3.120274543762207, |
|
"learning_rate": 4.2838018741633203e-05, |
|
"loss": 2.1973, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 2.3654618473895583, |
|
"grad_norm": 3.104851007461548, |
|
"learning_rate": 4.2570281124497996e-05, |
|
"loss": 2.3442, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 2.3694779116465865, |
|
"grad_norm": 2.97161602973938, |
|
"learning_rate": 4.230254350736278e-05, |
|
"loss": 2.5706, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.3734939759036147, |
|
"grad_norm": 2.6856470108032227, |
|
"learning_rate": 4.203480589022758e-05, |
|
"loss": 2.0781, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 2.3775100401606424, |
|
"grad_norm": 2.9654481410980225, |
|
"learning_rate": 4.176706827309237e-05, |
|
"loss": 2.2495, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 2.3815261044176705, |
|
"grad_norm": 2.861020088195801, |
|
"learning_rate": 4.149933065595716e-05, |
|
"loss": 1.9942, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 2.3855421686746987, |
|
"grad_norm": 3.413158893585205, |
|
"learning_rate": 4.123159303882196e-05, |
|
"loss": 2.6585, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 2.389558232931727, |
|
"grad_norm": 3.1313233375549316, |
|
"learning_rate": 4.0963855421686746e-05, |
|
"loss": 2.9493, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 2.393574297188755, |
|
"grad_norm": 3.325638771057129, |
|
"learning_rate": 4.069611780455154e-05, |
|
"loss": 2.7101, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 2.397590361445783, |
|
"grad_norm": 2.991661787033081, |
|
"learning_rate": 4.042838018741633e-05, |
|
"loss": 2.5683, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 2.4016064257028114, |
|
"grad_norm": 3.0619921684265137, |
|
"learning_rate": 4.0160642570281125e-05, |
|
"loss": 2.5722, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 2.4056224899598395, |
|
"grad_norm": 2.730375289916992, |
|
"learning_rate": 3.989290495314592e-05, |
|
"loss": 2.2107, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 2.4096385542168672, |
|
"grad_norm": 2.5859103202819824, |
|
"learning_rate": 3.962516733601071e-05, |
|
"loss": 2.0576, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.4136546184738954, |
|
"grad_norm": 2.8956499099731445, |
|
"learning_rate": 3.93574297188755e-05, |
|
"loss": 2.1889, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 2.4176706827309236, |
|
"grad_norm": 2.575547933578491, |
|
"learning_rate": 3.9089692101740296e-05, |
|
"loss": 1.9322, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 2.4216867469879517, |
|
"grad_norm": 3.3304378986358643, |
|
"learning_rate": 3.882195448460509e-05, |
|
"loss": 2.4677, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 2.42570281124498, |
|
"grad_norm": 3.5554420948028564, |
|
"learning_rate": 3.855421686746988e-05, |
|
"loss": 2.6703, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 2.429718875502008, |
|
"grad_norm": 3.415844440460205, |
|
"learning_rate": 3.8286479250334675e-05, |
|
"loss": 2.9157, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 2.433734939759036, |
|
"grad_norm": 3.127218008041382, |
|
"learning_rate": 3.801874163319947e-05, |
|
"loss": 2.416, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 2.4377510040160644, |
|
"grad_norm": 3.796701192855835, |
|
"learning_rate": 3.7751004016064253e-05, |
|
"loss": 2.3505, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 2.4417670682730925, |
|
"grad_norm": 3.6044912338256836, |
|
"learning_rate": 3.748326639892905e-05, |
|
"loss": 2.8561, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 2.4457831325301207, |
|
"grad_norm": 3.2551517486572266, |
|
"learning_rate": 3.7215528781793846e-05, |
|
"loss": 2.5376, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 2.4497991967871484, |
|
"grad_norm": 2.890302896499634, |
|
"learning_rate": 3.694779116465863e-05, |
|
"loss": 2.2256, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.4538152610441766, |
|
"grad_norm": 3.478085517883301, |
|
"learning_rate": 3.668005354752343e-05, |
|
"loss": 2.6602, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 2.4578313253012047, |
|
"grad_norm": 3.682518720626831, |
|
"learning_rate": 3.641231593038822e-05, |
|
"loss": 2.8083, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 2.461847389558233, |
|
"grad_norm": 2.841364860534668, |
|
"learning_rate": 3.614457831325301e-05, |
|
"loss": 2.0827, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 2.465863453815261, |
|
"grad_norm": 2.784315347671509, |
|
"learning_rate": 3.587684069611781e-05, |
|
"loss": 3.9997, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 2.4698795180722892, |
|
"grad_norm": 3.153395652770996, |
|
"learning_rate": 3.5609103078982596e-05, |
|
"loss": 2.3443, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 2.4738955823293174, |
|
"grad_norm": 3.2817304134368896, |
|
"learning_rate": 3.534136546184739e-05, |
|
"loss": 2.6729, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 2.4779116465863456, |
|
"grad_norm": 2.8291358947753906, |
|
"learning_rate": 3.507362784471218e-05, |
|
"loss": 2.1918, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 2.4819277108433733, |
|
"grad_norm": 3.548492670059204, |
|
"learning_rate": 3.4805890227576974e-05, |
|
"loss": 3.5277, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 2.4859437751004014, |
|
"grad_norm": 9.622389793395996, |
|
"learning_rate": 3.4538152610441774e-05, |
|
"loss": 3.3926, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 2.4899598393574296, |
|
"grad_norm": 3.489105224609375, |
|
"learning_rate": 3.427041499330656e-05, |
|
"loss": 2.5828, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.4939759036144578, |
|
"grad_norm": 2.7694857120513916, |
|
"learning_rate": 3.400267737617135e-05, |
|
"loss": 1.9917, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 2.497991967871486, |
|
"grad_norm": 3.2993392944335938, |
|
"learning_rate": 3.3734939759036146e-05, |
|
"loss": 2.8177, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 2.502008032128514, |
|
"grad_norm": 2.863051176071167, |
|
"learning_rate": 3.346720214190094e-05, |
|
"loss": 2.0999, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 2.5060240963855422, |
|
"grad_norm": 3.025731086730957, |
|
"learning_rate": 3.319946452476573e-05, |
|
"loss": 2.555, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 2.5100401606425704, |
|
"grad_norm": 3.236588716506958, |
|
"learning_rate": 3.2931726907630524e-05, |
|
"loss": 2.3746, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 2.5140562248995986, |
|
"grad_norm": 3.071715831756592, |
|
"learning_rate": 3.266398929049532e-05, |
|
"loss": 2.1943, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 2.5180722891566267, |
|
"grad_norm": 3.353304147720337, |
|
"learning_rate": 3.23962516733601e-05, |
|
"loss": 3.2267, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 2.522088353413655, |
|
"grad_norm": 2.9166722297668457, |
|
"learning_rate": 3.21285140562249e-05, |
|
"loss": 2.5768, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 2.5261044176706826, |
|
"grad_norm": 2.571737051010132, |
|
"learning_rate": 3.1860776439089695e-05, |
|
"loss": 2.4097, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 2.5301204819277108, |
|
"grad_norm": 3.2051124572753906, |
|
"learning_rate": 3.159303882195448e-05, |
|
"loss": 2.6875, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.534136546184739, |
|
"grad_norm": 3.414586067199707, |
|
"learning_rate": 3.132530120481928e-05, |
|
"loss": 2.467, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 2.538152610441767, |
|
"grad_norm": 3.201895236968994, |
|
"learning_rate": 3.105756358768407e-05, |
|
"loss": 2.6332, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 2.5421686746987953, |
|
"grad_norm": 3.2875518798828125, |
|
"learning_rate": 3.078982597054887e-05, |
|
"loss": 3.0367, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 2.5461847389558234, |
|
"grad_norm": 2.6989524364471436, |
|
"learning_rate": 3.052208835341366e-05, |
|
"loss": 2.1665, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 2.550200803212851, |
|
"grad_norm": 2.7747488021850586, |
|
"learning_rate": 3.025435073627845e-05, |
|
"loss": 2.1499, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 2.5542168674698793, |
|
"grad_norm": 3.4082605838775635, |
|
"learning_rate": 2.9986613119143242e-05, |
|
"loss": 2.6462, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 2.5582329317269075, |
|
"grad_norm": 2.713757276535034, |
|
"learning_rate": 2.971887550200803e-05, |
|
"loss": 2.09, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 2.5622489959839356, |
|
"grad_norm": 3.2788338661193848, |
|
"learning_rate": 2.9451137884872827e-05, |
|
"loss": 2.3322, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 2.566265060240964, |
|
"grad_norm": 2.6642184257507324, |
|
"learning_rate": 2.918340026773762e-05, |
|
"loss": 2.1751, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 2.570281124497992, |
|
"grad_norm": 3.069793224334717, |
|
"learning_rate": 2.891566265060241e-05, |
|
"loss": 2.2499, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.57429718875502, |
|
"grad_norm": 3.132709503173828, |
|
"learning_rate": 2.8647925033467206e-05, |
|
"loss": 2.585, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 2.5783132530120483, |
|
"grad_norm": 3.27109432220459, |
|
"learning_rate": 2.8380187416331995e-05, |
|
"loss": 2.4458, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 2.5823293172690764, |
|
"grad_norm": 3.5450148582458496, |
|
"learning_rate": 2.8112449799196788e-05, |
|
"loss": 3.8692, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 2.5863453815261046, |
|
"grad_norm": 3.2768943309783936, |
|
"learning_rate": 2.7844712182061584e-05, |
|
"loss": 2.4152, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 2.5903614457831328, |
|
"grad_norm": 3.1916306018829346, |
|
"learning_rate": 2.7576974564926374e-05, |
|
"loss": 2.5376, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 2.5943775100401605, |
|
"grad_norm": 2.7519237995147705, |
|
"learning_rate": 2.7309236947791167e-05, |
|
"loss": 2.1762, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 2.5983935742971886, |
|
"grad_norm": 3.649415969848633, |
|
"learning_rate": 2.7041499330655956e-05, |
|
"loss": 3.0767, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 2.602409638554217, |
|
"grad_norm": 3.1575088500976562, |
|
"learning_rate": 2.6773761713520752e-05, |
|
"loss": 2.5746, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 2.606425702811245, |
|
"grad_norm": 3.1661970615386963, |
|
"learning_rate": 2.6506024096385545e-05, |
|
"loss": 2.8486, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 2.610441767068273, |
|
"grad_norm": 3.374446392059326, |
|
"learning_rate": 2.6238286479250334e-05, |
|
"loss": 3.0536, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.6144578313253013, |
|
"grad_norm": 3.2961578369140625, |
|
"learning_rate": 2.597054886211513e-05, |
|
"loss": 2.403, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 2.6184738955823295, |
|
"grad_norm": 3.078670024871826, |
|
"learning_rate": 2.570281124497992e-05, |
|
"loss": 2.0923, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 2.622489959839357, |
|
"grad_norm": 3.625155448913574, |
|
"learning_rate": 2.5435073627844713e-05, |
|
"loss": 3.3948, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 2.6265060240963853, |
|
"grad_norm": 3.2434301376342773, |
|
"learning_rate": 2.516733601070951e-05, |
|
"loss": 3.0131, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 2.6305220883534135, |
|
"grad_norm": 3.321974515914917, |
|
"learning_rate": 2.48995983935743e-05, |
|
"loss": 2.5972, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 2.6345381526104417, |
|
"grad_norm": 2.6846182346343994, |
|
"learning_rate": 2.463186077643909e-05, |
|
"loss": 2.2812, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 2.63855421686747, |
|
"grad_norm": 2.814183235168457, |
|
"learning_rate": 2.4364123159303884e-05, |
|
"loss": 2.1195, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 2.642570281124498, |
|
"grad_norm": 2.640397310256958, |
|
"learning_rate": 2.4096385542168677e-05, |
|
"loss": 2.1728, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 2.646586345381526, |
|
"grad_norm": 3.7056844234466553, |
|
"learning_rate": 2.3828647925033466e-05, |
|
"loss": 2.8224, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 2.6506024096385543, |
|
"grad_norm": 2.740823268890381, |
|
"learning_rate": 2.356091030789826e-05, |
|
"loss": 2.3886, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.6546184738955825, |
|
"grad_norm": 2.689279079437256, |
|
"learning_rate": 2.3293172690763055e-05, |
|
"loss": 2.3151, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 2.6586345381526106, |
|
"grad_norm": 3.4579248428344727, |
|
"learning_rate": 2.3025435073627845e-05, |
|
"loss": 2.7812, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 2.662650602409639, |
|
"grad_norm": 3.293381690979004, |
|
"learning_rate": 2.2757697456492638e-05, |
|
"loss": 2.9381, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 3.3860654830932617, |
|
"learning_rate": 2.248995983935743e-05, |
|
"loss": 2.4111, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 2.6706827309236947, |
|
"grad_norm": 3.3504996299743652, |
|
"learning_rate": 2.2222222222222223e-05, |
|
"loss": 2.4411, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 2.674698795180723, |
|
"grad_norm": 3.2323498725891113, |
|
"learning_rate": 2.1954484605087016e-05, |
|
"loss": 2.6294, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 2.678714859437751, |
|
"grad_norm": 2.935426950454712, |
|
"learning_rate": 2.168674698795181e-05, |
|
"loss": 2.5489, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 2.682730923694779, |
|
"grad_norm": 3.483436346054077, |
|
"learning_rate": 2.1419009370816602e-05, |
|
"loss": 2.7512, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 2.6867469879518073, |
|
"grad_norm": 3.4001944065093994, |
|
"learning_rate": 2.115127175368139e-05, |
|
"loss": 2.4015, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 2.6907630522088355, |
|
"grad_norm": 3.6413683891296387, |
|
"learning_rate": 2.0883534136546184e-05, |
|
"loss": 3.5122, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.694779116465863, |
|
"grad_norm": 2.5411088466644287, |
|
"learning_rate": 2.061579651941098e-05, |
|
"loss": 2.0925, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 2.6987951807228914, |
|
"grad_norm": 3.1367125511169434, |
|
"learning_rate": 2.034805890227577e-05, |
|
"loss": 2.5457, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 2.7028112449799195, |
|
"grad_norm": 3.300114393234253, |
|
"learning_rate": 2.0080321285140562e-05, |
|
"loss": 3.0402, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 2.7068273092369477, |
|
"grad_norm": 2.744513750076294, |
|
"learning_rate": 1.9812583668005355e-05, |
|
"loss": 2.2273, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 2.710843373493976, |
|
"grad_norm": 3.0049889087677, |
|
"learning_rate": 1.9544846050870148e-05, |
|
"loss": 2.4656, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 2.714859437751004, |
|
"grad_norm": 2.9064860343933105, |
|
"learning_rate": 1.927710843373494e-05, |
|
"loss": 2.3855, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 2.718875502008032, |
|
"grad_norm": 3.317073106765747, |
|
"learning_rate": 1.9009370816599734e-05, |
|
"loss": 2.7036, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 2.7228915662650603, |
|
"grad_norm": 3.580209732055664, |
|
"learning_rate": 1.8741633199464527e-05, |
|
"loss": 2.4416, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 2.7269076305220885, |
|
"grad_norm": 3.0195388793945312, |
|
"learning_rate": 1.8473895582329316e-05, |
|
"loss": 2.0284, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 2.7309236947791167, |
|
"grad_norm": 3.5155584812164307, |
|
"learning_rate": 1.820615796519411e-05, |
|
"loss": 3.6898, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.734939759036145, |
|
"grad_norm": 3.3643851280212402, |
|
"learning_rate": 1.7938420348058905e-05, |
|
"loss": 2.7534, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 2.7389558232931726, |
|
"grad_norm": 3.949350595474243, |
|
"learning_rate": 1.7670682730923694e-05, |
|
"loss": 3.6933, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 2.7429718875502007, |
|
"grad_norm": 2.7811617851257324, |
|
"learning_rate": 1.7402945113788487e-05, |
|
"loss": 2.0857, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 2.746987951807229, |
|
"grad_norm": 3.3071796894073486, |
|
"learning_rate": 1.713520749665328e-05, |
|
"loss": 2.9454, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 2.751004016064257, |
|
"grad_norm": 3.181541919708252, |
|
"learning_rate": 1.6867469879518073e-05, |
|
"loss": 2.4977, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 2.755020080321285, |
|
"grad_norm": 2.8570432662963867, |
|
"learning_rate": 1.6599732262382866e-05, |
|
"loss": 2.2448, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 2.7590361445783134, |
|
"grad_norm": 2.8519392013549805, |
|
"learning_rate": 1.633199464524766e-05, |
|
"loss": 2.0659, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 2.7630522088353415, |
|
"grad_norm": 3.0057828426361084, |
|
"learning_rate": 1.606425702811245e-05, |
|
"loss": 2.711, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 2.7670682730923692, |
|
"grad_norm": 3.7644693851470947, |
|
"learning_rate": 1.579651941097724e-05, |
|
"loss": 2.7368, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 2.7710843373493974, |
|
"grad_norm": 3.339076519012451, |
|
"learning_rate": 1.5528781793842034e-05, |
|
"loss": 2.4372, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.7751004016064256, |
|
"grad_norm": 3.3303468227386475, |
|
"learning_rate": 1.526104417670683e-05, |
|
"loss": 2.1496, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 2.7791164658634537, |
|
"grad_norm": 3.007516384124756, |
|
"learning_rate": 1.4993306559571621e-05, |
|
"loss": 2.0637, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 2.783132530120482, |
|
"grad_norm": 3.2054901123046875, |
|
"learning_rate": 1.4725568942436414e-05, |
|
"loss": 2.6325, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 2.78714859437751, |
|
"grad_norm": 3.089660882949829, |
|
"learning_rate": 1.4457831325301205e-05, |
|
"loss": 2.6186, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 2.791164658634538, |
|
"grad_norm": 3.6075477600097656, |
|
"learning_rate": 1.4190093708165998e-05, |
|
"loss": 3.04, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 2.7951807228915664, |
|
"grad_norm": 2.9559810161590576, |
|
"learning_rate": 1.3922356091030792e-05, |
|
"loss": 2.1752, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 2.7991967871485945, |
|
"grad_norm": 3.062072992324829, |
|
"learning_rate": 1.3654618473895583e-05, |
|
"loss": 2.0509, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 2.8032128514056227, |
|
"grad_norm": 4.112563610076904, |
|
"learning_rate": 1.3386880856760376e-05, |
|
"loss": 2.937, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 2.807228915662651, |
|
"grad_norm": 3.2194480895996094, |
|
"learning_rate": 1.3119143239625167e-05, |
|
"loss": 2.2974, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 2.8112449799196786, |
|
"grad_norm": 3.2111270427703857, |
|
"learning_rate": 1.285140562248996e-05, |
|
"loss": 2.3903, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.8152610441767068, |
|
"grad_norm": 3.1619982719421387, |
|
"learning_rate": 1.2583668005354755e-05, |
|
"loss": 2.154, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 2.819277108433735, |
|
"grad_norm": 3.0533196926116943, |
|
"learning_rate": 1.2315930388219546e-05, |
|
"loss": 2.8862, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 2.823293172690763, |
|
"grad_norm": 2.838397264480591, |
|
"learning_rate": 1.2048192771084338e-05, |
|
"loss": 2.1974, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 2.8273092369477912, |
|
"grad_norm": 2.960359573364258, |
|
"learning_rate": 1.178045515394913e-05, |
|
"loss": 2.2714, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 2.8313253012048194, |
|
"grad_norm": 3.3387844562530518, |
|
"learning_rate": 1.1512717536813922e-05, |
|
"loss": 2.5617, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 2.835341365461847, |
|
"grad_norm": 3.802029609680176, |
|
"learning_rate": 1.1244979919678715e-05, |
|
"loss": 2.6791, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 2.8393574297188753, |
|
"grad_norm": 3.0797119140625, |
|
"learning_rate": 1.0977242302543508e-05, |
|
"loss": 2.008, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 2.8433734939759034, |
|
"grad_norm": 3.6929612159729004, |
|
"learning_rate": 1.0709504685408301e-05, |
|
"loss": 3.0253, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 2.8473895582329316, |
|
"grad_norm": 3.409666061401367, |
|
"learning_rate": 1.0441767068273092e-05, |
|
"loss": 2.488, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 2.8514056224899598, |
|
"grad_norm": 3.4419896602630615, |
|
"learning_rate": 1.0174029451137885e-05, |
|
"loss": 2.5107, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.855421686746988, |
|
"grad_norm": 2.9970462322235107, |
|
"learning_rate": 9.906291834002678e-06, |
|
"loss": 2.4561, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 2.859437751004016, |
|
"grad_norm": 2.9567370414733887, |
|
"learning_rate": 9.63855421686747e-06, |
|
"loss": 2.0972, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 2.8634538152610443, |
|
"grad_norm": 3.134462356567383, |
|
"learning_rate": 9.370816599732263e-06, |
|
"loss": 2.4256, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 2.8674698795180724, |
|
"grad_norm": 3.376096487045288, |
|
"learning_rate": 9.103078982597054e-06, |
|
"loss": 2.221, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 2.8714859437751006, |
|
"grad_norm": 3.569254159927368, |
|
"learning_rate": 8.835341365461847e-06, |
|
"loss": 2.379, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 2.8755020080321287, |
|
"grad_norm": 3.4028611183166504, |
|
"learning_rate": 8.56760374832664e-06, |
|
"loss": 2.3297, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 2.8795180722891565, |
|
"grad_norm": 3.772540807723999, |
|
"learning_rate": 8.299866131191433e-06, |
|
"loss": 2.9839, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 2.8835341365461846, |
|
"grad_norm": 3.2679340839385986, |
|
"learning_rate": 8.032128514056226e-06, |
|
"loss": 2.3875, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 2.887550200803213, |
|
"grad_norm": 3.6074769496917725, |
|
"learning_rate": 7.764390896921017e-06, |
|
"loss": 2.9021, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 2.891566265060241, |
|
"grad_norm": 3.7479116916656494, |
|
"learning_rate": 7.4966532797858104e-06, |
|
"loss": 2.5803, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.895582329317269, |
|
"grad_norm": 3.051452875137329, |
|
"learning_rate": 7.228915662650602e-06, |
|
"loss": 2.9504, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 2.8995983935742973, |
|
"grad_norm": 3.341724157333374, |
|
"learning_rate": 6.961178045515396e-06, |
|
"loss": 2.8643, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 2.9036144578313254, |
|
"grad_norm": 2.8065922260284424, |
|
"learning_rate": 6.693440428380188e-06, |
|
"loss": 2.6456, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 2.907630522088353, |
|
"grad_norm": 3.295828342437744, |
|
"learning_rate": 6.42570281124498e-06, |
|
"loss": 3.2691, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 2.9116465863453813, |
|
"grad_norm": 3.15494966506958, |
|
"learning_rate": 6.157965194109773e-06, |
|
"loss": 2.3256, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 2.9156626506024095, |
|
"grad_norm": 3.146188259124756, |
|
"learning_rate": 5.890227576974565e-06, |
|
"loss": 2.5247, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 2.9196787148594376, |
|
"grad_norm": 3.042181968688965, |
|
"learning_rate": 5.622489959839358e-06, |
|
"loss": 2.3458, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 2.923694779116466, |
|
"grad_norm": 2.8072509765625, |
|
"learning_rate": 5.3547523427041504e-06, |
|
"loss": 2.2129, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 2.927710843373494, |
|
"grad_norm": 3.1902520656585693, |
|
"learning_rate": 5.087014725568942e-06, |
|
"loss": 2.1905, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 2.931726907630522, |
|
"grad_norm": 3.706218719482422, |
|
"learning_rate": 4.819277108433735e-06, |
|
"loss": 2.8587, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.9357429718875503, |
|
"grad_norm": 3.516908645629883, |
|
"learning_rate": 4.551539491298527e-06, |
|
"loss": 3.0003, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 2.9397590361445785, |
|
"grad_norm": 3.9051806926727295, |
|
"learning_rate": 4.28380187416332e-06, |
|
"loss": 2.6986, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 2.9437751004016066, |
|
"grad_norm": 2.434493064880371, |
|
"learning_rate": 4.016064257028113e-06, |
|
"loss": 2.0143, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 2.9477911646586348, |
|
"grad_norm": 3.514988899230957, |
|
"learning_rate": 3.7483266398929052e-06, |
|
"loss": 2.5539, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 2.9518072289156625, |
|
"grad_norm": 3.145475387573242, |
|
"learning_rate": 3.480589022757698e-06, |
|
"loss": 2.3991, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 2.9558232931726907, |
|
"grad_norm": 3.0328280925750732, |
|
"learning_rate": 3.21285140562249e-06, |
|
"loss": 2.4384, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 2.959839357429719, |
|
"grad_norm": 3.584406614303589, |
|
"learning_rate": 2.9451137884872824e-06, |
|
"loss": 2.219, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 2.963855421686747, |
|
"grad_norm": 2.8902695178985596, |
|
"learning_rate": 2.6773761713520752e-06, |
|
"loss": 2.0701, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 2.967871485943775, |
|
"grad_norm": 2.714848518371582, |
|
"learning_rate": 2.4096385542168676e-06, |
|
"loss": 2.3578, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 2.9718875502008033, |
|
"grad_norm": 3.4589223861694336, |
|
"learning_rate": 2.14190093708166e-06, |
|
"loss": 2.4076, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.9759036144578315, |
|
"grad_norm": 2.8250577449798584, |
|
"learning_rate": 1.8741633199464526e-06, |
|
"loss": 2.2688, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 2.979919678714859, |
|
"grad_norm": 3.090301752090454, |
|
"learning_rate": 1.606425702811245e-06, |
|
"loss": 2.0527, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 2.9839357429718874, |
|
"grad_norm": 3.82488751411438, |
|
"learning_rate": 1.3386880856760376e-06, |
|
"loss": 2.9784, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 2.9879518072289155, |
|
"grad_norm": 3.046949863433838, |
|
"learning_rate": 1.07095046854083e-06, |
|
"loss": 2.987, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 2.9919678714859437, |
|
"grad_norm": 3.08667254447937, |
|
"learning_rate": 8.032128514056225e-07, |
|
"loss": 2.3121, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 2.995983935742972, |
|
"grad_norm": 3.114004611968994, |
|
"learning_rate": 5.35475234270415e-07, |
|
"loss": 2.4549, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 3.1294381618499756, |
|
"learning_rate": 2.677376171352075e-07, |
|
"loss": 2.2527, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.8732815980911255, |
|
"eval_runtime": 201.6297, |
|
"eval_samples_per_second": 2.475, |
|
"eval_steps_per_second": 1.24, |
|
"step": 747 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 747, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.0605631120002253e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|