{ "best_metric": 1.5775456428527832, "best_model_checkpoint": "miner_id_24/checkpoint-200", "epoch": 0.0299895036737142, "eval_steps": 50, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000149947518368571, "grad_norm": 0.5957685112953186, "learning_rate": 1.0140000000000001e-05, "loss": 1.6728, "step": 1 }, { "epoch": 0.000149947518368571, "eval_loss": 2.0381391048431396, "eval_runtime": 458.7264, "eval_samples_per_second": 6.121, "eval_steps_per_second": 1.53, "step": 1 }, { "epoch": 0.000299895036737142, "grad_norm": 0.7904772162437439, "learning_rate": 2.0280000000000002e-05, "loss": 1.6175, "step": 2 }, { "epoch": 0.000449842555105713, "grad_norm": 1.0926629304885864, "learning_rate": 3.0419999999999997e-05, "loss": 2.0248, "step": 3 }, { "epoch": 0.000599790073474284, "grad_norm": 1.0717623233795166, "learning_rate": 4.0560000000000005e-05, "loss": 1.9361, "step": 4 }, { "epoch": 0.000749737591842855, "grad_norm": 1.117130160331726, "learning_rate": 5.07e-05, "loss": 1.6989, "step": 5 }, { "epoch": 0.000899685110211426, "grad_norm": 0.9936268925666809, "learning_rate": 6.0839999999999993e-05, "loss": 1.8964, "step": 6 }, { "epoch": 0.001049632628579997, "grad_norm": 1.7469357252120972, "learning_rate": 7.097999999999999e-05, "loss": 2.1769, "step": 7 }, { "epoch": 0.001199580146948568, "grad_norm": 1.090126395225525, "learning_rate": 8.112000000000001e-05, "loss": 2.0254, "step": 8 }, { "epoch": 0.001349527665317139, "grad_norm": 0.9377706050872803, "learning_rate": 9.126e-05, "loss": 2.1493, "step": 9 }, { "epoch": 0.00149947518368571, "grad_norm": 1.1927528381347656, "learning_rate": 0.0001014, "loss": 1.9772, "step": 10 }, { "epoch": 0.001649422702054281, "grad_norm": 0.9619717001914978, "learning_rate": 0.00010086631578947368, "loss": 1.9051, "step": 11 }, { "epoch": 0.001799370220422852, "grad_norm": 1.0560603141784668, "learning_rate": 0.00010033263157894736, "loss": 1.9418, "step": 12 }, { "epoch": 0.001949317738791423, "grad_norm": 1.11575448513031, "learning_rate": 9.979894736842105e-05, "loss": 2.0762, "step": 13 }, { "epoch": 0.002099265257159994, "grad_norm": 1.11614191532135, "learning_rate": 9.926526315789475e-05, "loss": 1.9284, "step": 14 }, { "epoch": 0.002249212775528565, "grad_norm": 1.1195893287658691, "learning_rate": 9.873157894736843e-05, "loss": 2.0118, "step": 15 }, { "epoch": 0.002399160293897136, "grad_norm": 1.0731513500213623, "learning_rate": 9.81978947368421e-05, "loss": 2.0423, "step": 16 }, { "epoch": 0.002549107812265707, "grad_norm": 0.9746111631393433, "learning_rate": 9.766421052631579e-05, "loss": 1.439, "step": 17 }, { "epoch": 0.002699055330634278, "grad_norm": 1.2084391117095947, "learning_rate": 9.713052631578947e-05, "loss": 2.1063, "step": 18 }, { "epoch": 0.002849002849002849, "grad_norm": 1.1505322456359863, "learning_rate": 9.659684210526315e-05, "loss": 1.7444, "step": 19 }, { "epoch": 0.00299895036737142, "grad_norm": 1.201917052268982, "learning_rate": 9.606315789473684e-05, "loss": 2.0139, "step": 20 }, { "epoch": 0.003148897885739991, "grad_norm": 1.3050473928451538, "learning_rate": 9.552947368421053e-05, "loss": 2.1195, "step": 21 }, { "epoch": 0.003298845404108562, "grad_norm": 1.3555203676223755, "learning_rate": 9.499578947368422e-05, "loss": 1.9949, "step": 22 }, { "epoch": 0.003448792922477133, "grad_norm": 1.3348125219345093, "learning_rate": 9.44621052631579e-05, "loss": 1.8841, "step": 23 }, { "epoch": 0.003598740440845704, "grad_norm": 1.2195931673049927, "learning_rate": 9.392842105263158e-05, "loss": 1.8362, "step": 24 }, { "epoch": 0.003748687959214275, "grad_norm": 1.5519320964813232, "learning_rate": 9.339473684210526e-05, "loss": 1.9978, "step": 25 }, { "epoch": 0.003898635477582846, "grad_norm": 1.208939790725708, "learning_rate": 9.286105263157894e-05, "loss": 1.8797, "step": 26 }, { "epoch": 0.004048582995951417, "grad_norm": 1.225362777709961, "learning_rate": 9.232736842105263e-05, "loss": 2.0768, "step": 27 }, { "epoch": 0.004198530514319988, "grad_norm": 1.3108456134796143, "learning_rate": 9.179368421052632e-05, "loss": 2.0253, "step": 28 }, { "epoch": 0.004348478032688559, "grad_norm": 1.2646656036376953, "learning_rate": 9.126e-05, "loss": 1.7492, "step": 29 }, { "epoch": 0.00449842555105713, "grad_norm": 1.117287278175354, "learning_rate": 9.072631578947368e-05, "loss": 1.588, "step": 30 }, { "epoch": 0.004648373069425701, "grad_norm": 0.9503945708274841, "learning_rate": 9.019263157894736e-05, "loss": 1.4423, "step": 31 }, { "epoch": 0.004798320587794272, "grad_norm": 0.8445770740509033, "learning_rate": 8.965894736842104e-05, "loss": 1.512, "step": 32 }, { "epoch": 0.004948268106162843, "grad_norm": 0.9999889731407166, "learning_rate": 8.912526315789472e-05, "loss": 1.5027, "step": 33 }, { "epoch": 0.005098215624531414, "grad_norm": 0.9482656717300415, "learning_rate": 8.859157894736842e-05, "loss": 1.3396, "step": 34 }, { "epoch": 0.005248163142899985, "grad_norm": 1.024889588356018, "learning_rate": 8.805789473684211e-05, "loss": 1.6581, "step": 35 }, { "epoch": 0.005398110661268556, "grad_norm": 0.9717214703559875, "learning_rate": 8.752421052631579e-05, "loss": 1.5622, "step": 36 }, { "epoch": 0.005548058179637127, "grad_norm": 1.6440047025680542, "learning_rate": 8.699052631578947e-05, "loss": 1.7143, "step": 37 }, { "epoch": 0.005698005698005698, "grad_norm": 0.862278938293457, "learning_rate": 8.645684210526315e-05, "loss": 1.478, "step": 38 }, { "epoch": 0.005847953216374269, "grad_norm": 0.9841846227645874, "learning_rate": 8.592315789473683e-05, "loss": 1.4562, "step": 39 }, { "epoch": 0.00599790073474284, "grad_norm": 1.1571288108825684, "learning_rate": 8.538947368421051e-05, "loss": 1.7002, "step": 40 }, { "epoch": 0.006147848253111411, "grad_norm": 0.8437686562538147, "learning_rate": 8.485578947368421e-05, "loss": 1.2931, "step": 41 }, { "epoch": 0.006297795771479982, "grad_norm": 1.0105962753295898, "learning_rate": 8.43221052631579e-05, "loss": 1.6359, "step": 42 }, { "epoch": 0.006447743289848553, "grad_norm": 0.9077669382095337, "learning_rate": 8.378842105263158e-05, "loss": 1.7619, "step": 43 }, { "epoch": 0.006597690808217124, "grad_norm": 1.0444610118865967, "learning_rate": 8.325473684210526e-05, "loss": 1.693, "step": 44 }, { "epoch": 0.006747638326585695, "grad_norm": 1.0538818836212158, "learning_rate": 8.272105263157894e-05, "loss": 1.7734, "step": 45 }, { "epoch": 0.006897585844954266, "grad_norm": 1.0589853525161743, "learning_rate": 8.218736842105262e-05, "loss": 1.6743, "step": 46 }, { "epoch": 0.007047533363322837, "grad_norm": 0.890788197517395, "learning_rate": 8.165368421052632e-05, "loss": 1.4131, "step": 47 }, { "epoch": 0.007197480881691408, "grad_norm": 0.9080605506896973, "learning_rate": 8.112000000000001e-05, "loss": 1.557, "step": 48 }, { "epoch": 0.007347428400059979, "grad_norm": 0.9284332394599915, "learning_rate": 8.058631578947369e-05, "loss": 1.6069, "step": 49 }, { "epoch": 0.00749737591842855, "grad_norm": 0.9680672883987427, "learning_rate": 8.005263157894737e-05, "loss": 1.4316, "step": 50 }, { "epoch": 0.00749737591842855, "eval_loss": 1.7056361436843872, "eval_runtime": 458.97, "eval_samples_per_second": 6.118, "eval_steps_per_second": 1.53, "step": 50 }, { "epoch": 0.007647323436797121, "grad_norm": 0.6464632153511047, "learning_rate": 7.951894736842105e-05, "loss": 1.8528, "step": 51 }, { "epoch": 0.007797270955165692, "grad_norm": 0.7035087943077087, "learning_rate": 7.898526315789473e-05, "loss": 1.6005, "step": 52 }, { "epoch": 0.007947218473534263, "grad_norm": 0.7116038799285889, "learning_rate": 7.845157894736841e-05, "loss": 1.6531, "step": 53 }, { "epoch": 0.008097165991902834, "grad_norm": 0.7368800044059753, "learning_rate": 7.79178947368421e-05, "loss": 1.6262, "step": 54 }, { "epoch": 0.008247113510271405, "grad_norm": 0.7702679634094238, "learning_rate": 7.73842105263158e-05, "loss": 1.8364, "step": 55 }, { "epoch": 0.008397061028639977, "grad_norm": 0.7955710887908936, "learning_rate": 7.685052631578948e-05, "loss": 1.5435, "step": 56 }, { "epoch": 0.008547008547008548, "grad_norm": 0.7569422125816345, "learning_rate": 7.631684210526316e-05, "loss": 1.7112, "step": 57 }, { "epoch": 0.008696956065377117, "grad_norm": 0.7948057055473328, "learning_rate": 7.578315789473684e-05, "loss": 1.783, "step": 58 }, { "epoch": 0.008846903583745689, "grad_norm": 0.8909001350402832, "learning_rate": 7.524947368421052e-05, "loss": 1.7872, "step": 59 }, { "epoch": 0.00899685110211426, "grad_norm": 0.8286412358283997, "learning_rate": 7.47157894736842e-05, "loss": 1.5437, "step": 60 }, { "epoch": 0.009146798620482831, "grad_norm": 0.8057852983474731, "learning_rate": 7.418210526315789e-05, "loss": 1.6492, "step": 61 }, { "epoch": 0.009296746138851402, "grad_norm": 0.8314459919929504, "learning_rate": 7.364842105263159e-05, "loss": 1.811, "step": 62 }, { "epoch": 0.009446693657219974, "grad_norm": 0.9089363217353821, "learning_rate": 7.311473684210527e-05, "loss": 1.8301, "step": 63 }, { "epoch": 0.009596641175588543, "grad_norm": 0.9501635432243347, "learning_rate": 7.258105263157895e-05, "loss": 1.7909, "step": 64 }, { "epoch": 0.009746588693957114, "grad_norm": 0.8389644026756287, "learning_rate": 7.204736842105263e-05, "loss": 1.643, "step": 65 }, { "epoch": 0.009896536212325686, "grad_norm": 0.9620031118392944, "learning_rate": 7.151368421052631e-05, "loss": 2.0201, "step": 66 }, { "epoch": 0.010046483730694257, "grad_norm": 0.9018654227256775, "learning_rate": 7.097999999999999e-05, "loss": 1.7294, "step": 67 }, { "epoch": 0.010196431249062828, "grad_norm": 0.9742156267166138, "learning_rate": 7.044631578947368e-05, "loss": 1.8999, "step": 68 }, { "epoch": 0.0103463787674314, "grad_norm": 0.9514942169189453, "learning_rate": 6.991263157894738e-05, "loss": 1.7759, "step": 69 }, { "epoch": 0.01049632628579997, "grad_norm": 1.0679067373275757, "learning_rate": 6.937894736842106e-05, "loss": 1.9704, "step": 70 }, { "epoch": 0.01064627380416854, "grad_norm": 1.042546272277832, "learning_rate": 6.884526315789474e-05, "loss": 2.0272, "step": 71 }, { "epoch": 0.010796221322537112, "grad_norm": 1.0005282163619995, "learning_rate": 6.831157894736842e-05, "loss": 1.9547, "step": 72 }, { "epoch": 0.010946168840905683, "grad_norm": 0.9244095087051392, "learning_rate": 6.77778947368421e-05, "loss": 1.7477, "step": 73 }, { "epoch": 0.011096116359274254, "grad_norm": 1.0138779878616333, "learning_rate": 6.724421052631579e-05, "loss": 1.9458, "step": 74 }, { "epoch": 0.011246063877642825, "grad_norm": 1.0990403890609741, "learning_rate": 6.671052631578948e-05, "loss": 1.7663, "step": 75 }, { "epoch": 0.011396011396011397, "grad_norm": 0.9864200949668884, "learning_rate": 6.617684210526316e-05, "loss": 1.6638, "step": 76 }, { "epoch": 0.011545958914379968, "grad_norm": 1.0967020988464355, "learning_rate": 6.564315789473684e-05, "loss": 2.0666, "step": 77 }, { "epoch": 0.011695906432748537, "grad_norm": 1.037764549255371, "learning_rate": 6.510947368421052e-05, "loss": 1.8525, "step": 78 }, { "epoch": 0.011845853951117109, "grad_norm": 1.0943762063980103, "learning_rate": 6.45757894736842e-05, "loss": 1.7367, "step": 79 }, { "epoch": 0.01199580146948568, "grad_norm": 1.0915956497192383, "learning_rate": 6.404210526315789e-05, "loss": 1.8947, "step": 80 }, { "epoch": 0.012145748987854251, "grad_norm": 1.2832694053649902, "learning_rate": 6.350842105263158e-05, "loss": 1.9212, "step": 81 }, { "epoch": 0.012295696506222822, "grad_norm": 1.195035457611084, "learning_rate": 6.297473684210527e-05, "loss": 1.7512, "step": 82 }, { "epoch": 0.012445644024591394, "grad_norm": 0.9657008051872253, "learning_rate": 6.244105263157895e-05, "loss": 1.7012, "step": 83 }, { "epoch": 0.012595591542959963, "grad_norm": 1.1304044723510742, "learning_rate": 6.190736842105263e-05, "loss": 1.7072, "step": 84 }, { "epoch": 0.012745539061328534, "grad_norm": 0.9512737393379211, "learning_rate": 6.137368421052631e-05, "loss": 1.3972, "step": 85 }, { "epoch": 0.012895486579697106, "grad_norm": 0.9143646359443665, "learning_rate": 6.0839999999999993e-05, "loss": 1.7567, "step": 86 }, { "epoch": 0.013045434098065677, "grad_norm": 0.7937706112861633, "learning_rate": 6.030631578947368e-05, "loss": 1.3784, "step": 87 }, { "epoch": 0.013195381616434248, "grad_norm": 1.0114094018936157, "learning_rate": 5.977263157894736e-05, "loss": 1.6025, "step": 88 }, { "epoch": 0.01334532913480282, "grad_norm": 0.8791283965110779, "learning_rate": 5.9238947368421054e-05, "loss": 1.5596, "step": 89 }, { "epoch": 0.01349527665317139, "grad_norm": 1.085236668586731, "learning_rate": 5.870526315789474e-05, "loss": 1.7252, "step": 90 }, { "epoch": 0.01364522417153996, "grad_norm": 0.82220059633255, "learning_rate": 5.817157894736842e-05, "loss": 1.2668, "step": 91 }, { "epoch": 0.013795171689908532, "grad_norm": 0.8292524218559265, "learning_rate": 5.76378947368421e-05, "loss": 1.6119, "step": 92 }, { "epoch": 0.013945119208277103, "grad_norm": 0.8886623978614807, "learning_rate": 5.710421052631579e-05, "loss": 1.4698, "step": 93 }, { "epoch": 0.014095066726645674, "grad_norm": 0.8936247825622559, "learning_rate": 5.657052631578947e-05, "loss": 1.3889, "step": 94 }, { "epoch": 0.014245014245014245, "grad_norm": 0.8380143046379089, "learning_rate": 5.603684210526316e-05, "loss": 1.3708, "step": 95 }, { "epoch": 0.014394961763382817, "grad_norm": 0.9443024396896362, "learning_rate": 5.550315789473684e-05, "loss": 1.481, "step": 96 }, { "epoch": 0.014544909281751388, "grad_norm": 1.0896035432815552, "learning_rate": 5.496947368421053e-05, "loss": 1.6766, "step": 97 }, { "epoch": 0.014694856800119957, "grad_norm": 0.975226104259491, "learning_rate": 5.443578947368421e-05, "loss": 1.3945, "step": 98 }, { "epoch": 0.014844804318488529, "grad_norm": 0.8843653798103333, "learning_rate": 5.390210526315789e-05, "loss": 1.1653, "step": 99 }, { "epoch": 0.0149947518368571, "grad_norm": 1.046746015548706, "learning_rate": 5.336842105263158e-05, "loss": 1.5679, "step": 100 }, { "epoch": 0.0149947518368571, "eval_loss": 1.6402101516723633, "eval_runtime": 460.1008, "eval_samples_per_second": 6.103, "eval_steps_per_second": 1.526, "step": 100 }, { "epoch": 0.015144699355225671, "grad_norm": 0.7466453909873962, "learning_rate": 5.283473684210526e-05, "loss": 1.6413, "step": 101 }, { "epoch": 0.015294646873594242, "grad_norm": 0.7480393648147583, "learning_rate": 5.230105263157895e-05, "loss": 1.5599, "step": 102 }, { "epoch": 0.015444594391962814, "grad_norm": 0.7098796367645264, "learning_rate": 5.176736842105263e-05, "loss": 1.3843, "step": 103 }, { "epoch": 0.015594541910331383, "grad_norm": 0.7601646780967712, "learning_rate": 5.123368421052632e-05, "loss": 1.5504, "step": 104 }, { "epoch": 0.015744489428699954, "grad_norm": 0.7385756373405457, "learning_rate": 5.07e-05, "loss": 1.7734, "step": 105 }, { "epoch": 0.015894436947068526, "grad_norm": 0.7589382529258728, "learning_rate": 5.016631578947368e-05, "loss": 1.4962, "step": 106 }, { "epoch": 0.016044384465437097, "grad_norm": 0.8866682052612305, "learning_rate": 4.963263157894737e-05, "loss": 2.0286, "step": 107 }, { "epoch": 0.016194331983805668, "grad_norm": 0.746847927570343, "learning_rate": 4.909894736842105e-05, "loss": 1.5542, "step": 108 }, { "epoch": 0.01634427950217424, "grad_norm": 0.8657152056694031, "learning_rate": 4.8565263157894734e-05, "loss": 1.7472, "step": 109 }, { "epoch": 0.01649422702054281, "grad_norm": 0.8354389667510986, "learning_rate": 4.803157894736842e-05, "loss": 1.7904, "step": 110 }, { "epoch": 0.016644174538911382, "grad_norm": 0.9242944717407227, "learning_rate": 4.749789473684211e-05, "loss": 1.8346, "step": 111 }, { "epoch": 0.016794122057279953, "grad_norm": 0.8263792991638184, "learning_rate": 4.696421052631579e-05, "loss": 1.7405, "step": 112 }, { "epoch": 0.016944069575648524, "grad_norm": 0.8106954097747803, "learning_rate": 4.643052631578947e-05, "loss": 1.538, "step": 113 }, { "epoch": 0.017094017094017096, "grad_norm": 0.8450605869293213, "learning_rate": 4.589684210526316e-05, "loss": 1.669, "step": 114 }, { "epoch": 0.017243964612385664, "grad_norm": 0.8268195390701294, "learning_rate": 4.536315789473684e-05, "loss": 1.6906, "step": 115 }, { "epoch": 0.017393912130754235, "grad_norm": 0.9907267689704895, "learning_rate": 4.482947368421052e-05, "loss": 1.7395, "step": 116 }, { "epoch": 0.017543859649122806, "grad_norm": 0.9038468599319458, "learning_rate": 4.429578947368421e-05, "loss": 1.6599, "step": 117 }, { "epoch": 0.017693807167491377, "grad_norm": 0.9513837099075317, "learning_rate": 4.3762105263157896e-05, "loss": 1.9379, "step": 118 }, { "epoch": 0.01784375468585995, "grad_norm": 0.8293036818504333, "learning_rate": 4.3228421052631576e-05, "loss": 1.4927, "step": 119 }, { "epoch": 0.01799370220422852, "grad_norm": 0.8836062550544739, "learning_rate": 4.269473684210526e-05, "loss": 1.551, "step": 120 }, { "epoch": 0.01814364972259709, "grad_norm": 0.9525675177574158, "learning_rate": 4.216105263157895e-05, "loss": 1.7755, "step": 121 }, { "epoch": 0.018293597240965662, "grad_norm": 0.9899429678916931, "learning_rate": 4.162736842105263e-05, "loss": 1.6903, "step": 122 }, { "epoch": 0.018443544759334234, "grad_norm": 1.071230173110962, "learning_rate": 4.109368421052631e-05, "loss": 1.6678, "step": 123 }, { "epoch": 0.018593492277702805, "grad_norm": 1.0576627254486084, "learning_rate": 4.0560000000000005e-05, "loss": 1.8232, "step": 124 }, { "epoch": 0.018743439796071376, "grad_norm": 1.0281530618667603, "learning_rate": 4.0026315789473685e-05, "loss": 1.7751, "step": 125 }, { "epoch": 0.018893387314439947, "grad_norm": 1.1899738311767578, "learning_rate": 3.9492631578947365e-05, "loss": 1.8125, "step": 126 }, { "epoch": 0.01904333483280852, "grad_norm": 1.1237711906433105, "learning_rate": 3.895894736842105e-05, "loss": 1.9508, "step": 127 }, { "epoch": 0.019193282351177086, "grad_norm": 1.0853641033172607, "learning_rate": 3.842526315789474e-05, "loss": 1.551, "step": 128 }, { "epoch": 0.019343229869545658, "grad_norm": 1.093361258506775, "learning_rate": 3.789157894736842e-05, "loss": 1.8046, "step": 129 }, { "epoch": 0.01949317738791423, "grad_norm": 1.4327589273452759, "learning_rate": 3.73578947368421e-05, "loss": 1.9274, "step": 130 }, { "epoch": 0.0196431249062828, "grad_norm": 0.9172009825706482, "learning_rate": 3.682421052631579e-05, "loss": 1.4593, "step": 131 }, { "epoch": 0.01979307242465137, "grad_norm": 1.1487863063812256, "learning_rate": 3.6290526315789474e-05, "loss": 1.5119, "step": 132 }, { "epoch": 0.019943019943019943, "grad_norm": 0.8225706219673157, "learning_rate": 3.5756842105263154e-05, "loss": 1.4499, "step": 133 }, { "epoch": 0.020092967461388514, "grad_norm": 0.8087728023529053, "learning_rate": 3.522315789473684e-05, "loss": 1.2635, "step": 134 }, { "epoch": 0.020242914979757085, "grad_norm": 1.0588133335113525, "learning_rate": 3.468947368421053e-05, "loss": 1.5456, "step": 135 }, { "epoch": 0.020392862498125656, "grad_norm": 0.9803886413574219, "learning_rate": 3.415578947368421e-05, "loss": 1.7034, "step": 136 }, { "epoch": 0.020542810016494228, "grad_norm": 1.0371334552764893, "learning_rate": 3.3622105263157895e-05, "loss": 1.3559, "step": 137 }, { "epoch": 0.0206927575348628, "grad_norm": 0.9132927656173706, "learning_rate": 3.308842105263158e-05, "loss": 1.4233, "step": 138 }, { "epoch": 0.02084270505323137, "grad_norm": 0.9797881841659546, "learning_rate": 3.255473684210526e-05, "loss": 1.4863, "step": 139 }, { "epoch": 0.02099265257159994, "grad_norm": 0.9240613579750061, "learning_rate": 3.202105263157894e-05, "loss": 1.4756, "step": 140 }, { "epoch": 0.02114260008996851, "grad_norm": 0.9621275663375854, "learning_rate": 3.1487368421052636e-05, "loss": 1.5349, "step": 141 }, { "epoch": 0.02129254760833708, "grad_norm": 0.9655196666717529, "learning_rate": 3.0953684210526317e-05, "loss": 1.503, "step": 142 }, { "epoch": 0.021442495126705652, "grad_norm": 0.9339595437049866, "learning_rate": 3.0419999999999997e-05, "loss": 1.2824, "step": 143 }, { "epoch": 0.021592442645074223, "grad_norm": 1.0316590070724487, "learning_rate": 2.988631578947368e-05, "loss": 1.484, "step": 144 }, { "epoch": 0.021742390163442794, "grad_norm": 0.9773208498954773, "learning_rate": 2.935263157894737e-05, "loss": 1.2475, "step": 145 }, { "epoch": 0.021892337681811366, "grad_norm": 0.9496320486068726, "learning_rate": 2.881894736842105e-05, "loss": 1.3519, "step": 146 }, { "epoch": 0.022042285200179937, "grad_norm": 0.9151192903518677, "learning_rate": 2.8285263157894735e-05, "loss": 1.3695, "step": 147 }, { "epoch": 0.022192232718548508, "grad_norm": 0.8950878977775574, "learning_rate": 2.775157894736842e-05, "loss": 1.3184, "step": 148 }, { "epoch": 0.02234218023691708, "grad_norm": 0.97145015001297, "learning_rate": 2.7217894736842105e-05, "loss": 1.4416, "step": 149 }, { "epoch": 0.02249212775528565, "grad_norm": 1.0586730241775513, "learning_rate": 2.668421052631579e-05, "loss": 1.2799, "step": 150 }, { "epoch": 0.02249212775528565, "eval_loss": 1.5976461172103882, "eval_runtime": 459.7065, "eval_samples_per_second": 6.108, "eval_steps_per_second": 1.527, "step": 150 }, { "epoch": 0.022642075273654222, "grad_norm": 0.7638190388679504, "learning_rate": 2.6150526315789476e-05, "loss": 1.4851, "step": 151 }, { "epoch": 0.022792022792022793, "grad_norm": 0.7945873141288757, "learning_rate": 2.561684210526316e-05, "loss": 1.4862, "step": 152 }, { "epoch": 0.022941970310391364, "grad_norm": 0.8045321702957153, "learning_rate": 2.508315789473684e-05, "loss": 1.6359, "step": 153 }, { "epoch": 0.023091917828759936, "grad_norm": 0.7538973093032837, "learning_rate": 2.4549473684210527e-05, "loss": 1.5668, "step": 154 }, { "epoch": 0.023241865347128503, "grad_norm": 0.8441860675811768, "learning_rate": 2.401578947368421e-05, "loss": 1.8056, "step": 155 }, { "epoch": 0.023391812865497075, "grad_norm": 0.7490977048873901, "learning_rate": 2.3482105263157894e-05, "loss": 1.4704, "step": 156 }, { "epoch": 0.023541760383865646, "grad_norm": 0.8391216397285461, "learning_rate": 2.294842105263158e-05, "loss": 1.571, "step": 157 }, { "epoch": 0.023691707902234217, "grad_norm": 0.9428722262382507, "learning_rate": 2.241473684210526e-05, "loss": 1.7832, "step": 158 }, { "epoch": 0.02384165542060279, "grad_norm": 0.8295037150382996, "learning_rate": 2.1881052631578948e-05, "loss": 1.632, "step": 159 }, { "epoch": 0.02399160293897136, "grad_norm": 0.8535486459732056, "learning_rate": 2.134736842105263e-05, "loss": 1.598, "step": 160 }, { "epoch": 0.02414155045733993, "grad_norm": 0.8622595071792603, "learning_rate": 2.0813684210526315e-05, "loss": 1.4939, "step": 161 }, { "epoch": 0.024291497975708502, "grad_norm": 0.9161174297332764, "learning_rate": 2.0280000000000002e-05, "loss": 1.507, "step": 162 }, { "epoch": 0.024441445494077074, "grad_norm": 1.2824655771255493, "learning_rate": 1.9746315789473683e-05, "loss": 1.9943, "step": 163 }, { "epoch": 0.024591393012445645, "grad_norm": 1.068787932395935, "learning_rate": 1.921263157894737e-05, "loss": 1.6919, "step": 164 }, { "epoch": 0.024741340530814216, "grad_norm": 1.0220016241073608, "learning_rate": 1.867894736842105e-05, "loss": 1.7503, "step": 165 }, { "epoch": 0.024891288049182787, "grad_norm": 0.9780786037445068, "learning_rate": 1.8145263157894737e-05, "loss": 1.7564, "step": 166 }, { "epoch": 0.02504123556755136, "grad_norm": 0.9884896278381348, "learning_rate": 1.761157894736842e-05, "loss": 1.7081, "step": 167 }, { "epoch": 0.025191183085919926, "grad_norm": 0.9883540272712708, "learning_rate": 1.7077894736842104e-05, "loss": 1.8178, "step": 168 }, { "epoch": 0.025341130604288498, "grad_norm": 1.0421220064163208, "learning_rate": 1.654421052631579e-05, "loss": 1.6915, "step": 169 }, { "epoch": 0.02549107812265707, "grad_norm": 0.9626386761665344, "learning_rate": 1.601052631578947e-05, "loss": 1.4605, "step": 170 }, { "epoch": 0.02564102564102564, "grad_norm": 1.0436559915542603, "learning_rate": 1.5476842105263158e-05, "loss": 1.6141, "step": 171 }, { "epoch": 0.02579097315939421, "grad_norm": 1.0449950695037842, "learning_rate": 1.494315789473684e-05, "loss": 1.6414, "step": 172 }, { "epoch": 0.025940920677762783, "grad_norm": 1.166788935661316, "learning_rate": 1.4409473684210525e-05, "loss": 1.965, "step": 173 }, { "epoch": 0.026090868196131354, "grad_norm": 0.9662919640541077, "learning_rate": 1.387578947368421e-05, "loss": 1.4862, "step": 174 }, { "epoch": 0.026240815714499925, "grad_norm": 0.998842716217041, "learning_rate": 1.3342105263157894e-05, "loss": 1.6907, "step": 175 }, { "epoch": 0.026390763232868496, "grad_norm": 1.0697990655899048, "learning_rate": 1.280842105263158e-05, "loss": 1.7907, "step": 176 }, { "epoch": 0.026540710751237068, "grad_norm": 1.1204060316085815, "learning_rate": 1.2274736842105263e-05, "loss": 1.7333, "step": 177 }, { "epoch": 0.02669065826960564, "grad_norm": 1.2920093536376953, "learning_rate": 1.1741052631578947e-05, "loss": 1.9552, "step": 178 }, { "epoch": 0.02684060578797421, "grad_norm": 0.9266936182975769, "learning_rate": 1.120736842105263e-05, "loss": 1.3865, "step": 179 }, { "epoch": 0.02699055330634278, "grad_norm": 1.2769348621368408, "learning_rate": 1.0673684210526314e-05, "loss": 1.7049, "step": 180 }, { "epoch": 0.027140500824711353, "grad_norm": 0.9029727578163147, "learning_rate": 1.0140000000000001e-05, "loss": 1.2952, "step": 181 }, { "epoch": 0.02729044834307992, "grad_norm": 0.8483734130859375, "learning_rate": 9.606315789473685e-06, "loss": 1.4916, "step": 182 }, { "epoch": 0.027440395861448492, "grad_norm": 0.989263653755188, "learning_rate": 9.072631578947368e-06, "loss": 1.3972, "step": 183 }, { "epoch": 0.027590343379817063, "grad_norm": 0.8681879043579102, "learning_rate": 8.538947368421052e-06, "loss": 1.4803, "step": 184 }, { "epoch": 0.027740290898185634, "grad_norm": 0.7871488928794861, "learning_rate": 8.005263157894736e-06, "loss": 1.2179, "step": 185 }, { "epoch": 0.027890238416554206, "grad_norm": 0.9535936713218689, "learning_rate": 7.47157894736842e-06, "loss": 1.4229, "step": 186 }, { "epoch": 0.028040185934922777, "grad_norm": 0.9571456909179688, "learning_rate": 6.937894736842105e-06, "loss": 1.4315, "step": 187 }, { "epoch": 0.028190133453291348, "grad_norm": 0.9947013258934021, "learning_rate": 6.40421052631579e-06, "loss": 1.5569, "step": 188 }, { "epoch": 0.02834008097165992, "grad_norm": 0.8871195316314697, "learning_rate": 5.8705263157894735e-06, "loss": 1.4267, "step": 189 }, { "epoch": 0.02849002849002849, "grad_norm": 0.9514735341072083, "learning_rate": 5.336842105263157e-06, "loss": 1.4856, "step": 190 }, { "epoch": 0.028639976008397062, "grad_norm": 1.2530133724212646, "learning_rate": 4.803157894736842e-06, "loss": 1.4374, "step": 191 }, { "epoch": 0.028789923526765633, "grad_norm": 0.9343175888061523, "learning_rate": 4.269473684210526e-06, "loss": 1.3848, "step": 192 }, { "epoch": 0.028939871045134204, "grad_norm": 0.858911395072937, "learning_rate": 3.73578947368421e-06, "loss": 1.3063, "step": 193 }, { "epoch": 0.029089818563502776, "grad_norm": 0.9408674836158752, "learning_rate": 3.202105263157895e-06, "loss": 1.405, "step": 194 }, { "epoch": 0.029239766081871343, "grad_norm": 1.0257238149642944, "learning_rate": 2.6684210526315785e-06, "loss": 1.5829, "step": 195 }, { "epoch": 0.029389713600239915, "grad_norm": 0.9690769910812378, "learning_rate": 2.134736842105263e-06, "loss": 1.4333, "step": 196 }, { "epoch": 0.029539661118608486, "grad_norm": 0.9069121479988098, "learning_rate": 1.6010526315789475e-06, "loss": 1.1156, "step": 197 }, { "epoch": 0.029689608636977057, "grad_norm": 0.9297611713409424, "learning_rate": 1.0673684210526315e-06, "loss": 1.3354, "step": 198 }, { "epoch": 0.02983955615534563, "grad_norm": 1.0361257791519165, "learning_rate": 5.336842105263158e-07, "loss": 1.404, "step": 199 }, { "epoch": 0.0299895036737142, "grad_norm": 1.1815166473388672, "learning_rate": 0.0, "loss": 1.6191, "step": 200 }, { "epoch": 0.0299895036737142, "eval_loss": 1.5775456428527832, "eval_runtime": 459.7688, "eval_samples_per_second": 6.107, "eval_steps_per_second": 1.527, "step": 200 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0876908288658637e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }