|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9957173447537473, |
|
"eval_steps": 500, |
|
"global_step": 155, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.15481117367744446, |
|
"learning_rate": 2e-05, |
|
"loss": 1.1213, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.16627414524555206, |
|
"learning_rate": 4e-05, |
|
"loss": 1.1341, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.14720524847507477, |
|
"learning_rate": 6e-05, |
|
"loss": 1.148, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.15325099229812622, |
|
"learning_rate": 8e-05, |
|
"loss": 1.1435, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.16704852879047394, |
|
"learning_rate": 0.0001, |
|
"loss": 1.0895, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.11686641722917557, |
|
"learning_rate": 0.00012, |
|
"loss": 1.0784, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.09641632437705994, |
|
"learning_rate": 0.00014, |
|
"loss": 1.0612, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.12384118884801865, |
|
"learning_rate": 0.00016, |
|
"loss": 1.0566, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.07287071645259857, |
|
"learning_rate": 0.00018, |
|
"loss": 1.0442, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.07469318807125092, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0083, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.08364757895469666, |
|
"learning_rate": 0.00019999761633493753, |
|
"loss": 1.0169, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.07763934135437012, |
|
"learning_rate": 0.0001999904654533872, |
|
"loss": 1.0348, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.06744416803121567, |
|
"learning_rate": 0.0001999785476962552, |
|
"loss": 1.0123, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.07724820077419281, |
|
"learning_rate": 0.00019996186363170035, |
|
"loss": 1.0188, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.0746370479464531, |
|
"learning_rate": 0.00019994041405510705, |
|
"loss": 1.0164, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.055804282426834106, |
|
"learning_rate": 0.00019991419998904747, |
|
"loss": 1.0587, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.054163169115781784, |
|
"learning_rate": 0.00019988322268323268, |
|
"loss": 1.0149, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.05897677689790726, |
|
"learning_rate": 0.00019984748361445308, |
|
"loss": 1.0136, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.05603804066777229, |
|
"learning_rate": 0.00019980698448650804, |
|
"loss": 0.9996, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.06250110268592834, |
|
"learning_rate": 0.0001997617272301248, |
|
"loss": 1.0145, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.05678323656320572, |
|
"learning_rate": 0.000199711714002866, |
|
"loss": 1.0005, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.05278163403272629, |
|
"learning_rate": 0.00019965694718902745, |
|
"loss": 1.0034, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.05601625144481659, |
|
"learning_rate": 0.00019959742939952392, |
|
"loss": 0.9915, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.054547134786844254, |
|
"learning_rate": 0.00019953316347176488, |
|
"loss": 1.0115, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.06417939066886902, |
|
"learning_rate": 0.0001994641524695193, |
|
"loss": 0.9862, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.061326365917921066, |
|
"learning_rate": 0.0001993903996827694, |
|
"loss": 0.9889, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.05376205965876579, |
|
"learning_rate": 0.00019931190862755417, |
|
"loss": 0.9604, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.0678999274969101, |
|
"learning_rate": 0.00019922868304580118, |
|
"loss": 1.0492, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.053755760192871094, |
|
"learning_rate": 0.0001991407269051487, |
|
"loss": 0.9985, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.05401955544948578, |
|
"learning_rate": 0.00019904804439875633, |
|
"loss": 0.9787, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.05666874349117279, |
|
"learning_rate": 0.0001989506399451051, |
|
"loss": 0.9886, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.05425805598497391, |
|
"learning_rate": 0.00019884851818778693, |
|
"loss": 1.0197, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.06086369976401329, |
|
"learning_rate": 0.00019874168399528305, |
|
"loss": 0.9879, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.05577366426587105, |
|
"learning_rate": 0.00019863014246073214, |
|
"loss": 0.9808, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.05853752791881561, |
|
"learning_rate": 0.0001985138989016874, |
|
"loss": 0.957, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.0582735501229763, |
|
"learning_rate": 0.00019839295885986296, |
|
"loss": 0.9732, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.07255622744560242, |
|
"learning_rate": 0.00019826732810086998, |
|
"loss": 1.0199, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.06085599586367607, |
|
"learning_rate": 0.00019813701261394136, |
|
"loss": 0.9946, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.06600817292928696, |
|
"learning_rate": 0.00019800201861164664, |
|
"loss": 0.9646, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.0634196326136589, |
|
"learning_rate": 0.00019786235252959553, |
|
"loss": 1.0092, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.05254920944571495, |
|
"learning_rate": 0.00019771802102613127, |
|
"loss": 0.9535, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.06008182466030121, |
|
"learning_rate": 0.00019756903098201308, |
|
"loss": 0.9897, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.07715290039777756, |
|
"learning_rate": 0.00019741538950008818, |
|
"loss": 1.0132, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.06104297190904617, |
|
"learning_rate": 0.0001972571039049533, |
|
"loss": 0.9938, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.06008582562208176, |
|
"learning_rate": 0.0001970941817426052, |
|
"loss": 0.9889, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.05699775367975235, |
|
"learning_rate": 0.00019692663078008132, |
|
"loss": 0.9843, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.05760645866394043, |
|
"learning_rate": 0.00019675445900508909, |
|
"loss": 0.9677, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.06075143814086914, |
|
"learning_rate": 0.00019657767462562544, |
|
"loss": 0.9929, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.059820640832185745, |
|
"learning_rate": 0.00019639628606958533, |
|
"loss": 0.9889, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.059315863996744156, |
|
"learning_rate": 0.00019621030198436006, |
|
"loss": 0.9994, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.05949851870536804, |
|
"learning_rate": 0.00019601973123642492, |
|
"loss": 0.9593, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.06036762520670891, |
|
"learning_rate": 0.00019582458291091663, |
|
"loss": 0.9669, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.05799931660294533, |
|
"learning_rate": 0.00019562486631120006, |
|
"loss": 0.9731, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.05733400583267212, |
|
"learning_rate": 0.00019542059095842485, |
|
"loss": 0.9676, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.06894834339618683, |
|
"learning_rate": 0.00019521176659107142, |
|
"loss": 1.0142, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.0657154992222786, |
|
"learning_rate": 0.00019499840316448673, |
|
"loss": 0.9598, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.0570225715637207, |
|
"learning_rate": 0.00019478051085040975, |
|
"loss": 0.9979, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.06271913647651672, |
|
"learning_rate": 0.00019455810003648637, |
|
"loss": 0.9694, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.05831892415881157, |
|
"learning_rate": 0.0001943311813257743, |
|
"loss": 0.9934, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.061948519200086594, |
|
"learning_rate": 0.00019409976553623766, |
|
"loss": 0.9812, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.06726415455341339, |
|
"learning_rate": 0.00019386386370023103, |
|
"loss": 0.9837, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.057408351451158524, |
|
"learning_rate": 0.00019362348706397373, |
|
"loss": 0.9512, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.057638928294181824, |
|
"learning_rate": 0.00019337864708701357, |
|
"loss": 0.9622, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.06090644374489784, |
|
"learning_rate": 0.00019312935544168048, |
|
"loss": 0.9927, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.05811937153339386, |
|
"learning_rate": 0.00019287562401253022, |
|
"loss": 0.9905, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.05927939713001251, |
|
"learning_rate": 0.00019261746489577765, |
|
"loss": 0.9604, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.06021604314446449, |
|
"learning_rate": 0.0001923548903987201, |
|
"loss": 0.9535, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.07004135102033615, |
|
"learning_rate": 0.00019208791303915063, |
|
"loss": 1.0032, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.05626143515110016, |
|
"learning_rate": 0.0001918165455447614, |
|
"loss": 0.9726, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.06130916625261307, |
|
"learning_rate": 0.00019154080085253666, |
|
"loss": 0.9646, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.06467108428478241, |
|
"learning_rate": 0.0001912606921081362, |
|
"loss": 0.9516, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.06047213450074196, |
|
"learning_rate": 0.0001909762326652686, |
|
"loss": 0.9664, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.05907664820551872, |
|
"learning_rate": 0.00019068743608505455, |
|
"loss": 0.9796, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.06679921597242355, |
|
"learning_rate": 0.00019039431613538047, |
|
"loss": 0.9678, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.06133173033595085, |
|
"learning_rate": 0.0001900968867902419, |
|
"loss": 0.9875, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.05841493234038353, |
|
"learning_rate": 0.00018979516222907775, |
|
"loss": 0.9686, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.0660431906580925, |
|
"learning_rate": 0.00018948915683609388, |
|
"loss": 0.9863, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.05776617303490639, |
|
"learning_rate": 0.00018917888519957754, |
|
"loss": 0.9417, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.05937017872929573, |
|
"learning_rate": 0.00018886436211120193, |
|
"loss": 0.9995, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.06314114481210709, |
|
"learning_rate": 0.000188545602565321, |
|
"loss": 0.9806, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.060519032180309296, |
|
"learning_rate": 0.00018822262175825462, |
|
"loss": 0.9741, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.06154269725084305, |
|
"learning_rate": 0.00018789543508756408, |
|
"loss": 0.9793, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.06176121160387993, |
|
"learning_rate": 0.00018756405815131813, |
|
"loss": 0.9453, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.06044905260205269, |
|
"learning_rate": 0.00018722850674734927, |
|
"loss": 0.9462, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.05896229296922684, |
|
"learning_rate": 0.00018688879687250067, |
|
"loss": 0.9963, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.06071419641375542, |
|
"learning_rate": 0.0001865449447218635, |
|
"loss": 0.9914, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.0697932317852974, |
|
"learning_rate": 0.00018619696668800492, |
|
"loss": 0.9726, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.062443289905786514, |
|
"learning_rate": 0.00018584487936018661, |
|
"loss": 1.0084, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.059460923075675964, |
|
"learning_rate": 0.0001854886995235738, |
|
"loss": 0.9404, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.058260347694158554, |
|
"learning_rate": 0.00018512844415843514, |
|
"loss": 0.9796, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.05946533381938934, |
|
"learning_rate": 0.00018476413043933313, |
|
"loss": 0.9418, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.06572849303483963, |
|
"learning_rate": 0.00018439577573430555, |
|
"loss": 0.9785, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.06783867627382278, |
|
"learning_rate": 0.00018402339760403713, |
|
"loss": 0.9747, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.06454402953386307, |
|
"learning_rate": 0.00018364701380102266, |
|
"loss": 0.9779, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.06309663504362106, |
|
"learning_rate": 0.00018326664226872065, |
|
"loss": 0.9643, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.05967305600643158, |
|
"learning_rate": 0.00018288230114069765, |
|
"loss": 0.9752, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.05811592936515808, |
|
"learning_rate": 0.0001824940087397641, |
|
"loss": 0.9551, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.0642295628786087, |
|
"learning_rate": 0.00018210178357710058, |
|
"loss": 0.9522, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.05724099278450012, |
|
"learning_rate": 0.0001817056443513754, |
|
"loss": 1.0051, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.056155964732170105, |
|
"learning_rate": 0.00018130560994785325, |
|
"loss": 0.9778, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.058100346475839615, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 0.9825, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.06120794638991356, |
|
"learning_rate": 0.00018049393207604733, |
|
"loss": 0.9839, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.056975312530994415, |
|
"learning_rate": 0.00018008232730312723, |
|
"loss": 0.9968, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.06239038705825806, |
|
"learning_rate": 0.00017966690474129285, |
|
"loss": 0.9906, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.5958348512649536, |
|
"learning_rate": 0.00017924768419510904, |
|
"loss": 2.6531, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.06554935872554779, |
|
"learning_rate": 0.00017882468565020326, |
|
"loss": 1.0164, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.05698655918240547, |
|
"learning_rate": 0.00017839792927231254, |
|
"loss": 0.9516, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.06186239421367645, |
|
"learning_rate": 0.00017796743540632223, |
|
"loss": 0.9933, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.05811876431107521, |
|
"learning_rate": 0.00017753322457529614, |
|
"loss": 0.9552, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.06247268617153168, |
|
"learning_rate": 0.00017709531747949796, |
|
"loss": 0.9316, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.06278502196073532, |
|
"learning_rate": 0.00017665373499540463, |
|
"loss": 0.9867, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.06079186499118805, |
|
"learning_rate": 0.00017620849817471092, |
|
"loss": 1.0233, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.05586745962500572, |
|
"learning_rate": 0.00017575962824332596, |
|
"loss": 0.9454, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.059647805988788605, |
|
"learning_rate": 0.00017530714660036112, |
|
"loss": 0.9718, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.060143355280160904, |
|
"learning_rate": 0.00017485107481711012, |
|
"loss": 0.9927, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.4768543541431427, |
|
"learning_rate": 0.0001743914346360205, |
|
"loss": 2.4526, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.059104178100824356, |
|
"learning_rate": 0.00017392824796965702, |
|
"loss": 0.9366, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.06858639419078827, |
|
"learning_rate": 0.00017346153689965727, |
|
"loss": 0.9783, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.06308155506849289, |
|
"learning_rate": 0.00017299132367567857, |
|
"loss": 0.9688, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.0601269006729126, |
|
"learning_rate": 0.00017251763071433765, |
|
"loss": 0.9937, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.06544536352157593, |
|
"learning_rate": 0.00017204048059814175, |
|
"loss": 0.9351, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.06467759609222412, |
|
"learning_rate": 0.00017155989607441213, |
|
"loss": 0.9918, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.061619073152542114, |
|
"learning_rate": 0.0001710759000541995, |
|
"loss": 0.9872, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.06122846156358719, |
|
"learning_rate": 0.00017058851561119198, |
|
"loss": 0.968, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.08277314156293869, |
|
"learning_rate": 0.00017009776598061495, |
|
"loss": 0.9869, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.07559008151292801, |
|
"learning_rate": 0.00016960367455812336, |
|
"loss": 0.9804, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.06251110136508942, |
|
"learning_rate": 0.00016910626489868649, |
|
"loss": 0.978, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.06253345310688019, |
|
"learning_rate": 0.0001686055607154648, |
|
"loss": 0.9524, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.05948334559798241, |
|
"learning_rate": 0.00016810158587867973, |
|
"loss": 0.9826, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.06356865167617798, |
|
"learning_rate": 0.00016759436441447545, |
|
"loss": 0.9805, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.06536010652780533, |
|
"learning_rate": 0.00016708392050377363, |
|
"loss": 1.0146, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.06137322261929512, |
|
"learning_rate": 0.00016657027848112062, |
|
"loss": 0.9457, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.05955340713262558, |
|
"learning_rate": 0.00016605346283352727, |
|
"loss": 0.9823, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.0602475143969059, |
|
"learning_rate": 0.00016553349819930165, |
|
"loss": 1.0077, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.05905039981007576, |
|
"learning_rate": 0.00016501040936687443, |
|
"loss": 0.9313, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.06087419390678406, |
|
"learning_rate": 0.00016448422127361706, |
|
"loss": 0.9725, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.059714119881391525, |
|
"learning_rate": 0.00016395495900465304, |
|
"loss": 0.9963, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.06461544334888458, |
|
"learning_rate": 0.000163422647791662, |
|
"loss": 0.957, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.06247726082801819, |
|
"learning_rate": 0.00016288731301167668, |
|
"loss": 0.9742, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.06527213007211685, |
|
"learning_rate": 0.00016234898018587337, |
|
"loss": 0.9789, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.059757690876722336, |
|
"learning_rate": 0.00016180767497835503, |
|
"loss": 0.9309, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.059084221720695496, |
|
"learning_rate": 0.00016126342319492784, |
|
"loss": 0.9546, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.778961181640625, |
|
"learning_rate": 0.00016071625078187114, |
|
"loss": 2.6066, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.06468740105628967, |
|
"learning_rate": 0.00016016618382470012, |
|
"loss": 0.9472, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.061647091060876846, |
|
"learning_rate": 0.00015961324854692254, |
|
"loss": 0.9836, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.06061193719506264, |
|
"learning_rate": 0.0001590574713087885, |
|
"loss": 0.982, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.06635820865631104, |
|
"learning_rate": 0.00015849887860603374, |
|
"loss": 0.9873, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.06260058283805847, |
|
"learning_rate": 0.00015793749706861636, |
|
"loss": 0.9827, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.06037148833274841, |
|
"learning_rate": 0.00015737335345944757, |
|
"loss": 1.0072, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.06277037411928177, |
|
"learning_rate": 0.00015680647467311557, |
|
"loss": 0.9498, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.06306284666061401, |
|
"learning_rate": 0.00015623688773460357, |
|
"loss": 0.9866, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.07311715185642242, |
|
"learning_rate": 0.00015566461979800122, |
|
"loss": 0.9722, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.06143077090382576, |
|
"learning_rate": 0.00015508969814521025, |
|
"loss": 0.9442, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.067879818379879, |
|
"learning_rate": 0.00015451215018464387, |
|
"loss": 0.9416, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.06907378137111664, |
|
"learning_rate": 0.00015393200344991995, |
|
"loss": 0.9813, |
|
"step": 155 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 465, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 155, |
|
"total_flos": 3.361969685082931e+18, |
|
"train_batch_size": 5, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|