|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.93048128342246, |
|
"eval_steps": 500, |
|
"global_step": 93, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03208556149732621, |
|
"grad_norm": 0.20311123132705688, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 1.0142, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.06417112299465241, |
|
"grad_norm": 0.19597311317920685, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 1.0111, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0962566844919786, |
|
"grad_norm": 0.19070063531398773, |
|
"learning_rate": 3e-06, |
|
"loss": 1.0203, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.12834224598930483, |
|
"grad_norm": 0.19738665223121643, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.9815, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.16042780748663102, |
|
"grad_norm": 0.1658538281917572, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9497, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.1925133689839572, |
|
"grad_norm": 0.13552631437778473, |
|
"learning_rate": 6e-06, |
|
"loss": 0.9655, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.22459893048128343, |
|
"grad_norm": 0.12660366296768188, |
|
"learning_rate": 7e-06, |
|
"loss": 1.0031, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.25668449197860965, |
|
"grad_norm": 0.13044770061969757, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.8836, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.2887700534759358, |
|
"grad_norm": 0.133613720536232, |
|
"learning_rate": 9e-06, |
|
"loss": 0.904, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.32085561497326204, |
|
"grad_norm": 0.13057634234428406, |
|
"learning_rate": 1e-05, |
|
"loss": 0.8347, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.35294117647058826, |
|
"grad_norm": 0.15763278305530548, |
|
"learning_rate": 9.996418774081658e-06, |
|
"loss": 0.8767, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.3850267379679144, |
|
"grad_norm": 0.13350480794906616, |
|
"learning_rate": 9.985680226398261e-06, |
|
"loss": 0.8649, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.41711229946524064, |
|
"grad_norm": 0.11397632211446762, |
|
"learning_rate": 9.967799739815925e-06, |
|
"loss": 0.8318, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.44919786096256686, |
|
"grad_norm": 0.09967950731515884, |
|
"learning_rate": 9.942802927959444e-06, |
|
"loss": 0.8099, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.48128342245989303, |
|
"grad_norm": 0.09817608445882797, |
|
"learning_rate": 9.910725598521014e-06, |
|
"loss": 0.7997, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.5133689839572193, |
|
"grad_norm": 0.09854481369256973, |
|
"learning_rate": 9.871613701966067e-06, |
|
"loss": 0.8052, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.5454545454545454, |
|
"grad_norm": 0.09840090572834015, |
|
"learning_rate": 9.825523265709667e-06, |
|
"loss": 0.8383, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.5775401069518716, |
|
"grad_norm": 0.08703222870826721, |
|
"learning_rate": 9.772520313857777e-06, |
|
"loss": 0.8014, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.6096256684491979, |
|
"grad_norm": 0.07054264843463898, |
|
"learning_rate": 9.712680772628365e-06, |
|
"loss": 0.7716, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.6417112299465241, |
|
"grad_norm": 0.07254040986299515, |
|
"learning_rate": 9.646090361587828e-06, |
|
"loss": 0.7857, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.6737967914438503, |
|
"grad_norm": 0.06903552263975143, |
|
"learning_rate": 9.572844470858537e-06, |
|
"loss": 0.7589, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.7058823529411765, |
|
"grad_norm": 0.08053260296583176, |
|
"learning_rate": 9.493048024473413e-06, |
|
"loss": 0.7776, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.7379679144385026, |
|
"grad_norm": 0.07109075039625168, |
|
"learning_rate": 9.406815330073244e-06, |
|
"loss": 0.7594, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.7700534759358288, |
|
"grad_norm": 0.06771097332239151, |
|
"learning_rate": 9.314269915162115e-06, |
|
"loss": 0.7548, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.8021390374331551, |
|
"grad_norm": 0.06824363768100739, |
|
"learning_rate": 9.215544350155423e-06, |
|
"loss": 0.7432, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.8342245989304813, |
|
"grad_norm": 0.06845725327730179, |
|
"learning_rate": 9.110780058474052e-06, |
|
"loss": 0.7391, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.8663101604278075, |
|
"grad_norm": 0.06926020979881287, |
|
"learning_rate": 9.000127113956673e-06, |
|
"loss": 0.7673, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.8983957219251337, |
|
"grad_norm": 0.06381599605083466, |
|
"learning_rate": 8.883744025880429e-06, |
|
"loss": 0.747, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.93048128342246, |
|
"grad_norm": 0.06366810202598572, |
|
"learning_rate": 8.761797511897907e-06, |
|
"loss": 0.7633, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.9625668449197861, |
|
"grad_norm": 0.05870290845632553, |
|
"learning_rate": 8.634462259215719e-06, |
|
"loss": 0.7426, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.9946524064171123, |
|
"grad_norm": 0.06190042197704315, |
|
"learning_rate": 8.501920674356755e-06, |
|
"loss": 0.7761, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.06190042197704315, |
|
"learning_rate": 8.364362621864595e-06, |
|
"loss": 0.7086, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.032085561497326, |
|
"grad_norm": 0.1603803187608719, |
|
"learning_rate": 8.221985152324385e-06, |
|
"loss": 0.6956, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 1.0641711229946524, |
|
"grad_norm": 0.06225842982530594, |
|
"learning_rate": 8.07499222008977e-06, |
|
"loss": 0.6742, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.0962566844919786, |
|
"grad_norm": 0.055793747305870056, |
|
"learning_rate": 7.923594391120237e-06, |
|
"loss": 0.6592, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.1283422459893049, |
|
"grad_norm": 0.05490822345018387, |
|
"learning_rate": 7.768008541347423e-06, |
|
"loss": 0.6395, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.160427807486631, |
|
"grad_norm": 0.05545506253838539, |
|
"learning_rate": 7.608457546002423e-06, |
|
"loss": 0.6601, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.192513368983957, |
|
"grad_norm": 0.05865738168358803, |
|
"learning_rate": 7.445169960349167e-06, |
|
"loss": 0.6438, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.2245989304812834, |
|
"grad_norm": 0.05824247747659683, |
|
"learning_rate": 7.278379692281209e-06, |
|
"loss": 0.7292, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 1.2566844919786098, |
|
"grad_norm": 0.055361997336149216, |
|
"learning_rate": 7.10832566725092e-06, |
|
"loss": 0.6213, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.2887700534759359, |
|
"grad_norm": 0.0581163614988327, |
|
"learning_rate": 6.9352514860110876e-06, |
|
"loss": 0.6631, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 1.320855614973262, |
|
"grad_norm": 0.05861780047416687, |
|
"learning_rate": 6.759405075659165e-06, |
|
"loss": 0.6472, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.3529411764705883, |
|
"grad_norm": 0.06006443873047829, |
|
"learning_rate": 6.58103833448412e-06, |
|
"loss": 0.661, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 1.3850267379679144, |
|
"grad_norm": 0.05911916866898537, |
|
"learning_rate": 6.4004067711245366e-06, |
|
"loss": 0.6537, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.4171122994652405, |
|
"grad_norm": 0.05767710879445076, |
|
"learning_rate": 6.2177691385549595e-06, |
|
"loss": 0.6372, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.4491978609625669, |
|
"grad_norm": 0.06271807849407196, |
|
"learning_rate": 6.033387063424765e-06, |
|
"loss": 0.6449, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.481283422459893, |
|
"grad_norm": 0.05773553252220154, |
|
"learning_rate": 5.8475246712804845e-06, |
|
"loss": 0.6532, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.5133689839572193, |
|
"grad_norm": 0.05972479656338692, |
|
"learning_rate": 5.660448208208513e-06, |
|
"loss": 0.639, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.5454545454545454, |
|
"grad_norm": 0.060490116477012634, |
|
"learning_rate": 5.472425659440157e-06, |
|
"loss": 0.6427, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.5775401069518717, |
|
"grad_norm": 0.056138068437576294, |
|
"learning_rate": 5.2837263654653715e-06, |
|
"loss": 0.6328, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.6096256684491979, |
|
"grad_norm": 0.057874809950590134, |
|
"learning_rate": 5.094620636205096e-06, |
|
"loss": 0.6267, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.641711229946524, |
|
"grad_norm": 0.1051662415266037, |
|
"learning_rate": 4.905379363794907e-06, |
|
"loss": 0.5955, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.6737967914438503, |
|
"grad_norm": 0.05756345018744469, |
|
"learning_rate": 4.71627363453463e-06, |
|
"loss": 0.6069, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.7058823529411766, |
|
"grad_norm": 0.059037283062934875, |
|
"learning_rate": 4.527574340559844e-06, |
|
"loss": 0.6352, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.7379679144385025, |
|
"grad_norm": 0.06045999750494957, |
|
"learning_rate": 4.33955179179149e-06, |
|
"loss": 0.6434, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.7700534759358288, |
|
"grad_norm": 0.05958287790417671, |
|
"learning_rate": 4.152475328719517e-06, |
|
"loss": 0.6803, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.8021390374331552, |
|
"grad_norm": 0.05772184208035469, |
|
"learning_rate": 3.966612936575235e-06, |
|
"loss": 0.5837, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.8342245989304813, |
|
"grad_norm": 0.05690668150782585, |
|
"learning_rate": 3.782230861445041e-06, |
|
"loss": 0.6416, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.8663101604278074, |
|
"grad_norm": 0.055760458111763, |
|
"learning_rate": 3.5995932288754655e-06, |
|
"loss": 0.594, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.8983957219251337, |
|
"grad_norm": 0.060565654188394547, |
|
"learning_rate": 3.4189616655158803e-06, |
|
"loss": 0.613, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.93048128342246, |
|
"grad_norm": 0.0587182492017746, |
|
"learning_rate": 3.240594924340835e-06, |
|
"loss": 0.6263, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.962566844919786, |
|
"grad_norm": 0.06040377542376518, |
|
"learning_rate": 3.0647485139889145e-06, |
|
"loss": 0.6344, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.9946524064171123, |
|
"grad_norm": 0.05578358843922615, |
|
"learning_rate": 2.89167433274908e-06, |
|
"loss": 0.5962, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.05578358843922615, |
|
"learning_rate": 2.721620307718793e-06, |
|
"loss": 0.6669, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 2.0320855614973263, |
|
"grad_norm": 0.1518927663564682, |
|
"learning_rate": 2.554830039650834e-06, |
|
"loss": 0.5842, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 2.064171122994652, |
|
"grad_norm": 0.05603505298495293, |
|
"learning_rate": 2.391542453997578e-06, |
|
"loss": 0.6017, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 2.0962566844919786, |
|
"grad_norm": 0.0569174662232399, |
|
"learning_rate": 2.2319914586525776e-06, |
|
"loss": 0.5905, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 2.128342245989305, |
|
"grad_norm": 0.05474628135561943, |
|
"learning_rate": 2.0764056088797646e-06, |
|
"loss": 0.5657, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 2.160427807486631, |
|
"grad_norm": 0.05412266403436661, |
|
"learning_rate": 1.9250077799102323e-06, |
|
"loss": 0.5692, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 2.192513368983957, |
|
"grad_norm": 0.05691121146082878, |
|
"learning_rate": 1.7780148476756148e-06, |
|
"loss": 0.5528, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.2245989304812834, |
|
"grad_norm": 0.0541236586868763, |
|
"learning_rate": 1.6356373781354058e-06, |
|
"loss": 0.5426, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 2.2566844919786098, |
|
"grad_norm": 0.054437655955553055, |
|
"learning_rate": 1.4980793256432474e-06, |
|
"loss": 0.5578, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 2.2887700534759357, |
|
"grad_norm": 0.0597856268286705, |
|
"learning_rate": 1.3655377407842813e-06, |
|
"loss": 0.5612, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 2.320855614973262, |
|
"grad_norm": 0.060152262449264526, |
|
"learning_rate": 1.2382024881020937e-06, |
|
"loss": 0.5616, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 2.3529411764705883, |
|
"grad_norm": 0.05507192015647888, |
|
"learning_rate": 1.1162559741195733e-06, |
|
"loss": 0.5736, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 2.385026737967914, |
|
"grad_norm": 0.0576910674571991, |
|
"learning_rate": 9.998728860433277e-07, |
|
"loss": 0.5565, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 2.4171122994652405, |
|
"grad_norm": 0.056697096675634384, |
|
"learning_rate": 8.892199415259501e-07, |
|
"loss": 0.5509, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 2.449197860962567, |
|
"grad_norm": 0.05489673465490341, |
|
"learning_rate": 7.844556498445788e-07, |
|
"loss": 0.5783, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 2.481283422459893, |
|
"grad_norm": 0.05427519232034683, |
|
"learning_rate": 6.857300848378857e-07, |
|
"loss": 0.553, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 2.5133689839572195, |
|
"grad_norm": 0.05728753283619881, |
|
"learning_rate": 5.931846699267558e-07, |
|
"loss": 0.5956, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.5454545454545454, |
|
"grad_norm": 0.05481986328959465, |
|
"learning_rate": 5.0695197552659e-07, |
|
"loss": 0.564, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 2.5775401069518717, |
|
"grad_norm": 0.05629381909966469, |
|
"learning_rate": 4.271555291414636e-07, |
|
"loss": 0.5545, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 2.6096256684491976, |
|
"grad_norm": 0.057915519922971725, |
|
"learning_rate": 3.539096384121743e-07, |
|
"loss": 0.5994, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 2.641711229946524, |
|
"grad_norm": 0.05390770733356476, |
|
"learning_rate": 2.873192273716369e-07, |
|
"loss": 0.5615, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 2.6737967914438503, |
|
"grad_norm": 0.056489136070013046, |
|
"learning_rate": 2.274796861422246e-07, |
|
"loss": 0.5494, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 2.7058823529411766, |
|
"grad_norm": 0.05273538455367088, |
|
"learning_rate": 1.7447673429033361e-07, |
|
"loss": 0.5975, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 2.7379679144385025, |
|
"grad_norm": 0.05448044836521149, |
|
"learning_rate": 1.2838629803393343e-07, |
|
"loss": 0.5877, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 2.770053475935829, |
|
"grad_norm": 0.05841987207531929, |
|
"learning_rate": 8.927440147898703e-08, |
|
"loss": 0.5497, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 2.802139037433155, |
|
"grad_norm": 0.053782325237989426, |
|
"learning_rate": 5.7197072040557356e-08, |
|
"loss": 0.552, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 2.834224598930481, |
|
"grad_norm": 0.05557689070701599, |
|
"learning_rate": 3.220026018407541e-08, |
|
"loss": 0.5543, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.8663101604278074, |
|
"grad_norm": 0.055889178067445755, |
|
"learning_rate": 1.431977360173975e-08, |
|
"loss": 0.5606, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 2.8983957219251337, |
|
"grad_norm": 0.058224692940711975, |
|
"learning_rate": 3.5812259183426457e-09, |
|
"loss": 0.5859, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 2.93048128342246, |
|
"grad_norm": 0.06218594312667847, |
|
"learning_rate": 0.0, |
|
"loss": 0.5555, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 2.93048128342246, |
|
"step": 93, |
|
"total_flos": 9.35169742560297e+16, |
|
"train_loss": 0.0, |
|
"train_runtime": 1.09, |
|
"train_samples_per_second": 4109.216, |
|
"train_steps_per_second": 85.322 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 93, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 200, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.35169742560297e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|