|
{ |
|
"best_metric": 0.05344419553875923, |
|
"best_model_checkpoint": "/content/train/Qwen2-VL-2B-Instruct-unsloth-r4-rslora-bf16-tuned/checkpoint-270", |
|
"epoch": 2.0451977401129944, |
|
"eval_steps": 10, |
|
"global_step": 270, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.007532956685499058, |
|
"grad_norm": 0.68587327003479, |
|
"learning_rate": 2e-05, |
|
"loss": 1.6782, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.015065913370998116, |
|
"grad_norm": 0.7287677526473999, |
|
"learning_rate": 4e-05, |
|
"loss": 1.7932, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.022598870056497175, |
|
"grad_norm": 0.7718816995620728, |
|
"learning_rate": 6e-05, |
|
"loss": 1.6757, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.030131826741996232, |
|
"grad_norm": 0.7753613591194153, |
|
"learning_rate": 8e-05, |
|
"loss": 1.7695, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.03766478342749529, |
|
"grad_norm": 1.235795259475708, |
|
"learning_rate": 0.0001, |
|
"loss": 1.9245, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.04519774011299435, |
|
"grad_norm": 0.569118082523346, |
|
"learning_rate": 0.00012, |
|
"loss": 1.451, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.05273069679849341, |
|
"grad_norm": 0.6638339757919312, |
|
"learning_rate": 0.00014, |
|
"loss": 1.6576, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.060263653483992465, |
|
"grad_norm": 0.6843408942222595, |
|
"learning_rate": 0.00016, |
|
"loss": 1.6339, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.06779661016949153, |
|
"grad_norm": 0.5259923934936523, |
|
"learning_rate": 0.00018, |
|
"loss": 1.5687, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.07532956685499058, |
|
"grad_norm": 0.655581533908844, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6655, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07532956685499058, |
|
"eval_loss": 1.5389481782913208, |
|
"eval_runtime": 47.9411, |
|
"eval_samples_per_second": 1.564, |
|
"eval_steps_per_second": 0.793, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.08286252354048965, |
|
"grad_norm": 1.65678870677948, |
|
"learning_rate": 0.0001999966879815833, |
|
"loss": 1.7139, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0903954802259887, |
|
"grad_norm": 0.4999409019947052, |
|
"learning_rate": 0.0001999867521457224, |
|
"loss": 1.4695, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.09792843691148775, |
|
"grad_norm": 0.6279143691062927, |
|
"learning_rate": 0.0001999701931505708, |
|
"loss": 1.42, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.10546139359698682, |
|
"grad_norm": 0.47573018074035645, |
|
"learning_rate": 0.00019994701209300245, |
|
"loss": 1.3877, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.11299435028248588, |
|
"grad_norm": 0.5120630264282227, |
|
"learning_rate": 0.00019991721050853907, |
|
"loss": 1.4014, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.12052730696798493, |
|
"grad_norm": 0.4641444981098175, |
|
"learning_rate": 0.00019988079037124864, |
|
"loss": 1.2456, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.128060263653484, |
|
"grad_norm": 0.5229088664054871, |
|
"learning_rate": 0.00019983775409361447, |
|
"loss": 1.3617, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.13559322033898305, |
|
"grad_norm": 0.6793835759162903, |
|
"learning_rate": 0.00019978810452637543, |
|
"loss": 1.4584, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.1431261770244821, |
|
"grad_norm": 0.530450165271759, |
|
"learning_rate": 0.00019973184495833716, |
|
"loss": 1.2412, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.15065913370998116, |
|
"grad_norm": 0.5695556998252869, |
|
"learning_rate": 0.00019966897911615416, |
|
"loss": 1.2738, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.15065913370998116, |
|
"eval_loss": 1.2209211587905884, |
|
"eval_runtime": 37.1065, |
|
"eval_samples_per_second": 2.021, |
|
"eval_steps_per_second": 1.024, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.15819209039548024, |
|
"grad_norm": 0.5769656896591187, |
|
"learning_rate": 0.00019959951116408294, |
|
"loss": 1.2751, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.1657250470809793, |
|
"grad_norm": 0.6152491569519043, |
|
"learning_rate": 0.0001995234457037063, |
|
"loss": 1.2145, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.17325800376647835, |
|
"grad_norm": 0.6578372120857239, |
|
"learning_rate": 0.00019944078777362826, |
|
"loss": 1.1845, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.1807909604519774, |
|
"grad_norm": 0.5556841492652893, |
|
"learning_rate": 0.00019935154284914065, |
|
"loss": 1.0926, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.18832391713747645, |
|
"grad_norm": 0.7302567958831787, |
|
"learning_rate": 0.00019925571684186006, |
|
"loss": 1.1249, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.1958568738229755, |
|
"grad_norm": 0.6284404993057251, |
|
"learning_rate": 0.00019915331609933657, |
|
"loss": 0.9404, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.2033898305084746, |
|
"grad_norm": 0.776946485042572, |
|
"learning_rate": 0.00019904434740463306, |
|
"loss": 1.044, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.21092278719397364, |
|
"grad_norm": 0.7142918705940247, |
|
"learning_rate": 0.00019892881797587601, |
|
"loss": 0.9695, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.2184557438794727, |
|
"grad_norm": 0.8852341175079346, |
|
"learning_rate": 0.0001988067354657773, |
|
"loss": 0.8989, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.22598870056497175, |
|
"grad_norm": 0.8206908106803894, |
|
"learning_rate": 0.00019867810796112744, |
|
"loss": 0.8154, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.22598870056497175, |
|
"eval_loss": 0.8218569755554199, |
|
"eval_runtime": 37.094, |
|
"eval_samples_per_second": 2.022, |
|
"eval_steps_per_second": 1.024, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.2335216572504708, |
|
"grad_norm": 0.9797173142433167, |
|
"learning_rate": 0.0001985429439822596, |
|
"loss": 0.7847, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.24105461393596986, |
|
"grad_norm": 1.0684410333633423, |
|
"learning_rate": 0.00019840125248248564, |
|
"loss": 0.823, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.24858757062146894, |
|
"grad_norm": 1.009280800819397, |
|
"learning_rate": 0.00019825304284750263, |
|
"loss": 0.883, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.256120527306968, |
|
"grad_norm": 0.8165304660797119, |
|
"learning_rate": 0.00019809832489477142, |
|
"loss": 0.7012, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.263653483992467, |
|
"grad_norm": 0.794262707233429, |
|
"learning_rate": 0.00019793710887286615, |
|
"loss": 0.6529, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.2711864406779661, |
|
"grad_norm": 0.727675199508667, |
|
"learning_rate": 0.0001977694054607955, |
|
"loss": 0.6809, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.2787193973634652, |
|
"grad_norm": 0.7391637563705444, |
|
"learning_rate": 0.00019759522576729533, |
|
"loss": 0.6308, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.2862523540489642, |
|
"grad_norm": 0.7500622868537903, |
|
"learning_rate": 0.00019741458133009258, |
|
"loss": 0.5628, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.2937853107344633, |
|
"grad_norm": 0.962188184261322, |
|
"learning_rate": 0.00019722748411514135, |
|
"loss": 0.5857, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.3013182674199623, |
|
"grad_norm": 0.7300134301185608, |
|
"learning_rate": 0.0001970339465158301, |
|
"loss": 0.5631, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.3013182674199623, |
|
"eval_loss": 0.5341230630874634, |
|
"eval_runtime": 37.0912, |
|
"eval_samples_per_second": 2.022, |
|
"eval_steps_per_second": 1.025, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.3088512241054614, |
|
"grad_norm": 0.5163620710372925, |
|
"learning_rate": 0.00019683398135216066, |
|
"loss": 0.528, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.3163841807909605, |
|
"grad_norm": 0.38112568855285645, |
|
"learning_rate": 0.00019662760186989913, |
|
"loss": 0.5219, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.3239171374764595, |
|
"grad_norm": 0.389498233795166, |
|
"learning_rate": 0.00019641482173969848, |
|
"loss": 0.5172, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.3314500941619586, |
|
"grad_norm": 0.5581079125404358, |
|
"learning_rate": 0.00019619565505619288, |
|
"loss": 0.5106, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.3389830508474576, |
|
"grad_norm": 0.38179025053977966, |
|
"learning_rate": 0.00019597011633706415, |
|
"loss": 0.5374, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.3465160075329567, |
|
"grad_norm": 0.40401706099510193, |
|
"learning_rate": 0.00019573822052208013, |
|
"loss": 0.4814, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.3540489642184557, |
|
"grad_norm": 0.3594434857368469, |
|
"learning_rate": 0.00019549998297210502, |
|
"loss": 0.4933, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.3615819209039548, |
|
"grad_norm": 0.34325098991394043, |
|
"learning_rate": 0.00019525541946808188, |
|
"loss": 0.4893, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.3691148775894539, |
|
"grad_norm": 0.3423003852367401, |
|
"learning_rate": 0.00019500454620998732, |
|
"loss": 0.4584, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.3766478342749529, |
|
"grad_norm": 0.3735145330429077, |
|
"learning_rate": 0.00019474737981575832, |
|
"loss": 0.4078, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3766478342749529, |
|
"eval_loss": 0.4520163834095001, |
|
"eval_runtime": 37.128, |
|
"eval_samples_per_second": 2.02, |
|
"eval_steps_per_second": 1.023, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.384180790960452, |
|
"grad_norm": 0.5755606293678284, |
|
"learning_rate": 0.0001944839373201916, |
|
"loss": 0.4468, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.391713747645951, |
|
"grad_norm": 0.3776421546936035, |
|
"learning_rate": 0.00019421423617381508, |
|
"loss": 0.5, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.3992467043314501, |
|
"grad_norm": 0.3351342976093292, |
|
"learning_rate": 0.00019393829424173205, |
|
"loss": 0.4443, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.4067796610169492, |
|
"grad_norm": 0.6081859469413757, |
|
"learning_rate": 0.0001936561298024377, |
|
"loss": 0.393, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.4143126177024482, |
|
"grad_norm": 0.40104803442955017, |
|
"learning_rate": 0.00019336776154660841, |
|
"loss": 0.4274, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.4218455743879473, |
|
"grad_norm": 0.44677644968032837, |
|
"learning_rate": 0.00019307320857586376, |
|
"loss": 0.4133, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.4293785310734463, |
|
"grad_norm": 0.36069607734680176, |
|
"learning_rate": 0.00019277249040150092, |
|
"loss": 0.3849, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.4369114877589454, |
|
"grad_norm": 1.2188339233398438, |
|
"learning_rate": 0.00019246562694320255, |
|
"loss": 0.4041, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 0.4592845141887665, |
|
"learning_rate": 0.00019215263852771718, |
|
"loss": 0.4183, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.4519774011299435, |
|
"grad_norm": 1.6102626323699951, |
|
"learning_rate": 0.00019183354588751271, |
|
"loss": 0.4038, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.4519774011299435, |
|
"eval_loss": 0.38410142064094543, |
|
"eval_runtime": 37.1151, |
|
"eval_samples_per_second": 2.021, |
|
"eval_steps_per_second": 1.024, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.4595103578154426, |
|
"grad_norm": 0.4766036868095398, |
|
"learning_rate": 0.00019150837015940322, |
|
"loss": 0.4346, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.4670433145009416, |
|
"grad_norm": 0.4366019368171692, |
|
"learning_rate": 0.00019117713288314863, |
|
"loss": 0.3804, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.4745762711864407, |
|
"grad_norm": 0.792560338973999, |
|
"learning_rate": 0.00019083985600002818, |
|
"loss": 0.3856, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.4821092278719397, |
|
"grad_norm": 0.427386075258255, |
|
"learning_rate": 0.0001904965618513868, |
|
"loss": 0.3906, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.4896421845574388, |
|
"grad_norm": 0.6638129949569702, |
|
"learning_rate": 0.00019014727317715537, |
|
"loss": 0.4039, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.4971751412429379, |
|
"grad_norm": 0.46441903710365295, |
|
"learning_rate": 0.00018979201311434434, |
|
"loss": 0.422, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.504708097928437, |
|
"grad_norm": 0.4845605790615082, |
|
"learning_rate": 0.00018943080519551108, |
|
"loss": 0.358, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.512241054613936, |
|
"grad_norm": 0.7461917400360107, |
|
"learning_rate": 0.00018906367334720124, |
|
"loss": 0.3956, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.519774011299435, |
|
"grad_norm": 0.6427743434906006, |
|
"learning_rate": 0.0001886906418883636, |
|
"loss": 0.3141, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.527306967984934, |
|
"grad_norm": 0.6577739119529724, |
|
"learning_rate": 0.00018831173552873946, |
|
"loss": 0.3455, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.527306967984934, |
|
"eval_loss": 0.3052687644958496, |
|
"eval_runtime": 37.0939, |
|
"eval_samples_per_second": 2.022, |
|
"eval_steps_per_second": 1.024, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.5348399246704332, |
|
"grad_norm": 0.7122016549110413, |
|
"learning_rate": 0.00018792697936722563, |
|
"loss": 0.3519, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.5423728813559322, |
|
"grad_norm": 0.5734298229217529, |
|
"learning_rate": 0.00018753639889021196, |
|
"loss": 0.3051, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.5499058380414312, |
|
"grad_norm": 0.8871021270751953, |
|
"learning_rate": 0.00018714001996989312, |
|
"loss": 0.2803, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.5574387947269304, |
|
"grad_norm": 0.7467854022979736, |
|
"learning_rate": 0.00018673786886255476, |
|
"loss": 0.2741, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.5649717514124294, |
|
"grad_norm": 0.549818754196167, |
|
"learning_rate": 0.0001863299722068344, |
|
"loss": 0.2779, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.5725047080979284, |
|
"grad_norm": 0.5196639895439148, |
|
"learning_rate": 0.00018591635702195673, |
|
"loss": 0.3036, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.5800376647834274, |
|
"grad_norm": 0.532467782497406, |
|
"learning_rate": 0.00018549705070594396, |
|
"loss": 0.2767, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.5875706214689266, |
|
"grad_norm": 0.8568252325057983, |
|
"learning_rate": 0.00018507208103380092, |
|
"loss": 0.2224, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.5951035781544256, |
|
"grad_norm": 0.557944118976593, |
|
"learning_rate": 0.00018464147615567517, |
|
"loss": 0.2269, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.6026365348399246, |
|
"grad_norm": 0.886238157749176, |
|
"learning_rate": 0.0001842052645949925, |
|
"loss": 0.2658, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.6026365348399246, |
|
"eval_loss": 0.22510449588298798, |
|
"eval_runtime": 37.1134, |
|
"eval_samples_per_second": 2.021, |
|
"eval_steps_per_second": 1.024, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.6101694915254238, |
|
"grad_norm": 0.5309122204780579, |
|
"learning_rate": 0.00018376347524656734, |
|
"loss": 0.2168, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.6177024482109228, |
|
"grad_norm": 0.5083054304122925, |
|
"learning_rate": 0.00018331613737468887, |
|
"loss": 0.2312, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.6252354048964218, |
|
"grad_norm": 9.135035514831543, |
|
"learning_rate": 0.00018286328061118244, |
|
"loss": 0.246, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.632768361581921, |
|
"grad_norm": 0.771587610244751, |
|
"learning_rate": 0.00018240493495344694, |
|
"loss": 0.2207, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.64030131826742, |
|
"grad_norm": 0.8555005788803101, |
|
"learning_rate": 0.00018194113076246753, |
|
"loss": 0.223, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.647834274952919, |
|
"grad_norm": 0.5555715560913086, |
|
"learning_rate": 0.00018147189876080463, |
|
"loss": 0.2114, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.655367231638418, |
|
"grad_norm": 0.6347367167472839, |
|
"learning_rate": 0.00018099727003055894, |
|
"loss": 0.2326, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.6629001883239172, |
|
"grad_norm": 0.7266764640808105, |
|
"learning_rate": 0.00018051727601131227, |
|
"loss": 0.257, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.6704331450094162, |
|
"grad_norm": 0.7240170240402222, |
|
"learning_rate": 0.00018003194849804534, |
|
"loss": 0.2001, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.6779661016949152, |
|
"grad_norm": 0.7595257759094238, |
|
"learning_rate": 0.00017954131963903133, |
|
"loss": 0.1747, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.6779661016949152, |
|
"eval_loss": 0.17160969972610474, |
|
"eval_runtime": 37.1302, |
|
"eval_samples_per_second": 2.02, |
|
"eval_steps_per_second": 1.023, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.6854990583804144, |
|
"grad_norm": 0.7588114738464355, |
|
"learning_rate": 0.00017904542193370663, |
|
"loss": 0.1372, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.6930320150659134, |
|
"grad_norm": 0.7313429713249207, |
|
"learning_rate": 0.0001785442882305179, |
|
"loss": 0.2234, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.7005649717514124, |
|
"grad_norm": 0.8581743836402893, |
|
"learning_rate": 0.0001780379517247462, |
|
"loss": 0.1712, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.7080979284369114, |
|
"grad_norm": 1.0297523736953735, |
|
"learning_rate": 0.0001775264459563081, |
|
"loss": 0.1769, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.7156308851224106, |
|
"grad_norm": 0.5627338290214539, |
|
"learning_rate": 0.00017700980480753423, |
|
"loss": 0.1864, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.7231638418079096, |
|
"grad_norm": 1.0914690494537354, |
|
"learning_rate": 0.0001764880625009245, |
|
"loss": 0.1786, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.7306967984934086, |
|
"grad_norm": 0.6584937572479248, |
|
"learning_rate": 0.00017596125359688154, |
|
"loss": 0.131, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.7382297551789078, |
|
"grad_norm": 1.1257890462875366, |
|
"learning_rate": 0.00017542941299142112, |
|
"loss": 0.1678, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.7457627118644068, |
|
"grad_norm": 0.5444011688232422, |
|
"learning_rate": 0.00017489257591386093, |
|
"loss": 0.1562, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.7532956685499058, |
|
"grad_norm": 0.665874183177948, |
|
"learning_rate": 0.00017435077792448664, |
|
"loss": 0.189, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.7532956685499058, |
|
"eval_loss": 0.13106876611709595, |
|
"eval_runtime": 37.0741, |
|
"eval_samples_per_second": 2.023, |
|
"eval_steps_per_second": 1.025, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.7608286252354048, |
|
"grad_norm": 0.6252754330635071, |
|
"learning_rate": 0.0001738040549121967, |
|
"loss": 0.104, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.768361581920904, |
|
"grad_norm": 0.6250944137573242, |
|
"learning_rate": 0.00017325244309212475, |
|
"loss": 0.1582, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.775894538606403, |
|
"grad_norm": 0.7759442329406738, |
|
"learning_rate": 0.00017269597900324097, |
|
"loss": 0.1888, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.783427495291902, |
|
"grad_norm": 0.5639198422431946, |
|
"learning_rate": 0.00017213469950593156, |
|
"loss": 0.1223, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.7909604519774012, |
|
"grad_norm": 0.5083601474761963, |
|
"learning_rate": 0.00017156864177955719, |
|
"loss": 0.0838, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.7984934086629002, |
|
"grad_norm": 0.5559635758399963, |
|
"learning_rate": 0.0001709978433199901, |
|
"loss": 0.0855, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.8060263653483992, |
|
"grad_norm": 0.6353676319122314, |
|
"learning_rate": 0.00017042234193713056, |
|
"loss": 0.1105, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.8135593220338984, |
|
"grad_norm": 0.7712072134017944, |
|
"learning_rate": 0.0001698421757524021, |
|
"loss": 0.1402, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.8210922787193974, |
|
"grad_norm": 0.7416761517524719, |
|
"learning_rate": 0.00016925738319622654, |
|
"loss": 0.0932, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.8286252354048964, |
|
"grad_norm": 0.7182126045227051, |
|
"learning_rate": 0.00016866800300547813, |
|
"loss": 0.131, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.8286252354048964, |
|
"eval_loss": 0.09729403257369995, |
|
"eval_runtime": 37.1303, |
|
"eval_samples_per_second": 2.02, |
|
"eval_steps_per_second": 1.023, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.8361581920903954, |
|
"grad_norm": 0.6551967263221741, |
|
"learning_rate": 0.00016807407422091784, |
|
"loss": 0.1161, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.8436911487758946, |
|
"grad_norm": 0.6405838131904602, |
|
"learning_rate": 0.0001674756361846071, |
|
"loss": 0.1454, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.8512241054613936, |
|
"grad_norm": 0.4275994598865509, |
|
"learning_rate": 0.00016687272853730192, |
|
"loss": 0.0897, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.8587570621468926, |
|
"grad_norm": 0.6592651605606079, |
|
"learning_rate": 0.00016626539121582685, |
|
"loss": 0.0534, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.8662900188323918, |
|
"grad_norm": 0.6205569505691528, |
|
"learning_rate": 0.0001656536644504298, |
|
"loss": 0.1361, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.8738229755178908, |
|
"grad_norm": 0.5345686078071594, |
|
"learning_rate": 0.0001650375887621171, |
|
"loss": 0.0923, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.8813559322033898, |
|
"grad_norm": 1.0165270566940308, |
|
"learning_rate": 0.00016441720495996912, |
|
"loss": 0.0852, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 0.48809266090393066, |
|
"learning_rate": 0.00016379255413843754, |
|
"loss": 0.0839, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.896421845574388, |
|
"grad_norm": 0.650384247303009, |
|
"learning_rate": 0.0001631636776746228, |
|
"loss": 0.102, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.903954802259887, |
|
"grad_norm": 0.6523996591567993, |
|
"learning_rate": 0.00016253061722553355, |
|
"loss": 0.0661, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.903954802259887, |
|
"eval_loss": 0.07857384532690048, |
|
"eval_runtime": 37.0902, |
|
"eval_samples_per_second": 2.022, |
|
"eval_steps_per_second": 1.025, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.911487758945386, |
|
"grad_norm": 0.4382803738117218, |
|
"learning_rate": 0.00016189341472532705, |
|
"loss": 0.0582, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.9190207156308852, |
|
"grad_norm": 0.6267339587211609, |
|
"learning_rate": 0.0001612521123825317, |
|
"loss": 0.079, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.9265536723163842, |
|
"grad_norm": 0.700908899307251, |
|
"learning_rate": 0.00016060675267725083, |
|
"loss": 0.1022, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.9340866290018832, |
|
"grad_norm": 0.4881342351436615, |
|
"learning_rate": 0.00015995737835834906, |
|
"loss": 0.063, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.9416195856873822, |
|
"grad_norm": 0.4968627989292145, |
|
"learning_rate": 0.00015930403244062043, |
|
"loss": 0.0675, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.9491525423728814, |
|
"grad_norm": 0.4240921437740326, |
|
"learning_rate": 0.00015864675820193922, |
|
"loss": 0.0531, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.9566854990583804, |
|
"grad_norm": 0.3779008984565735, |
|
"learning_rate": 0.00015798559918039307, |
|
"loss": 0.0481, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.9642184557438794, |
|
"grad_norm": 0.471587210893631, |
|
"learning_rate": 0.00015732059917139912, |
|
"loss": 0.0698, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.9717514124293786, |
|
"grad_norm": 0.44407761096954346, |
|
"learning_rate": 0.0001566518022248029, |
|
"loss": 0.1005, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.9792843691148776, |
|
"grad_norm": 0.4785122275352478, |
|
"learning_rate": 0.00015597925264196049, |
|
"loss": 0.0784, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.9792843691148776, |
|
"eval_loss": 0.07214296609163284, |
|
"eval_runtime": 37.1186, |
|
"eval_samples_per_second": 2.021, |
|
"eval_steps_per_second": 1.024, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.9868173258003766, |
|
"grad_norm": 0.4364639222621918, |
|
"learning_rate": 0.00015530299497280395, |
|
"loss": 0.046, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.9943502824858758, |
|
"grad_norm": 0.6649202108383179, |
|
"learning_rate": 0.0001546230740128904, |
|
"loss": 0.0618, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.0075329566854991, |
|
"grad_norm": 0.662251353263855, |
|
"learning_rate": 0.00015393953480043467, |
|
"loss": 0.1003, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.015065913370998, |
|
"grad_norm": 0.36134594678878784, |
|
"learning_rate": 0.000153252422613326, |
|
"loss": 0.0403, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.0225988700564972, |
|
"grad_norm": 0.512718677520752, |
|
"learning_rate": 0.00015256178296612868, |
|
"loss": 0.0673, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.0301318267419963, |
|
"grad_norm": 0.4086618721485138, |
|
"learning_rate": 0.0001518676616070674, |
|
"loss": 0.0943, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.0376647834274952, |
|
"grad_norm": 0.3207029402256012, |
|
"learning_rate": 0.00015117010451499654, |
|
"loss": 0.0865, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.0451977401129944, |
|
"grad_norm": 0.2941770553588867, |
|
"learning_rate": 0.0001504691578963549, |
|
"loss": 0.0374, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.0527306967984935, |
|
"grad_norm": 0.4340198040008545, |
|
"learning_rate": 0.00014976486818210467, |
|
"loss": 0.077, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.0602636534839924, |
|
"grad_norm": 0.54200679063797, |
|
"learning_rate": 0.00014905728202465595, |
|
"loss": 0.086, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.0602636534839924, |
|
"eval_loss": 0.0658058300614357, |
|
"eval_runtime": 37.189, |
|
"eval_samples_per_second": 2.017, |
|
"eval_steps_per_second": 1.022, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.0677966101694916, |
|
"grad_norm": 0.48267418146133423, |
|
"learning_rate": 0.00014834644629477644, |
|
"loss": 0.0502, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.0753295668549905, |
|
"grad_norm": 0.5690019726753235, |
|
"learning_rate": 0.00014763240807848666, |
|
"loss": 0.0617, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.0828625235404896, |
|
"grad_norm": 0.4100703299045563, |
|
"learning_rate": 0.0001469152146739411, |
|
"loss": 0.0562, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.0903954802259888, |
|
"grad_norm": 0.49852266907691956, |
|
"learning_rate": 0.000146194913588295, |
|
"loss": 0.0751, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.0979284369114877, |
|
"grad_norm": 0.4217350482940674, |
|
"learning_rate": 0.00014547155253455768, |
|
"loss": 0.0803, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.1054613935969868, |
|
"grad_norm": 0.4313773810863495, |
|
"learning_rate": 0.00014474517942843175, |
|
"loss": 0.0447, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.112994350282486, |
|
"grad_norm": 0.5009363889694214, |
|
"learning_rate": 0.0001440158423851392, |
|
"loss": 0.0415, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.1205273069679849, |
|
"grad_norm": 0.8885876536369324, |
|
"learning_rate": 0.00014328358971623455, |
|
"loss": 0.0603, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.128060263653484, |
|
"grad_norm": 1.5320378541946411, |
|
"learning_rate": 0.00014254846992640423, |
|
"loss": 0.0665, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.1355932203389831, |
|
"grad_norm": 0.45557519793510437, |
|
"learning_rate": 0.00014181053171025392, |
|
"loss": 0.0855, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.1355932203389831, |
|
"eval_loss": 0.06454955041408539, |
|
"eval_runtime": 37.1265, |
|
"eval_samples_per_second": 2.02, |
|
"eval_steps_per_second": 1.024, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.143126177024482, |
|
"grad_norm": 0.2571490406990051, |
|
"learning_rate": 0.00014106982394908283, |
|
"loss": 0.0402, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.1506591337099812, |
|
"grad_norm": 0.4380505084991455, |
|
"learning_rate": 0.00014032639570764593, |
|
"loss": 0.086, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.1581920903954803, |
|
"grad_norm": 0.4073718190193176, |
|
"learning_rate": 0.00013958029623090378, |
|
"loss": 0.0491, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.1657250470809792, |
|
"grad_norm": 0.40776053071022034, |
|
"learning_rate": 0.00013883157494076046, |
|
"loss": 0.072, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.1732580037664784, |
|
"grad_norm": 0.31324324011802673, |
|
"learning_rate": 0.00013808028143279006, |
|
"loss": 0.0342, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.1807909604519775, |
|
"grad_norm": 0.517558753490448, |
|
"learning_rate": 0.00013732646547295126, |
|
"loss": 0.0579, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.1883239171374764, |
|
"grad_norm": 0.3593922257423401, |
|
"learning_rate": 0.00013657017699429092, |
|
"loss": 0.0749, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.1958568738229756, |
|
"grad_norm": 0.26723143458366394, |
|
"learning_rate": 0.0001358114660936364, |
|
"loss": 0.0372, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.2033898305084745, |
|
"grad_norm": 0.3371814489364624, |
|
"learning_rate": 0.00013505038302827723, |
|
"loss": 0.0486, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.2109227871939736, |
|
"grad_norm": 0.3006036579608917, |
|
"learning_rate": 0.000134286978212636, |
|
"loss": 0.0882, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.2109227871939736, |
|
"eval_loss": 0.06150702014565468, |
|
"eval_runtime": 37.13, |
|
"eval_samples_per_second": 2.02, |
|
"eval_steps_per_second": 1.023, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.2184557438794728, |
|
"grad_norm": 0.3491075932979584, |
|
"learning_rate": 0.0001335213022149289, |
|
"loss": 0.0656, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.2259887005649717, |
|
"grad_norm": 0.3559153378009796, |
|
"learning_rate": 0.00013275340575381598, |
|
"loss": 0.0601, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.2335216572504708, |
|
"grad_norm": 0.41236844658851624, |
|
"learning_rate": 0.00013198333969504175, |
|
"loss": 0.0383, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.24105461393597, |
|
"grad_norm": 0.3909653425216675, |
|
"learning_rate": 0.00013121115504806553, |
|
"loss": 0.1066, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.2485875706214689, |
|
"grad_norm": 0.2600908875465393, |
|
"learning_rate": 0.0001304369029626828, |
|
"loss": 0.0361, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.256120527306968, |
|
"grad_norm": 0.27978697419166565, |
|
"learning_rate": 0.00012966063472563685, |
|
"loss": 0.0301, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.2636534839924671, |
|
"grad_norm": 0.3649253249168396, |
|
"learning_rate": 0.00012888240175722162, |
|
"loss": 0.0508, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.271186440677966, |
|
"grad_norm": 0.34710630774497986, |
|
"learning_rate": 0.0001281022556078756, |
|
"loss": 0.0573, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.2787193973634652, |
|
"grad_norm": 0.3954513669013977, |
|
"learning_rate": 0.0001273202479547671, |
|
"loss": 0.0708, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.286252354048964, |
|
"grad_norm": 0.3171145021915436, |
|
"learning_rate": 0.00012653643059837107, |
|
"loss": 0.0835, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.286252354048964, |
|
"eval_loss": 0.06033060699701309, |
|
"eval_runtime": 37.0879, |
|
"eval_samples_per_second": 2.022, |
|
"eval_steps_per_second": 1.025, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.2937853107344632, |
|
"grad_norm": 0.3680741786956787, |
|
"learning_rate": 0.00012575085545903794, |
|
"loss": 0.077, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.3013182674199624, |
|
"grad_norm": 0.3026699423789978, |
|
"learning_rate": 0.00012496357457355422, |
|
"loss": 0.0778, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.3088512241054615, |
|
"grad_norm": 0.28971561789512634, |
|
"learning_rate": 0.00012417464009169583, |
|
"loss": 0.05, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.3163841807909604, |
|
"grad_norm": 0.4369751513004303, |
|
"learning_rate": 0.0001233841042727734, |
|
"loss": 0.0755, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.3239171374764596, |
|
"grad_norm": 0.2916516661643982, |
|
"learning_rate": 0.00012259201948217077, |
|
"loss": 0.0538, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.3314500941619585, |
|
"grad_norm": 0.6259362697601318, |
|
"learning_rate": 0.00012179843818787624, |
|
"loss": 0.0878, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.3389830508474576, |
|
"grad_norm": 0.2717919647693634, |
|
"learning_rate": 0.00012100341295700702, |
|
"loss": 0.0545, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.3465160075329567, |
|
"grad_norm": 0.47408613562583923, |
|
"learning_rate": 0.00012020699645232721, |
|
"loss": 0.0969, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.3540489642184557, |
|
"grad_norm": 0.2807871997356415, |
|
"learning_rate": 0.00011940924142875947, |
|
"loss": 0.0328, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.3615819209039548, |
|
"grad_norm": 0.4400388300418854, |
|
"learning_rate": 0.0001186102007298904, |
|
"loss": 0.0585, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.3615819209039548, |
|
"eval_loss": 0.05832603573799133, |
|
"eval_runtime": 37.1319, |
|
"eval_samples_per_second": 2.02, |
|
"eval_steps_per_second": 1.023, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.369114877589454, |
|
"grad_norm": 0.38300102949142456, |
|
"learning_rate": 0.00011780992728447018, |
|
"loss": 0.0655, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.3766478342749529, |
|
"grad_norm": 0.39059555530548096, |
|
"learning_rate": 0.00011700847410290667, |
|
"loss": 0.0617, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.384180790960452, |
|
"grad_norm": 0.36025285720825195, |
|
"learning_rate": 0.00011620589427375375, |
|
"loss": 0.1054, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.3917137476459511, |
|
"grad_norm": 0.24352721869945526, |
|
"learning_rate": 0.00011540224096019494, |
|
"loss": 0.0298, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.39924670433145, |
|
"grad_norm": 0.2885790169239044, |
|
"learning_rate": 0.00011459756739652175, |
|
"loss": 0.0696, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.4067796610169492, |
|
"grad_norm": 0.2957116961479187, |
|
"learning_rate": 0.0001137919268846074, |
|
"loss": 0.0449, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.414312617702448, |
|
"grad_norm": 0.32375454902648926, |
|
"learning_rate": 0.0001129853727903762, |
|
"loss": 0.0535, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.4218455743879472, |
|
"grad_norm": 0.35646215081214905, |
|
"learning_rate": 0.0001121779585402684, |
|
"loss": 0.037, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.4293785310734464, |
|
"grad_norm": 0.25164303183555603, |
|
"learning_rate": 0.00011136973761770136, |
|
"loss": 0.036, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.4369114877589455, |
|
"grad_norm": 0.24905888736248016, |
|
"learning_rate": 0.0001105607635595266, |
|
"loss": 0.0344, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.4369114877589455, |
|
"eval_loss": 0.05805233120918274, |
|
"eval_runtime": 37.1095, |
|
"eval_samples_per_second": 2.021, |
|
"eval_steps_per_second": 1.024, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.4444444444444444, |
|
"grad_norm": 0.3525996506214142, |
|
"learning_rate": 0.00010975108995248378, |
|
"loss": 0.0576, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.4519774011299436, |
|
"grad_norm": 0.2925921082496643, |
|
"learning_rate": 0.00010894077042965083, |
|
"loss": 0.0645, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.4595103578154425, |
|
"grad_norm": 0.47334054112434387, |
|
"learning_rate": 0.00010812985866689142, |
|
"loss": 0.1769, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.4670433145009416, |
|
"grad_norm": 0.3007245659828186, |
|
"learning_rate": 0.00010731840837929946, |
|
"loss": 0.0565, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.4745762711864407, |
|
"grad_norm": 0.3107605576515198, |
|
"learning_rate": 0.00010650647331764079, |
|
"loss": 0.0504, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.4821092278719397, |
|
"grad_norm": 0.3428517282009125, |
|
"learning_rate": 0.000105694107264793, |
|
"loss": 0.0749, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.4896421845574388, |
|
"grad_norm": 0.3695080280303955, |
|
"learning_rate": 0.00010488136403218265, |
|
"loss": 0.0604, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.497175141242938, |
|
"grad_norm": 0.33667024970054626, |
|
"learning_rate": 0.00010406829745622085, |
|
"loss": 0.0739, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.5047080979284368, |
|
"grad_norm": 0.4697053134441376, |
|
"learning_rate": 0.00010325496139473702, |
|
"loss": 0.0588, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.512241054613936, |
|
"grad_norm": 0.36798229813575745, |
|
"learning_rate": 0.00010244140972341155, |
|
"loss": 0.0401, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.512241054613936, |
|
"eval_loss": 0.05732857435941696, |
|
"eval_runtime": 37.158, |
|
"eval_samples_per_second": 2.018, |
|
"eval_steps_per_second": 1.023, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.5197740112994351, |
|
"grad_norm": 0.29147714376449585, |
|
"learning_rate": 0.00010162769633220672, |
|
"loss": 0.0692, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.527306967984934, |
|
"grad_norm": 0.2551415264606476, |
|
"learning_rate": 0.00010081387512179729, |
|
"loss": 0.0495, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.5348399246704332, |
|
"grad_norm": 0.4365129768848419, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0905, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.542372881355932, |
|
"grad_norm": 0.256455659866333, |
|
"learning_rate": 9.918612487820273e-05, |
|
"loss": 0.0441, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.5499058380414312, |
|
"grad_norm": 0.33844852447509766, |
|
"learning_rate": 9.83723036677933e-05, |
|
"loss": 0.0517, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.5574387947269304, |
|
"grad_norm": 0.28650492429733276, |
|
"learning_rate": 9.755859027658848e-05, |
|
"loss": 0.0473, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.5649717514124295, |
|
"grad_norm": 0.2910935580730438, |
|
"learning_rate": 9.674503860526297e-05, |
|
"loss": 0.0501, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.5725047080979284, |
|
"grad_norm": 0.49296438694000244, |
|
"learning_rate": 9.593170254377916e-05, |
|
"loss": 0.0624, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.5800376647834273, |
|
"grad_norm": 0.3825702965259552, |
|
"learning_rate": 9.511863596781734e-05, |
|
"loss": 0.0768, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.5875706214689265, |
|
"grad_norm": 0.2868608832359314, |
|
"learning_rate": 9.430589273520703e-05, |
|
"loss": 0.054, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.5875706214689265, |
|
"eval_loss": 0.05658142268657684, |
|
"eval_runtime": 37.107, |
|
"eval_samples_per_second": 2.021, |
|
"eval_steps_per_second": 1.024, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.5951035781544256, |
|
"grad_norm": 0.22975075244903564, |
|
"learning_rate": 9.349352668235925e-05, |
|
"loss": 0.0375, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.6026365348399247, |
|
"grad_norm": 0.29614976048469543, |
|
"learning_rate": 9.268159162070058e-05, |
|
"loss": 0.0768, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.6101694915254239, |
|
"grad_norm": 0.2965467870235443, |
|
"learning_rate": 9.18701413331086e-05, |
|
"loss": 0.0444, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.6177024482109228, |
|
"grad_norm": 0.3394235670566559, |
|
"learning_rate": 9.10592295703492e-05, |
|
"loss": 0.0549, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.6252354048964217, |
|
"grad_norm": 0.3029539883136749, |
|
"learning_rate": 9.024891004751626e-05, |
|
"loss": 0.0451, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.6327683615819208, |
|
"grad_norm": 0.28490352630615234, |
|
"learning_rate": 8.943923644047342e-05, |
|
"loss": 0.0272, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.64030131826742, |
|
"grad_norm": 0.3418651819229126, |
|
"learning_rate": 8.863026238229868e-05, |
|
"loss": 0.1127, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.6478342749529191, |
|
"grad_norm": 0.32494044303894043, |
|
"learning_rate": 8.782204145973162e-05, |
|
"loss": 0.0976, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.655367231638418, |
|
"grad_norm": 0.5956616997718811, |
|
"learning_rate": 8.701462720962381e-05, |
|
"loss": 0.0509, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.6629001883239172, |
|
"grad_norm": 0.35732752084732056, |
|
"learning_rate": 8.620807311539259e-05, |
|
"loss": 0.1967, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.6629001883239172, |
|
"eval_loss": 0.055175162851810455, |
|
"eval_runtime": 37.1192, |
|
"eval_samples_per_second": 2.021, |
|
"eval_steps_per_second": 1.024, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.670433145009416, |
|
"grad_norm": 0.4732244610786438, |
|
"learning_rate": 8.540243260347826e-05, |
|
"loss": 0.0693, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.6779661016949152, |
|
"grad_norm": 0.27817562222480774, |
|
"learning_rate": 8.45977590398051e-05, |
|
"loss": 0.0616, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.6854990583804144, |
|
"grad_norm": 0.28534531593322754, |
|
"learning_rate": 8.379410572624628e-05, |
|
"loss": 0.0392, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.6930320150659135, |
|
"grad_norm": 0.20350764691829681, |
|
"learning_rate": 8.299152589709336e-05, |
|
"loss": 0.0348, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.7005649717514124, |
|
"grad_norm": 0.22657251358032227, |
|
"learning_rate": 8.219007271552983e-05, |
|
"loss": 0.0393, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.7080979284369113, |
|
"grad_norm": 0.3810754418373108, |
|
"learning_rate": 8.138979927010964e-05, |
|
"loss": 0.0661, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.7156308851224105, |
|
"grad_norm": 0.23370787501335144, |
|
"learning_rate": 8.059075857124056e-05, |
|
"loss": 0.0519, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.7231638418079096, |
|
"grad_norm": 0.2558518648147583, |
|
"learning_rate": 7.97930035476728e-05, |
|
"loss": 0.0419, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.7306967984934087, |
|
"grad_norm": 0.24495276808738708, |
|
"learning_rate": 7.899658704299301e-05, |
|
"loss": 0.0768, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.7382297551789079, |
|
"grad_norm": 0.31314679980278015, |
|
"learning_rate": 7.820156181212379e-05, |
|
"loss": 0.0987, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.7382297551789079, |
|
"eval_loss": 0.055335018783807755, |
|
"eval_runtime": 37.1237, |
|
"eval_samples_per_second": 2.02, |
|
"eval_steps_per_second": 1.024, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.7457627118644068, |
|
"grad_norm": 0.2738696038722992, |
|
"learning_rate": 7.740798051782923e-05, |
|
"loss": 0.1045, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.7532956685499057, |
|
"grad_norm": 0.22097167372703552, |
|
"learning_rate": 7.66158957272266e-05, |
|
"loss": 0.0384, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.7608286252354048, |
|
"grad_norm": 0.319528728723526, |
|
"learning_rate": 7.582535990830415e-05, |
|
"loss": 0.0513, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.768361581920904, |
|
"grad_norm": 0.28677770495414734, |
|
"learning_rate": 7.503642542644581e-05, |
|
"loss": 0.0616, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.7758945386064031, |
|
"grad_norm": 0.3826892673969269, |
|
"learning_rate": 7.424914454096211e-05, |
|
"loss": 0.0606, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.783427495291902, |
|
"grad_norm": 0.3082129955291748, |
|
"learning_rate": 7.346356940162895e-05, |
|
"loss": 0.0566, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.7909604519774012, |
|
"grad_norm": 0.25097185373306274, |
|
"learning_rate": 7.267975204523295e-05, |
|
"loss": 0.0431, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.7984934086629, |
|
"grad_norm": 0.4633219838142395, |
|
"learning_rate": 7.189774439212442e-05, |
|
"loss": 0.0546, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.8060263653483992, |
|
"grad_norm": 0.3444885313510895, |
|
"learning_rate": 7.11175982427784e-05, |
|
"loss": 0.1409, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.8135593220338984, |
|
"grad_norm": 0.3237282633781433, |
|
"learning_rate": 7.033936527436318e-05, |
|
"loss": 0.0659, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.8135593220338984, |
|
"eval_loss": 0.05429178848862648, |
|
"eval_runtime": 37.0949, |
|
"eval_samples_per_second": 2.022, |
|
"eval_steps_per_second": 1.024, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.8210922787193975, |
|
"grad_norm": 0.2055477797985077, |
|
"learning_rate": 6.95630970373172e-05, |
|
"loss": 0.0378, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.8286252354048964, |
|
"grad_norm": 0.27016931772232056, |
|
"learning_rate": 6.878884495193448e-05, |
|
"loss": 0.0507, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.8361581920903953, |
|
"grad_norm": 0.2610904574394226, |
|
"learning_rate": 6.801666030495826e-05, |
|
"loss": 0.0389, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.8436911487758945, |
|
"grad_norm": 0.2465640753507614, |
|
"learning_rate": 6.724659424618401e-05, |
|
"loss": 0.0843, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.8512241054613936, |
|
"grad_norm": 0.24705246090888977, |
|
"learning_rate": 6.647869778507112e-05, |
|
"loss": 0.0493, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.8587570621468927, |
|
"grad_norm": 0.7887628078460693, |
|
"learning_rate": 6.571302178736404e-05, |
|
"loss": 0.0511, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.8662900188323919, |
|
"grad_norm": 0.3609479069709778, |
|
"learning_rate": 6.494961697172279e-05, |
|
"loss": 0.0292, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.8738229755178908, |
|
"grad_norm": 0.23038731515407562, |
|
"learning_rate": 6.418853390636364e-05, |
|
"loss": 0.0361, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.8813559322033897, |
|
"grad_norm": 0.3310745060443878, |
|
"learning_rate": 6.342982300570912e-05, |
|
"loss": 0.103, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.8888888888888888, |
|
"grad_norm": 0.2912939786911011, |
|
"learning_rate": 6.267353452704876e-05, |
|
"loss": 0.0391, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.8888888888888888, |
|
"eval_loss": 0.05423182249069214, |
|
"eval_runtime": 37.1201, |
|
"eval_samples_per_second": 2.02, |
|
"eval_steps_per_second": 1.024, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.896421845574388, |
|
"grad_norm": 0.3608611822128296, |
|
"learning_rate": 6.191971856720997e-05, |
|
"loss": 0.0474, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.9039548022598871, |
|
"grad_norm": 0.2649577856063843, |
|
"learning_rate": 6.116842505923955e-05, |
|
"loss": 0.0352, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.911487758945386, |
|
"grad_norm": 1.2930629253387451, |
|
"learning_rate": 6.0419703769096235e-05, |
|
"loss": 0.0672, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.9190207156308852, |
|
"grad_norm": 0.4104057252407074, |
|
"learning_rate": 5.967360429235407e-05, |
|
"loss": 0.07, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.926553672316384, |
|
"grad_norm": 0.375598281621933, |
|
"learning_rate": 5.893017605091717e-05, |
|
"loss": 0.0904, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.9340866290018832, |
|
"grad_norm": 0.20128563046455383, |
|
"learning_rate": 5.818946828974607e-05, |
|
"loss": 0.0288, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.9416195856873824, |
|
"grad_norm": 0.37956199049949646, |
|
"learning_rate": 5.7451530073595785e-05, |
|
"loss": 0.0575, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.9491525423728815, |
|
"grad_norm": 0.40059077739715576, |
|
"learning_rate": 5.671641028376546e-05, |
|
"loss": 0.0586, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.9566854990583804, |
|
"grad_norm": 0.3582189381122589, |
|
"learning_rate": 5.5984157614860845e-05, |
|
"loss": 0.0682, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.9642184557438793, |
|
"grad_norm": 0.3279203474521637, |
|
"learning_rate": 5.5254820571568325e-05, |
|
"loss": 0.0953, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.9642184557438793, |
|
"eval_loss": 0.05431414395570755, |
|
"eval_runtime": 37.1186, |
|
"eval_samples_per_second": 2.021, |
|
"eval_steps_per_second": 1.024, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.9717514124293785, |
|
"grad_norm": 0.27801838517189026, |
|
"learning_rate": 5.4528447465442334e-05, |
|
"loss": 0.0383, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.9792843691148776, |
|
"grad_norm": 0.26640596985816956, |
|
"learning_rate": 5.3805086411704985e-05, |
|
"loss": 0.0624, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.9868173258003767, |
|
"grad_norm": 0.3783319294452667, |
|
"learning_rate": 5.3084785326058925e-05, |
|
"loss": 0.0739, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.9943502824858759, |
|
"grad_norm": 0.2667982280254364, |
|
"learning_rate": 5.236759192151336e-05, |
|
"loss": 0.04, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 2.007532956685499, |
|
"grad_norm": 0.9414636492729187, |
|
"learning_rate": 5.165355370522358e-05, |
|
"loss": 0.1447, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.0150659133709983, |
|
"grad_norm": 0.3006013035774231, |
|
"learning_rate": 5.0942717975344035e-05, |
|
"loss": 0.0482, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 2.022598870056497, |
|
"grad_norm": 0.20572024583816528, |
|
"learning_rate": 5.02351318178953e-05, |
|
"loss": 0.0329, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 2.030131826741996, |
|
"grad_norm": 0.2508287727832794, |
|
"learning_rate": 4.953084210364508e-05, |
|
"loss": 0.0352, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 2.0376647834274952, |
|
"grad_norm": 0.2693123519420624, |
|
"learning_rate": 4.882989548500349e-05, |
|
"loss": 0.0408, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 2.0451977401129944, |
|
"grad_norm": 0.3008269965648651, |
|
"learning_rate": 4.813233839293265e-05, |
|
"loss": 0.0362, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.0451977401129944, |
|
"eval_loss": 0.05344419553875923, |
|
"eval_runtime": 37.1115, |
|
"eval_samples_per_second": 2.021, |
|
"eval_steps_per_second": 1.024, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.0451977401129944, |
|
"step": 270, |
|
"total_flos": 2.136820048293888e+16, |
|
"train_loss": 0.2946822406862069, |
|
"train_runtime": 4276.0328, |
|
"train_samples_per_second": 0.745, |
|
"train_steps_per_second": 0.093 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 396, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 5, |
|
"early_stopping_threshold": 0.001 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 5 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.136820048293888e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|