{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9946777054997042, "eval_steps": 500, "global_step": 1266, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02365464222353637, "grad_norm": 1.5665423397587683, "learning_rate": 5e-06, "loss": 0.8891, "step": 10 }, { "epoch": 0.04730928444707274, "grad_norm": 1.0589767378232773, "learning_rate": 5e-06, "loss": 0.7949, "step": 20 }, { "epoch": 0.0709639266706091, "grad_norm": 0.8838983419870171, "learning_rate": 5e-06, "loss": 0.7599, "step": 30 }, { "epoch": 0.09461856889414548, "grad_norm": 0.9501620192903069, "learning_rate": 5e-06, "loss": 0.7391, "step": 40 }, { "epoch": 0.11827321111768184, "grad_norm": 0.7634636002964827, "learning_rate": 5e-06, "loss": 0.7272, "step": 50 }, { "epoch": 0.1419278533412182, "grad_norm": 0.6753406016779676, "learning_rate": 5e-06, "loss": 0.7159, "step": 60 }, { "epoch": 0.16558249556475457, "grad_norm": 0.6614325908691252, "learning_rate": 5e-06, "loss": 0.7118, "step": 70 }, { "epoch": 0.18923713778829096, "grad_norm": 0.7719184686061289, "learning_rate": 5e-06, "loss": 0.7019, "step": 80 }, { "epoch": 0.21289178001182732, "grad_norm": 0.7524231413666251, "learning_rate": 5e-06, "loss": 0.6872, "step": 90 }, { "epoch": 0.23654642223536368, "grad_norm": 0.8110229180711027, "learning_rate": 5e-06, "loss": 0.6853, "step": 100 }, { "epoch": 0.26020106445890007, "grad_norm": 0.4971482406443161, "learning_rate": 5e-06, "loss": 0.6909, "step": 110 }, { "epoch": 0.2838557066824364, "grad_norm": 0.9464051864732146, "learning_rate": 5e-06, "loss": 0.6782, "step": 120 }, { "epoch": 0.3075103489059728, "grad_norm": 0.5071907565316945, "learning_rate": 5e-06, "loss": 0.6744, "step": 130 }, { "epoch": 0.33116499112950915, "grad_norm": 0.9224817650868851, "learning_rate": 5e-06, "loss": 0.6828, "step": 140 }, { "epoch": 0.35481963335304556, "grad_norm": 0.8885554899363501, "learning_rate": 5e-06, "loss": 0.6661, "step": 150 }, { "epoch": 0.3784742755765819, "grad_norm": 0.4781037463107871, "learning_rate": 5e-06, "loss": 0.6701, "step": 160 }, { "epoch": 0.4021289178001183, "grad_norm": 0.6023227514998295, "learning_rate": 5e-06, "loss": 0.6677, "step": 170 }, { "epoch": 0.42578356002365464, "grad_norm": 0.5603961021476538, "learning_rate": 5e-06, "loss": 0.6742, "step": 180 }, { "epoch": 0.449438202247191, "grad_norm": 0.4984868681428728, "learning_rate": 5e-06, "loss": 0.6711, "step": 190 }, { "epoch": 0.47309284447072736, "grad_norm": 0.6312594557804502, "learning_rate": 5e-06, "loss": 0.6689, "step": 200 }, { "epoch": 0.4967474866942638, "grad_norm": 0.9550758404721197, "learning_rate": 5e-06, "loss": 0.6707, "step": 210 }, { "epoch": 0.5204021289178001, "grad_norm": 0.6169594889049308, "learning_rate": 5e-06, "loss": 0.6699, "step": 220 }, { "epoch": 0.5440567711413364, "grad_norm": 0.46636130254439206, "learning_rate": 5e-06, "loss": 0.6615, "step": 230 }, { "epoch": 0.5677114133648729, "grad_norm": 0.5215033819505257, "learning_rate": 5e-06, "loss": 0.6657, "step": 240 }, { "epoch": 0.5913660555884093, "grad_norm": 0.5111456784576656, "learning_rate": 5e-06, "loss": 0.6633, "step": 250 }, { "epoch": 0.6150206978119456, "grad_norm": 0.517604760100503, "learning_rate": 5e-06, "loss": 0.6531, "step": 260 }, { "epoch": 0.638675340035482, "grad_norm": 0.6357411311158894, "learning_rate": 5e-06, "loss": 0.6652, "step": 270 }, { "epoch": 0.6623299822590183, "grad_norm": 0.6476086780436829, "learning_rate": 5e-06, "loss": 0.6676, "step": 280 }, { "epoch": 0.6859846244825547, "grad_norm": 0.46209126279187906, "learning_rate": 5e-06, "loss": 0.6601, "step": 290 }, { "epoch": 0.7096392667060911, "grad_norm": 0.521950153282793, "learning_rate": 5e-06, "loss": 0.6554, "step": 300 }, { "epoch": 0.7332939089296274, "grad_norm": 0.47211201942387604, "learning_rate": 5e-06, "loss": 0.6617, "step": 310 }, { "epoch": 0.7569485511531638, "grad_norm": 0.5341122235375386, "learning_rate": 5e-06, "loss": 0.6563, "step": 320 }, { "epoch": 0.7806031933767001, "grad_norm": 0.5708931772664904, "learning_rate": 5e-06, "loss": 0.654, "step": 330 }, { "epoch": 0.8042578356002366, "grad_norm": 0.6195560307499095, "learning_rate": 5e-06, "loss": 0.6543, "step": 340 }, { "epoch": 0.8279124778237729, "grad_norm": 0.5625110791293699, "learning_rate": 5e-06, "loss": 0.6621, "step": 350 }, { "epoch": 0.8515671200473093, "grad_norm": 0.4510061091403496, "learning_rate": 5e-06, "loss": 0.6546, "step": 360 }, { "epoch": 0.8752217622708457, "grad_norm": 0.46670984016010486, "learning_rate": 5e-06, "loss": 0.6576, "step": 370 }, { "epoch": 0.898876404494382, "grad_norm": 0.5044563574335732, "learning_rate": 5e-06, "loss": 0.6489, "step": 380 }, { "epoch": 0.9225310467179184, "grad_norm": 0.6276485740632245, "learning_rate": 5e-06, "loss": 0.6548, "step": 390 }, { "epoch": 0.9461856889414547, "grad_norm": 0.4849232290323015, "learning_rate": 5e-06, "loss": 0.6473, "step": 400 }, { "epoch": 0.9698403311649911, "grad_norm": 0.47066212467927304, "learning_rate": 5e-06, "loss": 0.6504, "step": 410 }, { "epoch": 0.9934949733885275, "grad_norm": 0.5498376915549745, "learning_rate": 5e-06, "loss": 0.6489, "step": 420 }, { "epoch": 0.9982259018332348, "eval_loss": 0.6508141756057739, "eval_runtime": 226.6795, "eval_samples_per_second": 50.243, "eval_steps_per_second": 0.393, "step": 422 }, { "epoch": 1.0171496156120639, "grad_norm": 0.5216981625298186, "learning_rate": 5e-06, "loss": 0.6225, "step": 430 }, { "epoch": 1.0408042578356003, "grad_norm": 0.5287278292361843, "learning_rate": 5e-06, "loss": 0.6077, "step": 440 }, { "epoch": 1.0644589000591367, "grad_norm": 0.7304842680236713, "learning_rate": 5e-06, "loss": 0.6058, "step": 450 }, { "epoch": 1.0881135422826729, "grad_norm": 0.5644902109246774, "learning_rate": 5e-06, "loss": 0.6138, "step": 460 }, { "epoch": 1.1117681845062093, "grad_norm": 0.4753652134651053, "learning_rate": 5e-06, "loss": 0.6129, "step": 470 }, { "epoch": 1.1354228267297457, "grad_norm": 0.7165345582019033, "learning_rate": 5e-06, "loss": 0.6125, "step": 480 }, { "epoch": 1.1590774689532821, "grad_norm": 0.4641812019487026, "learning_rate": 5e-06, "loss": 0.6106, "step": 490 }, { "epoch": 1.1827321111768185, "grad_norm": 0.4846409340358244, "learning_rate": 5e-06, "loss": 0.6065, "step": 500 }, { "epoch": 1.2063867534003547, "grad_norm": 0.5624538054911821, "learning_rate": 5e-06, "loss": 0.6058, "step": 510 }, { "epoch": 1.2300413956238911, "grad_norm": 0.5069531857984526, "learning_rate": 5e-06, "loss": 0.6114, "step": 520 }, { "epoch": 1.2536960378474276, "grad_norm": 0.5605749208808489, "learning_rate": 5e-06, "loss": 0.6117, "step": 530 }, { "epoch": 1.277350680070964, "grad_norm": 0.5081677690669225, "learning_rate": 5e-06, "loss": 0.6133, "step": 540 }, { "epoch": 1.3010053222945004, "grad_norm": 0.5493923133675747, "learning_rate": 5e-06, "loss": 0.6101, "step": 550 }, { "epoch": 1.3246599645180366, "grad_norm": 0.44898345952125707, "learning_rate": 5e-06, "loss": 0.6096, "step": 560 }, { "epoch": 1.348314606741573, "grad_norm": 0.6655768726116028, "learning_rate": 5e-06, "loss": 0.6094, "step": 570 }, { "epoch": 1.3719692489651094, "grad_norm": 0.4649052455972333, "learning_rate": 5e-06, "loss": 0.614, "step": 580 }, { "epoch": 1.3956238911886458, "grad_norm": 0.43539888089438844, "learning_rate": 5e-06, "loss": 0.6138, "step": 590 }, { "epoch": 1.4192785334121822, "grad_norm": 0.5826398325628406, "learning_rate": 5e-06, "loss": 0.601, "step": 600 }, { "epoch": 1.4429331756357184, "grad_norm": 0.4128719638682357, "learning_rate": 5e-06, "loss": 0.6022, "step": 610 }, { "epoch": 1.4665878178592548, "grad_norm": 0.45529767094349805, "learning_rate": 5e-06, "loss": 0.6052, "step": 620 }, { "epoch": 1.4902424600827913, "grad_norm": 0.4330632579708222, "learning_rate": 5e-06, "loss": 0.6156, "step": 630 }, { "epoch": 1.5138971023063275, "grad_norm": 0.44938881612128456, "learning_rate": 5e-06, "loss": 0.6152, "step": 640 }, { "epoch": 1.537551744529864, "grad_norm": 0.46628155511438146, "learning_rate": 5e-06, "loss": 0.6041, "step": 650 }, { "epoch": 1.5612063867534003, "grad_norm": 0.5843372770531808, "learning_rate": 5e-06, "loss": 0.6091, "step": 660 }, { "epoch": 1.5848610289769367, "grad_norm": 0.5865805864061058, "learning_rate": 5e-06, "loss": 0.6055, "step": 670 }, { "epoch": 1.6085156712004731, "grad_norm": 0.6180152508932449, "learning_rate": 5e-06, "loss": 0.6098, "step": 680 }, { "epoch": 1.6321703134240093, "grad_norm": 0.5250880775614019, "learning_rate": 5e-06, "loss": 0.6035, "step": 690 }, { "epoch": 1.655824955647546, "grad_norm": 0.478978441549023, "learning_rate": 5e-06, "loss": 0.6107, "step": 700 }, { "epoch": 1.6794795978710821, "grad_norm": 0.5161048108398052, "learning_rate": 5e-06, "loss": 0.6106, "step": 710 }, { "epoch": 1.7031342400946186, "grad_norm": 0.5444636992966723, "learning_rate": 5e-06, "loss": 0.6123, "step": 720 }, { "epoch": 1.726788882318155, "grad_norm": 0.49014325863511293, "learning_rate": 5e-06, "loss": 0.6131, "step": 730 }, { "epoch": 1.7504435245416912, "grad_norm": 0.6313993012312799, "learning_rate": 5e-06, "loss": 0.6059, "step": 740 }, { "epoch": 1.7740981667652278, "grad_norm": 0.5158091820023666, "learning_rate": 5e-06, "loss": 0.6116, "step": 750 }, { "epoch": 1.797752808988764, "grad_norm": 0.47029237511942096, "learning_rate": 5e-06, "loss": 0.617, "step": 760 }, { "epoch": 1.8214074512123004, "grad_norm": 0.4280088668307211, "learning_rate": 5e-06, "loss": 0.6167, "step": 770 }, { "epoch": 1.8450620934358368, "grad_norm": 0.42280798454326274, "learning_rate": 5e-06, "loss": 0.6094, "step": 780 }, { "epoch": 1.868716735659373, "grad_norm": 0.5015010241124669, "learning_rate": 5e-06, "loss": 0.606, "step": 790 }, { "epoch": 1.8923713778829097, "grad_norm": 0.4647527250682549, "learning_rate": 5e-06, "loss": 0.614, "step": 800 }, { "epoch": 1.9160260201064458, "grad_norm": 0.45335597096154956, "learning_rate": 5e-06, "loss": 0.6142, "step": 810 }, { "epoch": 1.9396806623299823, "grad_norm": 0.4377674877634198, "learning_rate": 5e-06, "loss": 0.6009, "step": 820 }, { "epoch": 1.9633353045535187, "grad_norm": 0.4922578419992401, "learning_rate": 5e-06, "loss": 0.6108, "step": 830 }, { "epoch": 1.9869899467770549, "grad_norm": 0.4440378004846697, "learning_rate": 5e-06, "loss": 0.5988, "step": 840 }, { "epoch": 1.9988172678888232, "eval_loss": 0.6402984857559204, "eval_runtime": 227.8867, "eval_samples_per_second": 49.977, "eval_steps_per_second": 0.391, "step": 845 }, { "epoch": 2.0106445890005915, "grad_norm": 0.6249254373352294, "learning_rate": 5e-06, "loss": 0.5864, "step": 850 }, { "epoch": 2.0342992312241277, "grad_norm": 0.5229366411844095, "learning_rate": 5e-06, "loss": 0.5668, "step": 860 }, { "epoch": 2.057953873447664, "grad_norm": 0.5273129182521392, "learning_rate": 5e-06, "loss": 0.569, "step": 870 }, { "epoch": 2.0816085156712005, "grad_norm": 0.5807006714823286, "learning_rate": 5e-06, "loss": 0.5605, "step": 880 }, { "epoch": 2.1052631578947367, "grad_norm": 0.5316074498783832, "learning_rate": 5e-06, "loss": 0.5644, "step": 890 }, { "epoch": 2.1289178001182734, "grad_norm": 0.571954438502547, "learning_rate": 5e-06, "loss": 0.5674, "step": 900 }, { "epoch": 2.1525724423418096, "grad_norm": 0.4503797360191138, "learning_rate": 5e-06, "loss": 0.5625, "step": 910 }, { "epoch": 2.1762270845653457, "grad_norm": 0.49334190713186854, "learning_rate": 5e-06, "loss": 0.5648, "step": 920 }, { "epoch": 2.1998817267888824, "grad_norm": 0.5335672716582398, "learning_rate": 5e-06, "loss": 0.5647, "step": 930 }, { "epoch": 2.2235363690124186, "grad_norm": 0.5933317390318844, "learning_rate": 5e-06, "loss": 0.5662, "step": 940 }, { "epoch": 2.247191011235955, "grad_norm": 0.5891028799938252, "learning_rate": 5e-06, "loss": 0.5716, "step": 950 }, { "epoch": 2.2708456534594914, "grad_norm": 0.5459106423210626, "learning_rate": 5e-06, "loss": 0.5651, "step": 960 }, { "epoch": 2.2945002956830276, "grad_norm": 0.5290952299819603, "learning_rate": 5e-06, "loss": 0.568, "step": 970 }, { "epoch": 2.3181549379065642, "grad_norm": 0.612560828315047, "learning_rate": 5e-06, "loss": 0.5695, "step": 980 }, { "epoch": 2.3418095801301004, "grad_norm": 0.4941237525745503, "learning_rate": 5e-06, "loss": 0.5689, "step": 990 }, { "epoch": 2.365464222353637, "grad_norm": 0.5650925849128444, "learning_rate": 5e-06, "loss": 0.5688, "step": 1000 }, { "epoch": 2.3891188645771733, "grad_norm": 0.4960493342953518, "learning_rate": 5e-06, "loss": 0.568, "step": 1010 }, { "epoch": 2.4127735068007095, "grad_norm": 0.5675556976300212, "learning_rate": 5e-06, "loss": 0.5656, "step": 1020 }, { "epoch": 2.436428149024246, "grad_norm": 0.4679420975237042, "learning_rate": 5e-06, "loss": 0.5658, "step": 1030 }, { "epoch": 2.4600827912477823, "grad_norm": 0.5934681961848346, "learning_rate": 5e-06, "loss": 0.5699, "step": 1040 }, { "epoch": 2.483737433471319, "grad_norm": 0.4994164056392902, "learning_rate": 5e-06, "loss": 0.5705, "step": 1050 }, { "epoch": 2.507392075694855, "grad_norm": 0.553203563454583, "learning_rate": 5e-06, "loss": 0.5692, "step": 1060 }, { "epoch": 2.5310467179183913, "grad_norm": 0.4828644604519897, "learning_rate": 5e-06, "loss": 0.5741, "step": 1070 }, { "epoch": 2.554701360141928, "grad_norm": 0.49982783159481065, "learning_rate": 5e-06, "loss": 0.5729, "step": 1080 }, { "epoch": 2.578356002365464, "grad_norm": 0.5028480314612812, "learning_rate": 5e-06, "loss": 0.5688, "step": 1090 }, { "epoch": 2.6020106445890008, "grad_norm": 0.47037814252850013, "learning_rate": 5e-06, "loss": 0.5693, "step": 1100 }, { "epoch": 2.625665286812537, "grad_norm": 0.5106941109306207, "learning_rate": 5e-06, "loss": 0.5671, "step": 1110 }, { "epoch": 2.649319929036073, "grad_norm": 0.6215547951000225, "learning_rate": 5e-06, "loss": 0.5792, "step": 1120 }, { "epoch": 2.67297457125961, "grad_norm": 0.5708225359288293, "learning_rate": 5e-06, "loss": 0.5667, "step": 1130 }, { "epoch": 2.696629213483146, "grad_norm": 0.4694908632163934, "learning_rate": 5e-06, "loss": 0.5666, "step": 1140 }, { "epoch": 2.7202838557066826, "grad_norm": 0.5487886870493401, "learning_rate": 5e-06, "loss": 0.569, "step": 1150 }, { "epoch": 2.743938497930219, "grad_norm": 0.5178726800202738, "learning_rate": 5e-06, "loss": 0.5755, "step": 1160 }, { "epoch": 2.767593140153755, "grad_norm": 0.572400968003655, "learning_rate": 5e-06, "loss": 0.5687, "step": 1170 }, { "epoch": 2.7912477823772917, "grad_norm": 0.4505102109858482, "learning_rate": 5e-06, "loss": 0.5694, "step": 1180 }, { "epoch": 2.814902424600828, "grad_norm": 0.4721423964551259, "learning_rate": 5e-06, "loss": 0.5711, "step": 1190 }, { "epoch": 2.8385570668243645, "grad_norm": 0.45313707571101325, "learning_rate": 5e-06, "loss": 0.568, "step": 1200 }, { "epoch": 2.8622117090479007, "grad_norm": 0.4905572612804298, "learning_rate": 5e-06, "loss": 0.5708, "step": 1210 }, { "epoch": 2.885866351271437, "grad_norm": 0.4488283221129483, "learning_rate": 5e-06, "loss": 0.5644, "step": 1220 }, { "epoch": 2.9095209934949735, "grad_norm": 0.5086777707270197, "learning_rate": 5e-06, "loss": 0.57, "step": 1230 }, { "epoch": 2.9331756357185097, "grad_norm": 0.5014035211864805, "learning_rate": 5e-06, "loss": 0.5719, "step": 1240 }, { "epoch": 2.9568302779420463, "grad_norm": 0.46427442721882733, "learning_rate": 5e-06, "loss": 0.5812, "step": 1250 }, { "epoch": 2.9804849201655825, "grad_norm": 0.4933592342573123, "learning_rate": 5e-06, "loss": 0.5728, "step": 1260 }, { "epoch": 2.9946777054997042, "eval_loss": 0.6413019895553589, "eval_runtime": 227.2584, "eval_samples_per_second": 50.115, "eval_steps_per_second": 0.392, "step": 1266 }, { "epoch": 2.9946777054997042, "step": 1266, "total_flos": 2120178393415680.0, "train_loss": 0.6201754979801027, "train_runtime": 38082.1567, "train_samples_per_second": 17.045, "train_steps_per_second": 0.033 } ], "logging_steps": 10, "max_steps": 1266, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2120178393415680.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }