{ "best_metric": 0.08053209632635117, "best_model_checkpoint": "saves/Breeze-7B-FC-v1_0-15-12-2024\\checkpoint-1700", "epoch": 0.3381150088257962, "eval_steps": 100, "global_step": 1700, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0019889118166223304, "grad_norm": 0.6233828067779541, "learning_rate": 0.0004, "loss": 0.7108, "step": 10 }, { "epoch": 0.003977823633244661, "grad_norm": 0.4662277102470398, "learning_rate": 0.0004, "loss": 0.729, "step": 20 }, { "epoch": 0.005966735449866992, "grad_norm": 0.5307815670967102, "learning_rate": 0.0004, "loss": 0.6936, "step": 30 }, { "epoch": 0.007955647266489322, "grad_norm": 0.5890870690345764, "learning_rate": 0.0004, "loss": 0.7229, "step": 40 }, { "epoch": 0.009944559083111653, "grad_norm": 0.5515501499176025, "learning_rate": 0.0004, "loss": 0.6669, "step": 50 }, { "epoch": 0.011933470899733983, "grad_norm": 0.5326377153396606, "learning_rate": 0.0004, "loss": 0.6564, "step": 60 }, { "epoch": 0.013922382716356313, "grad_norm": 0.5636907815933228, "learning_rate": 0.0004, "loss": 0.6773, "step": 70 }, { "epoch": 0.015911294532978643, "grad_norm": 0.49359801411628723, "learning_rate": 0.0004, "loss": 0.6516, "step": 80 }, { "epoch": 0.017900206349600975, "grad_norm": 0.4739631116390228, "learning_rate": 0.0004, "loss": 0.6594, "step": 90 }, { "epoch": 0.019889118166223307, "grad_norm": 0.5182624459266663, "learning_rate": 0.0004, "loss": 0.69, "step": 100 }, { "epoch": 0.019889118166223307, "eval_loss": 0.08401616662740707, "eval_runtime": 30.1147, "eval_samples_per_second": 2.69, "eval_steps_per_second": 1.361, "step": 100 }, { "epoch": 0.021878029982845635, "grad_norm": 0.5577639937400818, "learning_rate": 0.0004, "loss": 0.6813, "step": 110 }, { "epoch": 0.023866941799467967, "grad_norm": 0.5308650732040405, "learning_rate": 0.0004, "loss": 0.689, "step": 120 }, { "epoch": 0.025855853616090295, "grad_norm": 0.705682635307312, "learning_rate": 0.0004, "loss": 0.6846, "step": 130 }, { "epoch": 0.027844765432712627, "grad_norm": 0.5250112414360046, "learning_rate": 0.0004, "loss": 0.6699, "step": 140 }, { "epoch": 0.02983367724933496, "grad_norm": 0.5885920524597168, "learning_rate": 0.0004, "loss": 0.6733, "step": 150 }, { "epoch": 0.03182258906595729, "grad_norm": 0.5392662286758423, "learning_rate": 0.0004, "loss": 0.673, "step": 160 }, { "epoch": 0.03381150088257962, "grad_norm": 0.576032280921936, "learning_rate": 0.0004, "loss": 0.6934, "step": 170 }, { "epoch": 0.03580041269920195, "grad_norm": 0.5046477317810059, "learning_rate": 0.0004, "loss": 0.6535, "step": 180 }, { "epoch": 0.03778932451582428, "grad_norm": 0.628962516784668, "learning_rate": 0.0004, "loss": 0.6758, "step": 190 }, { "epoch": 0.03977823633244661, "grad_norm": 0.5312318801879883, "learning_rate": 0.0004, "loss": 0.6839, "step": 200 }, { "epoch": 0.03977823633244661, "eval_loss": 0.08386518061161041, "eval_runtime": 30.1222, "eval_samples_per_second": 2.689, "eval_steps_per_second": 1.361, "step": 200 }, { "epoch": 0.04176714814906894, "grad_norm": 0.5689460635185242, "learning_rate": 0.0004, "loss": 0.6616, "step": 210 }, { "epoch": 0.04375605996569127, "grad_norm": 0.5760230422019958, "learning_rate": 0.0004, "loss": 0.67, "step": 220 }, { "epoch": 0.045744971782313605, "grad_norm": 0.6037033200263977, "learning_rate": 0.0004, "loss": 0.6608, "step": 230 }, { "epoch": 0.04773388359893593, "grad_norm": 0.5196573138237, "learning_rate": 0.0004, "loss": 0.672, "step": 240 }, { "epoch": 0.04972279541555826, "grad_norm": 0.5766464471817017, "learning_rate": 0.0004, "loss": 0.6651, "step": 250 }, { "epoch": 0.05171170723218059, "grad_norm": 0.5686795711517334, "learning_rate": 0.0004, "loss": 0.6351, "step": 260 }, { "epoch": 0.053700619048802925, "grad_norm": 0.5607637763023376, "learning_rate": 0.0004, "loss": 0.659, "step": 270 }, { "epoch": 0.05568953086542525, "grad_norm": 0.545982837677002, "learning_rate": 0.0004, "loss": 0.6297, "step": 280 }, { "epoch": 0.05767844268204758, "grad_norm": 0.6047331690788269, "learning_rate": 0.0004, "loss": 0.6541, "step": 290 }, { "epoch": 0.05966735449866992, "grad_norm": 0.5864997506141663, "learning_rate": 0.0004, "loss": 0.6831, "step": 300 }, { "epoch": 0.05966735449866992, "eval_loss": 0.0832422524690628, "eval_runtime": 30.1361, "eval_samples_per_second": 2.688, "eval_steps_per_second": 1.36, "step": 300 }, { "epoch": 0.061656266315292245, "grad_norm": 0.6031671166419983, "learning_rate": 0.0004, "loss": 0.6441, "step": 310 }, { "epoch": 0.06364517813191457, "grad_norm": 0.5433733463287354, "learning_rate": 0.0004, "loss": 0.6836, "step": 320 }, { "epoch": 0.06563408994853691, "grad_norm": 0.5863742232322693, "learning_rate": 0.0004, "loss": 0.6804, "step": 330 }, { "epoch": 0.06762300176515924, "grad_norm": 0.7768782377243042, "learning_rate": 0.0004, "loss": 0.7014, "step": 340 }, { "epoch": 0.06961191358178156, "grad_norm": 0.548475444316864, "learning_rate": 0.0004, "loss": 0.6545, "step": 350 }, { "epoch": 0.0716008253984039, "grad_norm": 0.7511247992515564, "learning_rate": 0.0004, "loss": 0.6478, "step": 360 }, { "epoch": 0.07358973721502624, "grad_norm": 0.6464333534240723, "learning_rate": 0.0004, "loss": 0.6762, "step": 370 }, { "epoch": 0.07557864903164856, "grad_norm": 0.6280458569526672, "learning_rate": 0.0004, "loss": 0.6723, "step": 380 }, { "epoch": 0.07756756084827089, "grad_norm": 0.6138644218444824, "learning_rate": 0.0004, "loss": 0.6834, "step": 390 }, { "epoch": 0.07955647266489323, "grad_norm": 0.6612856984138489, "learning_rate": 0.0004, "loss": 0.662, "step": 400 }, { "epoch": 0.07955647266489323, "eval_loss": 0.08379939943552017, "eval_runtime": 30.1324, "eval_samples_per_second": 2.688, "eval_steps_per_second": 1.361, "step": 400 }, { "epoch": 0.08154538448151555, "grad_norm": 0.5658541917800903, "learning_rate": 0.0004, "loss": 0.6507, "step": 410 }, { "epoch": 0.08353429629813788, "grad_norm": 0.5861065983772278, "learning_rate": 0.0004, "loss": 0.6565, "step": 420 }, { "epoch": 0.08552320811476022, "grad_norm": 0.6580057144165039, "learning_rate": 0.0004, "loss": 0.6961, "step": 430 }, { "epoch": 0.08751211993138254, "grad_norm": 0.6456801295280457, "learning_rate": 0.0004, "loss": 0.6667, "step": 440 }, { "epoch": 0.08950103174800488, "grad_norm": 0.6603415608406067, "learning_rate": 0.0004, "loss": 0.6589, "step": 450 }, { "epoch": 0.09148994356462721, "grad_norm": 0.6744834184646606, "learning_rate": 0.0004, "loss": 0.6791, "step": 460 }, { "epoch": 0.09347885538124953, "grad_norm": 0.6219160556793213, "learning_rate": 0.0004, "loss": 0.6748, "step": 470 }, { "epoch": 0.09546776719787187, "grad_norm": 0.6373462677001953, "learning_rate": 0.0004, "loss": 0.654, "step": 480 }, { "epoch": 0.09745667901449419, "grad_norm": 0.7271533608436584, "learning_rate": 0.0004, "loss": 0.651, "step": 490 }, { "epoch": 0.09944559083111652, "grad_norm": 0.6483666300773621, "learning_rate": 0.0004, "loss": 0.6728, "step": 500 }, { "epoch": 0.09944559083111652, "eval_loss": 0.08319716155529022, "eval_runtime": 30.1252, "eval_samples_per_second": 2.689, "eval_steps_per_second": 1.361, "step": 500 }, { "epoch": 0.10143450264773886, "grad_norm": 0.5817425847053528, "learning_rate": 0.0004, "loss": 0.6571, "step": 510 }, { "epoch": 0.10342341446436118, "grad_norm": 0.6830428838729858, "learning_rate": 0.0004, "loss": 0.6618, "step": 520 }, { "epoch": 0.10541232628098351, "grad_norm": 0.5775642395019531, "learning_rate": 0.0004, "loss": 0.6181, "step": 530 }, { "epoch": 0.10740123809760585, "grad_norm": 0.6007582545280457, "learning_rate": 0.0004, "loss": 0.6839, "step": 540 }, { "epoch": 0.10939014991422817, "grad_norm": 0.648262083530426, "learning_rate": 0.0004, "loss": 0.6643, "step": 550 }, { "epoch": 0.1113790617308505, "grad_norm": 0.6632483601570129, "learning_rate": 0.0004, "loss": 0.652, "step": 560 }, { "epoch": 0.11336797354747284, "grad_norm": 0.5972626805305481, "learning_rate": 0.0004, "loss": 0.6938, "step": 570 }, { "epoch": 0.11535688536409516, "grad_norm": 0.6052406430244446, "learning_rate": 0.0004, "loss": 0.6301, "step": 580 }, { "epoch": 0.1173457971807175, "grad_norm": 0.5875466465950012, "learning_rate": 0.0004, "loss": 0.6614, "step": 590 }, { "epoch": 0.11933470899733983, "grad_norm": 0.7067976593971252, "learning_rate": 0.0004, "loss": 0.647, "step": 600 }, { "epoch": 0.11933470899733983, "eval_loss": 0.08319241553544998, "eval_runtime": 30.1031, "eval_samples_per_second": 2.691, "eval_steps_per_second": 1.362, "step": 600 }, { "epoch": 0.12132362081396215, "grad_norm": 0.66518634557724, "learning_rate": 0.0004, "loss": 0.693, "step": 610 }, { "epoch": 0.12331253263058449, "grad_norm": 0.6959813833236694, "learning_rate": 0.0004, "loss": 0.6747, "step": 620 }, { "epoch": 0.1253014444472068, "grad_norm": 0.935105562210083, "learning_rate": 0.0004, "loss": 0.6686, "step": 630 }, { "epoch": 0.12729035626382915, "grad_norm": 0.713768720626831, "learning_rate": 0.0004, "loss": 0.6707, "step": 640 }, { "epoch": 0.12927926808045148, "grad_norm": 0.7059699296951294, "learning_rate": 0.0004, "loss": 0.7255, "step": 650 }, { "epoch": 0.13126817989707382, "grad_norm": 0.588306725025177, "learning_rate": 0.0004, "loss": 0.6689, "step": 660 }, { "epoch": 0.13325709171369615, "grad_norm": 0.6097111105918884, "learning_rate": 0.0004, "loss": 0.6612, "step": 670 }, { "epoch": 0.1352460035303185, "grad_norm": 0.642393946647644, "learning_rate": 0.0004, "loss": 0.6743, "step": 680 }, { "epoch": 0.1372349153469408, "grad_norm": 0.7600162625312805, "learning_rate": 0.0004, "loss": 0.6768, "step": 690 }, { "epoch": 0.13922382716356313, "grad_norm": 0.7193499207496643, "learning_rate": 0.0004, "loss": 0.6559, "step": 700 }, { "epoch": 0.13922382716356313, "eval_loss": 0.08392166346311569, "eval_runtime": 30.1253, "eval_samples_per_second": 2.689, "eval_steps_per_second": 1.361, "step": 700 }, { "epoch": 0.14121273898018546, "grad_norm": 0.6542356014251709, "learning_rate": 0.0004, "loss": 0.6775, "step": 710 }, { "epoch": 0.1432016507968078, "grad_norm": 0.629941999912262, "learning_rate": 0.0004, "loss": 0.6223, "step": 720 }, { "epoch": 0.14519056261343014, "grad_norm": 0.6493385434150696, "learning_rate": 0.0004, "loss": 0.6894, "step": 730 }, { "epoch": 0.14717947443005247, "grad_norm": 0.7201417684555054, "learning_rate": 0.0004, "loss": 0.6858, "step": 740 }, { "epoch": 0.14916838624667478, "grad_norm": 0.6775253415107727, "learning_rate": 0.0004, "loss": 0.6628, "step": 750 }, { "epoch": 0.1511572980632971, "grad_norm": 0.6149548292160034, "learning_rate": 0.0004, "loss": 0.6993, "step": 760 }, { "epoch": 0.15314620987991945, "grad_norm": 0.6627587080001831, "learning_rate": 0.0004, "loss": 0.646, "step": 770 }, { "epoch": 0.15513512169654178, "grad_norm": 0.6701797842979431, "learning_rate": 0.0004, "loss": 0.6927, "step": 780 }, { "epoch": 0.15712403351316412, "grad_norm": 0.678193211555481, "learning_rate": 0.0004, "loss": 0.6454, "step": 790 }, { "epoch": 0.15911294532978645, "grad_norm": 0.6337444186210632, "learning_rate": 0.0004, "loss": 0.6723, "step": 800 }, { "epoch": 0.15911294532978645, "eval_loss": 0.08426591008901596, "eval_runtime": 30.0445, "eval_samples_per_second": 2.696, "eval_steps_per_second": 1.365, "step": 800 }, { "epoch": 0.16110185714640876, "grad_norm": 0.654451310634613, "learning_rate": 0.0004, "loss": 0.6799, "step": 810 }, { "epoch": 0.1630907689630311, "grad_norm": 0.6989086866378784, "learning_rate": 0.0004, "loss": 0.6694, "step": 820 }, { "epoch": 0.16507968077965343, "grad_norm": 0.6176579594612122, "learning_rate": 0.0004, "loss": 0.6225, "step": 830 }, { "epoch": 0.16706859259627577, "grad_norm": 0.6462605595588684, "learning_rate": 0.0004, "loss": 0.6584, "step": 840 }, { "epoch": 0.1690575044128981, "grad_norm": 0.7809733748435974, "learning_rate": 0.0004, "loss": 0.6601, "step": 850 }, { "epoch": 0.17104641622952044, "grad_norm": 0.7143774032592773, "learning_rate": 0.0004, "loss": 0.6655, "step": 860 }, { "epoch": 0.17303532804614274, "grad_norm": 0.7137865424156189, "learning_rate": 0.0004, "loss": 0.6849, "step": 870 }, { "epoch": 0.17502423986276508, "grad_norm": 0.715568482875824, "learning_rate": 0.0004, "loss": 0.6408, "step": 880 }, { "epoch": 0.17701315167938741, "grad_norm": 0.59111088514328, "learning_rate": 0.0004, "loss": 0.6772, "step": 890 }, { "epoch": 0.17900206349600975, "grad_norm": 0.7616696357727051, "learning_rate": 0.0004, "loss": 0.6671, "step": 900 }, { "epoch": 0.17900206349600975, "eval_loss": 0.084267757833004, "eval_runtime": 30.1275, "eval_samples_per_second": 2.689, "eval_steps_per_second": 1.361, "step": 900 }, { "epoch": 0.18099097531263209, "grad_norm": 0.6685693860054016, "learning_rate": 0.0004, "loss": 0.6792, "step": 910 }, { "epoch": 0.18297988712925442, "grad_norm": 0.7320526838302612, "learning_rate": 0.0004, "loss": 0.6435, "step": 920 }, { "epoch": 0.18496879894587673, "grad_norm": 0.6541480422019958, "learning_rate": 0.0004, "loss": 0.6649, "step": 930 }, { "epoch": 0.18695771076249906, "grad_norm": 0.6433006525039673, "learning_rate": 0.0004, "loss": 0.6677, "step": 940 }, { "epoch": 0.1889466225791214, "grad_norm": 0.6296941041946411, "learning_rate": 0.0004, "loss": 0.6334, "step": 950 }, { "epoch": 0.19093553439574373, "grad_norm": 0.7856689691543579, "learning_rate": 0.0004, "loss": 0.7039, "step": 960 }, { "epoch": 0.19292444621236607, "grad_norm": 0.6200475096702576, "learning_rate": 0.0004, "loss": 0.6602, "step": 970 }, { "epoch": 0.19491335802898838, "grad_norm": 0.6970551609992981, "learning_rate": 0.0004, "loss": 0.6704, "step": 980 }, { "epoch": 0.1969022698456107, "grad_norm": 0.6525449752807617, "learning_rate": 0.0004, "loss": 0.6721, "step": 990 }, { "epoch": 0.19889118166223305, "grad_norm": 0.7507511377334595, "learning_rate": 0.0004, "loss": 0.6829, "step": 1000 }, { "epoch": 0.19889118166223305, "eval_loss": 0.0835462287068367, "eval_runtime": 30.0507, "eval_samples_per_second": 2.695, "eval_steps_per_second": 1.364, "step": 1000 }, { "epoch": 0.20088009347885538, "grad_norm": 0.7378696203231812, "learning_rate": 0.0004, "loss": 0.6567, "step": 1010 }, { "epoch": 0.20286900529547772, "grad_norm": 0.6451396346092224, "learning_rate": 0.0004, "loss": 0.6502, "step": 1020 }, { "epoch": 0.20485791711210005, "grad_norm": 0.6342566013336182, "learning_rate": 0.0004, "loss": 0.6477, "step": 1030 }, { "epoch": 0.20684682892872236, "grad_norm": 0.7209526896476746, "learning_rate": 0.0004, "loss": 0.6661, "step": 1040 }, { "epoch": 0.2088357407453447, "grad_norm": 0.6808329820632935, "learning_rate": 0.0004, "loss": 0.6515, "step": 1050 }, { "epoch": 0.21082465256196703, "grad_norm": 0.6738231182098389, "learning_rate": 0.0004, "loss": 0.626, "step": 1060 }, { "epoch": 0.21281356437858936, "grad_norm": 0.6646963357925415, "learning_rate": 0.0004, "loss": 0.6714, "step": 1070 }, { "epoch": 0.2148024761952117, "grad_norm": 0.6372888088226318, "learning_rate": 0.0004, "loss": 0.6768, "step": 1080 }, { "epoch": 0.21679138801183404, "grad_norm": 0.7138890624046326, "learning_rate": 0.0004, "loss": 0.6949, "step": 1090 }, { "epoch": 0.21878029982845634, "grad_norm": 0.7249679565429688, "learning_rate": 0.0004, "loss": 0.6928, "step": 1100 }, { "epoch": 0.21878029982845634, "eval_loss": 0.08422956615686417, "eval_runtime": 30.1032, "eval_samples_per_second": 2.691, "eval_steps_per_second": 1.362, "step": 1100 }, { "epoch": 0.22076921164507868, "grad_norm": 0.6382346153259277, "learning_rate": 0.0004, "loss": 0.6619, "step": 1110 }, { "epoch": 0.222758123461701, "grad_norm": 0.6400596499443054, "learning_rate": 0.0004, "loss": 0.7103, "step": 1120 }, { "epoch": 0.22474703527832335, "grad_norm": 0.6994810700416565, "learning_rate": 0.0004, "loss": 0.6647, "step": 1130 }, { "epoch": 0.22673594709494568, "grad_norm": 0.76835036277771, "learning_rate": 0.0004, "loss": 0.6923, "step": 1140 }, { "epoch": 0.22872485891156802, "grad_norm": 0.6603644490242004, "learning_rate": 0.0004, "loss": 0.673, "step": 1150 }, { "epoch": 0.23071377072819033, "grad_norm": 0.7264408469200134, "learning_rate": 0.0004, "loss": 0.6828, "step": 1160 }, { "epoch": 0.23270268254481266, "grad_norm": 0.7072731852531433, "learning_rate": 0.0004, "loss": 0.6831, "step": 1170 }, { "epoch": 0.234691594361435, "grad_norm": 0.6494096517562866, "learning_rate": 0.0004, "loss": 0.6659, "step": 1180 }, { "epoch": 0.23668050617805733, "grad_norm": 0.6463006734848022, "learning_rate": 0.0004, "loss": 0.7155, "step": 1190 }, { "epoch": 0.23866941799467967, "grad_norm": 0.6508920192718506, "learning_rate": 0.0004, "loss": 0.6563, "step": 1200 }, { "epoch": 0.23866941799467967, "eval_loss": 0.08387701213359833, "eval_runtime": 30.0299, "eval_samples_per_second": 2.697, "eval_steps_per_second": 1.365, "step": 1200 }, { "epoch": 0.240658329811302, "grad_norm": 0.6701735258102417, "learning_rate": 0.0004, "loss": 0.6765, "step": 1210 }, { "epoch": 0.2426472416279243, "grad_norm": 0.5798119902610779, "learning_rate": 0.0004, "loss": 0.6501, "step": 1220 }, { "epoch": 0.24463615344454664, "grad_norm": 0.7210298776626587, "learning_rate": 0.0004, "loss": 0.6576, "step": 1230 }, { "epoch": 0.24662506526116898, "grad_norm": 0.7448759078979492, "learning_rate": 0.0004, "loss": 0.6918, "step": 1240 }, { "epoch": 0.24861397707779131, "grad_norm": 0.6556337475776672, "learning_rate": 0.0004, "loss": 0.6526, "step": 1250 }, { "epoch": 0.2506028888944136, "grad_norm": 0.6584301590919495, "learning_rate": 0.0004, "loss": 0.6736, "step": 1260 }, { "epoch": 0.252591800711036, "grad_norm": 0.6725241541862488, "learning_rate": 0.0004, "loss": 0.6772, "step": 1270 }, { "epoch": 0.2545807125276583, "grad_norm": 0.7188987731933594, "learning_rate": 0.0004, "loss": 0.6629, "step": 1280 }, { "epoch": 0.25656962434428066, "grad_norm": 0.8247680068016052, "learning_rate": 0.0004, "loss": 0.6844, "step": 1290 }, { "epoch": 0.25855853616090296, "grad_norm": 0.6960418224334717, "learning_rate": 0.0004, "loss": 0.6411, "step": 1300 }, { "epoch": 0.25855853616090296, "eval_loss": 0.0839666873216629, "eval_runtime": 30.0597, "eval_samples_per_second": 2.695, "eval_steps_per_second": 1.364, "step": 1300 }, { "epoch": 0.26054744797752527, "grad_norm": 0.6796591877937317, "learning_rate": 0.0004, "loss": 0.6707, "step": 1310 }, { "epoch": 0.26253635979414763, "grad_norm": 0.6542907357215881, "learning_rate": 0.0004, "loss": 0.6691, "step": 1320 }, { "epoch": 0.26452527161076994, "grad_norm": 0.6826708912849426, "learning_rate": 0.0004, "loss": 0.6696, "step": 1330 }, { "epoch": 0.2665141834273923, "grad_norm": 0.7088764905929565, "learning_rate": 0.0004, "loss": 0.6817, "step": 1340 }, { "epoch": 0.2685030952440146, "grad_norm": 0.7242617607116699, "learning_rate": 0.0004, "loss": 0.6759, "step": 1350 }, { "epoch": 0.270492007060637, "grad_norm": 0.7008711099624634, "learning_rate": 0.0004, "loss": 0.6592, "step": 1360 }, { "epoch": 0.2724809188772593, "grad_norm": 0.6489241719245911, "learning_rate": 0.0004, "loss": 0.6637, "step": 1370 }, { "epoch": 0.2744698306938816, "grad_norm": 0.7217922806739807, "learning_rate": 0.0004, "loss": 0.6539, "step": 1380 }, { "epoch": 0.27645874251050395, "grad_norm": 0.8037365078926086, "learning_rate": 0.0004, "loss": 0.6994, "step": 1390 }, { "epoch": 0.27844765432712626, "grad_norm": 0.659654974937439, "learning_rate": 0.0004, "loss": 0.6673, "step": 1400 }, { "epoch": 0.27844765432712626, "eval_loss": 0.0840681791305542, "eval_runtime": 30.0353, "eval_samples_per_second": 2.697, "eval_steps_per_second": 1.365, "step": 1400 }, { "epoch": 0.2804365661437486, "grad_norm": 0.7238272428512573, "learning_rate": 0.0004, "loss": 0.6974, "step": 1410 }, { "epoch": 0.28242547796037093, "grad_norm": 0.6564947366714478, "learning_rate": 0.0004, "loss": 0.6687, "step": 1420 }, { "epoch": 0.28441438977699324, "grad_norm": 0.7392669916152954, "learning_rate": 0.0004, "loss": 0.6546, "step": 1430 }, { "epoch": 0.2864033015936156, "grad_norm": 0.7504440546035767, "learning_rate": 0.0004, "loss": 0.6737, "step": 1440 }, { "epoch": 0.2883922134102379, "grad_norm": 0.7336270213127136, "learning_rate": 0.0004, "loss": 0.6803, "step": 1450 }, { "epoch": 0.29038112522686027, "grad_norm": 1.862186312675476, "learning_rate": 0.0004, "loss": 0.658, "step": 1460 }, { "epoch": 0.2923700370434826, "grad_norm": 0.7425276637077332, "learning_rate": 0.0004, "loss": 0.6383, "step": 1470 }, { "epoch": 0.29435894886010494, "grad_norm": 0.6604830622673035, "learning_rate": 0.0004, "loss": 0.7077, "step": 1480 }, { "epoch": 0.29634786067672725, "grad_norm": 0.7673712968826294, "learning_rate": 0.0004, "loss": 0.6722, "step": 1490 }, { "epoch": 0.29833677249334956, "grad_norm": 0.7889634370803833, "learning_rate": 0.0004, "loss": 0.6909, "step": 1500 }, { "epoch": 0.29833677249334956, "eval_loss": 0.08427305519580841, "eval_runtime": 30.0826, "eval_samples_per_second": 2.693, "eval_steps_per_second": 1.363, "step": 1500 }, { "epoch": 0.3003256843099719, "grad_norm": 0.8077505826950073, "learning_rate": 0.0004, "loss": 0.6976, "step": 1510 }, { "epoch": 0.3023145961265942, "grad_norm": 0.6837480068206787, "learning_rate": 0.0004, "loss": 0.675, "step": 1520 }, { "epoch": 0.3043035079432166, "grad_norm": 0.6629063487052917, "learning_rate": 0.0004, "loss": 0.6396, "step": 1530 }, { "epoch": 0.3062924197598389, "grad_norm": 0.7187213897705078, "learning_rate": 0.0004, "loss": 0.6898, "step": 1540 }, { "epoch": 0.3082813315764612, "grad_norm": 0.7269571423530579, "learning_rate": 0.0004, "loss": 0.7106, "step": 1550 }, { "epoch": 0.31027024339308357, "grad_norm": 0.6767787337303162, "learning_rate": 0.0004, "loss": 0.6836, "step": 1560 }, { "epoch": 0.3122591552097059, "grad_norm": 0.7046016454696655, "learning_rate": 0.0004, "loss": 0.7129, "step": 1570 }, { "epoch": 0.31424806702632824, "grad_norm": 0.6218843460083008, "learning_rate": 0.0004, "loss": 0.7129, "step": 1580 }, { "epoch": 0.31623697884295054, "grad_norm": 0.7410914897918701, "learning_rate": 0.0004, "loss": 0.6616, "step": 1590 }, { "epoch": 0.3182258906595729, "grad_norm": 0.5945529937744141, "learning_rate": 0.0004, "loss": 0.6878, "step": 1600 }, { "epoch": 0.3182258906595729, "eval_loss": 0.08433911204338074, "eval_runtime": 30.0269, "eval_samples_per_second": 2.698, "eval_steps_per_second": 1.365, "step": 1600 }, { "epoch": 0.3202148024761952, "grad_norm": 0.6479379534721375, "learning_rate": 4e-05, "loss": 0.6708, "step": 1610 }, { "epoch": 0.3222037142928175, "grad_norm": 0.6011672616004944, "learning_rate": 4e-05, "loss": 0.6511, "step": 1620 }, { "epoch": 0.3241926261094399, "grad_norm": 0.6457736492156982, "learning_rate": 4e-05, "loss": 0.6601, "step": 1630 }, { "epoch": 0.3261815379260622, "grad_norm": 0.6549608707427979, "learning_rate": 4e-05, "loss": 0.6643, "step": 1640 }, { "epoch": 0.32817044974268456, "grad_norm": 0.6869723200798035, "learning_rate": 4e-05, "loss": 0.6766, "step": 1650 }, { "epoch": 0.33015936155930686, "grad_norm": 0.6526493430137634, "learning_rate": 4e-05, "loss": 0.6625, "step": 1660 }, { "epoch": 0.33214827337592917, "grad_norm": 0.6106629967689514, "learning_rate": 4e-05, "loss": 0.6412, "step": 1670 }, { "epoch": 0.33413718519255153, "grad_norm": 0.6620826125144958, "learning_rate": 4e-05, "loss": 0.6314, "step": 1680 }, { "epoch": 0.33612609700917384, "grad_norm": 0.6487278938293457, "learning_rate": 4e-05, "loss": 0.643, "step": 1690 }, { "epoch": 0.3381150088257962, "grad_norm": 0.7703284621238708, "learning_rate": 4e-05, "loss": 0.6298, "step": 1700 }, { "epoch": 0.3381150088257962, "eval_loss": 0.08053209632635117, "eval_runtime": 30.0784, "eval_samples_per_second": 2.693, "eval_steps_per_second": 1.363, "step": 1700 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.460398769660756e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }