{ "best_metric": 0.038124363869428635, "best_model_checkpoint": "saves/psy-course/Llama3-OpenBioLLM-8B/train/fold6/checkpoint-1900", "epoch": 4.995305164319249, "eval_steps": 50, "global_step": 3325, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015023474178403756, "grad_norm": 6.414540767669678, "learning_rate": 3.003003003003003e-06, "loss": 1.3292, "step": 10 }, { "epoch": 0.03004694835680751, "grad_norm": 5.740021228790283, "learning_rate": 6.006006006006006e-06, "loss": 1.3844, "step": 20 }, { "epoch": 0.04507042253521127, "grad_norm": 4.104672431945801, "learning_rate": 9.00900900900901e-06, "loss": 1.1953, "step": 30 }, { "epoch": 0.06009389671361502, "grad_norm": 1.9491015672683716, "learning_rate": 1.2012012012012012e-05, "loss": 0.8062, "step": 40 }, { "epoch": 0.07511737089201878, "grad_norm": 1.8273999691009521, "learning_rate": 1.5015015015015016e-05, "loss": 0.4971, "step": 50 }, { "epoch": 0.07511737089201878, "eval_loss": 0.3236617147922516, "eval_runtime": 215.6614, "eval_samples_per_second": 5.49, "eval_steps_per_second": 5.49, "step": 50 }, { "epoch": 0.09014084507042254, "grad_norm": 1.789884328842163, "learning_rate": 1.801801801801802e-05, "loss": 0.3049, "step": 60 }, { "epoch": 0.10516431924882629, "grad_norm": 1.026201844215393, "learning_rate": 2.102102102102102e-05, "loss": 0.1795, "step": 70 }, { "epoch": 0.12018779342723004, "grad_norm": 1.4471508264541626, "learning_rate": 2.4024024024024024e-05, "loss": 0.1482, "step": 80 }, { "epoch": 0.1352112676056338, "grad_norm": 1.0525811910629272, "learning_rate": 2.702702702702703e-05, "loss": 0.0928, "step": 90 }, { "epoch": 0.15023474178403756, "grad_norm": 1.223236322402954, "learning_rate": 3.0030030030030033e-05, "loss": 0.1179, "step": 100 }, { "epoch": 0.15023474178403756, "eval_loss": 0.09419180452823639, "eval_runtime": 224.392, "eval_samples_per_second": 5.276, "eval_steps_per_second": 5.276, "step": 100 }, { "epoch": 0.1652582159624413, "grad_norm": 1.3412930965423584, "learning_rate": 3.3033033033033035e-05, "loss": 0.1185, "step": 110 }, { "epoch": 0.18028169014084508, "grad_norm": 1.0879250764846802, "learning_rate": 3.603603603603604e-05, "loss": 0.0985, "step": 120 }, { "epoch": 0.19530516431924883, "grad_norm": 0.9736179709434509, "learning_rate": 3.903903903903904e-05, "loss": 0.0828, "step": 130 }, { "epoch": 0.21032863849765257, "grad_norm": 0.6209188103675842, "learning_rate": 4.204204204204204e-05, "loss": 0.1053, "step": 140 }, { "epoch": 0.22535211267605634, "grad_norm": 0.6934787631034851, "learning_rate": 4.5045045045045046e-05, "loss": 0.0891, "step": 150 }, { "epoch": 0.22535211267605634, "eval_loss": 0.07970932126045227, "eval_runtime": 225.0802, "eval_samples_per_second": 5.26, "eval_steps_per_second": 5.26, "step": 150 }, { "epoch": 0.2403755868544601, "grad_norm": 1.273955225944519, "learning_rate": 4.804804804804805e-05, "loss": 0.0755, "step": 160 }, { "epoch": 0.25539906103286386, "grad_norm": 1.2673016786575317, "learning_rate": 5.105105105105106e-05, "loss": 0.0569, "step": 170 }, { "epoch": 0.2704225352112676, "grad_norm": 1.060423493385315, "learning_rate": 5.405405405405406e-05, "loss": 0.0744, "step": 180 }, { "epoch": 0.28544600938967135, "grad_norm": 1.2299424409866333, "learning_rate": 5.705705705705706e-05, "loss": 0.0684, "step": 190 }, { "epoch": 0.3004694835680751, "grad_norm": 0.6046801805496216, "learning_rate": 6.0060060060060066e-05, "loss": 0.0539, "step": 200 }, { "epoch": 0.3004694835680751, "eval_loss": 0.05980731174349785, "eval_runtime": 226.7678, "eval_samples_per_second": 5.221, "eval_steps_per_second": 5.221, "step": 200 }, { "epoch": 0.3154929577464789, "grad_norm": 0.5380669236183167, "learning_rate": 6.306306306306306e-05, "loss": 0.054, "step": 210 }, { "epoch": 0.3305164319248826, "grad_norm": 0.42053088545799255, "learning_rate": 6.606606606606607e-05, "loss": 0.0645, "step": 220 }, { "epoch": 0.3455399061032864, "grad_norm": 0.6181445717811584, "learning_rate": 6.906906906906907e-05, "loss": 0.0527, "step": 230 }, { "epoch": 0.36056338028169016, "grad_norm": 0.8342611193656921, "learning_rate": 7.207207207207208e-05, "loss": 0.0622, "step": 240 }, { "epoch": 0.3755868544600939, "grad_norm": 0.7963096499443054, "learning_rate": 7.507507507507507e-05, "loss": 0.065, "step": 250 }, { "epoch": 0.3755868544600939, "eval_loss": 0.05752342566847801, "eval_runtime": 228.7801, "eval_samples_per_second": 5.175, "eval_steps_per_second": 5.175, "step": 250 }, { "epoch": 0.39061032863849765, "grad_norm": 0.5707411170005798, "learning_rate": 7.807807807807808e-05, "loss": 0.0526, "step": 260 }, { "epoch": 0.4056338028169014, "grad_norm": 0.8613728880882263, "learning_rate": 8.108108108108109e-05, "loss": 0.0638, "step": 270 }, { "epoch": 0.42065727699530514, "grad_norm": 0.20531082153320312, "learning_rate": 8.408408408408409e-05, "loss": 0.0646, "step": 280 }, { "epoch": 0.4356807511737089, "grad_norm": 1.4829570055007935, "learning_rate": 8.70870870870871e-05, "loss": 0.0644, "step": 290 }, { "epoch": 0.4507042253521127, "grad_norm": 0.5894226431846619, "learning_rate": 9.009009009009009e-05, "loss": 0.052, "step": 300 }, { "epoch": 0.4507042253521127, "eval_loss": 0.053021036088466644, "eval_runtime": 224.341, "eval_samples_per_second": 5.278, "eval_steps_per_second": 5.278, "step": 300 }, { "epoch": 0.46572769953051646, "grad_norm": 0.3916723132133484, "learning_rate": 9.30930930930931e-05, "loss": 0.0449, "step": 310 }, { "epoch": 0.4807511737089202, "grad_norm": 0.7790238857269287, "learning_rate": 9.60960960960961e-05, "loss": 0.0598, "step": 320 }, { "epoch": 0.49577464788732395, "grad_norm": 0.6429105997085571, "learning_rate": 9.90990990990991e-05, "loss": 0.0561, "step": 330 }, { "epoch": 0.5107981220657277, "grad_norm": 0.9011141657829285, "learning_rate": 9.999864944989638e-05, "loss": 0.0683, "step": 340 }, { "epoch": 0.5258215962441315, "grad_norm": 0.3162938058376312, "learning_rate": 9.999203468625017e-05, "loss": 0.0535, "step": 350 }, { "epoch": 0.5258215962441315, "eval_loss": 0.054493363946676254, "eval_runtime": 227.3472, "eval_samples_per_second": 5.208, "eval_steps_per_second": 5.208, "step": 350 }, { "epoch": 0.5408450704225352, "grad_norm": 0.7987211346626282, "learning_rate": 9.997990837719421e-05, "loss": 0.0634, "step": 360 }, { "epoch": 0.5558685446009389, "grad_norm": 0.30378398299217224, "learning_rate": 9.996227185963554e-05, "loss": 0.0451, "step": 370 }, { "epoch": 0.5708920187793427, "grad_norm": 0.7728825807571411, "learning_rate": 9.993912707797329e-05, "loss": 0.0467, "step": 380 }, { "epoch": 0.5859154929577465, "grad_norm": 0.3161417245864868, "learning_rate": 9.99104765838842e-05, "loss": 0.0493, "step": 390 }, { "epoch": 0.6009389671361502, "grad_norm": 0.26571208238601685, "learning_rate": 9.987632353604151e-05, "loss": 0.0512, "step": 400 }, { "epoch": 0.6009389671361502, "eval_loss": 0.04660526290535927, "eval_runtime": 226.5936, "eval_samples_per_second": 5.225, "eval_steps_per_second": 5.225, "step": 400 }, { "epoch": 0.615962441314554, "grad_norm": 0.7127376794815063, "learning_rate": 9.98366716997665e-05, "loss": 0.0459, "step": 410 }, { "epoch": 0.6309859154929578, "grad_norm": 1.0153098106384277, "learning_rate": 9.979152544661354e-05, "loss": 0.0478, "step": 420 }, { "epoch": 0.6460093896713615, "grad_norm": 0.6635463833808899, "learning_rate": 9.974088975388802e-05, "loss": 0.0498, "step": 430 }, { "epoch": 0.6610328638497652, "grad_norm": 0.5931793451309204, "learning_rate": 9.968477020409766e-05, "loss": 0.0489, "step": 440 }, { "epoch": 0.676056338028169, "grad_norm": 0.5385611057281494, "learning_rate": 9.962317298433705e-05, "loss": 0.0597, "step": 450 }, { "epoch": 0.676056338028169, "eval_loss": 0.049538787454366684, "eval_runtime": 226.4382, "eval_samples_per_second": 5.229, "eval_steps_per_second": 5.229, "step": 450 }, { "epoch": 0.6910798122065728, "grad_norm": 0.31567174196243286, "learning_rate": 9.955610488560551e-05, "loss": 0.0595, "step": 460 }, { "epoch": 0.7061032863849765, "grad_norm": 0.6918370127677917, "learning_rate": 9.948357330205842e-05, "loss": 0.061, "step": 470 }, { "epoch": 0.7211267605633803, "grad_norm": 0.8577624559402466, "learning_rate": 9.940558623019201e-05, "loss": 0.032, "step": 480 }, { "epoch": 0.7361502347417841, "grad_norm": 0.6278219819068909, "learning_rate": 9.932215226796172e-05, "loss": 0.0486, "step": 490 }, { "epoch": 0.7511737089201878, "grad_norm": 0.3588387072086334, "learning_rate": 9.923328061383435e-05, "loss": 0.0509, "step": 500 }, { "epoch": 0.7511737089201878, "eval_loss": 0.046869516372680664, "eval_runtime": 228.8856, "eval_samples_per_second": 5.173, "eval_steps_per_second": 5.173, "step": 500 }, { "epoch": 0.7661971830985915, "grad_norm": 0.24121642112731934, "learning_rate": 9.913898106577393e-05, "loss": 0.049, "step": 510 }, { "epoch": 0.7812206572769953, "grad_norm": 0.3446851670742035, "learning_rate": 9.903926402016153e-05, "loss": 0.0405, "step": 520 }, { "epoch": 0.7962441314553991, "grad_norm": 0.30128219723701477, "learning_rate": 9.893414047064897e-05, "loss": 0.0452, "step": 530 }, { "epoch": 0.8112676056338028, "grad_norm": 0.4867357313632965, "learning_rate": 9.88236220069469e-05, "loss": 0.0483, "step": 540 }, { "epoch": 0.8262910798122066, "grad_norm": 0.3773770034313202, "learning_rate": 9.870772081354705e-05, "loss": 0.0624, "step": 550 }, { "epoch": 0.8262910798122066, "eval_loss": 0.04311800375580788, "eval_runtime": 225.9134, "eval_samples_per_second": 5.241, "eval_steps_per_second": 5.241, "step": 550 }, { "epoch": 0.8413145539906103, "grad_norm": 0.3748703896999359, "learning_rate": 9.858644966837878e-05, "loss": 0.0493, "step": 560 }, { "epoch": 0.856338028169014, "grad_norm": 0.19429147243499756, "learning_rate": 9.845982194140051e-05, "loss": 0.038, "step": 570 }, { "epoch": 0.8713615023474178, "grad_norm": 0.24159827828407288, "learning_rate": 9.832785159312559e-05, "loss": 0.0435, "step": 580 }, { "epoch": 0.8863849765258216, "grad_norm": 0.3861151337623596, "learning_rate": 9.819055317308317e-05, "loss": 0.0389, "step": 590 }, { "epoch": 0.9014084507042254, "grad_norm": 0.2947847545146942, "learning_rate": 9.804794181821422e-05, "loss": 0.035, "step": 600 }, { "epoch": 0.9014084507042254, "eval_loss": 0.04581260681152344, "eval_runtime": 228.7478, "eval_samples_per_second": 5.176, "eval_steps_per_second": 5.176, "step": 600 }, { "epoch": 0.9164319248826291, "grad_norm": 0.9220906496047974, "learning_rate": 9.790003325120261e-05, "loss": 0.0477, "step": 610 }, { "epoch": 0.9314553990610329, "grad_norm": 0.585889995098114, "learning_rate": 9.774684377874178e-05, "loss": 0.0565, "step": 620 }, { "epoch": 0.9464788732394366, "grad_norm": 0.16347403824329376, "learning_rate": 9.758839028973692e-05, "loss": 0.0356, "step": 630 }, { "epoch": 0.9615023474178404, "grad_norm": 0.4304580092430115, "learning_rate": 9.742469025344298e-05, "loss": 0.0467, "step": 640 }, { "epoch": 0.9765258215962441, "grad_norm": 0.2779543697834015, "learning_rate": 9.725576171753874e-05, "loss": 0.0546, "step": 650 }, { "epoch": 0.9765258215962441, "eval_loss": 0.04435986280441284, "eval_runtime": 228.5445, "eval_samples_per_second": 5.181, "eval_steps_per_second": 5.181, "step": 650 }, { "epoch": 0.9915492957746479, "grad_norm": 0.8497869372367859, "learning_rate": 9.708162330613708e-05, "loss": 0.0396, "step": 660 }, { "epoch": 1.0065727699530516, "grad_norm": 0.2351168841123581, "learning_rate": 9.690229421773167e-05, "loss": 0.0434, "step": 670 }, { "epoch": 1.0215962441314554, "grad_norm": 0.4020308256149292, "learning_rate": 9.67177942230804e-05, "loss": 0.0369, "step": 680 }, { "epoch": 1.036619718309859, "grad_norm": 0.48213866353034973, "learning_rate": 9.652814366302568e-05, "loss": 0.0409, "step": 690 }, { "epoch": 1.051643192488263, "grad_norm": 0.30210596323013306, "learning_rate": 9.633336344625185e-05, "loss": 0.0404, "step": 700 }, { "epoch": 1.051643192488263, "eval_loss": 0.04425961896777153, "eval_runtime": 230.6178, "eval_samples_per_second": 5.134, "eval_steps_per_second": 5.134, "step": 700 }, { "epoch": 1.0666666666666667, "grad_norm": 0.3107031583786011, "learning_rate": 9.61334750469801e-05, "loss": 0.029, "step": 710 }, { "epoch": 1.0816901408450703, "grad_norm": 0.5843893885612488, "learning_rate": 9.592850050260089e-05, "loss": 0.0373, "step": 720 }, { "epoch": 1.0967136150234742, "grad_norm": 0.26578277349472046, "learning_rate": 9.571846241124446e-05, "loss": 0.0328, "step": 730 }, { "epoch": 1.1117370892018779, "grad_norm": 0.22206802666187286, "learning_rate": 9.55033839292893e-05, "loss": 0.028, "step": 740 }, { "epoch": 1.1267605633802817, "grad_norm": 0.5371259450912476, "learning_rate": 9.52832887688093e-05, "loss": 0.0342, "step": 750 }, { "epoch": 1.1267605633802817, "eval_loss": 0.043438900262117386, "eval_runtime": 230.5863, "eval_samples_per_second": 5.135, "eval_steps_per_second": 5.135, "step": 750 }, { "epoch": 1.1417840375586854, "grad_norm": 0.43538734316825867, "learning_rate": 9.50582011949595e-05, "loss": 0.0391, "step": 760 }, { "epoch": 1.1568075117370893, "grad_norm": 0.23159150779247284, "learning_rate": 9.482814602330084e-05, "loss": 0.0354, "step": 770 }, { "epoch": 1.171830985915493, "grad_norm": 0.5288726091384888, "learning_rate": 9.459314861706435e-05, "loss": 0.0386, "step": 780 }, { "epoch": 1.1868544600938966, "grad_norm": 0.3396289646625519, "learning_rate": 9.435323488435488e-05, "loss": 0.0291, "step": 790 }, { "epoch": 1.2018779342723005, "grad_norm": 0.5357405543327332, "learning_rate": 9.410843127529473e-05, "loss": 0.0298, "step": 800 }, { "epoch": 1.2018779342723005, "eval_loss": 0.04284869506955147, "eval_runtime": 228.5369, "eval_samples_per_second": 5.181, "eval_steps_per_second": 5.181, "step": 800 }, { "epoch": 1.2169014084507042, "grad_norm": 0.6564673781394958, "learning_rate": 9.385876477910765e-05, "loss": 0.0371, "step": 810 }, { "epoch": 1.231924882629108, "grad_norm": 0.15312528610229492, "learning_rate": 9.360426292114314e-05, "loss": 0.0379, "step": 820 }, { "epoch": 1.2469483568075117, "grad_norm": 0.32437649369239807, "learning_rate": 9.334495375984212e-05, "loss": 0.0334, "step": 830 }, { "epoch": 1.2619718309859156, "grad_norm": 0.33815252780914307, "learning_rate": 9.30808658836432e-05, "loss": 0.0305, "step": 840 }, { "epoch": 1.2769953051643192, "grad_norm": 0.43382740020751953, "learning_rate": 9.281202840783108e-05, "loss": 0.0348, "step": 850 }, { "epoch": 1.2769953051643192, "eval_loss": 0.04067714512348175, "eval_runtime": 232.2301, "eval_samples_per_second": 5.098, "eval_steps_per_second": 5.098, "step": 850 }, { "epoch": 1.292018779342723, "grad_norm": 0.3475792109966278, "learning_rate": 9.253847097132655e-05, "loss": 0.0285, "step": 860 }, { "epoch": 1.3070422535211268, "grad_norm": 0.24043916165828705, "learning_rate": 9.226022373341882e-05, "loss": 0.0285, "step": 870 }, { "epoch": 1.3220657276995305, "grad_norm": 0.2261325865983963, "learning_rate": 9.19773173704406e-05, "loss": 0.0263, "step": 880 }, { "epoch": 1.3370892018779343, "grad_norm": 0.673026978969574, "learning_rate": 9.168978307238594e-05, "loss": 0.0371, "step": 890 }, { "epoch": 1.352112676056338, "grad_norm": 0.1485815793275833, "learning_rate": 9.13976525394717e-05, "loss": 0.0291, "step": 900 }, { "epoch": 1.352112676056338, "eval_loss": 0.0397120825946331, "eval_runtime": 229.6421, "eval_samples_per_second": 5.156, "eval_steps_per_second": 5.156, "step": 900 }, { "epoch": 1.3671361502347419, "grad_norm": 0.20742298662662506, "learning_rate": 9.110095797864263e-05, "loss": 0.0295, "step": 910 }, { "epoch": 1.3821596244131455, "grad_norm": 0.1670617312192917, "learning_rate": 9.079973210002051e-05, "loss": 0.0317, "step": 920 }, { "epoch": 1.3971830985915492, "grad_norm": 0.4337206482887268, "learning_rate": 9.049400811329807e-05, "loss": 0.044, "step": 930 }, { "epoch": 1.412206572769953, "grad_norm": 0.36409109830856323, "learning_rate": 9.01838197240775e-05, "loss": 0.0267, "step": 940 }, { "epoch": 1.4272300469483568, "grad_norm": 0.45880192518234253, "learning_rate": 8.986920113015461e-05, "loss": 0.0373, "step": 950 }, { "epoch": 1.4272300469483568, "eval_loss": 0.03974553570151329, "eval_runtime": 232.2398, "eval_samples_per_second": 5.098, "eval_steps_per_second": 5.098, "step": 950 }, { "epoch": 1.4422535211267606, "grad_norm": 0.2591331899166107, "learning_rate": 8.955018701774846e-05, "loss": 0.0277, "step": 960 }, { "epoch": 1.4572769953051643, "grad_norm": 0.20547696948051453, "learning_rate": 8.922681255767731e-05, "loss": 0.0418, "step": 970 }, { "epoch": 1.4723004694835682, "grad_norm": 0.2559107542037964, "learning_rate": 8.889911340148112e-05, "loss": 0.035, "step": 980 }, { "epoch": 1.4873239436619718, "grad_norm": 0.20421642065048218, "learning_rate": 8.856712567749095e-05, "loss": 0.0315, "step": 990 }, { "epoch": 1.5023474178403755, "grad_norm": 0.35287970304489136, "learning_rate": 8.82308859868459e-05, "loss": 0.0289, "step": 1000 }, { "epoch": 1.5023474178403755, "eval_loss": 0.04059433564543724, "eval_runtime": 232.2328, "eval_samples_per_second": 5.098, "eval_steps_per_second": 5.098, "step": 1000 }, { "epoch": 1.5173708920187794, "grad_norm": 0.3293749690055847, "learning_rate": 8.789043139945795e-05, "loss": 0.0332, "step": 1010 }, { "epoch": 1.532394366197183, "grad_norm": 0.26048192381858826, "learning_rate": 8.754579944992491e-05, "loss": 0.0419, "step": 1020 }, { "epoch": 1.5474178403755867, "grad_norm": 0.2248084545135498, "learning_rate": 8.719702813339248e-05, "loss": 0.0327, "step": 1030 }, { "epoch": 1.5624413145539906, "grad_norm": 0.29436999559402466, "learning_rate": 8.684415590136518e-05, "loss": 0.0305, "step": 1040 }, { "epoch": 1.5774647887323945, "grad_norm": 0.2553965151309967, "learning_rate": 8.648722165746722e-05, "loss": 0.0339, "step": 1050 }, { "epoch": 1.5774647887323945, "eval_loss": 0.0449674129486084, "eval_runtime": 229.1439, "eval_samples_per_second": 5.167, "eval_steps_per_second": 5.167, "step": 1050 }, { "epoch": 1.5924882629107981, "grad_norm": 0.2621479034423828, "learning_rate": 8.61262647531534e-05, "loss": 0.0273, "step": 1060 }, { "epoch": 1.6075117370892018, "grad_norm": 0.29867053031921387, "learning_rate": 8.576132498337068e-05, "loss": 0.0349, "step": 1070 }, { "epoch": 1.6225352112676057, "grad_norm": 0.5712867379188538, "learning_rate": 8.539244258217088e-05, "loss": 0.0332, "step": 1080 }, { "epoch": 1.6375586854460094, "grad_norm": 0.2706688940525055, "learning_rate": 8.501965821827485e-05, "loss": 0.0408, "step": 1090 }, { "epoch": 1.652582159624413, "grad_norm": 0.2529209554195404, "learning_rate": 8.464301299058892e-05, "loss": 0.0244, "step": 1100 }, { "epoch": 1.652582159624413, "eval_loss": 0.04117948189377785, "eval_runtime": 232.3365, "eval_samples_per_second": 5.096, "eval_steps_per_second": 5.096, "step": 1100 }, { "epoch": 1.667605633802817, "grad_norm": 0.2566877007484436, "learning_rate": 8.426254842367374e-05, "loss": 0.0296, "step": 1110 }, { "epoch": 1.6826291079812208, "grad_norm": 0.28313955664634705, "learning_rate": 8.387830646316623e-05, "loss": 0.0382, "step": 1120 }, { "epoch": 1.6976525821596244, "grad_norm": 0.5802918672561646, "learning_rate": 8.349032947115525e-05, "loss": 0.0285, "step": 1130 }, { "epoch": 1.712676056338028, "grad_norm": 0.31035321950912476, "learning_rate": 8.309866022151107e-05, "loss": 0.0422, "step": 1140 }, { "epoch": 1.727699530516432, "grad_norm": 0.37052640318870544, "learning_rate": 8.270334189516983e-05, "loss": 0.0288, "step": 1150 }, { "epoch": 1.727699530516432, "eval_loss": 0.04018314927816391, "eval_runtime": 229.361, "eval_samples_per_second": 5.162, "eval_steps_per_second": 5.162, "step": 1150 }, { "epoch": 1.7427230046948357, "grad_norm": 0.13857990503311157, "learning_rate": 8.230441807537277e-05, "loss": 0.0418, "step": 1160 }, { "epoch": 1.7577464788732393, "grad_norm": 0.20266057550907135, "learning_rate": 8.190193274286122e-05, "loss": 0.0272, "step": 1170 }, { "epoch": 1.7727699530516432, "grad_norm": 0.26995351910591125, "learning_rate": 8.149593027102789e-05, "loss": 0.0336, "step": 1180 }, { "epoch": 1.787793427230047, "grad_norm": 0.5027073621749878, "learning_rate": 8.108645542102469e-05, "loss": 0.0282, "step": 1190 }, { "epoch": 1.8028169014084507, "grad_norm": 0.3197694420814514, "learning_rate": 8.067355333682798e-05, "loss": 0.036, "step": 1200 }, { "epoch": 1.8028169014084507, "eval_loss": 0.03951646015048027, "eval_runtime": 233.3104, "eval_samples_per_second": 5.075, "eval_steps_per_second": 5.075, "step": 1200 }, { "epoch": 1.8178403755868544, "grad_norm": 0.16591861844062805, "learning_rate": 8.025726954026138e-05, "loss": 0.0257, "step": 1210 }, { "epoch": 1.8328638497652583, "grad_norm": 0.5136527419090271, "learning_rate": 7.983764992597716e-05, "loss": 0.0305, "step": 1220 }, { "epoch": 1.847887323943662, "grad_norm": 0.4316839575767517, "learning_rate": 7.94147407563964e-05, "loss": 0.0321, "step": 1230 }, { "epoch": 1.8629107981220656, "grad_norm": 0.38824647665023804, "learning_rate": 7.89885886566086e-05, "loss": 0.0352, "step": 1240 }, { "epoch": 1.8779342723004695, "grad_norm": 0.4996744990348816, "learning_rate": 7.855924060923141e-05, "loss": 0.0334, "step": 1250 }, { "epoch": 1.8779342723004695, "eval_loss": 0.039211343973875046, "eval_runtime": 231.7629, "eval_samples_per_second": 5.109, "eval_steps_per_second": 5.109, "step": 1250 }, { "epoch": 1.8929577464788734, "grad_norm": 0.23325949907302856, "learning_rate": 7.812674394923077e-05, "loss": 0.0346, "step": 1260 }, { "epoch": 1.907981220657277, "grad_norm": 0.27697038650512695, "learning_rate": 7.769114635870231e-05, "loss": 0.0317, "step": 1270 }, { "epoch": 1.9230046948356807, "grad_norm": 0.4152398407459259, "learning_rate": 7.725249586161463e-05, "loss": 0.0341, "step": 1280 }, { "epoch": 1.9380281690140846, "grad_norm": 0.20453670620918274, "learning_rate": 7.68108408185145e-05, "loss": 0.0299, "step": 1290 }, { "epoch": 1.9530516431924883, "grad_norm": 0.38925862312316895, "learning_rate": 7.636622992119536e-05, "loss": 0.0568, "step": 1300 }, { "epoch": 1.9530516431924883, "eval_loss": 0.04275708645582199, "eval_runtime": 226.0867, "eval_samples_per_second": 5.237, "eval_steps_per_second": 5.237, "step": 1300 }, { "epoch": 1.968075117370892, "grad_norm": 0.254408597946167, "learning_rate": 7.591871218732902e-05, "loss": 0.0371, "step": 1310 }, { "epoch": 1.9830985915492958, "grad_norm": 0.279060959815979, "learning_rate": 7.54683369550616e-05, "loss": 0.0443, "step": 1320 }, { "epoch": 1.9981220657276997, "grad_norm": 0.1406373232603073, "learning_rate": 7.501515387757404e-05, "loss": 0.034, "step": 1330 }, { "epoch": 2.013145539906103, "grad_norm": 0.2928116023540497, "learning_rate": 7.455921291760796e-05, "loss": 0.0376, "step": 1340 }, { "epoch": 2.028169014084507, "grad_norm": 0.10005494207143784, "learning_rate": 7.410056434195725e-05, "loss": 0.0206, "step": 1350 }, { "epoch": 2.028169014084507, "eval_loss": 0.04088796302676201, "eval_runtime": 225.6026, "eval_samples_per_second": 5.248, "eval_steps_per_second": 5.248, "step": 1350 }, { "epoch": 2.043192488262911, "grad_norm": 0.28136786818504333, "learning_rate": 7.363925871592629e-05, "loss": 0.0218, "step": 1360 }, { "epoch": 2.0582159624413148, "grad_norm": 0.15816110372543335, "learning_rate": 7.317534689775528e-05, "loss": 0.019, "step": 1370 }, { "epoch": 2.073239436619718, "grad_norm": 0.15044282376766205, "learning_rate": 7.270888003301304e-05, "loss": 0.016, "step": 1380 }, { "epoch": 2.088262910798122, "grad_norm": 0.6464121341705322, "learning_rate": 7.22399095489584e-05, "loss": 0.0205, "step": 1390 }, { "epoch": 2.103286384976526, "grad_norm": 0.21453744173049927, "learning_rate": 7.176848714887042e-05, "loss": 0.0226, "step": 1400 }, { "epoch": 2.103286384976526, "eval_loss": 0.04083973169326782, "eval_runtime": 220.6924, "eval_samples_per_second": 5.365, "eval_steps_per_second": 5.365, "step": 1400 }, { "epoch": 2.1183098591549294, "grad_norm": 0.32658934593200684, "learning_rate": 7.129466480634806e-05, "loss": 0.021, "step": 1410 }, { "epoch": 2.1333333333333333, "grad_norm": 0.5312864184379578, "learning_rate": 7.081849475958042e-05, "loss": 0.0184, "step": 1420 }, { "epoch": 2.148356807511737, "grad_norm": 0.30401676893234253, "learning_rate": 7.034002950558723e-05, "loss": 0.0181, "step": 1430 }, { "epoch": 2.1633802816901406, "grad_norm": 0.2417522370815277, "learning_rate": 6.985932179443144e-05, "loss": 0.0225, "step": 1440 }, { "epoch": 2.1784037558685445, "grad_norm": 0.31011879444122314, "learning_rate": 6.937642462340342e-05, "loss": 0.0257, "step": 1450 }, { "epoch": 2.1784037558685445, "eval_loss": 0.04065319150686264, "eval_runtime": 222.471, "eval_samples_per_second": 5.322, "eval_steps_per_second": 5.322, "step": 1450 }, { "epoch": 2.1934272300469484, "grad_norm": 0.2239389568567276, "learning_rate": 6.889139123117817e-05, "loss": 0.0183, "step": 1460 }, { "epoch": 2.2084507042253523, "grad_norm": 0.22514520585536957, "learning_rate": 6.840427509194575e-05, "loss": 0.0242, "step": 1470 }, { "epoch": 2.2234741784037557, "grad_norm": 0.2953309416770935, "learning_rate": 6.791512990951597e-05, "loss": 0.015, "step": 1480 }, { "epoch": 2.2384976525821596, "grad_norm": 0.35825031995773315, "learning_rate": 6.74240096113975e-05, "loss": 0.0254, "step": 1490 }, { "epoch": 2.2535211267605635, "grad_norm": 0.15820668637752533, "learning_rate": 6.693096834285256e-05, "loss": 0.0138, "step": 1500 }, { "epoch": 2.2535211267605635, "eval_loss": 0.04137204959988594, "eval_runtime": 221.3626, "eval_samples_per_second": 5.349, "eval_steps_per_second": 5.349, "step": 1500 }, { "epoch": 2.2685446009389674, "grad_norm": 0.8325169086456299, "learning_rate": 6.643606046092732e-05, "loss": 0.0239, "step": 1510 }, { "epoch": 2.283568075117371, "grad_norm": 0.2664135694503784, "learning_rate": 6.593934052845929e-05, "loss": 0.0217, "step": 1520 }, { "epoch": 2.2985915492957747, "grad_norm": 0.3122292160987854, "learning_rate": 6.544086330806181e-05, "loss": 0.022, "step": 1530 }, { "epoch": 2.3136150234741786, "grad_norm": 0.24781066179275513, "learning_rate": 6.494068375608646e-05, "loss": 0.0218, "step": 1540 }, { "epoch": 2.328638497652582, "grad_norm": 0.32878419756889343, "learning_rate": 6.443885701656432e-05, "loss": 0.0167, "step": 1550 }, { "epoch": 2.328638497652582, "eval_loss": 0.040929365903139114, "eval_runtime": 219.4831, "eval_samples_per_second": 5.394, "eval_steps_per_second": 5.394, "step": 1550 }, { "epoch": 2.343661971830986, "grad_norm": 0.49727535247802734, "learning_rate": 6.393543841512632e-05, "loss": 0.0223, "step": 1560 }, { "epoch": 2.35868544600939, "grad_norm": 0.617466926574707, "learning_rate": 6.343048345290386e-05, "loss": 0.0159, "step": 1570 }, { "epoch": 2.3737089201877932, "grad_norm": 0.3165408968925476, "learning_rate": 6.292404780040961e-05, "loss": 0.0278, "step": 1580 }, { "epoch": 2.388732394366197, "grad_norm": 0.2089942991733551, "learning_rate": 6.241618729140018e-05, "loss": 0.0187, "step": 1590 }, { "epoch": 2.403755868544601, "grad_norm": 0.3965398669242859, "learning_rate": 6.190695791672042e-05, "loss": 0.0217, "step": 1600 }, { "epoch": 2.403755868544601, "eval_loss": 0.03883142024278641, "eval_runtime": 221.119, "eval_samples_per_second": 5.355, "eval_steps_per_second": 5.355, "step": 1600 }, { "epoch": 2.418779342723005, "grad_norm": 0.16717387735843658, "learning_rate": 6.139641581813052e-05, "loss": 0.0173, "step": 1610 }, { "epoch": 2.4338028169014083, "grad_norm": 0.1234317496418953, "learning_rate": 6.088461728211642e-05, "loss": 0.016, "step": 1620 }, { "epoch": 2.448826291079812, "grad_norm": 0.3614145815372467, "learning_rate": 6.0371618733684474e-05, "loss": 0.0122, "step": 1630 }, { "epoch": 2.463849765258216, "grad_norm": 0.14805041253566742, "learning_rate": 5.9857476730140485e-05, "loss": 0.0216, "step": 1640 }, { "epoch": 2.4788732394366195, "grad_norm": 0.24672311544418335, "learning_rate": 5.9342247954854466e-05, "loss": 0.0195, "step": 1650 }, { "epoch": 2.4788732394366195, "eval_loss": 0.042663708329200745, "eval_runtime": 218.9509, "eval_samples_per_second": 5.408, "eval_steps_per_second": 5.408, "step": 1650 }, { "epoch": 2.4938967136150234, "grad_norm": 0.2416851967573166, "learning_rate": 5.8825989211011335e-05, "loss": 0.0239, "step": 1660 }, { "epoch": 2.5089201877934273, "grad_norm": 0.2621941864490509, "learning_rate": 5.830875741534852e-05, "loss": 0.0166, "step": 1670 }, { "epoch": 2.523943661971831, "grad_norm": 0.38329723477363586, "learning_rate": 5.7790609591880826e-05, "loss": 0.0252, "step": 1680 }, { "epoch": 2.5389671361502346, "grad_norm": 0.25781792402267456, "learning_rate": 5.727160286561386e-05, "loss": 0.0203, "step": 1690 }, { "epoch": 2.5539906103286385, "grad_norm": 0.2038751244544983, "learning_rate": 5.675179445624581e-05, "loss": 0.0223, "step": 1700 }, { "epoch": 2.5539906103286385, "eval_loss": 0.04374394193291664, "eval_runtime": 219.3848, "eval_samples_per_second": 5.397, "eval_steps_per_second": 5.397, "step": 1700 }, { "epoch": 2.5690140845070424, "grad_norm": 0.1650547981262207, "learning_rate": 5.62312416718593e-05, "loss": 0.0183, "step": 1710 }, { "epoch": 2.584037558685446, "grad_norm": 0.06820981949567795, "learning_rate": 5.5710001902603116e-05, "loss": 0.0217, "step": 1720 }, { "epoch": 2.5990610328638497, "grad_norm": 0.1459759920835495, "learning_rate": 5.5188132614365094e-05, "loss": 0.0164, "step": 1730 }, { "epoch": 2.6140845070422536, "grad_norm": 0.18846748769283295, "learning_rate": 5.4665691342436565e-05, "loss": 0.0145, "step": 1740 }, { "epoch": 2.629107981220657, "grad_norm": 0.44601181149482727, "learning_rate": 5.414273568516919e-05, "loss": 0.0195, "step": 1750 }, { "epoch": 2.629107981220657, "eval_loss": 0.0427960604429245, "eval_runtime": 219.0798, "eval_samples_per_second": 5.404, "eval_steps_per_second": 5.404, "step": 1750 }, { "epoch": 2.644131455399061, "grad_norm": 0.5126632452011108, "learning_rate": 5.361932329762481e-05, "loss": 0.0215, "step": 1760 }, { "epoch": 2.659154929577465, "grad_norm": 0.2724759876728058, "learning_rate": 5.309551188521914e-05, "loss": 0.0232, "step": 1770 }, { "epoch": 2.6741784037558687, "grad_norm": 0.4603884816169739, "learning_rate": 5.2571359197359704e-05, "loss": 0.0168, "step": 1780 }, { "epoch": 2.6892018779342726, "grad_norm": 0.3353332579135895, "learning_rate": 5.2046923021079175e-05, "loss": 0.0144, "step": 1790 }, { "epoch": 2.704225352112676, "grad_norm": 0.2024739384651184, "learning_rate": 5.1522261174664346e-05, "loss": 0.0218, "step": 1800 }, { "epoch": 2.704225352112676, "eval_loss": 0.04094401001930237, "eval_runtime": 211.573, "eval_samples_per_second": 5.596, "eval_steps_per_second": 5.596, "step": 1800 }, { "epoch": 2.71924882629108, "grad_norm": 1.624040126800537, "learning_rate": 5.0997431501281835e-05, "loss": 0.0262, "step": 1810 }, { "epoch": 2.7342723004694838, "grad_norm": 0.3581642806529999, "learning_rate": 5.0472491862600915e-05, "loss": 0.019, "step": 1820 }, { "epoch": 2.749295774647887, "grad_norm": 0.44195398688316345, "learning_rate": 4.994750013241435e-05, "loss": 0.0248, "step": 1830 }, { "epoch": 2.764319248826291, "grad_norm": 0.22482897341251373, "learning_rate": 4.9422514190257974e-05, "loss": 0.0221, "step": 1840 }, { "epoch": 2.779342723004695, "grad_norm": 0.2989714741706848, "learning_rate": 4.88975919150294e-05, "loss": 0.0189, "step": 1850 }, { "epoch": 2.779342723004695, "eval_loss": 0.040074534714221954, "eval_runtime": 207.949, "eval_samples_per_second": 5.694, "eval_steps_per_second": 5.694, "step": 1850 }, { "epoch": 2.7943661971830984, "grad_norm": 0.3000681400299072, "learning_rate": 4.83727911786071e-05, "loss": 0.0254, "step": 1860 }, { "epoch": 2.8093896713615023, "grad_norm": 0.3129828870296478, "learning_rate": 4.7848169839470145e-05, "loss": 0.0221, "step": 1870 }, { "epoch": 2.824413145539906, "grad_norm": 0.5919480323791504, "learning_rate": 4.7323785736319244e-05, "loss": 0.0239, "step": 1880 }, { "epoch": 2.8394366197183096, "grad_norm": 0.23092535138130188, "learning_rate": 4.679969668170024e-05, "loss": 0.0227, "step": 1890 }, { "epoch": 2.8544600938967135, "grad_norm": 0.10704797506332397, "learning_rate": 4.627596045563031e-05, "loss": 0.0195, "step": 1900 }, { "epoch": 2.8544600938967135, "eval_loss": 0.038124363869428635, "eval_runtime": 204.7966, "eval_samples_per_second": 5.781, "eval_steps_per_second": 5.781, "step": 1900 }, { "epoch": 2.8694835680751174, "grad_norm": 0.3655933439731598, "learning_rate": 4.575263479922783e-05, "loss": 0.014, "step": 1910 }, { "epoch": 2.8845070422535213, "grad_norm": 0.13271917402744293, "learning_rate": 4.522977740834651e-05, "loss": 0.0112, "step": 1920 }, { "epoch": 2.8995305164319247, "grad_norm": 0.12882518768310547, "learning_rate": 4.4707445927214456e-05, "loss": 0.0124, "step": 1930 }, { "epoch": 2.9145539906103286, "grad_norm": 0.3468819558620453, "learning_rate": 4.4185697942079115e-05, "loss": 0.0219, "step": 1940 }, { "epoch": 2.9295774647887325, "grad_norm": 0.16386888921260834, "learning_rate": 4.366459097485832e-05, "loss": 0.0188, "step": 1950 }, { "epoch": 2.9295774647887325, "eval_loss": 0.039931606501340866, "eval_runtime": 202.6841, "eval_samples_per_second": 5.842, "eval_steps_per_second": 5.842, "step": 1950 }, { "epoch": 2.9446009389671364, "grad_norm": 0.29388657212257385, "learning_rate": 4.314418247679866e-05, "loss": 0.0205, "step": 1960 }, { "epoch": 2.95962441314554, "grad_norm": 0.19597427546977997, "learning_rate": 4.26245298221416e-05, "loss": 0.0137, "step": 1970 }, { "epoch": 2.9746478873239437, "grad_norm": 0.16075938940048218, "learning_rate": 4.2105690301798014e-05, "loss": 0.0162, "step": 1980 }, { "epoch": 2.9896713615023476, "grad_norm": 0.39698508381843567, "learning_rate": 4.158772111703194e-05, "loss": 0.0163, "step": 1990 }, { "epoch": 3.004694835680751, "grad_norm": 0.044948216527700424, "learning_rate": 4.107067937315429e-05, "loss": 0.0105, "step": 2000 }, { "epoch": 3.004694835680751, "eval_loss": 0.041156113147735596, "eval_runtime": 200.8348, "eval_samples_per_second": 5.895, "eval_steps_per_second": 5.895, "step": 2000 }, { "epoch": 3.019718309859155, "grad_norm": 0.14478544890880585, "learning_rate": 4.055462207322698e-05, "loss": 0.0076, "step": 2010 }, { "epoch": 3.034741784037559, "grad_norm": 0.05940256267786026, "learning_rate": 4.003960611177855e-05, "loss": 0.0115, "step": 2020 }, { "epoch": 3.0497652582159622, "grad_norm": 0.13706958293914795, "learning_rate": 3.952568826853152e-05, "loss": 0.0102, "step": 2030 }, { "epoch": 3.064788732394366, "grad_norm": 0.22277607023715973, "learning_rate": 3.901292520214256e-05, "loss": 0.0064, "step": 2040 }, { "epoch": 3.07981220657277, "grad_norm": 0.30167967081069946, "learning_rate": 3.850137344395598e-05, "loss": 0.0073, "step": 2050 }, { "epoch": 3.07981220657277, "eval_loss": 0.04430518299341202, "eval_runtime": 200.5966, "eval_samples_per_second": 5.902, "eval_steps_per_second": 5.902, "step": 2050 }, { "epoch": 3.094835680751174, "grad_norm": 0.6749436855316162, "learning_rate": 3.799108939177118e-05, "loss": 0.0064, "step": 2060 }, { "epoch": 3.1098591549295773, "grad_norm": 0.16386157274246216, "learning_rate": 3.7482129303624934e-05, "loss": 0.0077, "step": 2070 }, { "epoch": 3.124882629107981, "grad_norm": 0.35549381375312805, "learning_rate": 3.697454929158901e-05, "loss": 0.0089, "step": 2080 }, { "epoch": 3.139906103286385, "grad_norm": 0.2858954071998596, "learning_rate": 3.6468405315583854e-05, "loss": 0.0102, "step": 2090 }, { "epoch": 3.1549295774647885, "grad_norm": 0.43447062373161316, "learning_rate": 3.59637531772092e-05, "loss": 0.0082, "step": 2100 }, { "epoch": 3.1549295774647885, "eval_loss": 0.0465645007789135, "eval_runtime": 200.3422, "eval_samples_per_second": 5.91, "eval_steps_per_second": 5.91, "step": 2100 }, { "epoch": 3.1699530516431924, "grad_norm": 0.16099733114242554, "learning_rate": 3.546064851359192e-05, "loss": 0.0037, "step": 2110 }, { "epoch": 3.1849765258215963, "grad_norm": 0.33014243841171265, "learning_rate": 3.495914679125212e-05, "loss": 0.0068, "step": 2120 }, { "epoch": 3.2, "grad_norm": 0.04894431680440903, "learning_rate": 3.445930329998819e-05, "loss": 0.007, "step": 2130 }, { "epoch": 3.2150234741784036, "grad_norm": 0.33057722449302673, "learning_rate": 3.396117314678097e-05, "loss": 0.0137, "step": 2140 }, { "epoch": 3.2300469483568075, "grad_norm": 0.4633444845676422, "learning_rate": 3.3464811249718474e-05, "loss": 0.0107, "step": 2150 }, { "epoch": 3.2300469483568075, "eval_loss": 0.04904927685856819, "eval_runtime": 199.8955, "eval_samples_per_second": 5.923, "eval_steps_per_second": 5.923, "step": 2150 }, { "epoch": 3.2450704225352114, "grad_norm": 0.27093496918678284, "learning_rate": 3.297027233194114e-05, "loss": 0.0087, "step": 2160 }, { "epoch": 3.260093896713615, "grad_norm": 0.047191720455884933, "learning_rate": 3.2477610915608704e-05, "loss": 0.0066, "step": 2170 }, { "epoch": 3.2751173708920187, "grad_norm": 0.3168286383152008, "learning_rate": 3.1986881315889315e-05, "loss": 0.0125, "step": 2180 }, { "epoch": 3.2901408450704226, "grad_norm": 0.5179242491722107, "learning_rate": 3.149813763497124e-05, "loss": 0.0167, "step": 2190 }, { "epoch": 3.3051643192488265, "grad_norm": 0.2823299169540405, "learning_rate": 3.101143375609818e-05, "loss": 0.0143, "step": 2200 }, { "epoch": 3.3051643192488265, "eval_loss": 0.044749025255441666, "eval_runtime": 199.0042, "eval_samples_per_second": 5.95, "eval_steps_per_second": 5.95, "step": 2200 }, { "epoch": 3.32018779342723, "grad_norm": 0.27977901697158813, "learning_rate": 3.0526823337628915e-05, "loss": 0.012, "step": 2210 }, { "epoch": 3.335211267605634, "grad_norm": 0.1816035658121109, "learning_rate": 3.004435980712129e-05, "loss": 0.0071, "step": 2220 }, { "epoch": 3.3502347417840377, "grad_norm": 0.20612962543964386, "learning_rate": 2.9564096355442116e-05, "loss": 0.0109, "step": 2230 }, { "epoch": 3.365258215962441, "grad_norm": 0.0782838836312294, "learning_rate": 2.9086085930902824e-05, "loss": 0.0109, "step": 2240 }, { "epoch": 3.380281690140845, "grad_norm": 0.1939852386713028, "learning_rate": 2.8610381233422058e-05, "loss": 0.005, "step": 2250 }, { "epoch": 3.380281690140845, "eval_loss": 0.04711684584617615, "eval_runtime": 199.3048, "eval_samples_per_second": 5.941, "eval_steps_per_second": 5.941, "step": 2250 }, { "epoch": 3.395305164319249, "grad_norm": 0.15407223999500275, "learning_rate": 2.8137034708715592e-05, "loss": 0.0051, "step": 2260 }, { "epoch": 3.4103286384976528, "grad_norm": 0.0431525744497776, "learning_rate": 2.7666098542514273e-05, "loss": 0.0061, "step": 2270 }, { "epoch": 3.425352112676056, "grad_norm": 0.4643717408180237, "learning_rate": 2.719762465481055e-05, "loss": 0.0095, "step": 2280 }, { "epoch": 3.44037558685446, "grad_norm": 0.13977164030075073, "learning_rate": 2.6731664694134473e-05, "loss": 0.0114, "step": 2290 }, { "epoch": 3.455399061032864, "grad_norm": 0.34736713767051697, "learning_rate": 2.6268270031859476e-05, "loss": 0.0079, "step": 2300 }, { "epoch": 3.455399061032864, "eval_loss": 0.0480869896709919, "eval_runtime": 198.5254, "eval_samples_per_second": 5.964, "eval_steps_per_second": 5.964, "step": 2300 }, { "epoch": 3.4704225352112674, "grad_norm": 0.5037516355514526, "learning_rate": 2.580749175653877e-05, "loss": 0.0097, "step": 2310 }, { "epoch": 3.4854460093896713, "grad_norm": 0.5058691501617432, "learning_rate": 2.5349380668272905e-05, "loss": 0.011, "step": 2320 }, { "epoch": 3.500469483568075, "grad_norm": 0.3391149938106537, "learning_rate": 2.489398727310908e-05, "loss": 0.0105, "step": 2330 }, { "epoch": 3.5154929577464786, "grad_norm": 0.5301526188850403, "learning_rate": 2.4441361777473066e-05, "loss": 0.0097, "step": 2340 }, { "epoch": 3.5305164319248825, "grad_norm": 0.5552868247032166, "learning_rate": 2.3991554082633912e-05, "loss": 0.0085, "step": 2350 }, { "epoch": 3.5305164319248825, "eval_loss": 0.05010828375816345, "eval_runtime": 198.3007, "eval_samples_per_second": 5.971, "eval_steps_per_second": 5.971, "step": 2350 }, { "epoch": 3.5455399061032864, "grad_norm": 0.15667784214019775, "learning_rate": 2.354461377920239e-05, "loss": 0.0078, "step": 2360 }, { "epoch": 3.5605633802816903, "grad_norm": 0.485943466424942, "learning_rate": 2.3100590141663807e-05, "loss": 0.0108, "step": 2370 }, { "epoch": 3.575586854460094, "grad_norm": 0.49499568343162537, "learning_rate": 2.265953212294551e-05, "loss": 0.0115, "step": 2380 }, { "epoch": 3.5906103286384976, "grad_norm": 0.559502899646759, "learning_rate": 2.2221488349019903e-05, "loss": 0.017, "step": 2390 }, { "epoch": 3.6056338028169015, "grad_norm": 0.260917067527771, "learning_rate": 2.1786507113543457e-05, "loss": 0.0073, "step": 2400 }, { "epoch": 3.6056338028169015, "eval_loss": 0.046827565878629684, "eval_runtime": 192.641, "eval_samples_per_second": 6.146, "eval_steps_per_second": 6.146, "step": 2400 }, { "epoch": 3.6206572769953054, "grad_norm": 0.07891547679901123, "learning_rate": 2.1354636372532523e-05, "loss": 0.0125, "step": 2410 }, { "epoch": 3.635680751173709, "grad_norm": 0.23800870776176453, "learning_rate": 2.092592373907617e-05, "loss": 0.0082, "step": 2420 }, { "epoch": 3.6507042253521127, "grad_norm": 0.09424114972352982, "learning_rate": 2.0500416478086932e-05, "loss": 0.0077, "step": 2430 }, { "epoch": 3.6657276995305166, "grad_norm": 0.3062797784805298, "learning_rate": 2.0078161501089954e-05, "loss": 0.0075, "step": 2440 }, { "epoch": 3.68075117370892, "grad_norm": 0.06614656746387482, "learning_rate": 1.9659205361050982e-05, "loss": 0.0034, "step": 2450 }, { "epoch": 3.68075117370892, "eval_loss": 0.04812372848391533, "eval_runtime": 197.4077, "eval_samples_per_second": 5.998, "eval_steps_per_second": 5.998, "step": 2450 }, { "epoch": 3.695774647887324, "grad_norm": 0.16158843040466309, "learning_rate": 1.924359424724408e-05, "loss": 0.0169, "step": 2460 }, { "epoch": 3.710798122065728, "grad_norm": 0.3151260018348694, "learning_rate": 1.8831373980159296e-05, "loss": 0.0127, "step": 2470 }, { "epoch": 3.7258215962441312, "grad_norm": 0.09106522053480148, "learning_rate": 1.8422590006450947e-05, "loss": 0.0042, "step": 2480 }, { "epoch": 3.740845070422535, "grad_norm": 0.07405770570039749, "learning_rate": 1.801728739392731e-05, "loss": 0.0058, "step": 2490 }, { "epoch": 3.755868544600939, "grad_norm": 0.16569849848747253, "learning_rate": 1.7615510826581904e-05, "loss": 0.0053, "step": 2500 }, { "epoch": 3.755868544600939, "eval_loss": 0.04956234619021416, "eval_runtime": 197.5247, "eval_samples_per_second": 5.994, "eval_steps_per_second": 5.994, "step": 2500 }, { "epoch": 3.770892018779343, "grad_norm": 0.271708220243454, "learning_rate": 1.7217304599667146e-05, "loss": 0.0087, "step": 2510 }, { "epoch": 3.7859154929577463, "grad_norm": 0.03417084738612175, "learning_rate": 1.6822712614810893e-05, "loss": 0.008, "step": 2520 }, { "epoch": 3.80093896713615, "grad_norm": 0.10270609706640244, "learning_rate": 1.643177837517631e-05, "loss": 0.0078, "step": 2530 }, { "epoch": 3.815962441314554, "grad_norm": 0.21668383479118347, "learning_rate": 1.6044544980665767e-05, "loss": 0.01, "step": 2540 }, { "epoch": 3.830985915492958, "grad_norm": 0.7087284922599792, "learning_rate": 1.5661055123169126e-05, "loss": 0.0052, "step": 2550 }, { "epoch": 3.830985915492958, "eval_loss": 0.04982556402683258, "eval_runtime": 197.2, "eval_samples_per_second": 6.004, "eval_steps_per_second": 6.004, "step": 2550 }, { "epoch": 3.8460093896713614, "grad_norm": 0.09885504096746445, "learning_rate": 1.5281351081856974e-05, "loss": 0.0065, "step": 2560 }, { "epoch": 3.8610328638497653, "grad_norm": 0.1990659385919571, "learning_rate": 1.4905474718519491e-05, "loss": 0.0032, "step": 2570 }, { "epoch": 3.876056338028169, "grad_norm": 0.2634921371936798, "learning_rate": 1.453346747295119e-05, "loss": 0.0116, "step": 2580 }, { "epoch": 3.8910798122065726, "grad_norm": 0.045282598584890366, "learning_rate": 1.4165370358382274e-05, "loss": 0.007, "step": 2590 }, { "epoch": 3.9061032863849765, "grad_norm": 0.04671657830476761, "learning_rate": 1.3801223956956994e-05, "loss": 0.0117, "step": 2600 }, { "epoch": 3.9061032863849765, "eval_loss": 0.05007306486368179, "eval_runtime": 196.5935, "eval_samples_per_second": 6.023, "eval_steps_per_second": 6.023, "step": 2600 }, { "epoch": 3.9211267605633804, "grad_norm": 0.2485828697681427, "learning_rate": 1.344106841525946e-05, "loss": 0.0108, "step": 2610 }, { "epoch": 3.936150234741784, "grad_norm": 0.542384147644043, "learning_rate": 1.3084943439887659e-05, "loss": 0.0105, "step": 2620 }, { "epoch": 3.9511737089201877, "grad_norm": 0.11230123043060303, "learning_rate": 1.273288829307579e-05, "loss": 0.005, "step": 2630 }, { "epoch": 3.9661971830985916, "grad_norm": 0.0764995664358139, "learning_rate": 1.2384941788365622e-05, "loss": 0.0055, "step": 2640 }, { "epoch": 3.981220657276995, "grad_norm": 0.2927321493625641, "learning_rate": 1.2041142286327477e-05, "loss": 0.0082, "step": 2650 }, { "epoch": 3.981220657276995, "eval_loss": 0.05000728368759155, "eval_runtime": 196.7082, "eval_samples_per_second": 6.019, "eval_steps_per_second": 6.019, "step": 2650 }, { "epoch": 3.996244131455399, "grad_norm": 0.8166933059692383, "learning_rate": 1.170152769033095e-05, "loss": 0.0062, "step": 2660 }, { "epoch": 4.011267605633803, "grad_norm": 0.20836931467056274, "learning_rate": 1.1366135442366127e-05, "loss": 0.0056, "step": 2670 }, { "epoch": 4.026291079812206, "grad_norm": 0.11336184293031693, "learning_rate": 1.103500251891571e-05, "loss": 0.0047, "step": 2680 }, { "epoch": 4.041314553990611, "grad_norm": 0.03469737619161606, "learning_rate": 1.0708165426878325e-05, "loss": 0.0028, "step": 2690 }, { "epoch": 4.056338028169014, "grad_norm": 0.30168983340263367, "learning_rate": 1.0385660199543812e-05, "loss": 0.0054, "step": 2700 }, { "epoch": 4.056338028169014, "eval_loss": 0.0516323521733284, "eval_runtime": 196.8548, "eval_samples_per_second": 6.015, "eval_steps_per_second": 6.015, "step": 2700 }, { "epoch": 4.0713615023474174, "grad_norm": 0.045852866023778915, "learning_rate": 1.0067522392620537e-05, "loss": 0.0031, "step": 2710 }, { "epoch": 4.086384976525822, "grad_norm": 0.2861749529838562, "learning_rate": 9.753787080315385e-06, "loss": 0.0089, "step": 2720 }, { "epoch": 4.101408450704225, "grad_norm": 0.029442287981510162, "learning_rate": 9.444488851467042e-06, "loss": 0.0025, "step": 2730 }, { "epoch": 4.1164319248826295, "grad_norm": 0.16276951134204865, "learning_rate": 9.139661805732435e-06, "loss": 0.0021, "step": 2740 }, { "epoch": 4.131455399061033, "grad_norm": 0.2108067125082016, "learning_rate": 8.839339549827397e-06, "loss": 0.0019, "step": 2750 }, { "epoch": 4.131455399061033, "eval_loss": 0.054593775421381, "eval_runtime": 196.8035, "eval_samples_per_second": 6.016, "eval_steps_per_second": 6.016, "step": 2750 }, { "epoch": 4.146478873239436, "grad_norm": 0.02386028692126274, "learning_rate": 8.543555193821634e-06, "loss": 0.0046, "step": 2760 }, { "epoch": 4.161502347417841, "grad_norm": 0.24152906239032745, "learning_rate": 8.252341347488251e-06, "loss": 0.0025, "step": 2770 }, { "epoch": 4.176525821596244, "grad_norm": 0.12346199154853821, "learning_rate": 7.965730116708681e-06, "loss": 0.0038, "step": 2780 }, { "epoch": 4.191549295774648, "grad_norm": 0.0784059390425682, "learning_rate": 7.68375309993304e-06, "loss": 0.0019, "step": 2790 }, { "epoch": 4.206572769953052, "grad_norm": 0.08971384167671204, "learning_rate": 7.406441384696372e-06, "loss": 0.0048, "step": 2800 }, { "epoch": 4.206572769953052, "eval_loss": 0.05651959031820297, "eval_runtime": 196.0515, "eval_samples_per_second": 6.039, "eval_steps_per_second": 6.039, "step": 2800 }, { "epoch": 4.221596244131455, "grad_norm": 0.06048822030425072, "learning_rate": 7.133825544191464e-06, "loss": 0.0053, "step": 2810 }, { "epoch": 4.236619718309859, "grad_norm": 0.19797635078430176, "learning_rate": 6.865935633897996e-06, "loss": 0.002, "step": 2820 }, { "epoch": 4.251643192488263, "grad_norm": 0.13878192007541656, "learning_rate": 6.602801188269081e-06, "loss": 0.0043, "step": 2830 }, { "epoch": 4.266666666666667, "grad_norm": 0.12489641457796097, "learning_rate": 6.344451217475183e-06, "loss": 0.0015, "step": 2840 }, { "epoch": 4.28169014084507, "grad_norm": 0.11764870584011078, "learning_rate": 6.090914204205655e-06, "loss": 0.0026, "step": 2850 }, { "epoch": 4.28169014084507, "eval_loss": 0.05827711895108223, "eval_runtime": 195.8018, "eval_samples_per_second": 6.047, "eval_steps_per_second": 6.047, "step": 2850 }, { "epoch": 4.296713615023474, "grad_norm": 0.07948636263608932, "learning_rate": 5.842218100528679e-06, "loss": 0.0018, "step": 2860 }, { "epoch": 4.311737089201878, "grad_norm": 0.11953713744878769, "learning_rate": 5.598390324809555e-06, "loss": 0.0026, "step": 2870 }, { "epoch": 4.326760563380281, "grad_norm": 0.22272926568984985, "learning_rate": 5.359457758687841e-06, "loss": 0.0028, "step": 2880 }, { "epoch": 4.341784037558686, "grad_norm": 0.5969500541687012, "learning_rate": 5.125446744113743e-06, "loss": 0.0028, "step": 2890 }, { "epoch": 4.356807511737089, "grad_norm": 0.025193296372890472, "learning_rate": 4.896383080443934e-06, "loss": 0.001, "step": 2900 }, { "epoch": 4.356807511737089, "eval_loss": 0.06079353392124176, "eval_runtime": 196.1461, "eval_samples_per_second": 6.036, "eval_steps_per_second": 6.036, "step": 2900 }, { "epoch": 4.371830985915493, "grad_norm": 0.2489527016878128, "learning_rate": 4.672292021597174e-06, "loss": 0.004, "step": 2910 }, { "epoch": 4.386854460093897, "grad_norm": 0.10525476932525635, "learning_rate": 4.4531982732702145e-06, "loss": 0.0037, "step": 2920 }, { "epoch": 4.4018779342723, "grad_norm": 0.08666111528873444, "learning_rate": 4.239125990213883e-06, "loss": 0.0019, "step": 2930 }, { "epoch": 4.416901408450705, "grad_norm": 0.15897853672504425, "learning_rate": 4.030098773570174e-06, "loss": 0.0006, "step": 2940 }, { "epoch": 4.431924882629108, "grad_norm": 0.05585940182209015, "learning_rate": 3.826139668270234e-06, "loss": 0.0022, "step": 2950 }, { "epoch": 4.431924882629108, "eval_loss": 0.06087281182408333, "eval_runtime": 195.9838, "eval_samples_per_second": 6.041, "eval_steps_per_second": 6.041, "step": 2950 }, { "epoch": 4.446948356807511, "grad_norm": 0.27004674077033997, "learning_rate": 3.6272711604936504e-06, "loss": 0.0022, "step": 2960 }, { "epoch": 4.461971830985916, "grad_norm": 0.02100216969847679, "learning_rate": 3.433515175189428e-06, "loss": 0.0021, "step": 2970 }, { "epoch": 4.476995305164319, "grad_norm": 0.023518303409218788, "learning_rate": 3.2448930736588e-06, "loss": 0.0063, "step": 2980 }, { "epoch": 4.492018779342723, "grad_norm": 0.04742588847875595, "learning_rate": 3.061425651200117e-06, "loss": 0.0021, "step": 2990 }, { "epoch": 4.507042253521127, "grad_norm": 0.19590170681476593, "learning_rate": 2.883133134816296e-06, "loss": 0.0036, "step": 3000 }, { "epoch": 4.507042253521127, "eval_loss": 0.06154530867934227, "eval_runtime": 196.0968, "eval_samples_per_second": 6.038, "eval_steps_per_second": 6.038, "step": 3000 }, { "epoch": 4.52206572769953, "grad_norm": 0.02329372987151146, "learning_rate": 2.7100351809847326e-06, "loss": 0.0018, "step": 3010 }, { "epoch": 4.537089201877935, "grad_norm": 0.034720126539468765, "learning_rate": 2.542150873490251e-06, "loss": 0.0013, "step": 3020 }, { "epoch": 4.552112676056338, "grad_norm": 0.1508730798959732, "learning_rate": 2.3794987213211383e-06, "loss": 0.0025, "step": 3030 }, { "epoch": 4.567136150234742, "grad_norm": 0.06496744602918625, "learning_rate": 2.222096656628547e-06, "loss": 0.0036, "step": 3040 }, { "epoch": 4.582159624413146, "grad_norm": 0.024426713585853577, "learning_rate": 2.0699620327495174e-06, "loss": 0.0006, "step": 3050 }, { "epoch": 4.582159624413146, "eval_loss": 0.06214800477027893, "eval_runtime": 194.9732, "eval_samples_per_second": 6.073, "eval_steps_per_second": 6.073, "step": 3050 }, { "epoch": 4.597183098591549, "grad_norm": 0.006804203614592552, "learning_rate": 1.9231116222937996e-06, "loss": 0.0021, "step": 3060 }, { "epoch": 4.612206572769953, "grad_norm": 0.043833401054143906, "learning_rate": 1.7815616152946523e-06, "loss": 0.0024, "step": 3070 }, { "epoch": 4.627230046948357, "grad_norm": 0.04456528648734093, "learning_rate": 1.6453276174240195e-06, "loss": 0.0018, "step": 3080 }, { "epoch": 4.642253521126761, "grad_norm": 0.06189090386033058, "learning_rate": 1.5144246482719114e-06, "loss": 0.0032, "step": 3090 }, { "epoch": 4.657276995305164, "grad_norm": 0.009907769970595837, "learning_rate": 1.3888671396905805e-06, "loss": 0.0019, "step": 3100 }, { "epoch": 4.657276995305164, "eval_loss": 0.06242601200938225, "eval_runtime": 195.4706, "eval_samples_per_second": 6.057, "eval_steps_per_second": 6.057, "step": 3100 }, { "epoch": 4.672300469483568, "grad_norm": 0.3474283516407013, "learning_rate": 1.2686689342034431e-06, "loss": 0.006, "step": 3110 }, { "epoch": 4.687323943661972, "grad_norm": 0.6320505738258362, "learning_rate": 1.1538432834789227e-06, "loss": 0.0032, "step": 3120 }, { "epoch": 4.702347417840375, "grad_norm": 0.22293131053447723, "learning_rate": 1.044402846869491e-06, "loss": 0.0021, "step": 3130 }, { "epoch": 4.71737089201878, "grad_norm": 0.3564072549343109, "learning_rate": 9.403596900160073e-07, "loss": 0.0013, "step": 3140 }, { "epoch": 4.732394366197183, "grad_norm": 0.2784675359725952, "learning_rate": 8.417252835174749e-07, "loss": 0.0027, "step": 3150 }, { "epoch": 4.732394366197183, "eval_loss": 0.06278736144304276, "eval_runtime": 195.5233, "eval_samples_per_second": 6.056, "eval_steps_per_second": 6.056, "step": 3150 }, { "epoch": 4.7474178403755865, "grad_norm": 0.29395076632499695, "learning_rate": 7.48510501666455e-07, "loss": 0.0023, "step": 3160 }, { "epoch": 4.762441314553991, "grad_norm": 0.17587997019290924, "learning_rate": 6.607256212501578e-07, "loss": 0.002, "step": 3170 }, { "epoch": 4.777464788732394, "grad_norm": 0.40092355012893677, "learning_rate": 5.783803204174654e-07, "loss": 0.0043, "step": 3180 }, { "epoch": 4.792488262910798, "grad_norm": 0.10170930624008179, "learning_rate": 5.014836776119358e-07, "loss": 0.0026, "step": 3190 }, { "epoch": 4.807511737089202, "grad_norm": 0.29708781838417053, "learning_rate": 4.300441705708924e-07, "loss": 0.0022, "step": 3200 }, { "epoch": 4.807511737089202, "eval_loss": 0.06290177255868912, "eval_runtime": 195.5195, "eval_samples_per_second": 6.056, "eval_steps_per_second": 6.056, "step": 3200 }, { "epoch": 4.822535211267605, "grad_norm": 0.205259770154953, "learning_rate": 3.6406967539078796e-07, "loss": 0.0017, "step": 3210 }, { "epoch": 4.83755868544601, "grad_norm": 0.03722928464412689, "learning_rate": 3.0356746565887715e-07, "loss": 0.0038, "step": 3220 }, { "epoch": 4.852582159624413, "grad_norm": 0.17495347559452057, "learning_rate": 2.485442116513026e-07, "loss": 0.0015, "step": 3230 }, { "epoch": 4.867605633802817, "grad_norm": 0.0666784942150116, "learning_rate": 1.9900597959770507e-07, "loss": 0.005, "step": 3240 }, { "epoch": 4.882629107981221, "grad_norm": 0.014443851076066494, "learning_rate": 1.5495823101245866e-07, "loss": 0.002, "step": 3250 }, { "epoch": 4.882629107981221, "eval_loss": 0.06296313554048538, "eval_runtime": 194.8864, "eval_samples_per_second": 6.075, "eval_steps_per_second": 6.075, "step": 3250 }, { "epoch": 4.897652582159624, "grad_norm": 0.10799351334571838, "learning_rate": 1.164058220925135e-07, "loss": 0.0024, "step": 3260 }, { "epoch": 4.912676056338028, "grad_norm": 0.12190629541873932, "learning_rate": 8.335300318201844e-08, "loss": 0.0054, "step": 3270 }, { "epoch": 4.927699530516432, "grad_norm": 0.4779601991176605, "learning_rate": 5.5803418303745917e-08, "loss": 0.0051, "step": 3280 }, { "epoch": 4.942723004694836, "grad_norm": 0.051619164645671844, "learning_rate": 3.3760104757313284e-08, "loss": 0.0009, "step": 3290 }, { "epoch": 4.957746478873239, "grad_norm": 0.03362768143415451, "learning_rate": 1.7225492784345156e-08, "loss": 0.0063, "step": 3300 }, { "epoch": 4.957746478873239, "eval_loss": 0.06280913203954697, "eval_runtime": 194.8282, "eval_samples_per_second": 6.077, "eval_steps_per_second": 6.077, "step": 3300 }, { "epoch": 4.972769953051643, "grad_norm": 0.059018392115831375, "learning_rate": 6.201405300532148e-09, "loss": 0.0035, "step": 3310 }, { "epoch": 4.987793427230047, "grad_norm": 0.08097545057535172, "learning_rate": 6.890576946805282e-10, "loss": 0.0008, "step": 3320 }, { "epoch": 4.995305164319249, "step": 3325, "total_flos": 8.345060490986127e+17, "train_loss": 0.041049694813843955, "train_runtime": 47877.1291, "train_samples_per_second": 1.112, "train_steps_per_second": 0.069 } ], "logging_steps": 10, "max_steps": 3325, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.345060490986127e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }