{ "best_global_step": 1666, "best_metric": 0.7958001448225923, "best_model_checkpoint": "dinov2-base-finetuned-dermnet-lr3-5-0.05wd-csr/checkpoint-1666", "epoch": 19.802056555269925, "eval_steps": 500, "global_step": 1940, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.10282776349614396, "grad_norm": 32.612945556640625, "learning_rate": 3.092783505154639e-06, "loss": 4.9827, "step": 10 }, { "epoch": 0.20565552699228792, "grad_norm": 31.24657440185547, "learning_rate": 6.185567010309278e-06, "loss": 4.2857, "step": 20 }, { "epoch": 0.30848329048843187, "grad_norm": 33.468929290771484, "learning_rate": 9.278350515463918e-06, "loss": 3.8351, "step": 30 }, { "epoch": 0.41131105398457585, "grad_norm": 34.168548583984375, "learning_rate": 1.2371134020618556e-05, "loss": 3.4705, "step": 40 }, { "epoch": 0.5141388174807198, "grad_norm": 42.5950813293457, "learning_rate": 1.5463917525773194e-05, "loss": 3.349, "step": 50 }, { "epoch": 0.6169665809768637, "grad_norm": 34.48703384399414, "learning_rate": 1.8556701030927837e-05, "loss": 3.147, "step": 60 }, { "epoch": 0.7197943444730077, "grad_norm": 41.95218276977539, "learning_rate": 2.1649484536082473e-05, "loss": 2.9307, "step": 70 }, { "epoch": 0.8226221079691517, "grad_norm": 37.5980110168457, "learning_rate": 2.4742268041237112e-05, "loss": 2.8797, "step": 80 }, { "epoch": 0.9254498714652957, "grad_norm": 37.91151809692383, "learning_rate": 2.7835051546391755e-05, "loss": 2.7488, "step": 90 }, { "epoch": 1.0, "eval_accuracy": 0.3721940622737147, "eval_loss": 2.51580810546875, "eval_runtime": 26.315, "eval_samples_per_second": 52.48, "eval_steps_per_second": 1.672, "step": 98 }, { "epoch": 1.0205655526992288, "grad_norm": 55.495548248291016, "learning_rate": 2.999980386644447e-05, "loss": 2.3089, "step": 100 }, { "epoch": 1.1233933161953726, "grad_norm": 40.1395378112793, "learning_rate": 2.9996317190364217e-05, "loss": 2.3888, "step": 110 }, { "epoch": 1.2262210796915167, "grad_norm": 36.68174743652344, "learning_rate": 2.9988473156950084e-05, "loss": 2.3248, "step": 120 }, { "epoch": 1.3290488431876606, "grad_norm": 30.71067237854004, "learning_rate": 2.9976274045378772e-05, "loss": 2.3207, "step": 130 }, { "epoch": 1.4318766066838047, "grad_norm": 29.980504989624023, "learning_rate": 2.9959723400246436e-05, "loss": 2.0967, "step": 140 }, { "epoch": 1.5347043701799485, "grad_norm": 37.19841766357422, "learning_rate": 2.993882603053877e-05, "loss": 2.0943, "step": 150 }, { "epoch": 1.6375321336760926, "grad_norm": 28.6318302154541, "learning_rate": 2.991358800823372e-05, "loss": 2.2165, "step": 160 }, { "epoch": 1.7403598971722365, "grad_norm": 27.183704376220703, "learning_rate": 2.9884016666537176e-05, "loss": 2.0141, "step": 170 }, { "epoch": 1.8431876606683804, "grad_norm": 29.224483489990234, "learning_rate": 2.9850120597752234e-05, "loss": 1.8937, "step": 180 }, { "epoch": 1.9460154241645244, "grad_norm": 32.00334167480469, "learning_rate": 2.98119096507826e-05, "loss": 1.9402, "step": 190 }, { "epoch": 2.0, "eval_accuracy": 0.5170166545981173, "eval_loss": 1.7710092067718506, "eval_runtime": 26.3331, "eval_samples_per_second": 52.444, "eval_steps_per_second": 1.671, "step": 196 }, { "epoch": 2.0411311053984575, "grad_norm": 27.400991439819336, "learning_rate": 2.976939492827086e-05, "loss": 1.6206, "step": 200 }, { "epoch": 2.1439588688946016, "grad_norm": 36.989017486572266, "learning_rate": 2.972258878337251e-05, "loss": 1.6252, "step": 210 }, { "epoch": 2.2467866323907453, "grad_norm": 27.719881057739258, "learning_rate": 2.9671504816166558e-05, "loss": 1.6608, "step": 220 }, { "epoch": 2.3496143958868894, "grad_norm": 25.621631622314453, "learning_rate": 2.961615786970389e-05, "loss": 1.579, "step": 230 }, { "epoch": 2.4524421593830334, "grad_norm": 31.112272262573242, "learning_rate": 2.955656402569444e-05, "loss": 1.4814, "step": 240 }, { "epoch": 2.5552699228791775, "grad_norm": 34.653717041015625, "learning_rate": 2.9492740599834448e-05, "loss": 1.6241, "step": 250 }, { "epoch": 2.658097686375321, "grad_norm": 25.508359909057617, "learning_rate": 2.9424706136775197e-05, "loss": 1.5262, "step": 260 }, { "epoch": 2.7609254498714653, "grad_norm": 35.96934509277344, "learning_rate": 2.9352480404734622e-05, "loss": 1.4771, "step": 270 }, { "epoch": 2.8637532133676094, "grad_norm": 25.368663787841797, "learning_rate": 2.9276084389753448e-05, "loss": 1.5244, "step": 280 }, { "epoch": 2.966580976863753, "grad_norm": 28.23850440979004, "learning_rate": 2.9195540289597432e-05, "loss": 1.4938, "step": 290 }, { "epoch": 3.0, "eval_accuracy": 0.5995655322230268, "eval_loss": 1.4938641786575317, "eval_runtime": 26.2761, "eval_samples_per_second": 52.557, "eval_steps_per_second": 1.675, "step": 294 }, { "epoch": 3.0616966580976865, "grad_norm": 24.054737091064453, "learning_rate": 2.9110871507307534e-05, "loss": 1.2121, "step": 300 }, { "epoch": 3.16452442159383, "grad_norm": 30.993751525878906, "learning_rate": 2.902210264439989e-05, "loss": 1.1347, "step": 310 }, { "epoch": 3.2673521850899743, "grad_norm": 25.852996826171875, "learning_rate": 2.892925949371758e-05, "loss": 1.2519, "step": 320 }, { "epoch": 3.3701799485861184, "grad_norm": 25.922534942626953, "learning_rate": 2.883236903193615e-05, "loss": 1.1958, "step": 330 }, { "epoch": 3.4730077120822624, "grad_norm": 23.32210350036621, "learning_rate": 2.8731459411725296e-05, "loss": 1.3409, "step": 340 }, { "epoch": 3.575835475578406, "grad_norm": 27.312992095947266, "learning_rate": 2.8626559953568705e-05, "loss": 1.1584, "step": 350 }, { "epoch": 3.67866323907455, "grad_norm": 25.632417678833008, "learning_rate": 2.8517701137244673e-05, "loss": 1.2443, "step": 360 }, { "epoch": 3.781491002570694, "grad_norm": 30.503786087036133, "learning_rate": 2.840491459296986e-05, "loss": 1.1815, "step": 370 }, { "epoch": 3.884318766066838, "grad_norm": 23.747785568237305, "learning_rate": 2.8288233092208723e-05, "loss": 1.1836, "step": 380 }, { "epoch": 3.987146529562982, "grad_norm": 32.35744094848633, "learning_rate": 2.8167690538151403e-05, "loss": 1.1226, "step": 390 }, { "epoch": 4.0, "eval_accuracy": 0.6256335988414192, "eval_loss": 1.3168108463287354, "eval_runtime": 26.5015, "eval_samples_per_second": 52.11, "eval_steps_per_second": 1.66, "step": 392 }, { "epoch": 4.082262210796915, "grad_norm": 22.52780532836914, "learning_rate": 2.8043321955862732e-05, "loss": 0.9153, "step": 400 }, { "epoch": 4.185089974293059, "grad_norm": 27.999103546142578, "learning_rate": 2.7915163482105318e-05, "loss": 0.9045, "step": 410 }, { "epoch": 4.287917737789203, "grad_norm": 33.024505615234375, "learning_rate": 2.778325235483954e-05, "loss": 0.9925, "step": 420 }, { "epoch": 4.390745501285347, "grad_norm": 26.51447868347168, "learning_rate": 2.7647626902403654e-05, "loss": 0.9905, "step": 430 }, { "epoch": 4.4935732647814906, "grad_norm": 28.593852996826172, "learning_rate": 2.7508326532377033e-05, "loss": 1.0245, "step": 440 }, { "epoch": 4.596401028277635, "grad_norm": 21.522586822509766, "learning_rate": 2.7365391720129796e-05, "loss": 0.9811, "step": 450 }, { "epoch": 4.699228791773779, "grad_norm": 30.715417861938477, "learning_rate": 2.7218863997062262e-05, "loss": 0.9917, "step": 460 }, { "epoch": 4.802056555269923, "grad_norm": 22.439016342163086, "learning_rate": 2.706878593853748e-05, "loss": 1.0038, "step": 470 }, { "epoch": 4.904884318766067, "grad_norm": 27.715469360351562, "learning_rate": 2.6915201151510446e-05, "loss": 1.0389, "step": 480 }, { "epoch": 5.0, "grad_norm": 28.68787956237793, "learning_rate": 2.675815426185761e-05, "loss": 0.9329, "step": 490 }, { "epoch": 5.0, "eval_accuracy": 0.6705286024619841, "eval_loss": 1.1905739307403564, "eval_runtime": 26.5085, "eval_samples_per_second": 52.096, "eval_steps_per_second": 1.66, "step": 490 }, { "epoch": 5.102827763496144, "grad_norm": 29.815004348754883, "learning_rate": 2.6597690901410298e-05, "loss": 0.7627, "step": 500 }, { "epoch": 5.205655526992288, "grad_norm": 26.553739547729492, "learning_rate": 2.6433857694695817e-05, "loss": 0.7796, "step": 510 }, { "epoch": 5.308483290488432, "grad_norm": 27.53105354309082, "learning_rate": 2.626670224539017e-05, "loss": 0.758, "step": 520 }, { "epoch": 5.4113110539845755, "grad_norm": 25.38495635986328, "learning_rate": 2.6096273122486252e-05, "loss": 0.8178, "step": 530 }, { "epoch": 5.5141388174807195, "grad_norm": 22.48053550720215, "learning_rate": 2.592261984618152e-05, "loss": 0.7668, "step": 540 }, { "epoch": 5.616966580976864, "grad_norm": 23.422449111938477, "learning_rate": 2.574579287348936e-05, "loss": 0.7643, "step": 550 }, { "epoch": 5.719794344473008, "grad_norm": 27.490188598632812, "learning_rate": 2.5565843583578168e-05, "loss": 0.7943, "step": 560 }, { "epoch": 5.822622107969152, "grad_norm": 27.94679069519043, "learning_rate": 2.538282426284254e-05, "loss": 0.8702, "step": 570 }, { "epoch": 5.925449871465296, "grad_norm": 25.224809646606445, "learning_rate": 2.519678808971084e-05, "loss": 0.8039, "step": 580 }, { "epoch": 6.0, "eval_accuracy": 0.7067342505430847, "eval_loss": 1.0881775617599487, "eval_runtime": 26.4057, "eval_samples_per_second": 52.299, "eval_steps_per_second": 1.666, "step": 588 }, { "epoch": 6.020565552699229, "grad_norm": 18.887771606445312, "learning_rate": 2.5007789119193587e-05, "loss": 0.7444, "step": 590 }, { "epoch": 6.123393316195373, "grad_norm": 23.249862670898438, "learning_rate": 2.4815882267177104e-05, "loss": 0.6462, "step": 600 }, { "epoch": 6.226221079691516, "grad_norm": 26.49795913696289, "learning_rate": 2.46211232944671e-05, "loss": 0.6117, "step": 610 }, { "epoch": 6.32904884318766, "grad_norm": 23.819791793823242, "learning_rate": 2.4423568790586672e-05, "loss": 0.6539, "step": 620 }, { "epoch": 6.4318766066838045, "grad_norm": 21.4884090423584, "learning_rate": 2.422327615733357e-05, "loss": 0.706, "step": 630 }, { "epoch": 6.5347043701799485, "grad_norm": 22.128145217895508, "learning_rate": 2.4020303592101413e-05, "loss": 0.6729, "step": 640 }, { "epoch": 6.637532133676093, "grad_norm": 23.40184211730957, "learning_rate": 2.3814710070969765e-05, "loss": 0.5978, "step": 650 }, { "epoch": 6.740359897172237, "grad_norm": 22.87993621826172, "learning_rate": 2.3606555331567915e-05, "loss": 0.6987, "step": 660 }, { "epoch": 6.843187660668381, "grad_norm": 24.94782066345215, "learning_rate": 2.339589985571742e-05, "loss": 0.6858, "step": 670 }, { "epoch": 6.946015424164525, "grad_norm": 24.335712432861328, "learning_rate": 2.3182804851858394e-05, "loss": 0.6426, "step": 680 }, { "epoch": 7.0, "eval_accuracy": 0.6929761042722665, "eval_loss": 1.1060941219329834, "eval_runtime": 26.4299, "eval_samples_per_second": 52.251, "eval_steps_per_second": 1.665, "step": 686 }, { "epoch": 7.041131105398458, "grad_norm": 21.920074462890625, "learning_rate": 2.2967332237264662e-05, "loss": 0.5733, "step": 690 }, { "epoch": 7.143958868894601, "grad_norm": 24.461130142211914, "learning_rate": 2.274954462005297e-05, "loss": 0.5654, "step": 700 }, { "epoch": 7.246786632390745, "grad_norm": 23.279918670654297, "learning_rate": 2.2529505280991463e-05, "loss": 0.5236, "step": 710 }, { "epoch": 7.349614395886889, "grad_norm": 23.701425552368164, "learning_rate": 2.230727815511269e-05, "loss": 0.545, "step": 720 }, { "epoch": 7.4524421593830334, "grad_norm": 25.40784454345703, "learning_rate": 2.2082927813136554e-05, "loss": 0.5451, "step": 730 }, { "epoch": 7.5552699228791775, "grad_norm": 24.78341293334961, "learning_rate": 2.185651944270849e-05, "loss": 0.543, "step": 740 }, { "epoch": 7.658097686375322, "grad_norm": 21.24177360534668, "learning_rate": 2.162811882945846e-05, "loss": 0.5667, "step": 750 }, { "epoch": 7.760925449871465, "grad_norm": 21.494333267211914, "learning_rate": 2.1397792337886123e-05, "loss": 0.5617, "step": 760 }, { "epoch": 7.863753213367609, "grad_norm": 16.520523071289062, "learning_rate": 2.116560689207788e-05, "loss": 0.5535, "step": 770 }, { "epoch": 7.966580976863753, "grad_norm": 23.4096622467041, "learning_rate": 2.0931629956261258e-05, "loss": 0.5777, "step": 780 }, { "epoch": 8.0, "eval_accuracy": 0.722664735698769, "eval_loss": 1.0133363008499146, "eval_runtime": 26.1041, "eval_samples_per_second": 52.904, "eval_steps_per_second": 1.686, "step": 784 }, { "epoch": 8.061696658097686, "grad_norm": 16.616352081298828, "learning_rate": 2.0695929515202412e-05, "loss": 0.4645, "step": 790 }, { "epoch": 8.16452442159383, "grad_norm": 19.899398803710938, "learning_rate": 2.0458574054452316e-05, "loss": 0.4847, "step": 800 }, { "epoch": 8.267352185089974, "grad_norm": 15.741543769836426, "learning_rate": 2.021963254044749e-05, "loss": 0.4867, "step": 810 }, { "epoch": 8.370179948586118, "grad_norm": 19.081022262573242, "learning_rate": 1.9979174400471007e-05, "loss": 0.448, "step": 820 }, { "epoch": 8.473007712082262, "grad_norm": 24.90814208984375, "learning_rate": 1.973726950247953e-05, "loss": 0.5156, "step": 830 }, { "epoch": 8.575835475578407, "grad_norm": 21.650203704833984, "learning_rate": 1.949398813480239e-05, "loss": 0.4697, "step": 840 }, { "epoch": 8.67866323907455, "grad_norm": 17.731460571289062, "learning_rate": 1.9249400985718454e-05, "loss": 0.5156, "step": 850 }, { "epoch": 8.781491002570695, "grad_norm": 16.667068481445312, "learning_rate": 1.9003579122916845e-05, "loss": 0.4639, "step": 860 }, { "epoch": 8.884318766066839, "grad_norm": 20.13938331604004, "learning_rate": 1.8756593972847325e-05, "loss": 0.521, "step": 870 }, { "epoch": 8.987146529562981, "grad_norm": 17.58572769165039, "learning_rate": 1.850851729996656e-05, "loss": 0.477, "step": 880 }, { "epoch": 9.0, "eval_accuracy": 0.7364228819695873, "eval_loss": 0.9681397080421448, "eval_runtime": 26.1222, "eval_samples_per_second": 52.867, "eval_steps_per_second": 1.684, "step": 882 }, { "epoch": 9.082262210796916, "grad_norm": 16.83856201171875, "learning_rate": 1.8259421185886074e-05, "loss": 0.4023, "step": 890 }, { "epoch": 9.185089974293058, "grad_norm": 19.466602325439453, "learning_rate": 1.8009378008428077e-05, "loss": 0.3975, "step": 900 }, { "epoch": 9.287917737789202, "grad_norm": 22.649280548095703, "learning_rate": 1.7758460420595254e-05, "loss": 0.3872, "step": 910 }, { "epoch": 9.390745501285346, "grad_norm": 21.412601470947266, "learning_rate": 1.7506741329460565e-05, "loss": 0.4247, "step": 920 }, { "epoch": 9.49357326478149, "grad_norm": 18.724817276000977, "learning_rate": 1.7254293874983248e-05, "loss": 0.4287, "step": 930 }, { "epoch": 9.596401028277635, "grad_norm": 21.37314796447754, "learning_rate": 1.7001191408757095e-05, "loss": 0.4351, "step": 940 }, { "epoch": 9.699228791773779, "grad_norm": 17.494699478149414, "learning_rate": 1.6747507472697332e-05, "loss": 0.3724, "step": 950 }, { "epoch": 9.802056555269923, "grad_norm": 22.221105575561523, "learning_rate": 1.6493315777672117e-05, "loss": 0.3974, "step": 960 }, { "epoch": 9.904884318766067, "grad_norm": 19.51470947265625, "learning_rate": 1.623869018208499e-05, "loss": 0.461, "step": 970 }, { "epoch": 10.0, "grad_norm": 27.976699829101562, "learning_rate": 1.598370467041441e-05, "loss": 0.3961, "step": 980 }, { "epoch": 10.0, "eval_accuracy": 0.7581462708182476, "eval_loss": 0.9401519298553467, "eval_runtime": 26.3136, "eval_samples_per_second": 52.482, "eval_steps_per_second": 1.672, "step": 980 }, { "epoch": 10.102827763496144, "grad_norm": 16.193754196166992, "learning_rate": 1.5728433331716727e-05, "loss": 0.3237, "step": 990 }, { "epoch": 10.205655526992288, "grad_norm": 14.301216125488281, "learning_rate": 1.547295033809871e-05, "loss": 0.2887, "step": 1000 }, { "epoch": 10.308483290488432, "grad_norm": 17.22450828552246, "learning_rate": 1.5217329923165976e-05, "loss": 0.3241, "step": 1010 }, { "epoch": 10.411311053984576, "grad_norm": 13.385015487670898, "learning_rate": 1.496164636045352e-05, "loss": 0.3521, "step": 1020 }, { "epoch": 10.51413881748072, "grad_norm": 19.334781646728516, "learning_rate": 1.470597394184468e-05, "loss": 0.3667, "step": 1030 }, { "epoch": 10.616966580976865, "grad_norm": 20.484798431396484, "learning_rate": 1.4450386955984728e-05, "loss": 0.3679, "step": 1040 }, { "epoch": 10.719794344473009, "grad_norm": 17.106210708618164, "learning_rate": 1.4194959666695457e-05, "loss": 0.3714, "step": 1050 }, { "epoch": 10.822622107969151, "grad_norm": 15.57706356048584, "learning_rate": 1.3939766291396927e-05, "loss": 0.3258, "step": 1060 }, { "epoch": 10.925449871465295, "grad_norm": 23.57295799255371, "learning_rate": 1.3684880979542712e-05, "loss": 0.3451, "step": 1070 }, { "epoch": 11.0, "eval_accuracy": 0.7509051412020276, "eval_loss": 0.9310904741287231, "eval_runtime": 26.2357, "eval_samples_per_second": 52.638, "eval_steps_per_second": 1.677, "step": 1078 }, { "epoch": 11.020565552699228, "grad_norm": 17.988672256469727, "learning_rate": 1.3430377791074928e-05, "loss": 0.2902, "step": 1080 }, { "epoch": 11.123393316195372, "grad_norm": 18.770322799682617, "learning_rate": 1.3176330674905207e-05, "loss": 0.3051, "step": 1090 }, { "epoch": 11.226221079691516, "grad_norm": 13.406723022460938, "learning_rate": 1.292281344742798e-05, "loss": 0.2818, "step": 1100 }, { "epoch": 11.32904884318766, "grad_norm": 18.024940490722656, "learning_rate": 1.2669899771072257e-05, "loss": 0.2732, "step": 1110 }, { "epoch": 11.431876606683804, "grad_norm": 16.603891372680664, "learning_rate": 1.2417663132898128e-05, "loss": 0.2773, "step": 1120 }, { "epoch": 11.534704370179949, "grad_norm": 18.97417449951172, "learning_rate": 1.216617682324423e-05, "loss": 0.3026, "step": 1130 }, { "epoch": 11.637532133676093, "grad_norm": 22.57927894592285, "learning_rate": 1.191551391443238e-05, "loss": 0.258, "step": 1140 }, { "epoch": 11.740359897172237, "grad_norm": 20.432846069335938, "learning_rate": 1.1665747239535528e-05, "loss": 0.3085, "step": 1150 }, { "epoch": 11.84318766066838, "grad_norm": 15.402393341064453, "learning_rate": 1.141694937121528e-05, "loss": 0.2961, "step": 1160 }, { "epoch": 11.946015424164525, "grad_norm": 18.094825744628906, "learning_rate": 1.1169192600635021e-05, "loss": 0.337, "step": 1170 }, { "epoch": 12.0, "eval_accuracy": 0.7661115133960897, "eval_loss": 0.8897334933280945, "eval_runtime": 25.9718, "eval_samples_per_second": 53.173, "eval_steps_per_second": 1.694, "step": 1176 }, { "epoch": 12.041131105398458, "grad_norm": 14.735356330871582, "learning_rate": 1.0922548916454857e-05, "loss": 0.2553, "step": 1180 }, { "epoch": 12.143958868894602, "grad_norm": 12.189945220947266, "learning_rate": 1.067708998391451e-05, "loss": 0.2287, "step": 1190 }, { "epoch": 12.246786632390746, "grad_norm": 14.356574058532715, "learning_rate": 1.043288712401007e-05, "loss": 0.2511, "step": 1200 }, { "epoch": 12.34961439588689, "grad_norm": 14.989773750305176, "learning_rate": 1.0190011292770883e-05, "loss": 0.2551, "step": 1210 }, { "epoch": 12.452442159383033, "grad_norm": 32.5069694519043, "learning_rate": 9.94853306064241e-06, "loss": 0.2278, "step": 1220 }, { "epoch": 12.555269922879177, "grad_norm": 17.35454559326172, "learning_rate": 9.708522591981165e-06, "loss": 0.2962, "step": 1230 }, { "epoch": 12.65809768637532, "grad_norm": 13.15945053100586, "learning_rate": 9.470049624667632e-06, "loss": 0.2255, "step": 1240 }, { "epoch": 12.760925449871465, "grad_norm": 17.534391403198242, "learning_rate": 9.233183449843086e-06, "loss": 0.2376, "step": 1250 }, { "epoch": 12.863753213367609, "grad_norm": 13.61146354675293, "learning_rate": 8.997992891776227e-06, "loss": 0.2463, "step": 1260 }, { "epoch": 12.966580976863753, "grad_norm": 15.458810806274414, "learning_rate": 8.764546287865517e-06, "loss": 0.2348, "step": 1270 }, { "epoch": 13.0, "eval_accuracy": 0.776249094858798, "eval_loss": 0.8616446852684021, "eval_runtime": 25.9396, "eval_samples_per_second": 53.239, "eval_steps_per_second": 1.696, "step": 1274 }, { "epoch": 13.061696658097686, "grad_norm": 16.166946411132812, "learning_rate": 8.532911468782881e-06, "loss": 0.2166, "step": 1280 }, { "epoch": 13.16452442159383, "grad_norm": 17.747920989990234, "learning_rate": 8.303155738764756e-06, "loss": 0.2355, "step": 1290 }, { "epoch": 13.267352185089974, "grad_norm": 15.442453384399414, "learning_rate": 8.075345856056013e-06, "loss": 0.216, "step": 1300 }, { "epoch": 13.370179948586118, "grad_norm": 17.77224349975586, "learning_rate": 7.84954801351257e-06, "loss": 0.2375, "step": 1310 }, { "epoch": 13.473007712082262, "grad_norm": 18.008956909179688, "learning_rate": 7.6258278193682875e-06, "loss": 0.1905, "step": 1320 }, { "epoch": 13.575835475578407, "grad_norm": 14.17025089263916, "learning_rate": 7.404250278171692e-06, "loss": 0.2078, "step": 1330 }, { "epoch": 13.67866323907455, "grad_norm": 16.15502166748047, "learning_rate": 7.184879771898176e-06, "loss": 0.2307, "step": 1340 }, { "epoch": 13.781491002570695, "grad_norm": 7.362473487854004, "learning_rate": 6.967780041243015e-06, "loss": 0.2113, "step": 1350 }, { "epoch": 13.884318766066839, "grad_norm": 9.562990188598633, "learning_rate": 6.75301416710077e-06, "loss": 0.1878, "step": 1360 }, { "epoch": 13.987146529562981, "grad_norm": 16.989004135131836, "learning_rate": 6.540644552236401e-06, "loss": 0.1992, "step": 1370 }, { "epoch": 14.0, "eval_accuracy": 0.7950760318609703, "eval_loss": 0.8241131901741028, "eval_runtime": 26.153, "eval_samples_per_second": 52.805, "eval_steps_per_second": 1.682, "step": 1372 }, { "epoch": 14.082262210796916, "grad_norm": 12.781423568725586, "learning_rate": 6.330732903153384e-06, "loss": 0.1503, "step": 1380 }, { "epoch": 14.185089974293058, "grad_norm": 13.374728202819824, "learning_rate": 6.123340212164146e-06, "loss": 0.1858, "step": 1390 }, { "epoch": 14.287917737789202, "grad_norm": 15.162919044494629, "learning_rate": 5.918526739668083e-06, "loss": 0.1991, "step": 1400 }, { "epoch": 14.390745501285346, "grad_norm": 12.247350692749023, "learning_rate": 5.716351996642117e-06, "loss": 0.1912, "step": 1410 }, { "epoch": 14.49357326478149, "grad_norm": 14.756021499633789, "learning_rate": 5.516874727349141e-06, "loss": 0.1866, "step": 1420 }, { "epoch": 14.596401028277635, "grad_norm": 18.17853546142578, "learning_rate": 5.320152892269129e-06, "loss": 0.1869, "step": 1430 }, { "epoch": 14.699228791773779, "grad_norm": 14.249058723449707, "learning_rate": 5.126243651258055e-06, "loss": 0.2376, "step": 1440 }, { "epoch": 14.802056555269923, "grad_norm": 12.407234191894531, "learning_rate": 4.9352033469394415e-06, "loss": 0.1928, "step": 1450 }, { "epoch": 14.904884318766067, "grad_norm": 12.956231117248535, "learning_rate": 4.747087488333281e-06, "loss": 0.1479, "step": 1460 }, { "epoch": 15.0, "grad_norm": 7.936241626739502, "learning_rate": 4.561950734727316e-06, "loss": 0.182, "step": 1470 }, { "epoch": 15.0, "eval_accuracy": 0.7878349022447502, "eval_loss": 0.8311899304389954, "eval_runtime": 28.981, "eval_samples_per_second": 47.652, "eval_steps_per_second": 1.518, "step": 1470 }, { "epoch": 15.102827763496144, "grad_norm": 9.052491188049316, "learning_rate": 4.379846879795069e-06, "loss": 0.1661, "step": 1480 }, { "epoch": 15.205655526992288, "grad_norm": 16.475849151611328, "learning_rate": 4.2008288359654864e-06, "loss": 0.1934, "step": 1490 }, { "epoch": 15.308483290488432, "grad_norm": 19.091060638427734, "learning_rate": 4.0249486190486075e-06, "loss": 0.1804, "step": 1500 }, { "epoch": 15.411311053984576, "grad_norm": 17.119796752929688, "learning_rate": 3.85225733312174e-06, "loss": 0.1448, "step": 1510 }, { "epoch": 15.51413881748072, "grad_norm": 15.315802574157715, "learning_rate": 3.6828051556805735e-06, "loss": 0.2012, "step": 1520 }, { "epoch": 15.616966580976865, "grad_norm": 18.2623348236084, "learning_rate": 3.5166413230595308e-06, "loss": 0.1906, "step": 1530 }, { "epoch": 15.719794344473009, "grad_norm": 11.490309715270996, "learning_rate": 3.353814116125521e-06, "loss": 0.1736, "step": 1540 }, { "epoch": 15.822622107969151, "grad_norm": 11.701955795288086, "learning_rate": 3.194370846249432e-06, "loss": 0.1627, "step": 1550 }, { "epoch": 15.925449871465295, "grad_norm": 9.927034378051758, "learning_rate": 3.038357841559191e-06, "loss": 0.1556, "step": 1560 }, { "epoch": 16.0, "eval_accuracy": 0.7856625633598842, "eval_loss": 0.8245254755020142, "eval_runtime": 26.2299, "eval_samples_per_second": 52.65, "eval_steps_per_second": 1.677, "step": 1568 }, { "epoch": 16.020565552699228, "grad_norm": 9.595512390136719, "learning_rate": 2.885820433478605e-06, "loss": 0.1227, "step": 1570 }, { "epoch": 16.123393316195372, "grad_norm": 14.648282051086426, "learning_rate": 2.7368029435557895e-06, "loss": 0.1805, "step": 1580 }, { "epoch": 16.226221079691516, "grad_norm": 7.373770713806152, "learning_rate": 2.591348670584994e-06, "loss": 0.1519, "step": 1590 }, { "epoch": 16.32904884318766, "grad_norm": 15.068218231201172, "learning_rate": 2.449499878025648e-06, "loss": 0.1684, "step": 1600 }, { "epoch": 16.431876606683804, "grad_norm": 16.052967071533203, "learning_rate": 2.3112977817221876e-06, "loss": 0.1806, "step": 1610 }, { "epoch": 16.53470437017995, "grad_norm": 10.597209930419922, "learning_rate": 2.1767825379283266e-06, "loss": 0.1712, "step": 1620 }, { "epoch": 16.637532133676093, "grad_norm": 10.147574424743652, "learning_rate": 2.0459932316391843e-06, "loss": 0.162, "step": 1630 }, { "epoch": 16.740359897172237, "grad_norm": 10.345613479614258, "learning_rate": 1.918967865234653e-06, "loss": 0.1487, "step": 1640 }, { "epoch": 16.84318766066838, "grad_norm": 12.96136474609375, "learning_rate": 1.7957433474373797e-06, "loss": 0.1459, "step": 1650 }, { "epoch": 16.946015424164525, "grad_norm": 10.343660354614258, "learning_rate": 1.6763554825884959e-06, "loss": 0.1516, "step": 1660 }, { "epoch": 17.0, "eval_accuracy": 0.7958001448225923, "eval_loss": 0.8169917464256287, "eval_runtime": 27.2569, "eval_samples_per_second": 50.666, "eval_steps_per_second": 1.614, "step": 1666 }, { "epoch": 17.041131105398456, "grad_norm": 10.28726863861084, "learning_rate": 1.5608389602442308e-06, "loss": 0.1576, "step": 1670 }, { "epoch": 17.1439588688946, "grad_norm": 9.736518859863281, "learning_rate": 1.4492273450964654e-06, "loss": 0.1522, "step": 1680 }, { "epoch": 17.246786632390744, "grad_norm": 15.01709270477295, "learning_rate": 1.3415530672201054e-06, "loss": 0.1432, "step": 1690 }, { "epoch": 17.34961439588689, "grad_norm": 13.26907730102539, "learning_rate": 1.2378474126501415e-06, "loss": 0.1263, "step": 1700 }, { "epoch": 17.452442159383033, "grad_norm": 16.161386489868164, "learning_rate": 1.1381405142911305e-06, "loss": 0.1774, "step": 1710 }, { "epoch": 17.555269922879177, "grad_norm": 13.573271751403809, "learning_rate": 1.0424613431617014e-06, "loss": 0.153, "step": 1720 }, { "epoch": 17.65809768637532, "grad_norm": 10.271599769592285, "learning_rate": 9.50837699976711e-07, "loss": 0.1148, "step": 1730 }, { "epoch": 17.760925449871465, "grad_norm": 10.907147407531738, "learning_rate": 8.63296207069394e-07, "loss": 0.1278, "step": 1740 }, { "epoch": 17.86375321336761, "grad_norm": 9.800398826599121, "learning_rate": 7.798623006559436e-07, "loss": 0.1592, "step": 1750 }, { "epoch": 17.966580976863753, "grad_norm": 11.684863090515137, "learning_rate": 7.005602234447122e-07, "loss": 0.1569, "step": 1760 }, { "epoch": 18.0, "eval_accuracy": 0.7878349022447502, "eval_loss": 0.8201833367347717, "eval_runtime": 26.1387, "eval_samples_per_second": 52.834, "eval_steps_per_second": 1.683, "step": 1764 }, { "epoch": 18.061696658097688, "grad_norm": 11.532236099243164, "learning_rate": 6.254130175922062e-07, "loss": 0.1098, "step": 1770 }, { "epoch": 18.164524421593832, "grad_norm": 16.26333236694336, "learning_rate": 5.544425180079144e-07, "loss": 0.1543, "step": 1780 }, { "epoch": 18.267352185089976, "grad_norm": 14.378057479858398, "learning_rate": 4.876693460099213e-07, "loss": 0.1647, "step": 1790 }, { "epoch": 18.370179948586117, "grad_norm": 10.328337669372559, "learning_rate": 4.251129033331341e-07, "loss": 0.1485, "step": 1800 }, { "epoch": 18.47300771208226, "grad_norm": 12.40806770324707, "learning_rate": 3.667913664918815e-07, "loss": 0.1472, "step": 1810 }, { "epoch": 18.575835475578405, "grad_norm": 12.186408042907715, "learning_rate": 3.127216814985118e-07, "loss": 0.1372, "step": 1820 }, { "epoch": 18.67866323907455, "grad_norm": 13.199813842773438, "learning_rate": 2.6291955893952504e-07, "loss": 0.1543, "step": 1830 }, { "epoch": 18.781491002570693, "grad_norm": 13.735506057739258, "learning_rate": 2.1739946941068457e-07, "loss": 0.16, "step": 1840 }, { "epoch": 18.884318766066837, "grad_norm": 9.509590148925781, "learning_rate": 1.7617463931240697e-07, "loss": 0.1481, "step": 1850 }, { "epoch": 18.98714652956298, "grad_norm": 9.935129165649414, "learning_rate": 1.3925704700667952e-07, "loss": 0.1364, "step": 1860 }, { "epoch": 19.0, "eval_accuracy": 0.7950760318609703, "eval_loss": 0.8116512298583984, "eval_runtime": 26.0801, "eval_samples_per_second": 52.952, "eval_steps_per_second": 1.687, "step": 1862 }, { "epoch": 19.082262210796916, "grad_norm": 9.694613456726074, "learning_rate": 1.0665741933660267e-07, "loss": 0.1266, "step": 1870 }, { "epoch": 19.18508997429306, "grad_norm": 11.092907905578613, "learning_rate": 7.838522850957819e-08, "loss": 0.1127, "step": 1880 }, { "epoch": 19.287917737789204, "grad_norm": 14.780961036682129, "learning_rate": 5.444868934505076e-08, "loss": 0.1418, "step": 1890 }, { "epoch": 19.39074550128535, "grad_norm": 11.33579158782959, "learning_rate": 3.485475688759232e-08, "loss": 0.1376, "step": 1900 }, { "epoch": 19.493573264781492, "grad_norm": 12.346114158630371, "learning_rate": 1.9609124386033572e-08, "loss": 0.1212, "step": 1910 }, { "epoch": 19.596401028277636, "grad_norm": 17.068004608154297, "learning_rate": 8.71622163922059e-09, "loss": 0.1705, "step": 1920 }, { "epoch": 19.69922879177378, "grad_norm": 12.213850021362305, "learning_rate": 2.179213708884409e-09, "loss": 0.1737, "step": 1930 }, { "epoch": 19.802056555269925, "grad_norm": 11.974803924560547, "learning_rate": 0.0, "loss": 0.1427, "step": 1940 }, { "epoch": 19.802056555269925, "eval_accuracy": 0.7958001448225923, "eval_loss": 0.8118759393692017, "eval_runtime": 26.3909, "eval_samples_per_second": 52.329, "eval_steps_per_second": 1.667, "step": 1940 }, { "epoch": 19.802056555269925, "step": 1940, "total_flos": 2.516976501822849e+19, "train_loss": 0.7143215845540627, "train_runtime": 8041.4021, "train_samples_per_second": 30.9, "train_steps_per_second": 0.241 } ], "logging_steps": 10, "max_steps": 1940, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.516976501822849e+19, "train_batch_size": 32, "trial_name": null, "trial_params": null }