{ "best_metric": 3.070533275604248, "best_model_checkpoint": "./distilled3/checkpoint-46000", "epoch": 1.7583705765990183, "eval_steps": 2000, "global_step": 48000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "combined_loss": 13.355602264404297, "distill_loss": 1.4010732173919678, "epoch": 0, "step": 0, "student_mlm_loss": 25.310131072998047 }, { "epoch": 0.003663272034581288, "grad_norm": 11.128765106201172, "learning_rate": 1e-05, "loss": 17.4544, "step": 100 }, { "combined_loss": 9.379831314086914, "distill_loss": 1.5227235555648804, "epoch": 0.003663272034581288, "step": 100, "student_mlm_loss": 17.2369384765625 }, { "epoch": 0.007326544069162576, "grad_norm": 14.151921272277832, "learning_rate": 2e-05, "loss": 16.0099, "step": 200 }, { "combined_loss": 28.136512756347656, "distill_loss": 1.571045160293579, "epoch": 0.007326544069162576, "step": 200, "student_mlm_loss": 54.70198059082031 }, { "epoch": 0.010989816103743864, "grad_norm": 11.68195915222168, "learning_rate": 3e-05, "loss": 18.8223, "step": 300 }, { "combined_loss": 15.699158668518066, "distill_loss": 1.5519400835037231, "epoch": 0.010989816103743864, "step": 300, "student_mlm_loss": 29.846376419067383 }, { "epoch": 0.014653088138325152, "grad_norm": 8.982569694519043, "learning_rate": 4e-05, "loss": 16.9008, "step": 400 }, { "combined_loss": 3.035900592803955, "distill_loss": 1.4880340099334717, "epoch": 0.014653088138325152, "step": 400, "student_mlm_loss": 4.583766937255859 }, { "epoch": 0.01831636017290644, "grad_norm": 7.045658111572266, "learning_rate": 5e-05, "loss": 8.812, "step": 500 }, { "combined_loss": 7.002770900726318, "distill_loss": 1.351847529411316, "epoch": 0.01831636017290644, "step": 500, "student_mlm_loss": 12.653694152832031 }, { "epoch": 0.021979632207487727, "grad_norm": 4.265043258666992, "learning_rate": 4.9938570410595373e-05, "loss": 16.8853, "step": 600 }, { "combined_loss": 3.2060928344726562, "distill_loss": 1.2962806224822998, "epoch": 0.021979632207487727, "step": 600, "student_mlm_loss": 5.115904808044434 }, { "epoch": 0.025642904242069015, "grad_norm": 7.744924545288086, "learning_rate": 4.987714082119075e-05, "loss": 7.1609, "step": 700 }, { "combined_loss": 2.2816712856292725, "distill_loss": 1.5105196237564087, "epoch": 0.025642904242069015, "step": 700, "student_mlm_loss": 3.052823066711426 }, { "epoch": 0.029306176276650303, "grad_norm": 12.44052791595459, "learning_rate": 4.981571123178613e-05, "loss": 13.0471, "step": 800 }, { "combined_loss": 3.225351095199585, "distill_loss": 1.5753816366195679, "epoch": 0.029306176276650303, "step": 800, "student_mlm_loss": 4.8753204345703125 }, { "epoch": 0.032969448311231594, "grad_norm": 6.2059645652771, "learning_rate": 4.975428164238151e-05, "loss": 6.2833, "step": 900 }, { "combined_loss": 8.580605506896973, "distill_loss": 1.530474066734314, "epoch": 0.032969448311231594, "step": 900, "student_mlm_loss": 15.630736351013184 }, { "epoch": 0.03663272034581288, "grad_norm": 14.731459617614746, "learning_rate": 4.969285205297688e-05, "loss": 5.8549, "step": 1000 }, { "combined_loss": 3.7085845470428467, "distill_loss": 1.4659323692321777, "epoch": 0.03663272034581288, "step": 1000, "student_mlm_loss": 5.951236724853516 }, { "epoch": 0.04029599238039417, "grad_norm": 9.745060920715332, "learning_rate": 4.9631422463572256e-05, "loss": 5.174, "step": 1100 }, { "combined_loss": 4.752764701843262, "distill_loss": 1.4000483751296997, "epoch": 0.04029599238039417, "step": 1100, "student_mlm_loss": 8.105481147766113 }, { "epoch": 0.043959264414975455, "grad_norm": 13.801424026489258, "learning_rate": 4.9569992874167634e-05, "loss": 19.8368, "step": 1200 }, { "combined_loss": 3.1324005126953125, "distill_loss": 1.404078483581543, "epoch": 0.043959264414975455, "step": 1200, "student_mlm_loss": 4.860722541809082 }, { "epoch": 0.047622536449556746, "grad_norm": 52.244632720947266, "learning_rate": 4.9508563284763005e-05, "loss": 5.547, "step": 1300 }, { "combined_loss": 3.1176328659057617, "distill_loss": 1.3057805299758911, "epoch": 0.047622536449556746, "step": 1300, "student_mlm_loss": 4.929485321044922 }, { "epoch": 0.05128580848413803, "grad_norm": 47.002349853515625, "learning_rate": 4.944713369535838e-05, "loss": 4.7784, "step": 1400 }, { "combined_loss": 3.871903657913208, "distill_loss": 1.5537463426589966, "epoch": 0.05128580848413803, "step": 1400, "student_mlm_loss": 6.190061092376709 }, { "epoch": 0.05494908051871932, "grad_norm": 11.417911529541016, "learning_rate": 4.9385704105953754e-05, "loss": 5.9593, "step": 1500 }, { "combined_loss": 6.293668270111084, "distill_loss": 1.3082151412963867, "epoch": 0.05494908051871932, "step": 1500, "student_mlm_loss": 11.279121398925781 }, { "epoch": 0.058612352553300606, "grad_norm": 24.519105911254883, "learning_rate": 4.932427451654914e-05, "loss": 7.2762, "step": 1600 }, { "combined_loss": 3.350501775741577, "distill_loss": 1.4593900442123413, "epoch": 0.058612352553300606, "step": 1600, "student_mlm_loss": 5.241613388061523 }, { "epoch": 0.0622756245878819, "grad_norm": 42.58499526977539, "learning_rate": 4.926284492714451e-05, "loss": 7.1364, "step": 1700 }, { "combined_loss": 10.976073265075684, "distill_loss": 1.594639539718628, "epoch": 0.0622756245878819, "step": 1700, "student_mlm_loss": 20.357507705688477 }, { "epoch": 0.06593889662246319, "grad_norm": 105.27689361572266, "learning_rate": 4.920141533773989e-05, "loss": 5.7662, "step": 1800 }, { "combined_loss": 4.272126197814941, "distill_loss": 1.4649100303649902, "epoch": 0.06593889662246319, "step": 1800, "student_mlm_loss": 7.079341888427734 }, { "epoch": 0.06960216865704447, "grad_norm": 9.272991180419922, "learning_rate": 4.913998574833526e-05, "loss": 4.9898, "step": 1900 }, { "combined_loss": 2.2884514331817627, "distill_loss": 1.5105092525482178, "epoch": 0.06960216865704447, "step": 1900, "student_mlm_loss": 3.0663936138153076 }, { "epoch": 0.07326544069162576, "grad_norm": 15.299578666687012, "learning_rate": 4.9078556158930636e-05, "loss": 6.8909, "step": 2000 }, { "epoch": 0.07326544069162576, "eval_loss": 6.166979789733887, "eval_runtime": 2.1158, "eval_samples_per_second": 3306.616, "eval_steps_per_second": 13.234, "step": 2000 }, { "combined_loss": 5.612101078033447, "distill_loss": 1.332657814025879, "epoch": 0.07326544069162576, "step": 2000, "student_mlm_loss": 9.891544342041016 }, { "epoch": 0.07692871272620705, "grad_norm": 12.242279052734375, "learning_rate": 4.9017126569526014e-05, "loss": 8.6608, "step": 2100 }, { "combined_loss": 2.035828113555908, "distill_loss": 1.3731106519699097, "epoch": 0.07692871272620705, "step": 2100, "student_mlm_loss": 2.6985456943511963 }, { "epoch": 0.08059198476078834, "grad_norm": 27.212379455566406, "learning_rate": 4.8955696980121385e-05, "loss": 9.4649, "step": 2200 }, { "combined_loss": 2.5593996047973633, "distill_loss": 1.5456775426864624, "epoch": 0.08059198476078834, "step": 2200, "student_mlm_loss": 3.5731217861175537 }, { "epoch": 0.08425525679536962, "grad_norm": 9.444129943847656, "learning_rate": 4.889426739071676e-05, "loss": 12.6304, "step": 2300 }, { "combined_loss": 3.0112435817718506, "distill_loss": 1.268593192100525, "epoch": 0.08425525679536962, "step": 2300, "student_mlm_loss": 4.753893852233887 }, { "epoch": 0.08791852882995091, "grad_norm": 6.72172212600708, "learning_rate": 4.8832837801312134e-05, "loss": 4.2453, "step": 2400 }, { "combined_loss": 2.3823843002319336, "distill_loss": 1.3674836158752441, "epoch": 0.08791852882995091, "step": 2400, "student_mlm_loss": 3.397284984588623 }, { "epoch": 0.0915818008645322, "grad_norm": 88.5478744506836, "learning_rate": 4.877140821190752e-05, "loss": 4.6849, "step": 2500 }, { "combined_loss": 3.8919034004211426, "distill_loss": 1.523806095123291, "epoch": 0.0915818008645322, "step": 2500, "student_mlm_loss": 6.260000705718994 }, { "epoch": 0.09524507289911349, "grad_norm": 11.671692848205566, "learning_rate": 4.870997862250289e-05, "loss": 4.8686, "step": 2600 }, { "combined_loss": 2.8186635971069336, "distill_loss": 1.313085913658142, "epoch": 0.09524507289911349, "step": 2600, "student_mlm_loss": 4.3242411613464355 }, { "epoch": 0.09890834493369477, "grad_norm": 7.681136131286621, "learning_rate": 4.864854903309827e-05, "loss": 14.7468, "step": 2700 }, { "combined_loss": 2.6350021362304688, "distill_loss": 1.5300695896148682, "epoch": 0.09890834493369477, "step": 2700, "student_mlm_loss": 3.7399346828460693 }, { "epoch": 0.10257161696827606, "grad_norm": 10.245522499084473, "learning_rate": 4.858711944369364e-05, "loss": 4.7465, "step": 2800 }, { "combined_loss": 1.9805179834365845, "distill_loss": 1.3671844005584717, "epoch": 0.10257161696827606, "step": 2800, "student_mlm_loss": 2.5938515663146973 }, { "epoch": 0.10623488900285735, "grad_norm": 51.705352783203125, "learning_rate": 4.8525689854289016e-05, "loss": 3.8985, "step": 2900 }, { "combined_loss": 1.9335501194000244, "distill_loss": 1.3294615745544434, "epoch": 0.10623488900285735, "step": 2900, "student_mlm_loss": 2.5376386642456055 }, { "epoch": 0.10989816103743864, "grad_norm": 7.661074161529541, "learning_rate": 4.8464260264884394e-05, "loss": 3.9846, "step": 3000 }, { "combined_loss": 2.815329074859619, "distill_loss": 1.5120948553085327, "epoch": 0.10989816103743864, "step": 3000, "student_mlm_loss": 4.118563175201416 }, { "epoch": 0.11356143307201993, "grad_norm": 3.9512596130371094, "learning_rate": 4.8402830675479765e-05, "loss": 5.6509, "step": 3100 }, { "combined_loss": 5.329846382141113, "distill_loss": 1.5839005708694458, "epoch": 0.11356143307201993, "step": 3100, "student_mlm_loss": 9.07579231262207 }, { "epoch": 0.11722470510660121, "grad_norm": 21.47922134399414, "learning_rate": 4.834140108607514e-05, "loss": 4.5437, "step": 3200 }, { "combined_loss": 3.32517147064209, "distill_loss": 1.4834882020950317, "epoch": 0.11722470510660121, "step": 3200, "student_mlm_loss": 5.1668548583984375 }, { "epoch": 0.1208879771411825, "grad_norm": 11.865033149719238, "learning_rate": 4.827997149667052e-05, "loss": 5.0218, "step": 3300 }, { "combined_loss": 2.84318208694458, "distill_loss": 1.302217960357666, "epoch": 0.1208879771411825, "step": 3300, "student_mlm_loss": 4.384146213531494 }, { "epoch": 0.1245512491757638, "grad_norm": 13.824487686157227, "learning_rate": 4.82185419072659e-05, "loss": 33.2949, "step": 3400 }, { "combined_loss": 2.065192937850952, "distill_loss": 1.3474924564361572, "epoch": 0.1245512491757638, "step": 3400, "student_mlm_loss": 2.782893419265747 }, { "epoch": 0.12821452121034507, "grad_norm": 34.21382522583008, "learning_rate": 4.815711231786127e-05, "loss": 12.5775, "step": 3500 }, { "combined_loss": 2.2148988246917725, "distill_loss": 1.616875171661377, "epoch": 0.12821452121034507, "step": 3500, "student_mlm_loss": 2.812922477722168 }, { "epoch": 0.13187779324492638, "grad_norm": 8.859841346740723, "learning_rate": 4.809568272845665e-05, "loss": 4.6975, "step": 3600 }, { "combined_loss": 4.478976726531982, "distill_loss": 1.3554083108901978, "epoch": 0.13187779324492638, "step": 3600, "student_mlm_loss": 7.602544784545898 }, { "epoch": 0.13554106527950766, "grad_norm": 12.680179595947266, "learning_rate": 4.803425313905202e-05, "loss": 4.5414, "step": 3700 }, { "combined_loss": 6.908867835998535, "distill_loss": 1.3570021390914917, "epoch": 0.13554106527950766, "step": 3700, "student_mlm_loss": 12.460733413696289 }, { "epoch": 0.13920433731408893, "grad_norm": 18.478200912475586, "learning_rate": 4.7972823549647396e-05, "loss": 35.1443, "step": 3800 }, { "combined_loss": 13.97608757019043, "distill_loss": 1.418832778930664, "epoch": 0.13920433731408893, "step": 3800, "student_mlm_loss": 26.533342361450195 }, { "epoch": 0.14286760934867024, "grad_norm": 10.53610897064209, "learning_rate": 4.7911393960242774e-05, "loss": 13.766, "step": 3900 }, { "combined_loss": 2.1997413635253906, "distill_loss": 1.4529953002929688, "epoch": 0.14286760934867024, "step": 3900, "student_mlm_loss": 2.9464874267578125 }, { "epoch": 0.14653088138325152, "grad_norm": 42.095558166503906, "learning_rate": 4.7849964370838145e-05, "loss": 3.297, "step": 4000 }, { "epoch": 0.14653088138325152, "eval_loss": 4.568027496337891, "eval_runtime": 2.0693, "eval_samples_per_second": 3380.818, "eval_steps_per_second": 13.531, "step": 4000 }, { "combined_loss": 2.278163433074951, "distill_loss": 1.5395259857177734, "epoch": 0.14653088138325152, "step": 4000, "student_mlm_loss": 3.016800880432129 }, { "epoch": 0.15019415341783282, "grad_norm": 15.655592918395996, "learning_rate": 4.778853478143352e-05, "loss": 4.5795, "step": 4100 }, { "combined_loss": 2.117962598800659, "distill_loss": 1.5073814392089844, "epoch": 0.15019415341783282, "step": 4100, "student_mlm_loss": 2.728543758392334 }, { "epoch": 0.1538574254524141, "grad_norm": 9.47999382019043, "learning_rate": 4.77271051920289e-05, "loss": 4.6384, "step": 4200 }, { "combined_loss": 2.2614216804504395, "distill_loss": 1.3999947309494019, "epoch": 0.1538574254524141, "step": 4200, "student_mlm_loss": 3.1228485107421875 }, { "epoch": 0.15752069748699538, "grad_norm": 12.137129783630371, "learning_rate": 4.766567560262428e-05, "loss": 3.6101, "step": 4300 }, { "combined_loss": 1.9776763916015625, "distill_loss": 1.4785245656967163, "epoch": 0.15752069748699538, "step": 4300, "student_mlm_loss": 2.476828098297119 }, { "epoch": 0.16118396952157668, "grad_norm": 74.8094253540039, "learning_rate": 4.760424601321965e-05, "loss": 4.9111, "step": 4400 }, { "combined_loss": 3.0158274173736572, "distill_loss": 1.2940564155578613, "epoch": 0.16118396952157668, "step": 4400, "student_mlm_loss": 4.737598419189453 }, { "epoch": 0.16484724155615796, "grad_norm": 5.339694499969482, "learning_rate": 4.754281642381502e-05, "loss": 3.4013, "step": 4500 }, { "combined_loss": 2.176065683364868, "distill_loss": 1.5688632726669312, "epoch": 0.16484724155615796, "step": 4500, "student_mlm_loss": 2.7832682132720947 }, { "epoch": 0.16851051359073924, "grad_norm": 12.745500564575195, "learning_rate": 4.74813868344104e-05, "loss": 3.1244, "step": 4600 }, { "combined_loss": 2.4230682849884033, "distill_loss": 1.46636962890625, "epoch": 0.16851051359073924, "step": 4600, "student_mlm_loss": 3.3797669410705566 }, { "epoch": 0.17217378562532054, "grad_norm": 14.515507698059082, "learning_rate": 4.7419957245005777e-05, "loss": 4.9862, "step": 4700 }, { "combined_loss": 6.772428512573242, "distill_loss": 1.6445391178131104, "epoch": 0.17217378562532054, "step": 4700, "student_mlm_loss": 11.900318145751953 }, { "epoch": 0.17583705765990182, "grad_norm": 10.036664962768555, "learning_rate": 4.7358527655601154e-05, "loss": 3.72, "step": 4800 }, { "combined_loss": 27.606048583984375, "distill_loss": 1.4302338361740112, "epoch": 0.17583705765990182, "step": 4800, "student_mlm_loss": 53.781864166259766 }, { "epoch": 0.17950032969448312, "grad_norm": 14.220582008361816, "learning_rate": 4.7297098066196525e-05, "loss": 9.0684, "step": 4900 }, { "combined_loss": 7.97739839553833, "distill_loss": 1.4764257669448853, "epoch": 0.17950032969448312, "step": 4900, "student_mlm_loss": 14.478370666503906 }, { "epoch": 0.1831636017290644, "grad_norm": 8.734748840332031, "learning_rate": 4.72356684767919e-05, "loss": 13.2974, "step": 5000 }, { "combined_loss": 3.3007736206054688, "distill_loss": 1.5111989974975586, "epoch": 0.1831636017290644, "step": 5000, "student_mlm_loss": 5.090348243713379 }, { "epoch": 0.18682687376364568, "grad_norm": 23.457653045654297, "learning_rate": 4.717423888738728e-05, "loss": 4.4811, "step": 5100 }, { "combined_loss": 2.695789337158203, "distill_loss": 1.4495799541473389, "epoch": 0.18682687376364568, "step": 5100, "student_mlm_loss": 3.9419989585876465 }, { "epoch": 0.19049014579822698, "grad_norm": 11.504470825195312, "learning_rate": 4.711280929798265e-05, "loss": 3.2576, "step": 5200 }, { "combined_loss": 3.5765743255615234, "distill_loss": 1.3500127792358398, "epoch": 0.19049014579822698, "step": 5200, "student_mlm_loss": 5.803135871887207 }, { "epoch": 0.19415341783280826, "grad_norm": 34.68207550048828, "learning_rate": 4.705137970857803e-05, "loss": 5.8403, "step": 5300 }, { "combined_loss": 4.304483413696289, "distill_loss": 1.4075747728347778, "epoch": 0.19415341783280826, "step": 5300, "student_mlm_loss": 7.20139217376709 }, { "epoch": 0.19781668986738954, "grad_norm": 22.416582107543945, "learning_rate": 4.69899501191734e-05, "loss": 4.045, "step": 5400 }, { "combined_loss": 1.9111289978027344, "distill_loss": 1.321276307106018, "epoch": 0.19781668986738954, "step": 5400, "student_mlm_loss": 2.500981569290161 }, { "epoch": 0.20147996190197084, "grad_norm": 27.66775894165039, "learning_rate": 4.6928520529768786e-05, "loss": 3.8896, "step": 5500 }, { "combined_loss": 2.142390251159668, "distill_loss": 1.4025957584381104, "epoch": 0.20147996190197084, "step": 5500, "student_mlm_loss": 2.8821845054626465 }, { "epoch": 0.20514323393655212, "grad_norm": 35.84339141845703, "learning_rate": 4.686709094036416e-05, "loss": 4.94, "step": 5600 }, { "combined_loss": 2.1642816066741943, "distill_loss": 1.392912745475769, "epoch": 0.20514323393655212, "step": 5600, "student_mlm_loss": 2.935650587081909 }, { "epoch": 0.20880650597113343, "grad_norm": 18.43452262878418, "learning_rate": 4.6805661350959535e-05, "loss": 7.4575, "step": 5700 }, { "combined_loss": 2.354356288909912, "distill_loss": 1.3411612510681152, "epoch": 0.20880650597113343, "step": 5700, "student_mlm_loss": 3.36755108833313 }, { "epoch": 0.2124697780057147, "grad_norm": 5.364467144012451, "learning_rate": 4.6744231761554906e-05, "loss": 3.2172, "step": 5800 }, { "combined_loss": 2.129748821258545, "distill_loss": 1.4555408954620361, "epoch": 0.2124697780057147, "step": 5800, "student_mlm_loss": 2.8039567470550537 }, { "epoch": 0.21613305004029598, "grad_norm": 12.704414367675781, "learning_rate": 4.6682802172150283e-05, "loss": 9.9214, "step": 5900 }, { "combined_loss": 5.396609783172607, "distill_loss": 1.3954136371612549, "epoch": 0.21613305004029598, "step": 5900, "student_mlm_loss": 9.397806167602539 }, { "epoch": 0.2197963220748773, "grad_norm": 9.411243438720703, "learning_rate": 4.662137258274566e-05, "loss": 4.6268, "step": 6000 }, { "epoch": 0.2197963220748773, "eval_loss": 4.474331855773926, "eval_runtime": 2.0765, "eval_samples_per_second": 3369.116, "eval_steps_per_second": 13.484, "step": 6000 }, { "combined_loss": 2.3863794803619385, "distill_loss": 1.4665789604187012, "epoch": 0.2197963220748773, "step": 6000, "student_mlm_loss": 3.306180000305176 }, { "epoch": 0.22345959410945856, "grad_norm": 15.34604263305664, "learning_rate": 4.655994299334103e-05, "loss": 3.586, "step": 6100 }, { "combined_loss": 2.5740702152252197, "distill_loss": 1.5186127424240112, "epoch": 0.22345959410945856, "step": 6100, "student_mlm_loss": 3.6295275688171387 }, { "epoch": 0.22712286614403987, "grad_norm": 10.821826934814453, "learning_rate": 4.649851340393641e-05, "loss": 5.516, "step": 6200 }, { "combined_loss": 4.770940780639648, "distill_loss": 1.5328683853149414, "epoch": 0.22712286614403987, "step": 6200, "student_mlm_loss": 8.009013175964355 }, { "epoch": 0.23078613817862115, "grad_norm": 45.33203887939453, "learning_rate": 4.643708381453178e-05, "loss": 6.4937, "step": 6300 }, { "combined_loss": 2.257235050201416, "distill_loss": 1.4594223499298096, "epoch": 0.23078613817862115, "step": 6300, "student_mlm_loss": 3.0550475120544434 }, { "epoch": 0.23444941021320242, "grad_norm": 24.137001037597656, "learning_rate": 4.6375654225127166e-05, "loss": 2.8761, "step": 6400 }, { "combined_loss": 3.673408031463623, "distill_loss": 1.5113860368728638, "epoch": 0.23444941021320242, "step": 6400, "student_mlm_loss": 5.835430145263672 }, { "epoch": 0.23811268224778373, "grad_norm": 89.53437042236328, "learning_rate": 4.631422463572254e-05, "loss": 4.9469, "step": 6500 }, { "combined_loss": 2.289175271987915, "distill_loss": 1.6255369186401367, "epoch": 0.23811268224778373, "step": 6500, "student_mlm_loss": 2.9528136253356934 }, { "epoch": 0.241775954282365, "grad_norm": 29.47341537475586, "learning_rate": 4.6252795046317915e-05, "loss": 3.2857, "step": 6600 }, { "combined_loss": 2.986036777496338, "distill_loss": 1.3628634214401245, "epoch": 0.241775954282365, "step": 6600, "student_mlm_loss": 4.609210014343262 }, { "epoch": 0.24543922631694629, "grad_norm": 8.413643836975098, "learning_rate": 4.6191365456913286e-05, "loss": 4.1874, "step": 6700 }, { "combined_loss": 4.9381103515625, "distill_loss": 1.5604116916656494, "epoch": 0.24543922631694629, "step": 6700, "student_mlm_loss": 8.31580924987793 }, { "epoch": 0.2491024983515276, "grad_norm": 19.279678344726562, "learning_rate": 4.6129935867508664e-05, "loss": 5.5581, "step": 6800 }, { "combined_loss": 4.7175493240356445, "distill_loss": 1.5657355785369873, "epoch": 0.2491024983515276, "step": 6800, "student_mlm_loss": 7.869362831115723 }, { "epoch": 0.25276577038610887, "grad_norm": 14.9283447265625, "learning_rate": 4.606850627810404e-05, "loss": 4.6319, "step": 6900 }, { "combined_loss": 5.707411766052246, "distill_loss": 1.566019058227539, "epoch": 0.25276577038610887, "step": 6900, "student_mlm_loss": 9.848804473876953 }, { "epoch": 0.25642904242069015, "grad_norm": 5.006555557250977, "learning_rate": 4.600707668869941e-05, "loss": 6.1192, "step": 7000 }, { "combined_loss": 4.373297691345215, "distill_loss": 1.4654217958450317, "epoch": 0.25642904242069015, "step": 7000, "student_mlm_loss": 7.281173229217529 }, { "epoch": 0.2600923144552714, "grad_norm": 15.025683403015137, "learning_rate": 4.594564709929479e-05, "loss": 3.472, "step": 7100 }, { "combined_loss": 5.1388630867004395, "distill_loss": 1.5254905223846436, "epoch": 0.2600923144552714, "step": 7100, "student_mlm_loss": 8.752235412597656 }, { "epoch": 0.26375558648985276, "grad_norm": 44.157169342041016, "learning_rate": 4.588421750989017e-05, "loss": 8.8482, "step": 7200 }, { "combined_loss": 2.1565892696380615, "distill_loss": 1.2985585927963257, "epoch": 0.26375558648985276, "step": 7200, "student_mlm_loss": 3.014619827270508 }, { "epoch": 0.26741885852443403, "grad_norm": 5.755523204803467, "learning_rate": 4.5822787920485546e-05, "loss": 5.7829, "step": 7300 }, { "combined_loss": 2.5404441356658936, "distill_loss": 1.5058717727661133, "epoch": 0.26741885852443403, "step": 7300, "student_mlm_loss": 3.575016498565674 }, { "epoch": 0.2710821305590153, "grad_norm": 15.252013206481934, "learning_rate": 4.576135833108092e-05, "loss": 7.9361, "step": 7400 }, { "combined_loss": 2.5752511024475098, "distill_loss": 1.5916697978973389, "epoch": 0.2710821305590153, "step": 7400, "student_mlm_loss": 3.5588326454162598 }, { "epoch": 0.2747454025935966, "grad_norm": 26.218740463256836, "learning_rate": 4.5699928741676295e-05, "loss": 4.8534, "step": 7500 }, { "combined_loss": 2.1656486988067627, "distill_loss": 1.4179739952087402, "epoch": 0.2747454025935966, "step": 7500, "student_mlm_loss": 2.913323402404785 }, { "epoch": 0.27840867462817787, "grad_norm": 6.031148910522461, "learning_rate": 4.5638499152271666e-05, "loss": 6.4535, "step": 7600 }, { "combined_loss": 2.8603813648223877, "distill_loss": 1.5837383270263672, "epoch": 0.27840867462817787, "step": 7600, "student_mlm_loss": 4.137024402618408 }, { "epoch": 0.2820719466627592, "grad_norm": 107.95591735839844, "learning_rate": 4.5577069562867044e-05, "loss": 3.2702, "step": 7700 }, { "combined_loss": 1.8474111557006836, "distill_loss": 1.437280297279358, "epoch": 0.2820719466627592, "step": 7700, "student_mlm_loss": 2.257542133331299 }, { "epoch": 0.2857352186973405, "grad_norm": 5.394913673400879, "learning_rate": 4.551563997346242e-05, "loss": 2.8998, "step": 7800 }, { "combined_loss": 4.77987813949585, "distill_loss": 1.5358555316925049, "epoch": 0.2857352186973405, "step": 7800, "student_mlm_loss": 8.023900985717773 }, { "epoch": 0.28939849073192175, "grad_norm": 7.790286540985107, "learning_rate": 4.545421038405779e-05, "loss": 2.9018, "step": 7900 }, { "combined_loss": 3.34071946144104, "distill_loss": 1.3893283605575562, "epoch": 0.28939849073192175, "step": 7900, "student_mlm_loss": 5.292110443115234 }, { "epoch": 0.29306176276650303, "grad_norm": 10.3685941696167, "learning_rate": 4.539278079465317e-05, "loss": 3.5884, "step": 8000 }, { "epoch": 0.29306176276650303, "eval_loss": 3.7581117153167725, "eval_runtime": 2.0302, "eval_samples_per_second": 3446.049, "eval_steps_per_second": 13.792, "step": 8000 }, { "combined_loss": 2.8955559730529785, "distill_loss": 1.3627426624298096, "epoch": 0.29306176276650303, "step": 8000, "student_mlm_loss": 4.428369522094727 }, { "epoch": 0.2967250348010843, "grad_norm": 49.06619644165039, "learning_rate": 4.533135120524855e-05, "loss": 3.5788, "step": 8100 }, { "combined_loss": 4.52724552154541, "distill_loss": 1.3924285173416138, "epoch": 0.2967250348010843, "step": 8100, "student_mlm_loss": 7.662062644958496 }, { "epoch": 0.30038830683566564, "grad_norm": 27.40319061279297, "learning_rate": 4.5269921615843926e-05, "loss": 3.9229, "step": 8200 }, { "combined_loss": 3.3075461387634277, "distill_loss": 1.5311795473098755, "epoch": 0.30038830683566564, "step": 8200, "student_mlm_loss": 5.0839128494262695 }, { "epoch": 0.3040515788702469, "grad_norm": 31.07562255859375, "learning_rate": 4.52084920264393e-05, "loss": 3.9566, "step": 8300 }, { "combined_loss": 1.9784274101257324, "distill_loss": 1.41036057472229, "epoch": 0.3040515788702469, "step": 8300, "student_mlm_loss": 2.546494245529175 }, { "epoch": 0.3077148509048282, "grad_norm": 4.548298358917236, "learning_rate": 4.514706243703467e-05, "loss": 5.1591, "step": 8400 }, { "combined_loss": 1.9796760082244873, "distill_loss": 1.408158302307129, "epoch": 0.3077148509048282, "step": 8400, "student_mlm_loss": 2.5511937141418457 }, { "epoch": 0.3113781229394095, "grad_norm": 8.897561073303223, "learning_rate": 4.5085632847630046e-05, "loss": 5.7057, "step": 8500 }, { "combined_loss": 2.080671548843384, "distill_loss": 1.4321857690811157, "epoch": 0.3113781229394095, "step": 8500, "student_mlm_loss": 2.7291574478149414 }, { "epoch": 0.31504139497399075, "grad_norm": 10.005053520202637, "learning_rate": 4.5024203258225424e-05, "loss": 7.7928, "step": 8600 }, { "combined_loss": 2.6395342350006104, "distill_loss": 1.5675503015518188, "epoch": 0.31504139497399075, "step": 8600, "student_mlm_loss": 3.7115182876586914 }, { "epoch": 0.31870466700857203, "grad_norm": 5.425146579742432, "learning_rate": 4.49627736688208e-05, "loss": 3.7716, "step": 8700 }, { "combined_loss": 2.9848690032958984, "distill_loss": 1.592170000076294, "epoch": 0.31870466700857203, "step": 8700, "student_mlm_loss": 4.377568244934082 }, { "epoch": 0.32236793904315336, "grad_norm": 5.64302396774292, "learning_rate": 4.490134407941617e-05, "loss": 6.8888, "step": 8800 }, { "combined_loss": 4.167844772338867, "distill_loss": 1.4308810234069824, "epoch": 0.32236793904315336, "step": 8800, "student_mlm_loss": 6.904808044433594 }, { "epoch": 0.32603121107773464, "grad_norm": 99.88166809082031, "learning_rate": 4.483991449001155e-05, "loss": 3.988, "step": 8900 }, { "combined_loss": 2.484290599822998, "distill_loss": 1.3509743213653564, "epoch": 0.32603121107773464, "step": 8900, "student_mlm_loss": 3.6176071166992188 }, { "epoch": 0.3296944831123159, "grad_norm": 74.52608489990234, "learning_rate": 4.477848490060693e-05, "loss": 7.0959, "step": 9000 }, { "combined_loss": 3.0457074642181396, "distill_loss": 1.3116565942764282, "epoch": 0.3296944831123159, "step": 9000, "student_mlm_loss": 4.779758453369141 }, { "epoch": 0.3333577551468972, "grad_norm": 11.735849380493164, "learning_rate": 4.47170553112023e-05, "loss": 3.3274, "step": 9100 }, { "combined_loss": 4.452191352844238, "distill_loss": 1.3943032026290894, "epoch": 0.3333577551468972, "step": 9100, "student_mlm_loss": 7.510079860687256 }, { "epoch": 0.33702102718147847, "grad_norm": 9.601778030395508, "learning_rate": 4.465562572179768e-05, "loss": 3.8928, "step": 9200 }, { "combined_loss": 4.875356197357178, "distill_loss": 1.4536867141723633, "epoch": 0.33702102718147847, "step": 9200, "student_mlm_loss": 8.297025680541992 }, { "epoch": 0.3406842992160598, "grad_norm": 9.49219799041748, "learning_rate": 4.459419613239305e-05, "loss": 3.7362, "step": 9300 }, { "combined_loss": 2.9027719497680664, "distill_loss": 1.3480241298675537, "epoch": 0.3406842992160598, "step": 9300, "student_mlm_loss": 4.45751953125 }, { "epoch": 0.3443475712506411, "grad_norm": 7.6804728507995605, "learning_rate": 4.453276654298843e-05, "loss": 4.4018, "step": 9400 }, { "combined_loss": 2.7022647857666016, "distill_loss": 1.3614214658737183, "epoch": 0.3443475712506411, "step": 9400, "student_mlm_loss": 4.043107986450195 }, { "epoch": 0.34801084328522236, "grad_norm": 38.41388702392578, "learning_rate": 4.4471336953583804e-05, "loss": 3.0632, "step": 9500 }, { "combined_loss": 1.9494025707244873, "distill_loss": 1.3876396417617798, "epoch": 0.34801084328522236, "step": 9500, "student_mlm_loss": 2.5111656188964844 }, { "epoch": 0.35167411531980364, "grad_norm": 37.10932540893555, "learning_rate": 4.440990736417918e-05, "loss": 3.3258, "step": 9600 }, { "combined_loss": 2.6435036659240723, "distill_loss": 1.3941702842712402, "epoch": 0.35167411531980364, "step": 9600, "student_mlm_loss": 3.8928370475769043 }, { "epoch": 0.3553373873543849, "grad_norm": 17.652099609375, "learning_rate": 4.434847777477455e-05, "loss": 8.3854, "step": 9700 }, { "combined_loss": 2.336359977722168, "distill_loss": 1.5497583150863647, "epoch": 0.3553373873543849, "step": 9700, "student_mlm_loss": 3.1229615211486816 }, { "epoch": 0.35900065938896625, "grad_norm": 58.41902160644531, "learning_rate": 4.428704818536993e-05, "loss": 6.9624, "step": 9800 }, { "combined_loss": 2.6561923027038574, "distill_loss": 1.5154696702957153, "epoch": 0.35900065938896625, "step": 9800, "student_mlm_loss": 3.796915054321289 }, { "epoch": 0.3626639314235475, "grad_norm": 23.230680465698242, "learning_rate": 4.422561859596531e-05, "loss": 3.4226, "step": 9900 }, { "combined_loss": 1.9643871784210205, "distill_loss": 1.3770619630813599, "epoch": 0.3626639314235475, "step": 9900, "student_mlm_loss": 2.5517125129699707 }, { "epoch": 0.3663272034581288, "grad_norm": 11.580951690673828, "learning_rate": 4.416418900656068e-05, "loss": 4.7414, "step": 10000 }, { "epoch": 0.3663272034581288, "eval_loss": 3.8432743549346924, "eval_runtime": 2.2879, "eval_samples_per_second": 3057.772, "eval_steps_per_second": 12.238, "step": 10000 }, { "combined_loss": 2.395519971847534, "distill_loss": 1.382614254951477, "epoch": 0.3663272034581288, "step": 10000, "student_mlm_loss": 3.408425807952881 }, { "epoch": 0.3699904754927101, "grad_norm": 19.014955520629883, "learning_rate": 4.410275941715606e-05, "loss": 6.6365, "step": 10100 }, { "combined_loss": 2.1697921752929688, "distill_loss": 1.5128508806228638, "epoch": 0.3699904754927101, "step": 10100, "student_mlm_loss": 2.8267335891723633 }, { "epoch": 0.37365374752729136, "grad_norm": 6.532296180725098, "learning_rate": 4.404132982775143e-05, "loss": 3.199, "step": 10200 }, { "combined_loss": 1.8516874313354492, "distill_loss": 1.413927674293518, "epoch": 0.37365374752729136, "step": 10200, "student_mlm_loss": 2.289447069168091 }, { "epoch": 0.3773170195618727, "grad_norm": 25.607181549072266, "learning_rate": 4.397990023834681e-05, "loss": 3.822, "step": 10300 }, { "combined_loss": 3.3827946186065674, "distill_loss": 1.4635933637619019, "epoch": 0.3773170195618727, "step": 10300, "student_mlm_loss": 5.301995754241943 }, { "epoch": 0.38098029159645397, "grad_norm": 12.52314567565918, "learning_rate": 4.3918470648942184e-05, "loss": 6.9491, "step": 10400 }, { "combined_loss": 1.9748457670211792, "distill_loss": 1.445707082748413, "epoch": 0.38098029159645397, "step": 10400, "student_mlm_loss": 2.5039844512939453 }, { "epoch": 0.38464356363103525, "grad_norm": 12.69713306427002, "learning_rate": 4.385704105953756e-05, "loss": 9.4794, "step": 10500 }, { "combined_loss": 3.5582261085510254, "distill_loss": 1.4324952363967896, "epoch": 0.38464356363103525, "step": 10500, "student_mlm_loss": 5.683957099914551 }, { "epoch": 0.3883068356656165, "grad_norm": 9.131495475769043, "learning_rate": 4.379561147013293e-05, "loss": 7.1932, "step": 10600 }, { "combined_loss": 6.080216407775879, "distill_loss": 1.477283000946045, "epoch": 0.3883068356656165, "step": 10600, "student_mlm_loss": 10.683149337768555 }, { "epoch": 0.3919701077001978, "grad_norm": 24.739810943603516, "learning_rate": 4.373418188072831e-05, "loss": 5.6399, "step": 10700 }, { "combined_loss": 3.7993698120117188, "distill_loss": 1.452317476272583, "epoch": 0.3919701077001978, "step": 10700, "student_mlm_loss": 6.146422386169434 }, { "epoch": 0.3956333797347791, "grad_norm": 42.44218063354492, "learning_rate": 4.367275229132369e-05, "loss": 4.2291, "step": 10800 }, { "combined_loss": 2.037079095840454, "distill_loss": 1.4349570274353027, "epoch": 0.3956333797347791, "step": 10800, "student_mlm_loss": 2.6392011642456055 }, { "epoch": 0.3992966517693604, "grad_norm": 231.26116943359375, "learning_rate": 4.361132270191906e-05, "loss": 4.6188, "step": 10900 }, { "combined_loss": 182.1781768798828, "distill_loss": 1.4427307844161987, "epoch": 0.3992966517693604, "step": 10900, "student_mlm_loss": 362.91363525390625 }, { "epoch": 0.4029599238039417, "grad_norm": 16.01262092590332, "learning_rate": 4.354989311251444e-05, "loss": 4.8535, "step": 11000 }, { "combined_loss": 3.2922308444976807, "distill_loss": 1.7308834791183472, "epoch": 0.4029599238039417, "step": 11000, "student_mlm_loss": 4.853578090667725 }, { "epoch": 0.40662319583852297, "grad_norm": 23.69573974609375, "learning_rate": 4.3488463523109816e-05, "loss": 2.8692, "step": 11100 }, { "combined_loss": 2.1010890007019043, "distill_loss": 1.3140019178390503, "epoch": 0.40662319583852297, "step": 11100, "student_mlm_loss": 2.888176202774048 }, { "epoch": 0.41028646787310424, "grad_norm": 9.695125579833984, "learning_rate": 4.3427033933705193e-05, "loss": 7.6829, "step": 11200 }, { "combined_loss": 2.24194598197937, "distill_loss": 1.560063362121582, "epoch": 0.41028646787310424, "step": 11200, "student_mlm_loss": 2.923828601837158 }, { "epoch": 0.4139497399076855, "grad_norm": 37.06310272216797, "learning_rate": 4.3365604344300565e-05, "loss": 3.5562, "step": 11300 }, { "combined_loss": 9.297407150268555, "distill_loss": 1.2328678369522095, "epoch": 0.4139497399076855, "step": 11300, "student_mlm_loss": 17.36194610595703 }, { "epoch": 0.41761301194226685, "grad_norm": 6.411166667938232, "learning_rate": 4.330417475489594e-05, "loss": 4.0543, "step": 11400 }, { "combined_loss": 2.141500949859619, "distill_loss": 1.467064380645752, "epoch": 0.41761301194226685, "step": 11400, "student_mlm_loss": 2.8159377574920654 }, { "epoch": 0.42127628397684813, "grad_norm": 5.802677154541016, "learning_rate": 4.3242745165491313e-05, "loss": 14.3215, "step": 11500 }, { "combined_loss": 6.576130390167236, "distill_loss": 1.46802818775177, "epoch": 0.42127628397684813, "step": 11500, "student_mlm_loss": 11.684232711791992 }, { "epoch": 0.4249395560114294, "grad_norm": 15.660844802856445, "learning_rate": 4.318131557608669e-05, "loss": 30.5877, "step": 11600 }, { "combined_loss": 1.9305293560028076, "distill_loss": 1.405720591545105, "epoch": 0.4249395560114294, "step": 11600, "student_mlm_loss": 2.4553380012512207 }, { "epoch": 0.4286028280460107, "grad_norm": 3.041947603225708, "learning_rate": 4.311988598668207e-05, "loss": 3.7156, "step": 11700 }, { "combined_loss": 2.78572940826416, "distill_loss": 1.45219886302948, "epoch": 0.4286028280460107, "step": 11700, "student_mlm_loss": 4.119259834289551 }, { "epoch": 0.43226610008059196, "grad_norm": 20.6744384765625, "learning_rate": 4.305845639727744e-05, "loss": 3.3939, "step": 11800 }, { "combined_loss": 2.0835349559783936, "distill_loss": 1.4508671760559082, "epoch": 0.43226610008059196, "step": 11800, "student_mlm_loss": 2.716202735900879 }, { "epoch": 0.4359293721151733, "grad_norm": 5.804731369018555, "learning_rate": 4.299702680787282e-05, "loss": 6.1951, "step": 11900 }, { "combined_loss": 3.1048030853271484, "distill_loss": 1.455564260482788, "epoch": 0.4359293721151733, "step": 11900, "student_mlm_loss": 4.75404167175293 }, { "epoch": 0.4395926441497546, "grad_norm": 33.689720153808594, "learning_rate": 4.2935597218468196e-05, "loss": 3.6583, "step": 12000 }, { "epoch": 0.4395926441497546, "eval_loss": 3.919630527496338, "eval_runtime": 2.0425, "eval_samples_per_second": 3425.261, "eval_steps_per_second": 13.709, "step": 12000 }, { "combined_loss": 2.315965175628662, "distill_loss": 1.3009124994277954, "epoch": 0.4395926441497546, "step": 12000, "student_mlm_loss": 3.3310179710388184 }, { "epoch": 0.44325591618433585, "grad_norm": 24.73545265197754, "learning_rate": 4.2874167629063574e-05, "loss": 2.9828, "step": 12100 }, { "combined_loss": 5.060952186584473, "distill_loss": 1.3712559938430786, "epoch": 0.44325591618433585, "step": 12100, "student_mlm_loss": 8.750648498535156 }, { "epoch": 0.44691918821891713, "grad_norm": 19.548921585083008, "learning_rate": 4.2812738039658945e-05, "loss": 3.1716, "step": 12200 }, { "combined_loss": 2.3697307109832764, "distill_loss": 1.480096459388733, "epoch": 0.44691918821891713, "step": 12200, "student_mlm_loss": 3.2593650817871094 }, { "epoch": 0.4505824602534984, "grad_norm": 6.217925548553467, "learning_rate": 4.2751308450254316e-05, "loss": 5.1037, "step": 12300 }, { "combined_loss": 1.9682085514068604, "distill_loss": 1.3534774780273438, "epoch": 0.4505824602534984, "step": 12300, "student_mlm_loss": 2.582939624786377 }, { "epoch": 0.45424573228807974, "grad_norm": 53.592735290527344, "learning_rate": 4.2689878860849694e-05, "loss": 5.3409, "step": 12400 }, { "combined_loss": 2.413550853729248, "distill_loss": 1.3951433897018433, "epoch": 0.45424573228807974, "step": 12400, "student_mlm_loss": 3.4319584369659424 }, { "epoch": 0.457909004322661, "grad_norm": 13.716507911682129, "learning_rate": 4.262844927144507e-05, "loss": 3.2261, "step": 12500 }, { "combined_loss": 3.6318020820617676, "distill_loss": 1.3529082536697388, "epoch": 0.457909004322661, "step": 12500, "student_mlm_loss": 5.910696029663086 }, { "epoch": 0.4615722763572423, "grad_norm": 16.206933975219727, "learning_rate": 4.256701968204045e-05, "loss": 3.1534, "step": 12600 }, { "combined_loss": 15.371432304382324, "distill_loss": 1.4290032386779785, "epoch": 0.4615722763572423, "step": 12600, "student_mlm_loss": 29.313861846923828 }, { "epoch": 0.4652355483918236, "grad_norm": 8.626960754394531, "learning_rate": 4.250559009263582e-05, "loss": 3.0824, "step": 12700 }, { "combined_loss": 2.0715112686157227, "distill_loss": 1.3553932905197144, "epoch": 0.4652355483918236, "step": 12700, "student_mlm_loss": 2.7876293659210205 }, { "epoch": 0.46889882042640485, "grad_norm": 8.153878211975098, "learning_rate": 4.24441605032312e-05, "loss": 3.8805, "step": 12800 }, { "combined_loss": 2.0972392559051514, "distill_loss": 1.2276250123977661, "epoch": 0.46889882042640485, "step": 12800, "student_mlm_loss": 2.966853618621826 }, { "epoch": 0.4725620924609861, "grad_norm": 12.068700790405273, "learning_rate": 4.2382730913826576e-05, "loss": 2.8937, "step": 12900 }, { "combined_loss": 2.9497852325439453, "distill_loss": 1.314728021621704, "epoch": 0.4725620924609861, "step": 12900, "student_mlm_loss": 4.584842681884766 }, { "epoch": 0.47622536449556746, "grad_norm": 12.260379791259766, "learning_rate": 4.232130132442195e-05, "loss": 5.581, "step": 13000 }, { "combined_loss": 1.8658246994018555, "distill_loss": 1.2703187465667725, "epoch": 0.47622536449556746, "step": 13000, "student_mlm_loss": 2.4613306522369385 }, { "epoch": 0.47988863653014874, "grad_norm": 22.688852310180664, "learning_rate": 4.2259871735017325e-05, "loss": 7.0059, "step": 13100 }, { "combined_loss": 3.673346519470215, "distill_loss": 1.397099256515503, "epoch": 0.47988863653014874, "step": 13100, "student_mlm_loss": 5.949593544006348 }, { "epoch": 0.48355190856473, "grad_norm": 28.811817169189453, "learning_rate": 4.2198442145612696e-05, "loss": 9.6395, "step": 13200 }, { "combined_loss": 2.036362409591675, "distill_loss": 1.3239866495132446, "epoch": 0.48355190856473, "step": 13200, "student_mlm_loss": 2.7487380504608154 }, { "epoch": 0.4872151805993113, "grad_norm": 6.380947589874268, "learning_rate": 4.213701255620808e-05, "loss": 2.7095, "step": 13300 }, { "combined_loss": 2.2547478675842285, "distill_loss": 1.4122509956359863, "epoch": 0.4872151805993113, "step": 13300, "student_mlm_loss": 3.09724497795105 }, { "epoch": 0.49087845263389257, "grad_norm": 83.60982513427734, "learning_rate": 4.207558296680345e-05, "loss": 3.2917, "step": 13400 }, { "combined_loss": 2.009040355682373, "distill_loss": 1.4236946105957031, "epoch": 0.49087845263389257, "step": 13400, "student_mlm_loss": 2.594385862350464 }, { "epoch": 0.4945417246684739, "grad_norm": 10.06588077545166, "learning_rate": 4.201415337739883e-05, "loss": 12.3205, "step": 13500 }, { "combined_loss": 2.9317073822021484, "distill_loss": 1.4229042530059814, "epoch": 0.4945417246684739, "step": 13500, "student_mlm_loss": 4.440510272979736 }, { "epoch": 0.4982049967030552, "grad_norm": 4.126479625701904, "learning_rate": 4.19527237879942e-05, "loss": 3.8077, "step": 13600 }, { "combined_loss": 1.9033926725387573, "distill_loss": 1.357490062713623, "epoch": 0.4982049967030552, "step": 13600, "student_mlm_loss": 2.4492952823638916 }, { "epoch": 0.5018682687376365, "grad_norm": 18.483203887939453, "learning_rate": 4.189129419858958e-05, "loss": 11.6361, "step": 13700 }, { "combined_loss": 3.165005683898926, "distill_loss": 1.3812006711959839, "epoch": 0.5018682687376365, "step": 13700, "student_mlm_loss": 4.948810577392578 }, { "epoch": 0.5055315407722177, "grad_norm": 7.388655662536621, "learning_rate": 4.1829864609184956e-05, "loss": 3.875, "step": 13800 }, { "combined_loss": 1.8155145645141602, "distill_loss": 1.3641600608825684, "epoch": 0.5055315407722177, "step": 13800, "student_mlm_loss": 2.266869068145752 }, { "epoch": 0.509194812806799, "grad_norm": 9.352982521057129, "learning_rate": 4.176843501978033e-05, "loss": 9.268, "step": 13900 }, { "combined_loss": 2.3618173599243164, "distill_loss": 1.3162891864776611, "epoch": 0.509194812806799, "step": 13900, "student_mlm_loss": 3.4073452949523926 }, { "epoch": 0.5128580848413803, "grad_norm": 8.513871192932129, "learning_rate": 4.1707005430375705e-05, "loss": 3.3999, "step": 14000 }, { "epoch": 0.5128580848413803, "eval_loss": 3.5987370014190674, "eval_runtime": 2.2869, "eval_samples_per_second": 3059.222, "eval_steps_per_second": 12.244, "step": 14000 }, { "combined_loss": 2.6841559410095215, "distill_loss": 1.401199460029602, "epoch": 0.5128580848413803, "step": 14000, "student_mlm_loss": 3.9671125411987305 }, { "epoch": 0.5165213568759616, "grad_norm": 30.661813735961914, "learning_rate": 4.1645575840971076e-05, "loss": 18.3341, "step": 14100 }, { "combined_loss": 4.752758026123047, "distill_loss": 1.247560977935791, "epoch": 0.5165213568759616, "step": 14100, "student_mlm_loss": 8.257954597473145 }, { "epoch": 0.5201846289105428, "grad_norm": 40.303707122802734, "learning_rate": 4.158414625156646e-05, "loss": 3.1057, "step": 14200 }, { "combined_loss": 1.988144874572754, "distill_loss": 1.2577546834945679, "epoch": 0.5201846289105428, "step": 14200, "student_mlm_loss": 2.7185349464416504 }, { "epoch": 0.5238479009451242, "grad_norm": 19.77947235107422, "learning_rate": 4.152271666216183e-05, "loss": 7.3457, "step": 14300 }, { "combined_loss": 4.299380779266357, "distill_loss": 1.2770593166351318, "epoch": 0.5238479009451242, "step": 14300, "student_mlm_loss": 7.321702480316162 }, { "epoch": 0.5275111729797055, "grad_norm": 7.412100315093994, "learning_rate": 4.146128707275721e-05, "loss": 4.8104, "step": 14400 }, { "combined_loss": 10.650766372680664, "distill_loss": 1.3233892917633057, "epoch": 0.5275111729797055, "step": 14400, "student_mlm_loss": 19.9781436920166 }, { "epoch": 0.5311744450142868, "grad_norm": 5.799710750579834, "learning_rate": 4.139985748335258e-05, "loss": 3.4765, "step": 14500 }, { "combined_loss": 2.4540774822235107, "distill_loss": 1.319036841392517, "epoch": 0.5311744450142868, "step": 14500, "student_mlm_loss": 3.589118003845215 }, { "epoch": 0.5348377170488681, "grad_norm": 7.147758483886719, "learning_rate": 4.133842789394796e-05, "loss": 3.12, "step": 14600 }, { "combined_loss": 1.8580541610717773, "distill_loss": 1.3114832639694214, "epoch": 0.5348377170488681, "step": 14600, "student_mlm_loss": 2.4046249389648438 }, { "epoch": 0.5385009890834493, "grad_norm": 5.120487213134766, "learning_rate": 4.1276998304543336e-05, "loss": 6.7029, "step": 14700 }, { "combined_loss": 1.9685258865356445, "distill_loss": 1.2455390691757202, "epoch": 0.5385009890834493, "step": 14700, "student_mlm_loss": 2.6915125846862793 }, { "epoch": 0.5421642611180306, "grad_norm": 6.225675106048584, "learning_rate": 4.121556871513871e-05, "loss": 7.1336, "step": 14800 }, { "combined_loss": 1.8886613845825195, "distill_loss": 1.2913726568222046, "epoch": 0.5421642611180306, "step": 14800, "student_mlm_loss": 2.485949993133545 }, { "epoch": 0.5458275331526119, "grad_norm": 11.508244514465332, "learning_rate": 4.1154139125734085e-05, "loss": 11.8719, "step": 14900 }, { "combined_loss": 2.1455585956573486, "distill_loss": 1.3711117506027222, "epoch": 0.5458275331526119, "step": 14900, "student_mlm_loss": 2.9200053215026855 }, { "epoch": 0.5494908051871932, "grad_norm": 17.030780792236328, "learning_rate": 4.109270953632946e-05, "loss": 3.091, "step": 15000 }, { "combined_loss": 1.9433504343032837, "distill_loss": 1.538583517074585, "epoch": 0.5494908051871932, "step": 15000, "student_mlm_loss": 2.3481173515319824 }, { "epoch": 0.5531540772217745, "grad_norm": 4.692992687225342, "learning_rate": 4.103127994692484e-05, "loss": 3.2488, "step": 15100 }, { "combined_loss": 2.820077657699585, "distill_loss": 1.2906769514083862, "epoch": 0.5531540772217745, "step": 15100, "student_mlm_loss": 4.349478244781494 }, { "epoch": 0.5568173492563557, "grad_norm": 49.70892333984375, "learning_rate": 4.096985035752021e-05, "loss": 10.6593, "step": 15200 }, { "combined_loss": 1.857104778289795, "distill_loss": 1.4106833934783936, "epoch": 0.5568173492563557, "step": 15200, "student_mlm_loss": 2.3035261631011963 }, { "epoch": 0.5604806212909371, "grad_norm": 7.913967609405518, "learning_rate": 4.090842076811558e-05, "loss": 3.3056, "step": 15300 }, { "combined_loss": 3.2144076824188232, "distill_loss": 1.3917032480239868, "epoch": 0.5604806212909371, "step": 15300, "student_mlm_loss": 5.037112236022949 }, { "epoch": 0.5641438933255184, "grad_norm": 10.575057983398438, "learning_rate": 4.084699117871096e-05, "loss": 10.0757, "step": 15400 }, { "combined_loss": 5.352452754974365, "distill_loss": 1.3542910814285278, "epoch": 0.5641438933255184, "step": 15400, "student_mlm_loss": 9.350614547729492 }, { "epoch": 0.5678071653600997, "grad_norm": 119.92784118652344, "learning_rate": 4.078556158930634e-05, "loss": 3.4463, "step": 15500 }, { "combined_loss": 1.7753610610961914, "distill_loss": 1.3875095844268799, "epoch": 0.5678071653600997, "step": 15500, "student_mlm_loss": 2.163212537765503 }, { "epoch": 0.571470437394681, "grad_norm": 4.203140735626221, "learning_rate": 4.0724131999901717e-05, "loss": 4.8205, "step": 15600 }, { "combined_loss": 1.8941802978515625, "distill_loss": 1.3584777116775513, "epoch": 0.571470437394681, "step": 15600, "student_mlm_loss": 2.4298830032348633 }, { "epoch": 0.5751337094292622, "grad_norm": 16.848825454711914, "learning_rate": 4.066270241049709e-05, "loss": 7.7339, "step": 15700 }, { "combined_loss": 1.9499808549880981, "distill_loss": 1.3122260570526123, "epoch": 0.5751337094292622, "step": 15700, "student_mlm_loss": 2.587735652923584 }, { "epoch": 0.5787969814638435, "grad_norm": 2.9838955402374268, "learning_rate": 4.0601272821092465e-05, "loss": 3.4354, "step": 15800 }, { "combined_loss": 1.9672229290008545, "distill_loss": 1.3119910955429077, "epoch": 0.5787969814638435, "step": 15800, "student_mlm_loss": 2.622454881668091 }, { "epoch": 0.5824602534984248, "grad_norm": 6.6938676834106445, "learning_rate": 4.053984323168784e-05, "loss": 5.2244, "step": 15900 }, { "combined_loss": 2.8469321727752686, "distill_loss": 1.361178994178772, "epoch": 0.5824602534984248, "step": 15900, "student_mlm_loss": 4.332685470581055 }, { "epoch": 0.5861235255330061, "grad_norm": 31.440717697143555, "learning_rate": 4.047841364228322e-05, "loss": 8.7168, "step": 16000 }, { "epoch": 0.5861235255330061, "eval_loss": 3.480536937713623, "eval_runtime": 2.1572, "eval_samples_per_second": 3243.154, "eval_steps_per_second": 12.98, "step": 16000 }, { "combined_loss": 2.0847339630126953, "distill_loss": 1.4640412330627441, "epoch": 0.5861235255330061, "step": 16000, "student_mlm_loss": 2.7054266929626465 }, { "epoch": 0.5897867975675873, "grad_norm": 6.238570690155029, "learning_rate": 4.041698405287859e-05, "loss": 3.2375, "step": 16100 }, { "combined_loss": 2.2635374069213867, "distill_loss": 1.5188945531845093, "epoch": 0.5897867975675873, "step": 16100, "student_mlm_loss": 3.0081801414489746 }, { "epoch": 0.5934500696021686, "grad_norm": 11.832098960876465, "learning_rate": 4.035555446347396e-05, "loss": 3.3115, "step": 16200 }, { "combined_loss": 2.2285714149475098, "distill_loss": 1.4724992513656616, "epoch": 0.5934500696021686, "step": 16200, "student_mlm_loss": 2.9846436977386475 }, { "epoch": 0.5971133416367499, "grad_norm": 8.876389503479004, "learning_rate": 4.029412487406934e-05, "loss": 4.1388, "step": 16300 }, { "combined_loss": 2.0907256603240967, "distill_loss": 1.2955131530761719, "epoch": 0.5971133416367499, "step": 16300, "student_mlm_loss": 2.8859381675720215 }, { "epoch": 0.6007766136713313, "grad_norm": 4.118688106536865, "learning_rate": 4.023269528466472e-05, "loss": 5.4036, "step": 16400 }, { "combined_loss": 5.190587997436523, "distill_loss": 1.502519965171814, "epoch": 0.6007766136713313, "step": 16400, "student_mlm_loss": 8.878656387329102 }, { "epoch": 0.6044398857059126, "grad_norm": 17.806203842163086, "learning_rate": 4.01712656952601e-05, "loss": 3.4529, "step": 16500 }, { "combined_loss": 2.0771563053131104, "distill_loss": 1.5032036304473877, "epoch": 0.6044398857059126, "step": 16500, "student_mlm_loss": 2.651108980178833 }, { "epoch": 0.6081031577404938, "grad_norm": 11.406692504882812, "learning_rate": 4.010983610585547e-05, "loss": 2.9157, "step": 16600 }, { "combined_loss": 2.0262105464935303, "distill_loss": 1.406888723373413, "epoch": 0.6081031577404938, "step": 16600, "student_mlm_loss": 2.6455323696136475 }, { "epoch": 0.6117664297750751, "grad_norm": 9.248611450195312, "learning_rate": 4.0048406516450846e-05, "loss": 3.7273, "step": 16700 }, { "combined_loss": 9.912755966186523, "distill_loss": 1.3654385805130005, "epoch": 0.6117664297750751, "step": 16700, "student_mlm_loss": 18.460073471069336 }, { "epoch": 0.6154297018096564, "grad_norm": 7.337488651275635, "learning_rate": 3.9986976927046223e-05, "loss": 3.5316, "step": 16800 }, { "combined_loss": 2.2111759185791016, "distill_loss": 1.410059928894043, "epoch": 0.6154297018096564, "step": 16800, "student_mlm_loss": 3.012291669845581 }, { "epoch": 0.6190929738442377, "grad_norm": 3.7927513122558594, "learning_rate": 3.9925547337641595e-05, "loss": 2.942, "step": 16900 }, { "combined_loss": 1.9941096305847168, "distill_loss": 1.3353883028030396, "epoch": 0.6190929738442377, "step": 16900, "student_mlm_loss": 2.6528310775756836 }, { "epoch": 0.622756245878819, "grad_norm": 8.092863082885742, "learning_rate": 3.986411774823697e-05, "loss": 8.3194, "step": 17000 }, { "combined_loss": 1.8197941780090332, "distill_loss": 1.2830308675765991, "epoch": 0.622756245878819, "step": 17000, "student_mlm_loss": 2.356557607650757 }, { "epoch": 0.6264195179134002, "grad_norm": 21.95607566833496, "learning_rate": 3.9802688158832343e-05, "loss": 3.6842, "step": 17100 }, { "combined_loss": 1.967858076095581, "distill_loss": 1.3744505643844604, "epoch": 0.6264195179134002, "step": 17100, "student_mlm_loss": 2.561265707015991 }, { "epoch": 0.6300827899479815, "grad_norm": 17.734630584716797, "learning_rate": 3.974125856942773e-05, "loss": 3.4446, "step": 17200 }, { "combined_loss": 3.56831955909729, "distill_loss": 1.4127169847488403, "epoch": 0.6300827899479815, "step": 17200, "student_mlm_loss": 5.723922252655029 }, { "epoch": 0.6337460619825628, "grad_norm": 14.227143287658691, "learning_rate": 3.96798289800231e-05, "loss": 4.3058, "step": 17300 }, { "combined_loss": 6.485238552093506, "distill_loss": 1.3285768032073975, "epoch": 0.6337460619825628, "step": 17300, "student_mlm_loss": 11.641900062561035 }, { "epoch": 0.6374093340171441, "grad_norm": 27.379819869995117, "learning_rate": 3.961839939061848e-05, "loss": 3.3666, "step": 17400 }, { "combined_loss": 3.212083339691162, "distill_loss": 1.3358004093170166, "epoch": 0.6374093340171441, "step": 17400, "student_mlm_loss": 5.088366508483887 }, { "epoch": 0.6410726060517254, "grad_norm": 6.261890411376953, "learning_rate": 3.955696980121385e-05, "loss": 6.3216, "step": 17500 }, { "combined_loss": 1.8787257671356201, "distill_loss": 1.3068917989730835, "epoch": 0.6410726060517254, "step": 17500, "student_mlm_loss": 2.4505598545074463 }, { "epoch": 0.6447358780863067, "grad_norm": 4.643723011016846, "learning_rate": 3.9495540211809226e-05, "loss": 6.3659, "step": 17600 }, { "combined_loss": 1.9111711978912354, "distill_loss": 1.315952181816101, "epoch": 0.6447358780863067, "step": 17600, "student_mlm_loss": 2.506390333175659 }, { "epoch": 0.648399150120888, "grad_norm": 209.94358825683594, "learning_rate": 3.9434110622404604e-05, "loss": 3.1778, "step": 17700 }, { "combined_loss": 2.7990779876708984, "distill_loss": 1.360758662223816, "epoch": 0.648399150120888, "step": 17700, "student_mlm_loss": 4.237397193908691 }, { "epoch": 0.6520624221554693, "grad_norm": 25.861230850219727, "learning_rate": 3.9372681032999975e-05, "loss": 6.5636, "step": 17800 }, { "combined_loss": 3.8194119930267334, "distill_loss": 1.45068359375, "epoch": 0.6520624221554693, "step": 17800, "student_mlm_loss": 6.188140392303467 }, { "epoch": 0.6557256941900506, "grad_norm": 46.81015396118164, "learning_rate": 3.931125144359535e-05, "loss": 6.4281, "step": 17900 }, { "combined_loss": 1.8790740966796875, "distill_loss": 1.2603598833084106, "epoch": 0.6557256941900506, "step": 17900, "student_mlm_loss": 2.497788429260254 }, { "epoch": 0.6593889662246318, "grad_norm": 3.634798049926758, "learning_rate": 3.924982185419073e-05, "loss": 3.7705, "step": 18000 }, { "epoch": 0.6593889662246318, "eval_loss": 3.4686477184295654, "eval_runtime": 2.0476, "eval_samples_per_second": 3416.619, "eval_steps_per_second": 13.674, "step": 18000 }, { "combined_loss": 1.8001245260238647, "distill_loss": 1.358407735824585, "epoch": 0.6593889662246318, "step": 18000, "student_mlm_loss": 2.2418413162231445 }, { "epoch": 0.6630522382592131, "grad_norm": 14.09543514251709, "learning_rate": 3.918839226478611e-05, "loss": 7.2198, "step": 18100 }, { "combined_loss": 2.165346622467041, "distill_loss": 1.3290469646453857, "epoch": 0.6630522382592131, "step": 18100, "student_mlm_loss": 3.0016462802886963 }, { "epoch": 0.6667155102937944, "grad_norm": 4.29142951965332, "learning_rate": 3.912696267538148e-05, "loss": 4.3053, "step": 18200 }, { "combined_loss": 1.8569279909133911, "distill_loss": 1.355130910873413, "epoch": 0.6667155102937944, "step": 18200, "student_mlm_loss": 2.358725070953369 }, { "epoch": 0.6703787823283757, "grad_norm": 4.424899101257324, "learning_rate": 3.906553308597686e-05, "loss": 3.2385, "step": 18300 }, { "combined_loss": 2.083707094192505, "distill_loss": 1.307104229927063, "epoch": 0.6703787823283757, "step": 18300, "student_mlm_loss": 2.8603098392486572 }, { "epoch": 0.6740420543629569, "grad_norm": 8.061409950256348, "learning_rate": 3.900410349657223e-05, "loss": 2.9075, "step": 18400 }, { "combined_loss": 1.9213597774505615, "distill_loss": 1.434320330619812, "epoch": 0.6740420543629569, "step": 18400, "student_mlm_loss": 2.4083993434906006 }, { "epoch": 0.6777053263975383, "grad_norm": 55.50898361206055, "learning_rate": 3.8942673907167606e-05, "loss": 13.4077, "step": 18500 }, { "combined_loss": 2.01340389251709, "distill_loss": 1.3991159200668335, "epoch": 0.6777053263975383, "step": 18500, "student_mlm_loss": 2.6276917457580566 }, { "epoch": 0.6813685984321196, "grad_norm": 5.348477840423584, "learning_rate": 3.8881244317762984e-05, "loss": 6.8559, "step": 18600 }, { "combined_loss": 2.5955307483673096, "distill_loss": 1.4375801086425781, "epoch": 0.6813685984321196, "step": 18600, "student_mlm_loss": 3.753481388092041 }, { "epoch": 0.6850318704667009, "grad_norm": 26.911954879760742, "learning_rate": 3.8819814728358355e-05, "loss": 9.8471, "step": 18700 }, { "combined_loss": 2.3086562156677246, "distill_loss": 1.4082762002944946, "epoch": 0.6850318704667009, "step": 18700, "student_mlm_loss": 3.209036350250244 }, { "epoch": 0.6886951425012822, "grad_norm": 8.086039543151855, "learning_rate": 3.875838513895373e-05, "loss": 3.841, "step": 18800 }, { "combined_loss": 4.487699031829834, "distill_loss": 1.4052667617797852, "epoch": 0.6886951425012822, "step": 18800, "student_mlm_loss": 7.570131301879883 }, { "epoch": 0.6923584145358634, "grad_norm": 10.749812126159668, "learning_rate": 3.869695554954911e-05, "loss": 9.7279, "step": 18900 }, { "combined_loss": 3.3014779090881348, "distill_loss": 1.246164083480835, "epoch": 0.6923584145358634, "step": 18900, "student_mlm_loss": 5.3567914962768555 }, { "epoch": 0.6960216865704447, "grad_norm": 11.313789367675781, "learning_rate": 3.863552596014449e-05, "loss": 28.0849, "step": 19000 }, { "combined_loss": 4.825923919677734, "distill_loss": 1.377113938331604, "epoch": 0.6960216865704447, "step": 19000, "student_mlm_loss": 8.274733543395996 }, { "epoch": 0.699684958605026, "grad_norm": 3.8648459911346436, "learning_rate": 3.857409637073986e-05, "loss": 5.8981, "step": 19100 }, { "combined_loss": 3.4921586513519287, "distill_loss": 1.4171725511550903, "epoch": 0.699684958605026, "step": 19100, "student_mlm_loss": 5.567144870758057 }, { "epoch": 0.7033482306396073, "grad_norm": 18.98455238342285, "learning_rate": 3.851266678133523e-05, "loss": 2.5944, "step": 19200 }, { "combined_loss": 1.8949182033538818, "distill_loss": 1.3743678331375122, "epoch": 0.7033482306396073, "step": 19200, "student_mlm_loss": 2.415468692779541 }, { "epoch": 0.7070115026741886, "grad_norm": 27.53456687927246, "learning_rate": 3.845123719193061e-05, "loss": 2.8462, "step": 19300 }, { "combined_loss": 1.8077284097671509, "distill_loss": 1.2764451503753662, "epoch": 0.7070115026741886, "step": 19300, "student_mlm_loss": 2.3390116691589355 }, { "epoch": 0.7106747747087698, "grad_norm": 8.815896987915039, "learning_rate": 3.8389807602525986e-05, "loss": 3.403, "step": 19400 }, { "combined_loss": 2.2496674060821533, "distill_loss": 1.408218264579773, "epoch": 0.7106747747087698, "step": 19400, "student_mlm_loss": 3.091116428375244 }, { "epoch": 0.7143380467433511, "grad_norm": 20.02590560913086, "learning_rate": 3.8328378013121364e-05, "loss": 3.7767, "step": 19500 }, { "combined_loss": 2.6540353298187256, "distill_loss": 1.451707124710083, "epoch": 0.7143380467433511, "step": 19500, "student_mlm_loss": 3.856363534927368 }, { "epoch": 0.7180013187779325, "grad_norm": 48.139583587646484, "learning_rate": 3.8266948423716735e-05, "loss": 3.4148, "step": 19600 }, { "combined_loss": 3.5710411071777344, "distill_loss": 1.2874888181686401, "epoch": 0.7180013187779325, "step": 19600, "student_mlm_loss": 5.854593276977539 }, { "epoch": 0.7216645908125138, "grad_norm": 5.810763835906982, "learning_rate": 3.820551883431211e-05, "loss": 11.1815, "step": 19700 }, { "combined_loss": 2.022658586502075, "distill_loss": 1.408826231956482, "epoch": 0.7216645908125138, "step": 19700, "student_mlm_loss": 2.636491060256958 }, { "epoch": 0.725327862847095, "grad_norm": 5.03505277633667, "learning_rate": 3.814408924490749e-05, "loss": 3.5792, "step": 19800 }, { "combined_loss": 2.450950860977173, "distill_loss": 1.3786026239395142, "epoch": 0.725327862847095, "step": 19800, "student_mlm_loss": 3.523299217224121 }, { "epoch": 0.7289911348816763, "grad_norm": 44.703548431396484, "learning_rate": 3.808265965550287e-05, "loss": 14.0822, "step": 19900 }, { "combined_loss": 1.8448269367218018, "distill_loss": 1.3061137199401855, "epoch": 0.7289911348816763, "step": 19900, "student_mlm_loss": 2.383540153503418 }, { "epoch": 0.7326544069162576, "grad_norm": 73.46593475341797, "learning_rate": 3.802123006609824e-05, "loss": 3.5648, "step": 20000 }, { "epoch": 0.7326544069162576, "eval_loss": 3.689605474472046, "eval_runtime": 2.2951, "eval_samples_per_second": 3048.261, "eval_steps_per_second": 12.2, "step": 20000 }, { "combined_loss": 5.831945896148682, "distill_loss": 1.2505719661712646, "epoch": 0.7326544069162576, "step": 20000, "student_mlm_loss": 10.41331958770752 }, { "epoch": 0.7363176789508389, "grad_norm": 7.289074897766113, "learning_rate": 3.795980047669361e-05, "loss": 5.9452, "step": 20100 }, { "combined_loss": 14.608942985534668, "distill_loss": 1.4141182899475098, "epoch": 0.7363176789508389, "step": 20100, "student_mlm_loss": 27.803768157958984 }, { "epoch": 0.7399809509854202, "grad_norm": 15.717759132385254, "learning_rate": 3.7898370887288995e-05, "loss": 5.3196, "step": 20200 }, { "combined_loss": 2.34932279586792, "distill_loss": 1.2641239166259766, "epoch": 0.7399809509854202, "step": 20200, "student_mlm_loss": 3.434521436691284 }, { "epoch": 0.7436442230200014, "grad_norm": 75.113037109375, "learning_rate": 3.7836941297884366e-05, "loss": 3.4868, "step": 20300 }, { "combined_loss": 2.0885400772094727, "distill_loss": 1.3560060262680054, "epoch": 0.7436442230200014, "step": 20300, "student_mlm_loss": 2.8210740089416504 }, { "epoch": 0.7473074950545827, "grad_norm": 12.071985244750977, "learning_rate": 3.7775511708479744e-05, "loss": 3.1594, "step": 20400 }, { "combined_loss": 2.104968309402466, "distill_loss": 1.456742286682129, "epoch": 0.7473074950545827, "step": 20400, "student_mlm_loss": 2.7531943321228027 }, { "epoch": 0.750970767089164, "grad_norm": 49.17687225341797, "learning_rate": 3.7714082119075115e-05, "loss": 5.0772, "step": 20500 }, { "combined_loss": 1.9532296657562256, "distill_loss": 1.2734321355819702, "epoch": 0.750970767089164, "step": 20500, "student_mlm_loss": 2.6330270767211914 }, { "epoch": 0.7546340391237454, "grad_norm": 4.601011753082275, "learning_rate": 3.765265252967049e-05, "loss": 8.0874, "step": 20600 }, { "combined_loss": 1.8828588724136353, "distill_loss": 1.35260009765625, "epoch": 0.7546340391237454, "step": 20600, "student_mlm_loss": 2.4131176471710205 }, { "epoch": 0.7582973111583267, "grad_norm": 3.9183883666992188, "learning_rate": 3.759122294026587e-05, "loss": 3.1836, "step": 20700 }, { "combined_loss": 3.261841058731079, "distill_loss": 1.35749351978302, "epoch": 0.7582973111583267, "step": 20700, "student_mlm_loss": 5.166188716888428 }, { "epoch": 0.7619605831929079, "grad_norm": 59.35635757446289, "learning_rate": 3.752979335086124e-05, "loss": 3.446, "step": 20800 }, { "combined_loss": 2.0783181190490723, "distill_loss": 1.3386023044586182, "epoch": 0.7619605831929079, "step": 20800, "student_mlm_loss": 2.8180341720581055 }, { "epoch": 0.7656238552274892, "grad_norm": 14.875, "learning_rate": 3.746836376145662e-05, "loss": 8.5798, "step": 20900 }, { "combined_loss": 1.926416039466858, "distill_loss": 1.3077542781829834, "epoch": 0.7656238552274892, "step": 20900, "student_mlm_loss": 2.5450778007507324 }, { "epoch": 0.7692871272620705, "grad_norm": 23.419870376586914, "learning_rate": 3.740693417205199e-05, "loss": 5.2177, "step": 21000 }, { "combined_loss": 1.7290170192718506, "distill_loss": 1.2258715629577637, "epoch": 0.7692871272620705, "step": 21000, "student_mlm_loss": 2.2321624755859375 }, { "epoch": 0.7729503992966518, "grad_norm": 29.292964935302734, "learning_rate": 3.7345504582647375e-05, "loss": 13.8021, "step": 21100 }, { "combined_loss": 1.9402461051940918, "distill_loss": 1.2749103307724, "epoch": 0.7729503992966518, "step": 21100, "student_mlm_loss": 2.6055819988250732 }, { "epoch": 0.776613671331233, "grad_norm": 9.03995418548584, "learning_rate": 3.7284074993242747e-05, "loss": 6.547, "step": 21200 }, { "combined_loss": 2.2710204124450684, "distill_loss": 1.312924861907959, "epoch": 0.776613671331233, "step": 21200, "student_mlm_loss": 3.229116201400757 }, { "epoch": 0.7802769433658143, "grad_norm": 11.86938190460205, "learning_rate": 3.7222645403838124e-05, "loss": 12.9682, "step": 21300 }, { "combined_loss": 3.114459991455078, "distill_loss": 1.318755865097046, "epoch": 0.7802769433658143, "step": 21300, "student_mlm_loss": 4.910163879394531 }, { "epoch": 0.7839402154003956, "grad_norm": 14.11950969696045, "learning_rate": 3.7161215814433495e-05, "loss": 3.1257, "step": 21400 }, { "combined_loss": 3.882293224334717, "distill_loss": 1.1930829286575317, "epoch": 0.7839402154003956, "step": 21400, "student_mlm_loss": 6.571503639221191 }, { "epoch": 0.7876034874349769, "grad_norm": 22.7275447845459, "learning_rate": 3.709978622502887e-05, "loss": 3.1395, "step": 21500 }, { "combined_loss": 2.00057315826416, "distill_loss": 1.3134089708328247, "epoch": 0.7876034874349769, "step": 21500, "student_mlm_loss": 2.687737226486206 }, { "epoch": 0.7912667594695582, "grad_norm": 56.84143829345703, "learning_rate": 3.703835663562425e-05, "loss": 13.1799, "step": 21600 }, { "combined_loss": 2.094574213027954, "distill_loss": 1.3792191743850708, "epoch": 0.7912667594695582, "step": 21600, "student_mlm_loss": 2.809929370880127 }, { "epoch": 0.7949300315041395, "grad_norm": 30.655105590820312, "learning_rate": 3.697692704621962e-05, "loss": 4.1563, "step": 21700 }, { "combined_loss": 2.167109489440918, "distill_loss": 1.3041900396347046, "epoch": 0.7949300315041395, "step": 21700, "student_mlm_loss": 3.030029058456421 }, { "epoch": 0.7985933035387208, "grad_norm": 7.400668144226074, "learning_rate": 3.6915497456815e-05, "loss": 9.7848, "step": 21800 }, { "combined_loss": 2.2639806270599365, "distill_loss": 1.3241550922393799, "epoch": 0.7985933035387208, "step": 21800, "student_mlm_loss": 3.203806161880493 }, { "epoch": 0.8022565755733021, "grad_norm": 28.212512969970703, "learning_rate": 3.685406786741038e-05, "loss": 2.7595, "step": 21900 }, { "combined_loss": 1.9249264001846313, "distill_loss": 1.337939739227295, "epoch": 0.8022565755733021, "step": 21900, "student_mlm_loss": 2.5119130611419678 }, { "epoch": 0.8059198476078834, "grad_norm": 5.998919486999512, "learning_rate": 3.6792638278005756e-05, "loss": 5.9041, "step": 22000 }, { "epoch": 0.8059198476078834, "eval_loss": 3.310230016708374, "eval_runtime": 1.9252, "eval_samples_per_second": 3633.98, "eval_steps_per_second": 14.544, "step": 22000 }, { "combined_loss": 2.208944320678711, "distill_loss": 1.2883169651031494, "epoch": 0.8059198476078834, "step": 22000, "student_mlm_loss": 3.1295716762542725 }, { "epoch": 0.8095831196424647, "grad_norm": 42.16996383666992, "learning_rate": 3.673120868860113e-05, "loss": 10.4166, "step": 22100 }, { "combined_loss": 2.089421510696411, "distill_loss": 1.3541114330291748, "epoch": 0.8095831196424647, "step": 22100, "student_mlm_loss": 2.8247315883636475 }, { "epoch": 0.8132463916770459, "grad_norm": 10.702394485473633, "learning_rate": 3.6669779099196505e-05, "loss": 3.5812, "step": 22200 }, { "combined_loss": 1.8974239826202393, "distill_loss": 1.3954590559005737, "epoch": 0.8132463916770459, "step": 22200, "student_mlm_loss": 2.3993890285491943 }, { "epoch": 0.8169096637116272, "grad_norm": 149.82179260253906, "learning_rate": 3.6608349509791876e-05, "loss": 3.229, "step": 22300 }, { "combined_loss": 2.0663747787475586, "distill_loss": 1.3880882263183594, "epoch": 0.8169096637116272, "step": 22300, "student_mlm_loss": 2.7446610927581787 }, { "epoch": 0.8205729357462085, "grad_norm": 5.735169410705566, "learning_rate": 3.6546919920387253e-05, "loss": 13.0135, "step": 22400 }, { "combined_loss": 2.3801686763763428, "distill_loss": 1.2296876907348633, "epoch": 0.8205729357462085, "step": 22400, "student_mlm_loss": 3.5306496620178223 }, { "epoch": 0.8242362077807898, "grad_norm": 3.9154951572418213, "learning_rate": 3.648549033098263e-05, "loss": 3.0256, "step": 22500 }, { "combined_loss": 2.619138240814209, "distill_loss": 1.369718313217163, "epoch": 0.8242362077807898, "step": 22500, "student_mlm_loss": 3.868557929992676 }, { "epoch": 0.827899479815371, "grad_norm": 6.706686019897461, "learning_rate": 3.6424060741578e-05, "loss": 6.8373, "step": 22600 }, { "combined_loss": 3.571559429168701, "distill_loss": 1.360285758972168, "epoch": 0.827899479815371, "step": 22600, "student_mlm_loss": 5.782833099365234 }, { "epoch": 0.8315627518499524, "grad_norm": 63.70609664916992, "learning_rate": 3.636263115217338e-05, "loss": 3.1874, "step": 22700 }, { "combined_loss": 6.645792007446289, "distill_loss": 1.3381716012954712, "epoch": 0.8315627518499524, "step": 22700, "student_mlm_loss": 11.953412055969238 }, { "epoch": 0.8352260238845337, "grad_norm": 112.02607727050781, "learning_rate": 3.630120156276876e-05, "loss": 4.1698, "step": 22800 }, { "combined_loss": 2.399282455444336, "distill_loss": 1.2190183401107788, "epoch": 0.8352260238845337, "step": 22800, "student_mlm_loss": 3.5795464515686035 }, { "epoch": 0.838889295919115, "grad_norm": 319.05230712890625, "learning_rate": 3.6239771973364136e-05, "loss": 3.351, "step": 22900 }, { "combined_loss": 5.626018047332764, "distill_loss": 1.3532286882400513, "epoch": 0.838889295919115, "step": 22900, "student_mlm_loss": 9.898807525634766 }, { "epoch": 0.8425525679536963, "grad_norm": 4.46912956237793, "learning_rate": 3.617834238395951e-05, "loss": 3.1926, "step": 23000 }, { "combined_loss": 1.8462562561035156, "distill_loss": 1.339337944984436, "epoch": 0.8425525679536963, "step": 23000, "student_mlm_loss": 2.3531746864318848 }, { "epoch": 0.8462158399882775, "grad_norm": 15.756026268005371, "learning_rate": 3.611691279455488e-05, "loss": 11.7086, "step": 23100 }, { "combined_loss": 3.4101529121398926, "distill_loss": 1.3407546281814575, "epoch": 0.8462158399882775, "step": 23100, "student_mlm_loss": 5.479551315307617 }, { "epoch": 0.8498791120228588, "grad_norm": 12.350069046020508, "learning_rate": 3.6055483205150256e-05, "loss": 3.1203, "step": 23200 }, { "combined_loss": 2.5675039291381836, "distill_loss": 1.2296205759048462, "epoch": 0.8498791120228588, "step": 23200, "student_mlm_loss": 3.9053874015808105 }, { "epoch": 0.8535423840574401, "grad_norm": 11.17212963104248, "learning_rate": 3.5994053615745634e-05, "loss": 6.2935, "step": 23300 }, { "combined_loss": 2.901674270629883, "distill_loss": 1.318871021270752, "epoch": 0.8535423840574401, "step": 23300, "student_mlm_loss": 4.484477519989014 }, { "epoch": 0.8572056560920214, "grad_norm": 11.69430160522461, "learning_rate": 3.593262402634101e-05, "loss": 6.1123, "step": 23400 }, { "combined_loss": 1.962475061416626, "distill_loss": 1.3837331533432007, "epoch": 0.8572056560920214, "step": 23400, "student_mlm_loss": 2.541217088699341 }, { "epoch": 0.8608689281266027, "grad_norm": 6.221428394317627, "learning_rate": 3.587119443693638e-05, "loss": 5.0621, "step": 23500 }, { "combined_loss": 2.3063066005706787, "distill_loss": 1.364685297012329, "epoch": 0.8608689281266027, "step": 23500, "student_mlm_loss": 3.2479279041290283 }, { "epoch": 0.8645322001611839, "grad_norm": 3.200302839279175, "learning_rate": 3.580976484753176e-05, "loss": 3.1679, "step": 23600 }, { "combined_loss": 14.653901100158691, "distill_loss": 1.3521461486816406, "epoch": 0.8645322001611839, "step": 23600, "student_mlm_loss": 27.955656051635742 }, { "epoch": 0.8681954721957652, "grad_norm": 18.003841400146484, "learning_rate": 3.574833525812714e-05, "loss": 4.2524, "step": 23700 }, { "combined_loss": 2.05013108253479, "distill_loss": 1.473749041557312, "epoch": 0.8681954721957652, "step": 23700, "student_mlm_loss": 2.6265130043029785 }, { "epoch": 0.8718587442303466, "grad_norm": 16.64165687561035, "learning_rate": 3.5686905668722516e-05, "loss": 3.4139, "step": 23800 }, { "combined_loss": 3.8039913177490234, "distill_loss": 1.3022387027740479, "epoch": 0.8718587442303466, "step": 23800, "student_mlm_loss": 6.305744171142578 }, { "epoch": 0.8755220162649279, "grad_norm": 6.90595817565918, "learning_rate": 3.562547607931789e-05, "loss": 5.4512, "step": 23900 }, { "combined_loss": 2.0175633430480957, "distill_loss": 1.2362921237945557, "epoch": 0.8755220162649279, "step": 23900, "student_mlm_loss": 2.7988343238830566 }, { "epoch": 0.8791852882995091, "grad_norm": 26.792980194091797, "learning_rate": 3.556404648991326e-05, "loss": 6.622, "step": 24000 }, { "epoch": 0.8791852882995091, "eval_loss": 3.643918991088867, "eval_runtime": 1.9198, "eval_samples_per_second": 3644.043, "eval_steps_per_second": 14.585, "step": 24000 }, { "combined_loss": 2.1716020107269287, "distill_loss": 1.3234556913375854, "epoch": 0.8791852882995091, "step": 24000, "student_mlm_loss": 3.0197484493255615 }, { "epoch": 0.8828485603340904, "grad_norm": 4.8087568283081055, "learning_rate": 3.550261690050864e-05, "loss": 4.0542, "step": 24100 }, { "combined_loss": 13.035262107849121, "distill_loss": 1.353433609008789, "epoch": 0.8828485603340904, "step": 24100, "student_mlm_loss": 24.717090606689453 }, { "epoch": 0.8865118323686717, "grad_norm": 10.60560417175293, "learning_rate": 3.5441187311104014e-05, "loss": 3.1068, "step": 24200 }, { "combined_loss": 1.8867456912994385, "distill_loss": 1.2289210557937622, "epoch": 0.8865118323686717, "step": 24200, "student_mlm_loss": 2.544570207595825 }, { "epoch": 0.890175104403253, "grad_norm": 11.34473705291748, "learning_rate": 3.537975772169939e-05, "loss": 2.9801, "step": 24300 }, { "combined_loss": 1.7472858428955078, "distill_loss": 1.229453206062317, "epoch": 0.890175104403253, "step": 24300, "student_mlm_loss": 2.265118360519409 }, { "epoch": 0.8938383764378343, "grad_norm": 17.742507934570312, "learning_rate": 3.531832813229476e-05, "loss": 4.6617, "step": 24400 }, { "combined_loss": 1.9173786640167236, "distill_loss": 1.3212807178497314, "epoch": 0.8938383764378343, "step": 24400, "student_mlm_loss": 2.513476610183716 }, { "epoch": 0.8975016484724155, "grad_norm": 14.223791122436523, "learning_rate": 3.525689854289014e-05, "loss": 3.0537, "step": 24500 }, { "combined_loss": 1.7878549098968506, "distill_loss": 1.2908958196640015, "epoch": 0.8975016484724155, "step": 24500, "student_mlm_loss": 2.28481388092041 }, { "epoch": 0.9011649205069968, "grad_norm": 4.241771697998047, "learning_rate": 3.519546895348552e-05, "loss": 7.9255, "step": 24600 }, { "combined_loss": 1.8853719234466553, "distill_loss": 1.3350555896759033, "epoch": 0.9011649205069968, "step": 24600, "student_mlm_loss": 2.4356882572174072 }, { "epoch": 0.9048281925415781, "grad_norm": 5.793640613555908, "learning_rate": 3.513403936408089e-05, "loss": 2.9971, "step": 24700 }, { "combined_loss": 9.072087287902832, "distill_loss": 1.2805593013763428, "epoch": 0.9048281925415781, "step": 24700, "student_mlm_loss": 16.863615036010742 }, { "epoch": 0.9084914645761595, "grad_norm": 4.500351905822754, "learning_rate": 3.507260977467627e-05, "loss": 2.9841, "step": 24800 }, { "combined_loss": 4.229645252227783, "distill_loss": 1.231893539428711, "epoch": 0.9084914645761595, "step": 24800, "student_mlm_loss": 7.2273969650268555 }, { "epoch": 0.9121547366107408, "grad_norm": 24.93678855895996, "learning_rate": 3.501118018527164e-05, "loss": 5.2865, "step": 24900 }, { "combined_loss": 4.519498825073242, "distill_loss": 1.35053288936615, "epoch": 0.9121547366107408, "step": 24900, "student_mlm_loss": 7.688465118408203 }, { "epoch": 0.915818008645322, "grad_norm": 9.416017532348633, "learning_rate": 3.494975059586702e-05, "loss": 2.9688, "step": 25000 }, { "combined_loss": 4.33969783782959, "distill_loss": 1.2811079025268555, "epoch": 0.915818008645322, "step": 25000, "student_mlm_loss": 7.398288249969482 }, { "epoch": 0.9194812806799033, "grad_norm": 41.79585266113281, "learning_rate": 3.4888321006462394e-05, "loss": 12.352, "step": 25100 }, { "combined_loss": 2.398942232131958, "distill_loss": 1.3129199743270874, "epoch": 0.9194812806799033, "step": 25100, "student_mlm_loss": 3.484964609146118 }, { "epoch": 0.9231445527144846, "grad_norm": 27.67843246459961, "learning_rate": 3.482689141705777e-05, "loss": 4.6291, "step": 25200 }, { "combined_loss": 1.8275630474090576, "distill_loss": 1.1290583610534668, "epoch": 0.9231445527144846, "step": 25200, "student_mlm_loss": 2.5260677337646484 }, { "epoch": 0.9268078247490659, "grad_norm": 57.03019332885742, "learning_rate": 3.476546182765314e-05, "loss": 3.8226, "step": 25300 }, { "combined_loss": 1.8621808290481567, "distill_loss": 1.3249785900115967, "epoch": 0.9268078247490659, "step": 25300, "student_mlm_loss": 2.399383068084717 }, { "epoch": 0.9304710967836471, "grad_norm": 5.4275007247924805, "learning_rate": 3.470403223824852e-05, "loss": 3.7803, "step": 25400 }, { "combined_loss": 5.317490100860596, "distill_loss": 1.3810964822769165, "epoch": 0.9304710967836471, "step": 25400, "student_mlm_loss": 9.253883361816406 }, { "epoch": 0.9341343688182284, "grad_norm": 6.36318302154541, "learning_rate": 3.46426026488439e-05, "loss": 17.9114, "step": 25500 }, { "combined_loss": 4.816742897033691, "distill_loss": 1.274537444114685, "epoch": 0.9341343688182284, "step": 25500, "student_mlm_loss": 8.358948707580566 }, { "epoch": 0.9377976408528097, "grad_norm": 4.670822620391846, "learning_rate": 3.458117305943927e-05, "loss": 3.4352, "step": 25600 }, { "combined_loss": 1.7166364192962646, "distill_loss": 1.2876447439193726, "epoch": 0.9377976408528097, "step": 25600, "student_mlm_loss": 2.145627975463867 }, { "epoch": 0.941460912887391, "grad_norm": 16.301795959472656, "learning_rate": 3.451974347003465e-05, "loss": 2.591, "step": 25700 }, { "combined_loss": 1.8349076509475708, "distill_loss": 1.3192713260650635, "epoch": 0.941460912887391, "step": 25700, "student_mlm_loss": 2.350543975830078 }, { "epoch": 0.9451241849219723, "grad_norm": 4.464934349060059, "learning_rate": 3.4458313880630025e-05, "loss": 5.3202, "step": 25800 }, { "combined_loss": 2.022656202316284, "distill_loss": 1.4582451581954956, "epoch": 0.9451241849219723, "step": 25800, "student_mlm_loss": 2.587067127227783 }, { "epoch": 0.9487874569565536, "grad_norm": 13.280508041381836, "learning_rate": 3.43968842912254e-05, "loss": 3.2685, "step": 25900 }, { "combined_loss": 1.7409727573394775, "distill_loss": 1.2449432611465454, "epoch": 0.9487874569565536, "step": 25900, "student_mlm_loss": 2.23700213432312 }, { "epoch": 0.9524507289911349, "grad_norm": 34.54155349731445, "learning_rate": 3.4335454701820774e-05, "loss": 4.4614, "step": 26000 }, { "epoch": 0.9524507289911349, "eval_loss": 3.371135950088501, "eval_runtime": 1.9026, "eval_samples_per_second": 3677.064, "eval_steps_per_second": 14.717, "step": 26000 }, { "combined_loss": 2.1200222969055176, "distill_loss": 1.4147942066192627, "epoch": 0.9524507289911349, "step": 26000, "student_mlm_loss": 2.8252503871917725 }, { "epoch": 0.9561140010257162, "grad_norm": 12.063314437866211, "learning_rate": 3.427402511241615e-05, "loss": 3.8605, "step": 26100 }, { "combined_loss": 2.440842866897583, "distill_loss": 1.4115891456604004, "epoch": 0.9561140010257162, "step": 26100, "student_mlm_loss": 3.4700965881347656 }, { "epoch": 0.9597772730602975, "grad_norm": 3.154322862625122, "learning_rate": 3.421259552301152e-05, "loss": 3.4216, "step": 26200 }, { "combined_loss": 2.0511860847473145, "distill_loss": 1.2086646556854248, "epoch": 0.9597772730602975, "step": 26200, "student_mlm_loss": 2.893707752227783 }, { "epoch": 0.9634405450948788, "grad_norm": 4.469895839691162, "learning_rate": 3.41511659336069e-05, "loss": 8.4313, "step": 26300 }, { "combined_loss": 1.9184556007385254, "distill_loss": 1.311684489250183, "epoch": 0.9634405450948788, "step": 26300, "student_mlm_loss": 2.525226593017578 }, { "epoch": 0.96710381712946, "grad_norm": 37.47445297241211, "learning_rate": 3.408973634420228e-05, "loss": 3.33, "step": 26400 }, { "combined_loss": 1.8568530082702637, "distill_loss": 1.3435510396957397, "epoch": 0.96710381712946, "step": 26400, "student_mlm_loss": 2.370154857635498 }, { "epoch": 0.9707670891640413, "grad_norm": 5.385250091552734, "learning_rate": 3.402830675479765e-05, "loss": 3.0353, "step": 26500 }, { "combined_loss": 2.078137159347534, "distill_loss": 1.4688613414764404, "epoch": 0.9707670891640413, "step": 26500, "student_mlm_loss": 2.687412977218628 }, { "epoch": 0.9744303611986226, "grad_norm": 20.363506317138672, "learning_rate": 3.396687716539303e-05, "loss": 5.5902, "step": 26600 }, { "combined_loss": 2.420652151107788, "distill_loss": 1.3566147089004517, "epoch": 0.9744303611986226, "step": 26600, "student_mlm_loss": 3.484689474105835 }, { "epoch": 0.9780936332332039, "grad_norm": 5.678069591522217, "learning_rate": 3.3905447575988405e-05, "loss": 3.1063, "step": 26700 }, { "combined_loss": 2.2643003463745117, "distill_loss": 1.3446204662322998, "epoch": 0.9780936332332039, "step": 26700, "student_mlm_loss": 3.1839799880981445 }, { "epoch": 0.9817569052677851, "grad_norm": 8.722668647766113, "learning_rate": 3.384401798658378e-05, "loss": 9.3685, "step": 26800 }, { "combined_loss": 8.34331226348877, "distill_loss": 1.3864542245864868, "epoch": 0.9817569052677851, "step": 26800, "student_mlm_loss": 15.3001708984375 }, { "epoch": 0.9854201773023665, "grad_norm": 5.101404190063477, "learning_rate": 3.3782588397179154e-05, "loss": 3.1112, "step": 26900 }, { "combined_loss": 30.241453170776367, "distill_loss": 1.3818217515945435, "epoch": 0.9854201773023665, "step": 26900, "student_mlm_loss": 59.1010856628418 }, { "epoch": 0.9890834493369478, "grad_norm": 3.8359858989715576, "learning_rate": 3.3721158807774525e-05, "loss": 3.348, "step": 27000 }, { "combined_loss": 1.8264105319976807, "distill_loss": 1.2956147193908691, "epoch": 0.9890834493369478, "step": 27000, "student_mlm_loss": 2.357206344604492 }, { "epoch": 0.9927467213715291, "grad_norm": 33.43736267089844, "learning_rate": 3.36597292183699e-05, "loss": 3.5437, "step": 27100 }, { "combined_loss": 2.331777572631836, "distill_loss": 1.3274433612823486, "epoch": 0.9927467213715291, "step": 27100, "student_mlm_loss": 3.3361120223999023 }, { "epoch": 0.9964099934061104, "grad_norm": 2.9736690521240234, "learning_rate": 3.359829962896528e-05, "loss": 2.828, "step": 27200 }, { "combined_loss": 2.0438201427459717, "distill_loss": 1.334372639656067, "epoch": 0.9964099934061104, "step": 27200, "student_mlm_loss": 2.753267526626587 }, { "epoch": 1.0000732654406916, "grad_norm": 3.6774871349334717, "learning_rate": 3.353687003956066e-05, "loss": 3.168, "step": 27300 }, { "combined_loss": 3.4676733016967773, "distill_loss": 1.2681790590286255, "epoch": 1.0000732654406916, "step": 27300, "student_mlm_loss": 5.667167663574219 }, { "epoch": 1.003736537475273, "grad_norm": 20.265796661376953, "learning_rate": 3.347544045015603e-05, "loss": 4.9071, "step": 27400 }, { "combined_loss": 1.740236520767212, "distill_loss": 1.1595730781555176, "epoch": 1.003736537475273, "step": 27400, "student_mlm_loss": 2.3208999633789062 }, { "epoch": 1.0073998095098542, "grad_norm": 14.427675247192383, "learning_rate": 3.341401086075141e-05, "loss": 3.1375, "step": 27500 }, { "combined_loss": 2.0229873657226562, "distill_loss": 1.3961925506591797, "epoch": 1.0073998095098542, "step": 27500, "student_mlm_loss": 2.6497819423675537 }, { "epoch": 1.0110630815444355, "grad_norm": 3.032438039779663, "learning_rate": 3.3352581271346786e-05, "loss": 2.7581, "step": 27600 }, { "combined_loss": 1.9314367771148682, "distill_loss": 1.2618595361709595, "epoch": 1.0110630815444355, "step": 27600, "student_mlm_loss": 2.6010141372680664 }, { "epoch": 1.0147263535790167, "grad_norm": 6.167496681213379, "learning_rate": 3.3291151681942163e-05, "loss": 6.7788, "step": 27700 }, { "combined_loss": 2.247697353363037, "distill_loss": 1.4385483264923096, "epoch": 1.0147263535790167, "step": 27700, "student_mlm_loss": 3.0568461418151855 }, { "epoch": 1.018389625613598, "grad_norm": 4.82693338394165, "learning_rate": 3.3229722092537534e-05, "loss": 5.9229, "step": 27800 }, { "combined_loss": 3.4328160285949707, "distill_loss": 1.319059133529663, "epoch": 1.018389625613598, "step": 27800, "student_mlm_loss": 5.546572685241699 }, { "epoch": 1.0220528976481793, "grad_norm": 13.18911361694336, "learning_rate": 3.3168292503132906e-05, "loss": 3.5041, "step": 27900 }, { "combined_loss": 3.720487594604492, "distill_loss": 1.233067274093628, "epoch": 1.0220528976481793, "step": 27900, "student_mlm_loss": 6.207907676696777 }, { "epoch": 1.0257161696827606, "grad_norm": 10.725250244140625, "learning_rate": 3.310686291372829e-05, "loss": 2.9279, "step": 28000 }, { "epoch": 1.0257161696827606, "eval_loss": 3.3177244663238525, "eval_runtime": 2.0821, "eval_samples_per_second": 3360.034, "eval_steps_per_second": 13.448, "step": 28000 }, { "combined_loss": 2.0106987953186035, "distill_loss": 1.3163011074066162, "epoch": 1.0257161696827606, "step": 28000, "student_mlm_loss": 2.70509672164917 }, { "epoch": 1.0293794417173419, "grad_norm": 5.406506538391113, "learning_rate": 3.304543332432366e-05, "loss": 3.2149, "step": 28100 }, { "combined_loss": 2.042628288269043, "distill_loss": 1.3173636198043823, "epoch": 1.0293794417173419, "step": 28100, "student_mlm_loss": 2.767892837524414 }, { "epoch": 1.0330427137519231, "grad_norm": 3.2733256816864014, "learning_rate": 3.298400373491904e-05, "loss": 6.3856, "step": 28200 }, { "combined_loss": 1.9145760536193848, "distill_loss": 1.438834309577942, "epoch": 1.0330427137519231, "step": 28200, "student_mlm_loss": 2.390317916870117 }, { "epoch": 1.0367059857865044, "grad_norm": 10.546121597290039, "learning_rate": 3.292257414551441e-05, "loss": 3.5422, "step": 28300 }, { "combined_loss": 2.6431736946105957, "distill_loss": 1.367489218711853, "epoch": 1.0367059857865044, "step": 28300, "student_mlm_loss": 3.918858289718628 }, { "epoch": 1.0403692578210857, "grad_norm": 25.674352645874023, "learning_rate": 3.286114455610979e-05, "loss": 6.2258, "step": 28400 }, { "combined_loss": 1.8416577577590942, "distill_loss": 1.2867157459259033, "epoch": 1.0403692578210857, "step": 28400, "student_mlm_loss": 2.396599769592285 }, { "epoch": 1.044032529855667, "grad_norm": 3.6745688915252686, "learning_rate": 3.2799714966705166e-05, "loss": 5.0647, "step": 28500 }, { "combined_loss": 1.9693520069122314, "distill_loss": 1.3039644956588745, "epoch": 1.044032529855667, "step": 28500, "student_mlm_loss": 2.634739637374878 }, { "epoch": 1.0476958018902485, "grad_norm": 40.79129409790039, "learning_rate": 3.273828537730054e-05, "loss": 2.6424, "step": 28600 }, { "combined_loss": 2.4251365661621094, "distill_loss": 1.3121291399002075, "epoch": 1.0476958018902485, "step": 28600, "student_mlm_loss": 3.5381438732147217 }, { "epoch": 1.0513590739248297, "grad_norm": 7.185906410217285, "learning_rate": 3.2676855787895915e-05, "loss": 2.9095, "step": 28700 }, { "combined_loss": 5.781175136566162, "distill_loss": 1.3236074447631836, "epoch": 1.0513590739248297, "step": 28700, "student_mlm_loss": 10.23874282836914 }, { "epoch": 1.055022345959411, "grad_norm": 7.2639079093933105, "learning_rate": 3.2615426198491286e-05, "loss": 3.0536, "step": 28800 }, { "combined_loss": 1.8534462451934814, "distill_loss": 1.433970332145691, "epoch": 1.055022345959411, "step": 28800, "student_mlm_loss": 2.2729220390319824 }, { "epoch": 1.0586856179939923, "grad_norm": 82.9974365234375, "learning_rate": 3.255399660908667e-05, "loss": 3.4605, "step": 28900 }, { "combined_loss": 2.385720729827881, "distill_loss": 1.319982647895813, "epoch": 1.0586856179939923, "step": 28900, "student_mlm_loss": 3.4514589309692383 }, { "epoch": 1.0623488900285736, "grad_norm": 8.101861000061035, "learning_rate": 3.249256701968204e-05, "loss": 2.9531, "step": 29000 }, { "combined_loss": 1.9569958448410034, "distill_loss": 1.350255012512207, "epoch": 1.0623488900285736, "step": 29000, "student_mlm_loss": 2.5637366771698 }, { "epoch": 1.0660121620631549, "grad_norm": 42.843135833740234, "learning_rate": 3.243113743027742e-05, "loss": 3.5336, "step": 29100 }, { "combined_loss": 2.0199599266052246, "distill_loss": 1.1558183431625366, "epoch": 1.0660121620631549, "step": 29100, "student_mlm_loss": 2.884101390838623 }, { "epoch": 1.0696754340977361, "grad_norm": 10.401261329650879, "learning_rate": 3.236970784087279e-05, "loss": 2.6909, "step": 29200 }, { "combined_loss": 1.898897409439087, "distill_loss": 1.2361267805099487, "epoch": 1.0696754340977361, "step": 29200, "student_mlm_loss": 2.5616679191589355 }, { "epoch": 1.0733387061323174, "grad_norm": 13.08026123046875, "learning_rate": 3.230827825146817e-05, "loss": 10.7499, "step": 29300 }, { "combined_loss": 2.385263442993164, "distill_loss": 1.2960166931152344, "epoch": 1.0733387061323174, "step": 29300, "student_mlm_loss": 3.4745099544525146 }, { "epoch": 1.0770019781668987, "grad_norm": 6.8822431564331055, "learning_rate": 3.2246848662063546e-05, "loss": 3.0651, "step": 29400 }, { "combined_loss": 2.1257505416870117, "distill_loss": 1.3224972486495972, "epoch": 1.0770019781668987, "step": 29400, "student_mlm_loss": 2.929003953933716 }, { "epoch": 1.08066525020148, "grad_norm": 3.4312744140625, "learning_rate": 3.218541907265892e-05, "loss": 3.1323, "step": 29500 }, { "combined_loss": 2.0117716789245605, "distill_loss": 1.2447552680969238, "epoch": 1.08066525020148, "step": 29500, "student_mlm_loss": 2.7787880897521973 }, { "epoch": 1.0843285222360612, "grad_norm": 3.970820426940918, "learning_rate": 3.2123989483254295e-05, "loss": 3.7427, "step": 29600 }, { "combined_loss": 2.493256092071533, "distill_loss": 1.27970290184021, "epoch": 1.0843285222360612, "step": 29600, "student_mlm_loss": 3.7068092823028564 }, { "epoch": 1.0879917942706425, "grad_norm": 5.8632426261901855, "learning_rate": 3.206255989384967e-05, "loss": 3.0698, "step": 29700 }, { "combined_loss": 2.017867088317871, "distill_loss": 1.408115029335022, "epoch": 1.0879917942706425, "step": 29700, "student_mlm_loss": 2.6276190280914307 }, { "epoch": 1.0916550663052238, "grad_norm": 7.350955963134766, "learning_rate": 3.200113030444505e-05, "loss": 10.1517, "step": 29800 }, { "combined_loss": 3.020230770111084, "distill_loss": 1.1870992183685303, "epoch": 1.0916550663052238, "step": 29800, "student_mlm_loss": 4.853362083435059 }, { "epoch": 1.095318338339805, "grad_norm": 14.347647666931152, "learning_rate": 3.193970071504042e-05, "loss": 2.8345, "step": 29900 }, { "combined_loss": 1.8037035465240479, "distill_loss": 1.2421637773513794, "epoch": 1.095318338339805, "step": 29900, "student_mlm_loss": 2.365243434906006 }, { "epoch": 1.0989816103743864, "grad_norm": 8.716060638427734, "learning_rate": 3.18782711256358e-05, "loss": 4.9073, "step": 30000 }, { "epoch": 1.0989816103743864, "eval_loss": 3.289705753326416, "eval_runtime": 2.6398, "eval_samples_per_second": 2650.179, "eval_steps_per_second": 10.607, "step": 30000 }, { "combined_loss": 3.3838839530944824, "distill_loss": 1.2657897472381592, "epoch": 1.0989816103743864, "step": 30000, "student_mlm_loss": 5.501977920532227 }, { "epoch": 1.1026448824089676, "grad_norm": 9.78013801574707, "learning_rate": 3.181684153623117e-05, "loss": 6.1366, "step": 30100 }, { "combined_loss": 1.8116616010665894, "distill_loss": 1.3585631847381592, "epoch": 1.1026448824089676, "step": 30100, "student_mlm_loss": 2.2647600173950195 }, { "epoch": 1.106308154443549, "grad_norm": 20.41010856628418, "learning_rate": 3.175541194682655e-05, "loss": 4.7028, "step": 30200 }, { "combined_loss": 1.9074151515960693, "distill_loss": 1.119224190711975, "epoch": 1.106308154443549, "step": 30200, "student_mlm_loss": 2.695605993270874 }, { "epoch": 1.1099714264781302, "grad_norm": 7.005733966827393, "learning_rate": 3.1693982357421926e-05, "loss": 4.9073, "step": 30300 }, { "combined_loss": 1.7690558433532715, "distill_loss": 1.2762707471847534, "epoch": 1.1099714264781302, "step": 30300, "student_mlm_loss": 2.2618408203125 }, { "epoch": 1.1136346985127115, "grad_norm": 4.290195465087891, "learning_rate": 3.16325527680173e-05, "loss": 4.1257, "step": 30400 }, { "combined_loss": 15.505983352661133, "distill_loss": 1.252361536026001, "epoch": 1.1136346985127115, "step": 30400, "student_mlm_loss": 29.759605407714844 }, { "epoch": 1.1172979705472927, "grad_norm": 27.59025764465332, "learning_rate": 3.1571123178612675e-05, "loss": 3.6319, "step": 30500 }, { "combined_loss": 3.190175771713257, "distill_loss": 1.237632155418396, "epoch": 1.1172979705472927, "step": 30500, "student_mlm_loss": 5.142719268798828 }, { "epoch": 1.120961242581874, "grad_norm": 35.681365966796875, "learning_rate": 3.150969358920805e-05, "loss": 5.2866, "step": 30600 }, { "combined_loss": 2.1486501693725586, "distill_loss": 1.3570821285247803, "epoch": 1.120961242581874, "step": 30600, "student_mlm_loss": 2.940218448638916 }, { "epoch": 1.1246245146164555, "grad_norm": 28.920949935913086, "learning_rate": 3.144826399980343e-05, "loss": 11.35, "step": 30700 }, { "combined_loss": 3.544619560241699, "distill_loss": 1.3219174146652222, "epoch": 1.1246245146164555, "step": 30700, "student_mlm_loss": 5.767321586608887 }, { "epoch": 1.1282877866510368, "grad_norm": 36.29865264892578, "learning_rate": 3.13868344103988e-05, "loss": 8.8748, "step": 30800 }, { "combined_loss": 3.136960744857788, "distill_loss": 1.4069170951843262, "epoch": 1.1282877866510368, "step": 30800, "student_mlm_loss": 4.86700439453125 }, { "epoch": 1.131951058685618, "grad_norm": 8.498424530029297, "learning_rate": 3.132540482099417e-05, "loss": 2.6175, "step": 30900 }, { "combined_loss": 2.584123373031616, "distill_loss": 1.3318666219711304, "epoch": 1.131951058685618, "step": 30900, "student_mlm_loss": 3.8363800048828125 }, { "epoch": 1.1356143307201993, "grad_norm": 8.784627914428711, "learning_rate": 3.126397523158955e-05, "loss": 3.7912, "step": 31000 }, { "combined_loss": 4.065792083740234, "distill_loss": 1.279055118560791, "epoch": 1.1356143307201993, "step": 31000, "student_mlm_loss": 6.8525285720825195 }, { "epoch": 1.1392776027547806, "grad_norm": 15.763399124145508, "learning_rate": 3.120254564218493e-05, "loss": 7.3671, "step": 31100 }, { "combined_loss": 1.9532334804534912, "distill_loss": 1.2137418985366821, "epoch": 1.1392776027547806, "step": 31100, "student_mlm_loss": 2.6927249431610107 }, { "epoch": 1.142940874789362, "grad_norm": 6.777341842651367, "learning_rate": 3.1141116052780306e-05, "loss": 2.8877, "step": 31200 }, { "combined_loss": 3.5847015380859375, "distill_loss": 1.3712694644927979, "epoch": 1.142940874789362, "step": 31200, "student_mlm_loss": 5.798133850097656 }, { "epoch": 1.1466041468239432, "grad_norm": 6.115112781524658, "learning_rate": 3.107968646337568e-05, "loss": 3.3763, "step": 31300 }, { "combined_loss": 1.899533748626709, "distill_loss": 1.2805981636047363, "epoch": 1.1466041468239432, "step": 31300, "student_mlm_loss": 2.5184693336486816 }, { "epoch": 1.1502674188585245, "grad_norm": 3.3896713256835938, "learning_rate": 3.1018256873971055e-05, "loss": 3.2932, "step": 31400 }, { "combined_loss": 1.9794254302978516, "distill_loss": 1.3896270990371704, "epoch": 1.1502674188585245, "step": 31400, "student_mlm_loss": 2.5692238807678223 }, { "epoch": 1.1539306908931057, "grad_norm": 12.824034690856934, "learning_rate": 3.095682728456643e-05, "loss": 3.5341, "step": 31500 }, { "combined_loss": 2.5983529090881348, "distill_loss": 1.2135576009750366, "epoch": 1.1539306908931057, "step": 31500, "student_mlm_loss": 3.9831480979919434 }, { "epoch": 1.157593962927687, "grad_norm": 73.47982025146484, "learning_rate": 3.089539769516181e-05, "loss": 2.9879, "step": 31600 }, { "combined_loss": 1.8584779500961304, "distill_loss": 1.3214514255523682, "epoch": 1.157593962927687, "step": 31600, "student_mlm_loss": 2.3955044746398926 }, { "epoch": 1.1612572349622683, "grad_norm": 5.6778340339660645, "learning_rate": 3.083396810575718e-05, "loss": 2.9781, "step": 31700 }, { "combined_loss": 4.854001045227051, "distill_loss": 1.2088978290557861, "epoch": 1.1612572349622683, "step": 31700, "student_mlm_loss": 8.499104499816895 }, { "epoch": 1.1649205069968496, "grad_norm": 17.93754768371582, "learning_rate": 3.077253851635255e-05, "loss": 3.5773, "step": 31800 }, { "combined_loss": 1.9064607620239258, "distill_loss": 1.363638997077942, "epoch": 1.1649205069968496, "step": 31800, "student_mlm_loss": 2.449282646179199 }, { "epoch": 1.1685837790314308, "grad_norm": 8.912027359008789, "learning_rate": 3.071110892694794e-05, "loss": 3.0949, "step": 31900 }, { "combined_loss": 1.9666361808776855, "distill_loss": 1.3997029066085815, "epoch": 1.1685837790314308, "step": 31900, "student_mlm_loss": 2.5335693359375 }, { "epoch": 1.1722470510660121, "grad_norm": 21.05866050720215, "learning_rate": 3.064967933754331e-05, "loss": 2.965, "step": 32000 }, { "epoch": 1.1722470510660121, "eval_loss": 3.516061544418335, "eval_runtime": 2.6391, "eval_samples_per_second": 2650.903, "eval_steps_per_second": 10.61, "step": 32000 }, { "combined_loss": 2.466904640197754, "distill_loss": 1.2619636058807373, "epoch": 1.1722470510660121, "step": 32000, "student_mlm_loss": 3.6718459129333496 }, { "epoch": 1.1759103231005934, "grad_norm": 14.288066864013672, "learning_rate": 3.0588249748138686e-05, "loss": 6.5656, "step": 32100 }, { "combined_loss": 5.987391471862793, "distill_loss": 1.3964972496032715, "epoch": 1.1759103231005934, "step": 32100, "student_mlm_loss": 10.578286170959473 }, { "epoch": 1.1795735951351747, "grad_norm": 10.953961372375488, "learning_rate": 3.052682015873406e-05, "loss": 7.1246, "step": 32200 }, { "combined_loss": 1.758845567703247, "distill_loss": 1.2731348276138306, "epoch": 1.1795735951351747, "step": 32200, "student_mlm_loss": 2.244556188583374 }, { "epoch": 1.183236867169756, "grad_norm": 17.076087951660156, "learning_rate": 3.046539056932944e-05, "loss": 7.3734, "step": 32300 }, { "combined_loss": 1.7941749095916748, "distill_loss": 1.282630205154419, "epoch": 1.183236867169756, "step": 32300, "student_mlm_loss": 2.3057196140289307 }, { "epoch": 1.1869001392043372, "grad_norm": 11.33812427520752, "learning_rate": 3.040396097992481e-05, "loss": 5.4979, "step": 32400 }, { "combined_loss": 2.379426956176758, "distill_loss": 1.2975032329559326, "epoch": 1.1869001392043372, "step": 32400, "student_mlm_loss": 3.461350917816162 }, { "epoch": 1.1905634112389185, "grad_norm": 3.6378591060638428, "learning_rate": 3.0342531390520184e-05, "loss": 5.077, "step": 32500 }, { "combined_loss": 1.835166573524475, "distill_loss": 1.294168472290039, "epoch": 1.1905634112389185, "step": 32500, "student_mlm_loss": 2.376164674758911 }, { "epoch": 1.1942266832735, "grad_norm": 23.017444610595703, "learning_rate": 3.0281101801115562e-05, "loss": 3.1428, "step": 32600 }, { "combined_loss": 1.8867619037628174, "distill_loss": 1.2372292280197144, "epoch": 1.1942266832735, "step": 32600, "student_mlm_loss": 2.536294460296631 }, { "epoch": 1.197889955308081, "grad_norm": 7.055652141571045, "learning_rate": 3.0219672211710937e-05, "loss": 8.7118, "step": 32700 }, { "combined_loss": 6.59044075012207, "distill_loss": 1.3554973602294922, "epoch": 1.197889955308081, "step": 32700, "student_mlm_loss": 11.825384140014648 }, { "epoch": 1.2015532273426626, "grad_norm": 6.935373783111572, "learning_rate": 3.0158242622306314e-05, "loss": 7.5763, "step": 32800 }, { "combined_loss": 2.4971964359283447, "distill_loss": 1.2960432767868042, "epoch": 1.2015532273426626, "step": 32800, "student_mlm_loss": 3.698349714279175 }, { "epoch": 1.2052164993772438, "grad_norm": 19.48725700378418, "learning_rate": 3.009681303290169e-05, "loss": 5.1993, "step": 32900 }, { "combined_loss": 2.639206886291504, "distill_loss": 1.2536990642547607, "epoch": 1.2052164993772438, "step": 32900, "student_mlm_loss": 4.024714469909668 }, { "epoch": 1.2088797714118251, "grad_norm": 215.4875946044922, "learning_rate": 3.0035383443497067e-05, "loss": 3.9297, "step": 33000 }, { "combined_loss": 2.1888670921325684, "distill_loss": 1.4587746858596802, "epoch": 1.2088797714118251, "step": 33000, "student_mlm_loss": 2.918959379196167 }, { "epoch": 1.2125430434464064, "grad_norm": 5.346382141113281, "learning_rate": 2.997395385409244e-05, "loss": 3.3704, "step": 33100 }, { "combined_loss": 2.5722949504852295, "distill_loss": 1.2250982522964478, "epoch": 1.2125430434464064, "step": 33100, "student_mlm_loss": 3.9194915294647217 }, { "epoch": 1.2162063154809877, "grad_norm": 21.193038940429688, "learning_rate": 2.991252426468782e-05, "loss": 3.22, "step": 33200 }, { "combined_loss": 1.8822517395019531, "distill_loss": 1.264020323753357, "epoch": 1.2162063154809877, "step": 33200, "student_mlm_loss": 2.5004830360412598 }, { "epoch": 1.219869587515569, "grad_norm": 8.840603828430176, "learning_rate": 2.9851094675283193e-05, "loss": 13.091, "step": 33300 }, { "combined_loss": 2.0461645126342773, "distill_loss": 1.3376085758209229, "epoch": 1.219869587515569, "step": 33300, "student_mlm_loss": 2.7547202110290527 }, { "epoch": 1.2235328595501502, "grad_norm": 16.414852142333984, "learning_rate": 2.9789665085878564e-05, "loss": 3.6096, "step": 33400 }, { "combined_loss": 1.8437246084213257, "distill_loss": 1.2731173038482666, "epoch": 1.2235328595501502, "step": 33400, "student_mlm_loss": 2.4143319129943848 }, { "epoch": 1.2271961315847315, "grad_norm": 5.047356605529785, "learning_rate": 2.9728235496473946e-05, "loss": 10.6014, "step": 33500 }, { "combined_loss": 2.0613672733306885, "distill_loss": 1.1784592866897583, "epoch": 1.2271961315847315, "step": 33500, "student_mlm_loss": 2.944275140762329 }, { "epoch": 1.2308594036193128, "grad_norm": 8.502574920654297, "learning_rate": 2.9666805907069317e-05, "loss": 12.6532, "step": 33600 }, { "combined_loss": 2.301725149154663, "distill_loss": 1.2482868432998657, "epoch": 1.2308594036193128, "step": 33600, "student_mlm_loss": 3.355163335800171 }, { "epoch": 1.234522675653894, "grad_norm": 25.97445297241211, "learning_rate": 2.9605376317664695e-05, "loss": 3.1296, "step": 33700 }, { "combined_loss": 1.8135402202606201, "distill_loss": 1.309229850769043, "epoch": 1.234522675653894, "step": 33700, "student_mlm_loss": 2.3178505897521973 }, { "epoch": 1.2381859476884753, "grad_norm": 7.912507057189941, "learning_rate": 2.954394672826007e-05, "loss": 2.9749, "step": 33800 }, { "combined_loss": 1.9506487846374512, "distill_loss": 1.3808802366256714, "epoch": 1.2381859476884753, "step": 33800, "student_mlm_loss": 2.5204174518585205 }, { "epoch": 1.2418492197230566, "grad_norm": 28.239988327026367, "learning_rate": 2.9482517138855447e-05, "loss": 5.7527, "step": 33900 }, { "combined_loss": 1.881349802017212, "distill_loss": 1.3489292860031128, "epoch": 1.2418492197230566, "step": 33900, "student_mlm_loss": 2.4137701988220215 }, { "epoch": 1.245512491757638, "grad_norm": 25.953353881835938, "learning_rate": 2.942108754945082e-05, "loss": 4.0339, "step": 34000 }, { "epoch": 1.245512491757638, "eval_loss": 3.297154188156128, "eval_runtime": 2.3826, "eval_samples_per_second": 2936.248, "eval_steps_per_second": 11.752, "step": 34000 }, { "combined_loss": 2.5429787635803223, "distill_loss": 1.2718520164489746, "epoch": 1.245512491757638, "step": 34000, "student_mlm_loss": 3.814105272293091 }, { "epoch": 1.2491757637922192, "grad_norm": 48.45500183105469, "learning_rate": 2.9359657960046196e-05, "loss": 6.1408, "step": 34100 }, { "combined_loss": 4.794422626495361, "distill_loss": 1.3052036762237549, "epoch": 1.2491757637922192, "step": 34100, "student_mlm_loss": 8.283641815185547 }, { "epoch": 1.2528390358268005, "grad_norm": 6.028234004974365, "learning_rate": 2.9298228370641574e-05, "loss": 2.9116, "step": 34200 }, { "combined_loss": 2.125443458557129, "distill_loss": 1.25053071975708, "epoch": 1.2528390358268005, "step": 34200, "student_mlm_loss": 3.0003561973571777 }, { "epoch": 1.2565023078613817, "grad_norm": 15.824817657470703, "learning_rate": 2.9236798781236945e-05, "loss": 3.5834, "step": 34300 }, { "combined_loss": 2.156796932220459, "distill_loss": 1.1805670261383057, "epoch": 1.2565023078613817, "step": 34300, "student_mlm_loss": 3.1330268383026123 }, { "epoch": 1.260165579895963, "grad_norm": 8.438326835632324, "learning_rate": 2.9175369191832326e-05, "loss": 5.0724, "step": 34400 }, { "combined_loss": 3.144615888595581, "distill_loss": 1.2467416524887085, "epoch": 1.260165579895963, "step": 34400, "student_mlm_loss": 5.042490005493164 }, { "epoch": 1.2638288519305443, "grad_norm": 3.7252449989318848, "learning_rate": 2.9113939602427697e-05, "loss": 2.9306, "step": 34500 }, { "combined_loss": 4.309004783630371, "distill_loss": 1.2629985809326172, "epoch": 1.2638288519305443, "step": 34500, "student_mlm_loss": 7.355010986328125 }, { "epoch": 1.2674921239651256, "grad_norm": 14.86426067352295, "learning_rate": 2.9052510013023078e-05, "loss": 3.059, "step": 34600 }, { "combined_loss": 2.128227472305298, "distill_loss": 1.3674236536026, "epoch": 1.2674921239651256, "step": 34600, "student_mlm_loss": 2.889031171798706 }, { "epoch": 1.271155395999707, "grad_norm": 14.947731018066406, "learning_rate": 2.899108042361845e-05, "loss": 3.0461, "step": 34700 }, { "combined_loss": 1.9557018280029297, "distill_loss": 1.3122907876968384, "epoch": 1.271155395999707, "step": 34700, "student_mlm_loss": 2.5991127490997314 }, { "epoch": 1.2748186680342881, "grad_norm": 4.714714527130127, "learning_rate": 2.8929650834213824e-05, "loss": 3.0221, "step": 34800 }, { "combined_loss": 1.7830932140350342, "distill_loss": 1.278725028038025, "epoch": 1.2748186680342881, "step": 34800, "student_mlm_loss": 2.287461519241333 }, { "epoch": 1.2784819400688696, "grad_norm": 13.885130882263184, "learning_rate": 2.88682212448092e-05, "loss": 8.529, "step": 34900 }, { "combined_loss": 4.974426746368408, "distill_loss": 1.4173694849014282, "epoch": 1.2784819400688696, "step": 34900, "student_mlm_loss": 8.53148365020752 }, { "epoch": 1.2821452121034507, "grad_norm": 6.786545753479004, "learning_rate": 2.8806791655404576e-05, "loss": 3.563, "step": 35000 }, { "combined_loss": 1.7134695053100586, "distill_loss": 1.2251827716827393, "epoch": 1.2821452121034507, "step": 35000, "student_mlm_loss": 2.201756238937378 }, { "epoch": 1.2858084841380322, "grad_norm": 18.235891342163086, "learning_rate": 2.8745362065999954e-05, "loss": 6.9188, "step": 35100 }, { "combined_loss": 6.00921106338501, "distill_loss": 1.3103188276290894, "epoch": 1.2858084841380322, "step": 35100, "student_mlm_loss": 10.70810317993164 }, { "epoch": 1.2894717561726134, "grad_norm": 6.3708696365356445, "learning_rate": 2.8683932476595328e-05, "loss": 6.7695, "step": 35200 }, { "combined_loss": 2.2400052547454834, "distill_loss": 1.3289698362350464, "epoch": 1.2894717561726134, "step": 35200, "student_mlm_loss": 3.151040554046631 }, { "epoch": 1.2931350282071947, "grad_norm": 7.5602946281433105, "learning_rate": 2.8622502887190706e-05, "loss": 9.8005, "step": 35300 }, { "combined_loss": 1.848390817642212, "distill_loss": 1.2897430658340454, "epoch": 1.2931350282071947, "step": 35300, "student_mlm_loss": 2.407038688659668 }, { "epoch": 1.296798300241776, "grad_norm": 24.799640655517578, "learning_rate": 2.8561073297786077e-05, "loss": 3.2996, "step": 35400 }, { "combined_loss": 4.894403457641602, "distill_loss": 1.282358169555664, "epoch": 1.296798300241776, "step": 35400, "student_mlm_loss": 8.506448745727539 }, { "epoch": 1.3004615722763573, "grad_norm": 34.4364013671875, "learning_rate": 2.849964370838146e-05, "loss": 3.399, "step": 35500 }, { "combined_loss": 1.7965787649154663, "distill_loss": 1.3232142925262451, "epoch": 1.3004615722763573, "step": 35500, "student_mlm_loss": 2.2699432373046875 }, { "epoch": 1.3041248443109386, "grad_norm": 7.9551825523376465, "learning_rate": 2.843821411897683e-05, "loss": 3.1887, "step": 35600 }, { "combined_loss": 1.855729579925537, "distill_loss": 1.2217527627944946, "epoch": 1.3041248443109386, "step": 35600, "student_mlm_loss": 2.48970627784729 }, { "epoch": 1.3077881163455198, "grad_norm": 5.838754177093506, "learning_rate": 2.8376784529572204e-05, "loss": 3.1524, "step": 35700 }, { "combined_loss": 2.3417129516601562, "distill_loss": 1.2872867584228516, "epoch": 1.3077881163455198, "step": 35700, "student_mlm_loss": 3.39613938331604 }, { "epoch": 1.3114513883801011, "grad_norm": 4.118559837341309, "learning_rate": 2.831535494016758e-05, "loss": 7.9754, "step": 35800 }, { "combined_loss": 3.906961679458618, "distill_loss": 1.2905327081680298, "epoch": 1.3114513883801011, "step": 35800, "student_mlm_loss": 6.523390769958496 }, { "epoch": 1.3151146604146824, "grad_norm": 5.229255199432373, "learning_rate": 2.8253925350762956e-05, "loss": 3.6586, "step": 35900 }, { "combined_loss": 2.6259002685546875, "distill_loss": 1.217278003692627, "epoch": 1.3151146604146824, "step": 35900, "student_mlm_loss": 4.034522533416748 }, { "epoch": 1.3187779324492637, "grad_norm": 9.182631492614746, "learning_rate": 2.8192495761358334e-05, "loss": 8.5789, "step": 36000 }, { "epoch": 1.3187779324492637, "eval_loss": 3.3097567558288574, "eval_runtime": 1.9861, "eval_samples_per_second": 3522.525, "eval_steps_per_second": 14.098, "step": 36000 }, { "combined_loss": 15.921034812927246, "distill_loss": 1.2575896978378296, "epoch": 1.3187779324492637, "step": 36000, "student_mlm_loss": 30.58448028564453 }, { "epoch": 1.322441204483845, "grad_norm": 5.999209880828857, "learning_rate": 2.813106617195371e-05, "loss": 3.6109, "step": 36100 }, { "combined_loss": 204.92184448242188, "distill_loss": 1.2291535139083862, "epoch": 1.322441204483845, "step": 36100, "student_mlm_loss": 408.6145324707031 }, { "epoch": 1.3261044765184262, "grad_norm": 8.351846694946289, "learning_rate": 2.8069636582549086e-05, "loss": 5.9753, "step": 36200 }, { "combined_loss": 3.7332310676574707, "distill_loss": 1.377110481262207, "epoch": 1.3261044765184262, "step": 36200, "student_mlm_loss": 6.089351654052734 }, { "epoch": 1.3297677485530075, "grad_norm": 4.738751411437988, "learning_rate": 2.800820699314446e-05, "loss": 2.8706, "step": 36300 }, { "combined_loss": 1.949210286140442, "distill_loss": 1.1820151805877686, "epoch": 1.3297677485530075, "step": 36300, "student_mlm_loss": 2.7164053916931152 }, { "epoch": 1.3334310205875888, "grad_norm": 3.7835421562194824, "learning_rate": 2.7946777403739832e-05, "loss": 3.5794, "step": 36400 }, { "combined_loss": 1.7922800779342651, "distill_loss": 1.2455928325653076, "epoch": 1.3334310205875888, "step": 36400, "student_mlm_loss": 2.3389673233032227 }, { "epoch": 1.33709429262217, "grad_norm": 22.528881072998047, "learning_rate": 2.788534781433521e-05, "loss": 3.8623, "step": 36500 }, { "combined_loss": 1.788147211074829, "distill_loss": 1.2254056930541992, "epoch": 1.33709429262217, "step": 36500, "student_mlm_loss": 2.350888729095459 }, { "epoch": 1.3407575646567513, "grad_norm": 5.876169681549072, "learning_rate": 2.7823918224930584e-05, "loss": 8.4137, "step": 36600 }, { "combined_loss": 2.0377962589263916, "distill_loss": 1.2204126119613647, "epoch": 1.3407575646567513, "step": 36600, "student_mlm_loss": 2.855179786682129 }, { "epoch": 1.3444208366913326, "grad_norm": 20.921276092529297, "learning_rate": 2.7762488635525962e-05, "loss": 3.5857, "step": 36700 }, { "combined_loss": 1.9521321058273315, "distill_loss": 1.249513864517212, "epoch": 1.3444208366913326, "step": 36700, "student_mlm_loss": 2.654750347137451 }, { "epoch": 1.348084108725914, "grad_norm": 13.851704597473145, "learning_rate": 2.7701059046121336e-05, "loss": 3.8678, "step": 36800 }, { "combined_loss": 2.2560389041900635, "distill_loss": 1.2315130233764648, "epoch": 1.348084108725914, "step": 36800, "student_mlm_loss": 3.280564785003662 }, { "epoch": 1.3517473807604952, "grad_norm": 16.56214714050293, "learning_rate": 2.7639629456716714e-05, "loss": 3.3998, "step": 36900 }, { "combined_loss": 3.098896026611328, "distill_loss": 1.3377043008804321, "epoch": 1.3517473807604952, "step": 36900, "student_mlm_loss": 4.860087871551514 }, { "epoch": 1.3554106527950767, "grad_norm": 35.91291809082031, "learning_rate": 2.757819986731209e-05, "loss": 3.761, "step": 37000 }, { "combined_loss": 1.9794631004333496, "distill_loss": 1.3087836503982544, "epoch": 1.3554106527950767, "step": 37000, "student_mlm_loss": 2.6501426696777344 }, { "epoch": 1.3590739248296577, "grad_norm": 11.776296615600586, "learning_rate": 2.7516770277907466e-05, "loss": 3.9886, "step": 37100 }, { "combined_loss": 2.3107573986053467, "distill_loss": 1.268768310546875, "epoch": 1.3590739248296577, "step": 37100, "student_mlm_loss": 3.3527464866638184 }, { "epoch": 1.3627371968642392, "grad_norm": 13.237029075622559, "learning_rate": 2.745534068850284e-05, "loss": 5.3161, "step": 37200 }, { "combined_loss": 4.210747718811035, "distill_loss": 1.4009877443313599, "epoch": 1.3627371968642392, "step": 37200, "student_mlm_loss": 7.0205078125 }, { "epoch": 1.3664004688988205, "grad_norm": 18.256624221801758, "learning_rate": 2.7393911099098212e-05, "loss": 3.3122, "step": 37300 }, { "combined_loss": 2.467655658721924, "distill_loss": 1.3313319683074951, "epoch": 1.3664004688988205, "step": 37300, "student_mlm_loss": 3.6039793491363525 }, { "epoch": 1.3700637409334018, "grad_norm": 3.6821129322052, "learning_rate": 2.7332481509693593e-05, "loss": 2.5638, "step": 37400 }, { "combined_loss": 4.0961503982543945, "distill_loss": 1.2590566873550415, "epoch": 1.3700637409334018, "step": 37400, "student_mlm_loss": 6.933243751525879 }, { "epoch": 1.373727012967983, "grad_norm": 9.491351127624512, "learning_rate": 2.7271051920288964e-05, "loss": 5.2572, "step": 37500 }, { "combined_loss": 1.8323596715927124, "distill_loss": 1.2323403358459473, "epoch": 1.373727012967983, "step": 37500, "student_mlm_loss": 2.4323790073394775 }, { "epoch": 1.3773902850025643, "grad_norm": 10.13337516784668, "learning_rate": 2.7209622330884342e-05, "loss": 2.9805, "step": 37600 }, { "combined_loss": 2.7236733436584473, "distill_loss": 1.2598845958709717, "epoch": 1.3773902850025643, "step": 37600, "student_mlm_loss": 4.187462329864502 }, { "epoch": 1.3810535570371456, "grad_norm": 22.098358154296875, "learning_rate": 2.7148192741479716e-05, "loss": 3.1095, "step": 37700 }, { "combined_loss": 1.7910634279251099, "distill_loss": 1.271672010421753, "epoch": 1.3810535570371456, "step": 37700, "student_mlm_loss": 2.310454845428467 }, { "epoch": 1.3847168290717269, "grad_norm": 233.01779174804688, "learning_rate": 2.7086763152075094e-05, "loss": 3.0334, "step": 37800 }, { "combined_loss": 2.449730396270752, "distill_loss": 1.343329906463623, "epoch": 1.3847168290717269, "step": 37800, "student_mlm_loss": 3.556130886077881 }, { "epoch": 1.3883801011063082, "grad_norm": 7.459797382354736, "learning_rate": 2.702533356267047e-05, "loss": 5.0088, "step": 37900 }, { "combined_loss": 2.047302722930908, "distill_loss": 1.2358465194702148, "epoch": 1.3883801011063082, "step": 37900, "student_mlm_loss": 2.8587586879730225 }, { "epoch": 1.3920433731408894, "grad_norm": 3.9627275466918945, "learning_rate": 2.6963903973265843e-05, "loss": 2.7476, "step": 38000 }, { "epoch": 1.3920433731408894, "eval_loss": 4.346156120300293, "eval_runtime": 1.974, "eval_samples_per_second": 3544.088, "eval_steps_per_second": 14.184, "step": 38000 }, { "combined_loss": 2.4468555450439453, "distill_loss": 1.166190505027771, "epoch": 1.3920433731408894, "step": 38000, "student_mlm_loss": 3.72752046585083 }, { "epoch": 1.3957066451754707, "grad_norm": 11.812987327575684, "learning_rate": 2.690247438386122e-05, "loss": 3.8226, "step": 38100 }, { "combined_loss": 2.274935245513916, "distill_loss": 1.3503799438476562, "epoch": 1.3957066451754707, "step": 38100, "student_mlm_loss": 3.199490785598755 }, { "epoch": 1.399369917210052, "grad_norm": 6.545460224151611, "learning_rate": 2.6841044794456592e-05, "loss": 4.1598, "step": 38200 }, { "combined_loss": 2.1577343940734863, "distill_loss": 1.2623993158340454, "epoch": 1.399369917210052, "step": 38200, "student_mlm_loss": 3.0530693531036377 }, { "epoch": 1.4030331892446333, "grad_norm": 7.286951541900635, "learning_rate": 2.6779615205051973e-05, "loss": 3.8211, "step": 38300 }, { "combined_loss": 2.479806900024414, "distill_loss": 1.2152717113494873, "epoch": 1.4030331892446333, "step": 38300, "student_mlm_loss": 3.74434232711792 }, { "epoch": 1.4066964612792145, "grad_norm": 18.360294342041016, "learning_rate": 2.6718185615647344e-05, "loss": 3.3871, "step": 38400 }, { "combined_loss": 1.7289254665374756, "distill_loss": 1.3171356916427612, "epoch": 1.4066964612792145, "step": 38400, "student_mlm_loss": 2.1407151222229004 }, { "epoch": 1.4103597333137958, "grad_norm": 8.086026191711426, "learning_rate": 2.6656756026242726e-05, "loss": 2.6337, "step": 38500 }, { "combined_loss": 1.9621633291244507, "distill_loss": 1.3215687274932861, "epoch": 1.4103597333137958, "step": 38500, "student_mlm_loss": 2.6027579307556152 }, { "epoch": 1.414023005348377, "grad_norm": 13.378824234008789, "learning_rate": 2.6595326436838097e-05, "loss": 3.4032, "step": 38600 }, { "combined_loss": 37.448326110839844, "distill_loss": 1.2198776006698608, "epoch": 1.414023005348377, "step": 38600, "student_mlm_loss": 73.67677307128906 }, { "epoch": 1.4176862773829584, "grad_norm": 5.834230422973633, "learning_rate": 2.653389684743347e-05, "loss": 6.724, "step": 38700 }, { "combined_loss": 1.8702625036239624, "distill_loss": 1.2802906036376953, "epoch": 1.4176862773829584, "step": 38700, "student_mlm_loss": 2.4602344036102295 }, { "epoch": 1.4213495494175397, "grad_norm": 3.5685741901397705, "learning_rate": 2.647246725802885e-05, "loss": 3.2721, "step": 38800 }, { "combined_loss": 1.7411483526229858, "distill_loss": 1.285083532333374, "epoch": 1.4213495494175397, "step": 38800, "student_mlm_loss": 2.1972131729125977 }, { "epoch": 1.4250128214521212, "grad_norm": 8.644251823425293, "learning_rate": 2.6411037668624223e-05, "loss": 13.6859, "step": 38900 }, { "combined_loss": 3.234241008758545, "distill_loss": 1.2654619216918945, "epoch": 1.4250128214521212, "step": 38900, "student_mlm_loss": 5.203020095825195 }, { "epoch": 1.4286760934867022, "grad_norm": 15.043992042541504, "learning_rate": 2.63496080792196e-05, "loss": 4.3161, "step": 39000 }, { "combined_loss": 2.013312339782715, "distill_loss": 1.2555652856826782, "epoch": 1.4286760934867022, "step": 39000, "student_mlm_loss": 2.771059274673462 }, { "epoch": 1.4323393655212837, "grad_norm": 35.315345764160156, "learning_rate": 2.6288178489814976e-05, "loss": 6.3089, "step": 39100 }, { "combined_loss": 1.7854509353637695, "distill_loss": 1.2994376420974731, "epoch": 1.4323393655212837, "step": 39100, "student_mlm_loss": 2.2714641094207764 }, { "epoch": 1.4360026375558648, "grad_norm": 8.155647277832031, "learning_rate": 2.6226748900410353e-05, "loss": 3.3881, "step": 39200 }, { "combined_loss": 1.8790473937988281, "distill_loss": 1.2656193971633911, "epoch": 1.4360026375558648, "step": 39200, "student_mlm_loss": 2.4924752712249756 }, { "epoch": 1.4396659095904463, "grad_norm": 4.777060508728027, "learning_rate": 2.6165319311005725e-05, "loss": 3.0181, "step": 39300 }, { "combined_loss": 2.2714784145355225, "distill_loss": 1.2724400758743286, "epoch": 1.4396659095904463, "step": 39300, "student_mlm_loss": 3.270516872406006 }, { "epoch": 1.4433291816250275, "grad_norm": 3.7660317420959473, "learning_rate": 2.6103889721601106e-05, "loss": 3.3045, "step": 39400 }, { "combined_loss": 1.9759800434112549, "distill_loss": 1.1767717599868774, "epoch": 1.4433291816250275, "step": 39400, "student_mlm_loss": 2.775188446044922 }, { "epoch": 1.4469924536596088, "grad_norm": 55.78919982910156, "learning_rate": 2.6042460132196477e-05, "loss": 3.5094, "step": 39500 }, { "combined_loss": 2.5586395263671875, "distill_loss": 1.3177176713943481, "epoch": 1.4469924536596088, "step": 39500, "student_mlm_loss": 3.7995612621307373 }, { "epoch": 1.45065572569419, "grad_norm": 11.648473739624023, "learning_rate": 2.598103054279185e-05, "loss": 6.3066, "step": 39600 }, { "combined_loss": 1.8263496160507202, "distill_loss": 1.2649195194244385, "epoch": 1.45065572569419, "step": 39600, "student_mlm_loss": 2.387779712677002 }, { "epoch": 1.4543189977287714, "grad_norm": 4.982020378112793, "learning_rate": 2.591960095338723e-05, "loss": 3.1475, "step": 39700 }, { "combined_loss": 4.95673131942749, "distill_loss": 1.2415388822555542, "epoch": 1.4543189977287714, "step": 39700, "student_mlm_loss": 8.671923637390137 }, { "epoch": 1.4579822697633527, "grad_norm": 4.551340103149414, "learning_rate": 2.5858171363982604e-05, "loss": 6.0043, "step": 39800 }, { "combined_loss": 2.124246597290039, "distill_loss": 1.197386384010315, "epoch": 1.4579822697633527, "step": 39800, "student_mlm_loss": 3.0511069297790527 }, { "epoch": 1.461645541797934, "grad_norm": 41.217533111572266, "learning_rate": 2.579674177457798e-05, "loss": 2.7216, "step": 39900 }, { "combined_loss": 1.8579926490783691, "distill_loss": 1.1948734521865845, "epoch": 1.461645541797934, "step": 39900, "student_mlm_loss": 2.5211119651794434 }, { "epoch": 1.4653088138325152, "grad_norm": 3.3428897857666016, "learning_rate": 2.5735312185173356e-05, "loss": 3.5888, "step": 40000 }, { "epoch": 1.4653088138325152, "eval_loss": 3.433469295501709, "eval_runtime": 2.0987, "eval_samples_per_second": 3333.452, "eval_steps_per_second": 13.341, "step": 40000 }, { "combined_loss": 3.9790029525756836, "distill_loss": 1.2571158409118652, "epoch": 1.4653088138325152, "step": 40000, "student_mlm_loss": 6.700890064239502 }, { "epoch": 1.4689720858670965, "grad_norm": 24.387128829956055, "learning_rate": 2.5673882595768734e-05, "loss": 3.3546, "step": 40100 }, { "combined_loss": 2.113370418548584, "distill_loss": 1.2904696464538574, "epoch": 1.4689720858670965, "step": 40100, "student_mlm_loss": 2.9362711906433105 }, { "epoch": 1.4726353579016778, "grad_norm": 11.271422386169434, "learning_rate": 2.5612453006364108e-05, "loss": 9.1182, "step": 40200 }, { "combined_loss": 1.7249795198440552, "distill_loss": 1.2220125198364258, "epoch": 1.4726353579016778, "step": 40200, "student_mlm_loss": 2.2279465198516846 }, { "epoch": 1.476298629936259, "grad_norm": 88.92086029052734, "learning_rate": 2.555102341695948e-05, "loss": 5.5622, "step": 40300 }, { "combined_loss": 3.5107364654541016, "distill_loss": 1.2663298845291138, "epoch": 1.476298629936259, "step": 40300, "student_mlm_loss": 5.755143165588379 }, { "epoch": 1.4799619019708403, "grad_norm": 4.677048683166504, "learning_rate": 2.5489593827554857e-05, "loss": 5.3278, "step": 40400 }, { "combined_loss": 3.5298116207122803, "distill_loss": 1.1846145391464233, "epoch": 1.4799619019708403, "step": 40400, "student_mlm_loss": 5.875008583068848 }, { "epoch": 1.4836251740054216, "grad_norm": 21.207704544067383, "learning_rate": 2.542816423815023e-05, "loss": 2.9588, "step": 40500 }, { "combined_loss": 2.6109657287597656, "distill_loss": 1.2608091831207275, "epoch": 1.4836251740054216, "step": 40500, "student_mlm_loss": 3.9611220359802246 }, { "epoch": 1.4872884460400029, "grad_norm": 7.7415876388549805, "learning_rate": 2.536673464874561e-05, "loss": 2.706, "step": 40600 }, { "combined_loss": 2.455023765563965, "distill_loss": 1.3175585269927979, "epoch": 1.4872884460400029, "step": 40600, "student_mlm_loss": 3.5924887657165527 }, { "epoch": 1.4909517180745842, "grad_norm": 19.366378784179688, "learning_rate": 2.5305305059340984e-05, "loss": 2.7981, "step": 40700 }, { "combined_loss": 3.624007225036621, "distill_loss": 1.1402699947357178, "epoch": 1.4909517180745842, "step": 40700, "student_mlm_loss": 6.1077446937561035 }, { "epoch": 1.4946149901091654, "grad_norm": 7.310671806335449, "learning_rate": 2.524387546993636e-05, "loss": 29.272, "step": 40800 }, { "combined_loss": 2.2329726219177246, "distill_loss": 1.303555965423584, "epoch": 1.4946149901091654, "step": 40800, "student_mlm_loss": 3.1623895168304443 }, { "epoch": 1.4982782621437467, "grad_norm": 48.7297477722168, "learning_rate": 2.5182445880531736e-05, "loss": 3.1319, "step": 40900 }, { "combined_loss": 1.8255285024642944, "distill_loss": 1.1643202304840088, "epoch": 1.4982782621437467, "step": 40900, "student_mlm_loss": 2.48673677444458 }, { "epoch": 1.5019415341783282, "grad_norm": 32.60409927368164, "learning_rate": 2.5121016291127114e-05, "loss": 8.524, "step": 41000 }, { "combined_loss": 2.896923542022705, "distill_loss": 1.3571655750274658, "epoch": 1.5019415341783282, "step": 41000, "student_mlm_loss": 4.436681747436523 }, { "epoch": 1.5056048062129093, "grad_norm": 4.127974510192871, "learning_rate": 2.5059586701722488e-05, "loss": 6.3087, "step": 41100 }, { "combined_loss": 2.145819664001465, "distill_loss": 1.2983198165893555, "epoch": 1.5056048062129093, "step": 41100, "student_mlm_loss": 2.993319511413574 }, { "epoch": 1.5092680782474908, "grad_norm": 3.873206853866577, "learning_rate": 2.4998157112317863e-05, "loss": 5.279, "step": 41200 }, { "combined_loss": 4.8266730308532715, "distill_loss": 1.1676665544509888, "epoch": 1.5092680782474908, "step": 41200, "student_mlm_loss": 8.485679626464844 }, { "epoch": 1.5129313502820718, "grad_norm": 6.902312755584717, "learning_rate": 2.493672752291324e-05, "loss": 5.3583, "step": 41300 }, { "combined_loss": 1.7068848609924316, "distill_loss": 1.1335561275482178, "epoch": 1.5129313502820718, "step": 41300, "student_mlm_loss": 2.2802135944366455 }, { "epoch": 1.5165946223166533, "grad_norm": 17.415306091308594, "learning_rate": 2.487529793350861e-05, "loss": 2.8319, "step": 41400 }, { "combined_loss": 1.5696630477905273, "distill_loss": 1.152633786201477, "epoch": 1.5165946223166533, "step": 41400, "student_mlm_loss": 1.9866924285888672 }, { "epoch": 1.5202578943512344, "grad_norm": 11.67779541015625, "learning_rate": 2.481386834410399e-05, "loss": 3.0117, "step": 41500 }, { "combined_loss": 1.9209272861480713, "distill_loss": 1.2611881494522095, "epoch": 1.5202578943512344, "step": 41500, "student_mlm_loss": 2.5806663036346436 }, { "epoch": 1.5239211663858159, "grad_norm": 9.814743041992188, "learning_rate": 2.4752438754699364e-05, "loss": 2.8479, "step": 41600 }, { "combined_loss": 4.1822404861450195, "distill_loss": 1.254117488861084, "epoch": 1.5239211663858159, "step": 41600, "student_mlm_loss": 7.110363960266113 }, { "epoch": 1.5275844384203972, "grad_norm": 11.7344970703125, "learning_rate": 2.4691009165294742e-05, "loss": 3.2502, "step": 41700 }, { "combined_loss": 1.7558622360229492, "distill_loss": 1.1821727752685547, "epoch": 1.5275844384203972, "step": 41700, "student_mlm_loss": 2.3295516967773438 }, { "epoch": 1.5312477104549784, "grad_norm": 8.426025390625, "learning_rate": 2.4629579575890116e-05, "loss": 3.3169, "step": 41800 }, { "combined_loss": 1.843000054359436, "distill_loss": 1.1456735134124756, "epoch": 1.5312477104549784, "step": 41800, "student_mlm_loss": 2.5403265953063965 }, { "epoch": 1.5349109824895597, "grad_norm": 3.654872417449951, "learning_rate": 2.456814998648549e-05, "loss": 2.6259, "step": 41900 }, { "combined_loss": 1.7651002407073975, "distill_loss": 1.1741529703140259, "epoch": 1.5349109824895597, "step": 41900, "student_mlm_loss": 2.3560476303100586 }, { "epoch": 1.538574254524141, "grad_norm": 18.605615615844727, "learning_rate": 2.450672039708087e-05, "loss": 2.4854, "step": 42000 }, { "epoch": 1.538574254524141, "eval_loss": 3.4032058715820312, "eval_runtime": 1.8747, "eval_samples_per_second": 3731.788, "eval_steps_per_second": 14.936, "step": 42000 }, { "combined_loss": 2.60400390625, "distill_loss": 1.2034615278244019, "epoch": 1.538574254524141, "step": 42000, "student_mlm_loss": 4.004546165466309 }, { "epoch": 1.5422375265587223, "grad_norm": 6.775146484375, "learning_rate": 2.4445290807676243e-05, "loss": 2.8405, "step": 42100 }, { "combined_loss": 1.7485601902008057, "distill_loss": 1.1682909727096558, "epoch": 1.5422375265587223, "step": 42100, "student_mlm_loss": 2.328829288482666 }, { "epoch": 1.5459007985933035, "grad_norm": 24.79000473022461, "learning_rate": 2.4383861218271617e-05, "loss": 2.9811, "step": 42200 }, { "combined_loss": 2.2294323444366455, "distill_loss": 1.262848138809204, "epoch": 1.5459007985933035, "step": 42200, "student_mlm_loss": 3.196016550064087 }, { "epoch": 1.5495640706278848, "grad_norm": 11.027627944946289, "learning_rate": 2.4322431628866992e-05, "loss": 3.7109, "step": 42300 }, { "combined_loss": 1.8129802942276, "distill_loss": 1.205324411392212, "epoch": 1.5495640706278848, "step": 42300, "student_mlm_loss": 2.4206361770629883 }, { "epoch": 1.553227342662466, "grad_norm": 6.328401565551758, "learning_rate": 2.426100203946237e-05, "loss": 31.168, "step": 42400 }, { "combined_loss": 2.391860246658325, "distill_loss": 1.1356655359268188, "epoch": 1.553227342662466, "step": 42400, "student_mlm_loss": 3.648054838180542 }, { "epoch": 1.5568906146970474, "grad_norm": 26.61184310913086, "learning_rate": 2.4199572450057744e-05, "loss": 6.4259, "step": 42500 }, { "combined_loss": 3.222200870513916, "distill_loss": 1.3243845701217651, "epoch": 1.5568906146970474, "step": 42500, "student_mlm_loss": 5.120017051696777 }, { "epoch": 1.5605538867316286, "grad_norm": 78.89910888671875, "learning_rate": 2.4138142860653122e-05, "loss": 3.3441, "step": 42600 }, { "combined_loss": 1.7442145347595215, "distill_loss": 1.282542109489441, "epoch": 1.5605538867316286, "step": 42600, "student_mlm_loss": 2.2058870792388916 }, { "epoch": 1.56421715876621, "grad_norm": 88.92566680908203, "learning_rate": 2.4076713271248496e-05, "loss": 2.8234, "step": 42700 }, { "combined_loss": 2.366835117340088, "distill_loss": 1.1711124181747437, "epoch": 1.56421715876621, "step": 42700, "student_mlm_loss": 3.5625579357147217 }, { "epoch": 1.5678804308007912, "grad_norm": 6.83758544921875, "learning_rate": 2.4015283681843874e-05, "loss": 5.4491, "step": 42800 }, { "combined_loss": 4.174956798553467, "distill_loss": 1.0669249296188354, "epoch": 1.5678804308007912, "step": 42800, "student_mlm_loss": 7.282988548278809 }, { "epoch": 1.5715437028353727, "grad_norm": 5.723924160003662, "learning_rate": 2.395385409243925e-05, "loss": 3.1108, "step": 42900 }, { "combined_loss": 2.3197238445281982, "distill_loss": 1.2763570547103882, "epoch": 1.5715437028353727, "step": 42900, "student_mlm_loss": 3.3630905151367188 }, { "epoch": 1.5752069748699538, "grad_norm": 14.807353973388672, "learning_rate": 2.3892424503034623e-05, "loss": 6.4113, "step": 43000 }, { "combined_loss": 1.7868092060089111, "distill_loss": 1.1304634809494019, "epoch": 1.5752069748699538, "step": 43000, "student_mlm_loss": 2.44315505027771 }, { "epoch": 1.5788702469045353, "grad_norm": 8.68276596069336, "learning_rate": 2.3830994913629998e-05, "loss": 5.1213, "step": 43100 }, { "combined_loss": 19.46100425720215, "distill_loss": 1.259545087814331, "epoch": 1.5788702469045353, "step": 43100, "student_mlm_loss": 37.6624641418457 }, { "epoch": 1.5825335189391163, "grad_norm": 4.91242790222168, "learning_rate": 2.3769565324225372e-05, "loss": 3.2674, "step": 43200 }, { "combined_loss": 1.797656536102295, "distill_loss": 1.3039189577102661, "epoch": 1.5825335189391163, "step": 43200, "student_mlm_loss": 2.2913942337036133 }, { "epoch": 1.5861967909736978, "grad_norm": 52.68294906616211, "learning_rate": 2.370813573482075e-05, "loss": 3.7711, "step": 43300 }, { "combined_loss": 1.8017528057098389, "distill_loss": 1.1734706163406372, "epoch": 1.5861967909736978, "step": 43300, "student_mlm_loss": 2.43003511428833 }, { "epoch": 1.5898600630082789, "grad_norm": 11.869544982910156, "learning_rate": 2.3646706145416124e-05, "loss": 9.8177, "step": 43400 }, { "combined_loss": 2.760119915008545, "distill_loss": 1.2446471452713013, "epoch": 1.5898600630082789, "step": 43400, "student_mlm_loss": 4.275592803955078 }, { "epoch": 1.5935233350428604, "grad_norm": 3.7819387912750244, "learning_rate": 2.3585276556011502e-05, "loss": 4.6552, "step": 43500 }, { "combined_loss": 4.660012245178223, "distill_loss": 1.1187530755996704, "epoch": 1.5935233350428604, "step": 43500, "student_mlm_loss": 8.201271057128906 }, { "epoch": 1.5971866070774414, "grad_norm": 21.269559860229492, "learning_rate": 2.3523846966606877e-05, "loss": 8.5404, "step": 43600 }, { "combined_loss": 2.3045759201049805, "distill_loss": 1.3545589447021484, "epoch": 1.5971866070774414, "step": 43600, "student_mlm_loss": 3.2545931339263916 }, { "epoch": 1.600849879112023, "grad_norm": 8.289508819580078, "learning_rate": 2.3462417377202254e-05, "loss": 2.7135, "step": 43700 }, { "combined_loss": 3.0867691040039062, "distill_loss": 1.1124651432037354, "epoch": 1.600849879112023, "step": 43700, "student_mlm_loss": 5.061073303222656 }, { "epoch": 1.6045131511466042, "grad_norm": 22.303661346435547, "learning_rate": 2.3400987787797625e-05, "loss": 3.6364, "step": 43800 }, { "combined_loss": 1.7930564880371094, "distill_loss": 1.2114512920379639, "epoch": 1.6045131511466042, "step": 43800, "student_mlm_loss": 2.374661684036255 }, { "epoch": 1.6081764231811855, "grad_norm": 4.351790904998779, "learning_rate": 2.3339558198393003e-05, "loss": 5.6887, "step": 43900 }, { "combined_loss": 1.7365663051605225, "distill_loss": 1.2089755535125732, "epoch": 1.6081764231811855, "step": 43900, "student_mlm_loss": 2.2641570568084717 }, { "epoch": 1.6118396952157668, "grad_norm": 13.450850486755371, "learning_rate": 2.3278128608988378e-05, "loss": 3.6702, "step": 44000 }, { "epoch": 1.6118396952157668, "eval_loss": 3.194415330886841, "eval_runtime": 1.9274, "eval_samples_per_second": 3629.828, "eval_steps_per_second": 14.528, "step": 44000 }, { "combined_loss": 1.760496735572815, "distill_loss": 1.1514201164245605, "epoch": 1.6118396952157668, "step": 44000, "student_mlm_loss": 2.3695733547210693 }, { "epoch": 1.615502967250348, "grad_norm": 7.381774425506592, "learning_rate": 2.3216699019583756e-05, "loss": 2.9269, "step": 44100 }, { "combined_loss": 4.663776397705078, "distill_loss": 1.307958722114563, "epoch": 1.615502967250348, "step": 44100, "student_mlm_loss": 8.019594192504883 }, { "epoch": 1.6191662392849293, "grad_norm": 10.999051094055176, "learning_rate": 2.315526943017913e-05, "loss": 3.0334, "step": 44200 }, { "combined_loss": 1.9191560745239258, "distill_loss": 1.3481658697128296, "epoch": 1.6191662392849293, "step": 44200, "student_mlm_loss": 2.4901461601257324 }, { "epoch": 1.6228295113195106, "grad_norm": 6.187446594238281, "learning_rate": 2.3093839840774504e-05, "loss": 30.6923, "step": 44300 }, { "combined_loss": 12.122703552246094, "distill_loss": 1.1659897565841675, "epoch": 1.6228295113195106, "step": 44300, "student_mlm_loss": 23.079418182373047 }, { "epoch": 1.6264927833540919, "grad_norm": 6.142828941345215, "learning_rate": 2.3032410251369882e-05, "loss": 7.4162, "step": 44400 }, { "combined_loss": 1.9456160068511963, "distill_loss": 1.257858157157898, "epoch": 1.6264927833540919, "step": 44400, "student_mlm_loss": 2.633373737335205 }, { "epoch": 1.6301560553886731, "grad_norm": 15.393942832946777, "learning_rate": 2.2970980661965257e-05, "loss": 4.8003, "step": 44500 }, { "combined_loss": 2.7578635215759277, "distill_loss": 1.1640808582305908, "epoch": 1.6301560553886731, "step": 44500, "student_mlm_loss": 4.351646423339844 }, { "epoch": 1.6338193274232544, "grad_norm": 18.73512077331543, "learning_rate": 2.290955107256063e-05, "loss": 5.3592, "step": 44600 }, { "combined_loss": 3.758654832839966, "distill_loss": 1.260606288909912, "epoch": 1.6338193274232544, "step": 44600, "student_mlm_loss": 6.2567033767700195 }, { "epoch": 1.6374825994578357, "grad_norm": 6.1570048332214355, "learning_rate": 2.2848121483156006e-05, "loss": 10.8594, "step": 44700 }, { "combined_loss": 3.205047845840454, "distill_loss": 1.1495074033737183, "epoch": 1.6374825994578357, "step": 44700, "student_mlm_loss": 5.2605881690979 }, { "epoch": 1.641145871492417, "grad_norm": 8.748614311218262, "learning_rate": 2.2786691893751383e-05, "loss": 2.611, "step": 44800 }, { "combined_loss": 2.7548794746398926, "distill_loss": 1.153849482536316, "epoch": 1.641145871492417, "step": 44800, "student_mlm_loss": 4.35590934753418 }, { "epoch": 1.6448091435269983, "grad_norm": 9.594339370727539, "learning_rate": 2.2725262304346758e-05, "loss": 3.621, "step": 44900 }, { "combined_loss": 2.63676381111145, "distill_loss": 1.144437313079834, "epoch": 1.6448091435269983, "step": 44900, "student_mlm_loss": 4.129090309143066 }, { "epoch": 1.6484724155615798, "grad_norm": 8.756010055541992, "learning_rate": 2.2663832714942136e-05, "loss": 5.0762, "step": 45000 }, { "combined_loss": 2.0047507286071777, "distill_loss": 1.203262209892273, "epoch": 1.6484724155615798, "step": 45000, "student_mlm_loss": 2.806239366531372 }, { "epoch": 1.6521356875961608, "grad_norm": 16.163911819458008, "learning_rate": 2.260240312553751e-05, "loss": 3.1675, "step": 45100 }, { "combined_loss": 1.822305679321289, "distill_loss": 1.187317967414856, "epoch": 1.6521356875961608, "step": 45100, "student_mlm_loss": 2.4572935104370117 }, { "epoch": 1.6557989596307423, "grad_norm": 4.047428607940674, "learning_rate": 2.2540973536132888e-05, "loss": 2.6406, "step": 45200 }, { "combined_loss": 2.431349039077759, "distill_loss": 1.2643455266952515, "epoch": 1.6557989596307423, "step": 45200, "student_mlm_loss": 3.5983526706695557 }, { "epoch": 1.6594622316653234, "grad_norm": 28.598485946655273, "learning_rate": 2.247954394672826e-05, "loss": 3.7667, "step": 45300 }, { "combined_loss": 2.274944543838501, "distill_loss": 1.266087293624878, "epoch": 1.6594622316653234, "step": 45300, "student_mlm_loss": 3.283801794052124 }, { "epoch": 1.6631255036999049, "grad_norm": 11.642946243286133, "learning_rate": 2.2418114357323637e-05, "loss": 3.0131, "step": 45400 }, { "combined_loss": 2.064805507659912, "distill_loss": 1.2423893213272095, "epoch": 1.6631255036999049, "step": 45400, "student_mlm_loss": 2.8872218132019043 }, { "epoch": 1.666788775734486, "grad_norm": 7.227854251861572, "learning_rate": 2.235668476791901e-05, "loss": 7.556, "step": 45500 }, { "combined_loss": 1.8626993894577026, "distill_loss": 1.153686761856079, "epoch": 1.666788775734486, "step": 45500, "student_mlm_loss": 2.571712017059326 }, { "epoch": 1.6704520477690674, "grad_norm": 11.972105026245117, "learning_rate": 2.229525517851439e-05, "loss": 3.9606, "step": 45600 }, { "combined_loss": 1.7529842853546143, "distill_loss": 1.2637630701065063, "epoch": 1.6704520477690674, "step": 45600, "student_mlm_loss": 2.2422056198120117 }, { "epoch": 1.6741153198036485, "grad_norm": 4.263253211975098, "learning_rate": 2.2233825589109764e-05, "loss": 3.0922, "step": 45700 }, { "combined_loss": 2.6089985370635986, "distill_loss": 1.2136098146438599, "epoch": 1.6741153198036485, "step": 45700, "student_mlm_loss": 4.004387378692627 }, { "epoch": 1.67777859183823, "grad_norm": 24.4074764251709, "learning_rate": 2.2172395999705138e-05, "loss": 3.2329, "step": 45800 }, { "combined_loss": 1.6919562816619873, "distill_loss": 1.139168381690979, "epoch": 1.67777859183823, "step": 45800, "student_mlm_loss": 2.244744300842285 }, { "epoch": 1.6814418638728112, "grad_norm": 5.1518778800964355, "learning_rate": 2.2110966410300516e-05, "loss": 9.4019, "step": 45900 }, { "combined_loss": 2.1822292804718018, "distill_loss": 1.3423482179641724, "epoch": 1.6814418638728112, "step": 45900, "student_mlm_loss": 3.0221104621887207 }, { "epoch": 1.6851051359073925, "grad_norm": 18.045368194580078, "learning_rate": 2.204953682089589e-05, "loss": 3.3662, "step": 46000 }, { "epoch": 1.6851051359073925, "eval_loss": 3.070533275604248, "eval_runtime": 1.9768, "eval_samples_per_second": 3539.063, "eval_steps_per_second": 14.164, "step": 46000 }, { "combined_loss": 1.8376495838165283, "distill_loss": 1.261283278465271, "epoch": 1.6851051359073925, "step": 46000, "student_mlm_loss": 2.414015769958496 }, { "epoch": 1.6887684079419738, "grad_norm": 5.69982385635376, "learning_rate": 2.1988107231491265e-05, "loss": 3.3451, "step": 46100 }, { "combined_loss": 1.7916219234466553, "distill_loss": 1.2525031566619873, "epoch": 1.6887684079419738, "step": 46100, "student_mlm_loss": 2.3307406902313232 }, { "epoch": 1.692431679976555, "grad_norm": 27.134151458740234, "learning_rate": 2.192667764208664e-05, "loss": 9.1006, "step": 46200 }, { "combined_loss": 59.0687141418457, "distill_loss": 1.1848413944244385, "epoch": 1.692431679976555, "step": 46200, "student_mlm_loss": 116.95258331298828 }, { "epoch": 1.6960949520111364, "grad_norm": 6.624229431152344, "learning_rate": 2.1865248052682017e-05, "loss": 3.0016, "step": 46300 }, { "combined_loss": 2.7997608184814453, "distill_loss": 1.1524275541305542, "epoch": 1.6960949520111364, "step": 46300, "student_mlm_loss": 4.447093963623047 }, { "epoch": 1.6997582240457176, "grad_norm": 5.472049236297607, "learning_rate": 2.180381846327739e-05, "loss": 20.0915, "step": 46400 }, { "combined_loss": 1.7153997421264648, "distill_loss": 1.237658143043518, "epoch": 1.6997582240457176, "step": 46400, "student_mlm_loss": 2.193141460418701 }, { "epoch": 1.703421496080299, "grad_norm": 14.290247917175293, "learning_rate": 2.174238887387277e-05, "loss": 4.5936, "step": 46500 }, { "combined_loss": 1.709627628326416, "distill_loss": 1.2791212797164917, "epoch": 1.703421496080299, "step": 46500, "student_mlm_loss": 2.140133857727051 }, { "epoch": 1.7070847681148802, "grad_norm": 17.962997436523438, "learning_rate": 2.1680959284468144e-05, "loss": 3.3627, "step": 46600 }, { "combined_loss": 7.8201751708984375, "distill_loss": 1.3012824058532715, "epoch": 1.7070847681148802, "step": 46600, "student_mlm_loss": 14.339067459106445 }, { "epoch": 1.7107480401494615, "grad_norm": 6.800339698791504, "learning_rate": 2.161952969506352e-05, "loss": 6.7955, "step": 46700 }, { "combined_loss": 1.809753656387329, "distill_loss": 1.2891262769699097, "epoch": 1.7107480401494615, "step": 46700, "student_mlm_loss": 2.330381155014038 }, { "epoch": 1.7144113121840427, "grad_norm": 12.281099319458008, "learning_rate": 2.1558100105658896e-05, "loss": 10.3436, "step": 46800 }, { "combined_loss": 3.3808600902557373, "distill_loss": 1.2777303457260132, "epoch": 1.7144113121840427, "step": 46800, "student_mlm_loss": 5.483989715576172 }, { "epoch": 1.718074584218624, "grad_norm": 3.3210408687591553, "learning_rate": 2.149667051625427e-05, "loss": 2.8055, "step": 46900 }, { "combined_loss": 2.1092348098754883, "distill_loss": 1.2058593034744263, "epoch": 1.718074584218624, "step": 46900, "student_mlm_loss": 3.0126101970672607 }, { "epoch": 1.7217378562532053, "grad_norm": 11.694738388061523, "learning_rate": 2.1435240926849645e-05, "loss": 4.6311, "step": 47000 }, { "combined_loss": 2.2222890853881836, "distill_loss": 1.218597173690796, "epoch": 1.7217378562532053, "step": 47000, "student_mlm_loss": 3.2259812355041504 }, { "epoch": 1.7254011282877868, "grad_norm": 23.036334991455078, "learning_rate": 2.137381133744502e-05, "loss": 2.5923, "step": 47100 }, { "combined_loss": 1.882810354232788, "distill_loss": 1.2441027164459229, "epoch": 1.7254011282877868, "step": 47100, "student_mlm_loss": 2.5215179920196533 }, { "epoch": 1.7290644003223679, "grad_norm": 65.06354522705078, "learning_rate": 2.1312381748040397e-05, "loss": 3.3375, "step": 47200 }, { "combined_loss": 1.84983229637146, "distill_loss": 1.224557876586914, "epoch": 1.7290644003223679, "step": 47200, "student_mlm_loss": 2.475106716156006 }, { "epoch": 1.7327276723569494, "grad_norm": 9.202945709228516, "learning_rate": 2.1250952158635772e-05, "loss": 3.0094, "step": 47300 }, { "combined_loss": 1.6417255401611328, "distill_loss": 1.2296794652938843, "epoch": 1.7327276723569494, "step": 47300, "student_mlm_loss": 2.053771734237671 }, { "epoch": 1.7363909443915304, "grad_norm": 7.1568193435668945, "learning_rate": 2.118952256923115e-05, "loss": 3.3413, "step": 47400 }, { "combined_loss": 2.165384531021118, "distill_loss": 1.2572156190872192, "epoch": 1.7363909443915304, "step": 47400, "student_mlm_loss": 3.0735535621643066 }, { "epoch": 1.740054216426112, "grad_norm": 39.054439544677734, "learning_rate": 2.1128092979826524e-05, "loss": 4.8522, "step": 47500 }, { "combined_loss": 2.6122236251831055, "distill_loss": 1.1487023830413818, "epoch": 1.740054216426112, "step": 47500, "student_mlm_loss": 4.07574462890625 }, { "epoch": 1.743717488460693, "grad_norm": 3.18758487701416, "learning_rate": 2.1066663390421902e-05, "loss": 4.3993, "step": 47600 }, { "combined_loss": 6.344114303588867, "distill_loss": 1.1341725587844849, "epoch": 1.743717488460693, "step": 47600, "student_mlm_loss": 11.554056167602539 }, { "epoch": 1.7473807604952745, "grad_norm": 9.418896675109863, "learning_rate": 2.1005233801017273e-05, "loss": 8.7279, "step": 47700 }, { "combined_loss": 2.8721518516540527, "distill_loss": 1.2175838947296143, "epoch": 1.7473807604952745, "step": 47700, "student_mlm_loss": 4.526719570159912 }, { "epoch": 1.7510440325298555, "grad_norm": 4.730939865112305, "learning_rate": 2.094380421161265e-05, "loss": 2.74, "step": 47800 }, { "combined_loss": 1.8483730554580688, "distill_loss": 1.2789607048034668, "epoch": 1.7510440325298555, "step": 47800, "student_mlm_loss": 2.417785406112671 }, { "epoch": 1.754707304564437, "grad_norm": 4.566458225250244, "learning_rate": 2.0882374622208025e-05, "loss": 2.63, "step": 47900 }, { "combined_loss": 1.8073049783706665, "distill_loss": 1.3073413372039795, "epoch": 1.754707304564437, "step": 47900, "student_mlm_loss": 2.3072686195373535 }, { "epoch": 1.7583705765990183, "grad_norm": 14.967068672180176, "learning_rate": 2.0820945032803403e-05, "loss": 2.5821, "step": 48000 }, { "epoch": 1.7583705765990183, "eval_loss": 3.2400870323181152, "eval_runtime": 1.8322, "eval_samples_per_second": 3818.29, "eval_steps_per_second": 15.282, "step": 48000 } ], "logging_steps": 100, "max_steps": 81894, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.7150683130961408e+16, "train_batch_size": 256, "trial_name": null, "trial_params": null }