diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6945 @@ +{ + "best_metric": 3.070533275604248, + "best_model_checkpoint": "./distilled3/checkpoint-46000", + "epoch": 1.7583705765990183, + "eval_steps": 2000, + "global_step": 48000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "combined_loss": 13.355602264404297, + "distill_loss": 1.4010732173919678, + "epoch": 0, + "step": 0, + "student_mlm_loss": 25.310131072998047 + }, + { + "epoch": 0.003663272034581288, + "grad_norm": 11.128765106201172, + "learning_rate": 1e-05, + "loss": 17.4544, + "step": 100 + }, + { + "combined_loss": 9.379831314086914, + "distill_loss": 1.5227235555648804, + "epoch": 0.003663272034581288, + "step": 100, + "student_mlm_loss": 17.2369384765625 + }, + { + "epoch": 0.007326544069162576, + "grad_norm": 14.151921272277832, + "learning_rate": 2e-05, + "loss": 16.0099, + "step": 200 + }, + { + "combined_loss": 28.136512756347656, + "distill_loss": 1.571045160293579, + "epoch": 0.007326544069162576, + "step": 200, + "student_mlm_loss": 54.70198059082031 + }, + { + "epoch": 0.010989816103743864, + "grad_norm": 11.68195915222168, + "learning_rate": 3e-05, + "loss": 18.8223, + "step": 300 + }, + { + "combined_loss": 15.699158668518066, + "distill_loss": 1.5519400835037231, + "epoch": 0.010989816103743864, + "step": 300, + "student_mlm_loss": 29.846376419067383 + }, + { + "epoch": 0.014653088138325152, + "grad_norm": 8.982569694519043, + "learning_rate": 4e-05, + "loss": 16.9008, + "step": 400 + }, + { + "combined_loss": 3.035900592803955, + "distill_loss": 1.4880340099334717, + "epoch": 0.014653088138325152, + "step": 400, + "student_mlm_loss": 4.583766937255859 + }, + { + "epoch": 0.01831636017290644, + "grad_norm": 7.045658111572266, + "learning_rate": 5e-05, + "loss": 8.812, + "step": 500 + }, + { + "combined_loss": 7.002770900726318, + "distill_loss": 1.351847529411316, + "epoch": 0.01831636017290644, + "step": 500, + "student_mlm_loss": 12.653694152832031 + }, + { + "epoch": 0.021979632207487727, + "grad_norm": 4.265043258666992, + "learning_rate": 4.9938570410595373e-05, + "loss": 16.8853, + "step": 600 + }, + { + "combined_loss": 3.2060928344726562, + "distill_loss": 1.2962806224822998, + "epoch": 0.021979632207487727, + "step": 600, + "student_mlm_loss": 5.115904808044434 + }, + { + "epoch": 0.025642904242069015, + "grad_norm": 7.744924545288086, + "learning_rate": 4.987714082119075e-05, + "loss": 7.1609, + "step": 700 + }, + { + "combined_loss": 2.2816712856292725, + "distill_loss": 1.5105196237564087, + "epoch": 0.025642904242069015, + "step": 700, + "student_mlm_loss": 3.052823066711426 + }, + { + "epoch": 0.029306176276650303, + "grad_norm": 12.44052791595459, + "learning_rate": 4.981571123178613e-05, + "loss": 13.0471, + "step": 800 + }, + { + "combined_loss": 3.225351095199585, + "distill_loss": 1.5753816366195679, + "epoch": 0.029306176276650303, + "step": 800, + "student_mlm_loss": 4.8753204345703125 + }, + { + "epoch": 0.032969448311231594, + "grad_norm": 6.2059645652771, + "learning_rate": 4.975428164238151e-05, + "loss": 6.2833, + "step": 900 + }, + { + "combined_loss": 8.580605506896973, + "distill_loss": 1.530474066734314, + "epoch": 0.032969448311231594, + "step": 900, + "student_mlm_loss": 15.630736351013184 + }, + { + "epoch": 0.03663272034581288, + "grad_norm": 14.731459617614746, + "learning_rate": 4.969285205297688e-05, + "loss": 5.8549, + "step": 1000 + }, + { + "combined_loss": 3.7085845470428467, + "distill_loss": 1.4659323692321777, + "epoch": 0.03663272034581288, + "step": 1000, + "student_mlm_loss": 5.951236724853516 + }, + { + "epoch": 0.04029599238039417, + "grad_norm": 9.745060920715332, + "learning_rate": 4.9631422463572256e-05, + "loss": 5.174, + "step": 1100 + }, + { + "combined_loss": 4.752764701843262, + "distill_loss": 1.4000483751296997, + "epoch": 0.04029599238039417, + "step": 1100, + "student_mlm_loss": 8.105481147766113 + }, + { + "epoch": 0.043959264414975455, + "grad_norm": 13.801424026489258, + "learning_rate": 4.9569992874167634e-05, + "loss": 19.8368, + "step": 1200 + }, + { + "combined_loss": 3.1324005126953125, + "distill_loss": 1.404078483581543, + "epoch": 0.043959264414975455, + "step": 1200, + "student_mlm_loss": 4.860722541809082 + }, + { + "epoch": 0.047622536449556746, + "grad_norm": 52.244632720947266, + "learning_rate": 4.9508563284763005e-05, + "loss": 5.547, + "step": 1300 + }, + { + "combined_loss": 3.1176328659057617, + "distill_loss": 1.3057805299758911, + "epoch": 0.047622536449556746, + "step": 1300, + "student_mlm_loss": 4.929485321044922 + }, + { + "epoch": 0.05128580848413803, + "grad_norm": 47.002349853515625, + "learning_rate": 4.944713369535838e-05, + "loss": 4.7784, + "step": 1400 + }, + { + "combined_loss": 3.871903657913208, + "distill_loss": 1.5537463426589966, + "epoch": 0.05128580848413803, + "step": 1400, + "student_mlm_loss": 6.190061092376709 + }, + { + "epoch": 0.05494908051871932, + "grad_norm": 11.417911529541016, + "learning_rate": 4.9385704105953754e-05, + "loss": 5.9593, + "step": 1500 + }, + { + "combined_loss": 6.293668270111084, + "distill_loss": 1.3082151412963867, + "epoch": 0.05494908051871932, + "step": 1500, + "student_mlm_loss": 11.279121398925781 + }, + { + "epoch": 0.058612352553300606, + "grad_norm": 24.519105911254883, + "learning_rate": 4.932427451654914e-05, + "loss": 7.2762, + "step": 1600 + }, + { + "combined_loss": 3.350501775741577, + "distill_loss": 1.4593900442123413, + "epoch": 0.058612352553300606, + "step": 1600, + "student_mlm_loss": 5.241613388061523 + }, + { + "epoch": 0.0622756245878819, + "grad_norm": 42.58499526977539, + "learning_rate": 4.926284492714451e-05, + "loss": 7.1364, + "step": 1700 + }, + { + "combined_loss": 10.976073265075684, + "distill_loss": 1.594639539718628, + "epoch": 0.0622756245878819, + "step": 1700, + "student_mlm_loss": 20.357507705688477 + }, + { + "epoch": 0.06593889662246319, + "grad_norm": 105.27689361572266, + "learning_rate": 4.920141533773989e-05, + "loss": 5.7662, + "step": 1800 + }, + { + "combined_loss": 4.272126197814941, + "distill_loss": 1.4649100303649902, + "epoch": 0.06593889662246319, + "step": 1800, + "student_mlm_loss": 7.079341888427734 + }, + { + "epoch": 0.06960216865704447, + "grad_norm": 9.272991180419922, + "learning_rate": 4.913998574833526e-05, + "loss": 4.9898, + "step": 1900 + }, + { + "combined_loss": 2.2884514331817627, + "distill_loss": 1.5105092525482178, + "epoch": 0.06960216865704447, + "step": 1900, + "student_mlm_loss": 3.0663936138153076 + }, + { + "epoch": 0.07326544069162576, + "grad_norm": 15.299578666687012, + "learning_rate": 4.9078556158930636e-05, + "loss": 6.8909, + "step": 2000 + }, + { + "epoch": 0.07326544069162576, + "eval_loss": 6.166979789733887, + "eval_runtime": 2.1158, + "eval_samples_per_second": 3306.616, + "eval_steps_per_second": 13.234, + "step": 2000 + }, + { + "combined_loss": 5.612101078033447, + "distill_loss": 1.332657814025879, + "epoch": 0.07326544069162576, + "step": 2000, + "student_mlm_loss": 9.891544342041016 + }, + { + "epoch": 0.07692871272620705, + "grad_norm": 12.242279052734375, + "learning_rate": 4.9017126569526014e-05, + "loss": 8.6608, + "step": 2100 + }, + { + "combined_loss": 2.035828113555908, + "distill_loss": 1.3731106519699097, + "epoch": 0.07692871272620705, + "step": 2100, + "student_mlm_loss": 2.6985456943511963 + }, + { + "epoch": 0.08059198476078834, + "grad_norm": 27.212379455566406, + "learning_rate": 4.8955696980121385e-05, + "loss": 9.4649, + "step": 2200 + }, + { + "combined_loss": 2.5593996047973633, + "distill_loss": 1.5456775426864624, + "epoch": 0.08059198476078834, + "step": 2200, + "student_mlm_loss": 3.5731217861175537 + }, + { + "epoch": 0.08425525679536962, + "grad_norm": 9.444129943847656, + "learning_rate": 4.889426739071676e-05, + "loss": 12.6304, + "step": 2300 + }, + { + "combined_loss": 3.0112435817718506, + "distill_loss": 1.268593192100525, + "epoch": 0.08425525679536962, + "step": 2300, + "student_mlm_loss": 4.753893852233887 + }, + { + "epoch": 0.08791852882995091, + "grad_norm": 6.72172212600708, + "learning_rate": 4.8832837801312134e-05, + "loss": 4.2453, + "step": 2400 + }, + { + "combined_loss": 2.3823843002319336, + "distill_loss": 1.3674836158752441, + "epoch": 0.08791852882995091, + "step": 2400, + "student_mlm_loss": 3.397284984588623 + }, + { + "epoch": 0.0915818008645322, + "grad_norm": 88.5478744506836, + "learning_rate": 4.877140821190752e-05, + "loss": 4.6849, + "step": 2500 + }, + { + "combined_loss": 3.8919034004211426, + "distill_loss": 1.523806095123291, + "epoch": 0.0915818008645322, + "step": 2500, + "student_mlm_loss": 6.260000705718994 + }, + { + "epoch": 0.09524507289911349, + "grad_norm": 11.671692848205566, + "learning_rate": 4.870997862250289e-05, + "loss": 4.8686, + "step": 2600 + }, + { + "combined_loss": 2.8186635971069336, + "distill_loss": 1.313085913658142, + "epoch": 0.09524507289911349, + "step": 2600, + "student_mlm_loss": 4.3242411613464355 + }, + { + "epoch": 0.09890834493369477, + "grad_norm": 7.681136131286621, + "learning_rate": 4.864854903309827e-05, + "loss": 14.7468, + "step": 2700 + }, + { + "combined_loss": 2.6350021362304688, + "distill_loss": 1.5300695896148682, + "epoch": 0.09890834493369477, + "step": 2700, + "student_mlm_loss": 3.7399346828460693 + }, + { + "epoch": 0.10257161696827606, + "grad_norm": 10.245522499084473, + "learning_rate": 4.858711944369364e-05, + "loss": 4.7465, + "step": 2800 + }, + { + "combined_loss": 1.9805179834365845, + "distill_loss": 1.3671844005584717, + "epoch": 0.10257161696827606, + "step": 2800, + "student_mlm_loss": 2.5938515663146973 + }, + { + "epoch": 0.10623488900285735, + "grad_norm": 51.705352783203125, + "learning_rate": 4.8525689854289016e-05, + "loss": 3.8985, + "step": 2900 + }, + { + "combined_loss": 1.9335501194000244, + "distill_loss": 1.3294615745544434, + "epoch": 0.10623488900285735, + "step": 2900, + "student_mlm_loss": 2.5376386642456055 + }, + { + "epoch": 0.10989816103743864, + "grad_norm": 7.661074161529541, + "learning_rate": 4.8464260264884394e-05, + "loss": 3.9846, + "step": 3000 + }, + { + "combined_loss": 2.815329074859619, + "distill_loss": 1.5120948553085327, + "epoch": 0.10989816103743864, + "step": 3000, + "student_mlm_loss": 4.118563175201416 + }, + { + "epoch": 0.11356143307201993, + "grad_norm": 3.9512596130371094, + "learning_rate": 4.8402830675479765e-05, + "loss": 5.6509, + "step": 3100 + }, + { + "combined_loss": 5.329846382141113, + "distill_loss": 1.5839005708694458, + "epoch": 0.11356143307201993, + "step": 3100, + "student_mlm_loss": 9.07579231262207 + }, + { + "epoch": 0.11722470510660121, + "grad_norm": 21.47922134399414, + "learning_rate": 4.834140108607514e-05, + "loss": 4.5437, + "step": 3200 + }, + { + "combined_loss": 3.32517147064209, + "distill_loss": 1.4834882020950317, + "epoch": 0.11722470510660121, + "step": 3200, + "student_mlm_loss": 5.1668548583984375 + }, + { + "epoch": 0.1208879771411825, + "grad_norm": 11.865033149719238, + "learning_rate": 4.827997149667052e-05, + "loss": 5.0218, + "step": 3300 + }, + { + "combined_loss": 2.84318208694458, + "distill_loss": 1.302217960357666, + "epoch": 0.1208879771411825, + "step": 3300, + "student_mlm_loss": 4.384146213531494 + }, + { + "epoch": 0.1245512491757638, + "grad_norm": 13.824487686157227, + "learning_rate": 4.82185419072659e-05, + "loss": 33.2949, + "step": 3400 + }, + { + "combined_loss": 2.065192937850952, + "distill_loss": 1.3474924564361572, + "epoch": 0.1245512491757638, + "step": 3400, + "student_mlm_loss": 2.782893419265747 + }, + { + "epoch": 0.12821452121034507, + "grad_norm": 34.21382522583008, + "learning_rate": 4.815711231786127e-05, + "loss": 12.5775, + "step": 3500 + }, + { + "combined_loss": 2.2148988246917725, + "distill_loss": 1.616875171661377, + "epoch": 0.12821452121034507, + "step": 3500, + "student_mlm_loss": 2.812922477722168 + }, + { + "epoch": 0.13187779324492638, + "grad_norm": 8.859841346740723, + "learning_rate": 4.809568272845665e-05, + "loss": 4.6975, + "step": 3600 + }, + { + "combined_loss": 4.478976726531982, + "distill_loss": 1.3554083108901978, + "epoch": 0.13187779324492638, + "step": 3600, + "student_mlm_loss": 7.602544784545898 + }, + { + "epoch": 0.13554106527950766, + "grad_norm": 12.680179595947266, + "learning_rate": 4.803425313905202e-05, + "loss": 4.5414, + "step": 3700 + }, + { + "combined_loss": 6.908867835998535, + "distill_loss": 1.3570021390914917, + "epoch": 0.13554106527950766, + "step": 3700, + "student_mlm_loss": 12.460733413696289 + }, + { + "epoch": 0.13920433731408893, + "grad_norm": 18.478200912475586, + "learning_rate": 4.7972823549647396e-05, + "loss": 35.1443, + "step": 3800 + }, + { + "combined_loss": 13.97608757019043, + "distill_loss": 1.418832778930664, + "epoch": 0.13920433731408893, + "step": 3800, + "student_mlm_loss": 26.533342361450195 + }, + { + "epoch": 0.14286760934867024, + "grad_norm": 10.53610897064209, + "learning_rate": 4.7911393960242774e-05, + "loss": 13.766, + "step": 3900 + }, + { + "combined_loss": 2.1997413635253906, + "distill_loss": 1.4529953002929688, + "epoch": 0.14286760934867024, + "step": 3900, + "student_mlm_loss": 2.9464874267578125 + }, + { + "epoch": 0.14653088138325152, + "grad_norm": 42.095558166503906, + "learning_rate": 4.7849964370838145e-05, + "loss": 3.297, + "step": 4000 + }, + { + "epoch": 0.14653088138325152, + "eval_loss": 4.568027496337891, + "eval_runtime": 2.0693, + "eval_samples_per_second": 3380.818, + "eval_steps_per_second": 13.531, + "step": 4000 + }, + { + "combined_loss": 2.278163433074951, + "distill_loss": 1.5395259857177734, + "epoch": 0.14653088138325152, + "step": 4000, + "student_mlm_loss": 3.016800880432129 + }, + { + "epoch": 0.15019415341783282, + "grad_norm": 15.655592918395996, + "learning_rate": 4.778853478143352e-05, + "loss": 4.5795, + "step": 4100 + }, + { + "combined_loss": 2.117962598800659, + "distill_loss": 1.5073814392089844, + "epoch": 0.15019415341783282, + "step": 4100, + "student_mlm_loss": 2.728543758392334 + }, + { + "epoch": 0.1538574254524141, + "grad_norm": 9.47999382019043, + "learning_rate": 4.77271051920289e-05, + "loss": 4.6384, + "step": 4200 + }, + { + "combined_loss": 2.2614216804504395, + "distill_loss": 1.3999947309494019, + "epoch": 0.1538574254524141, + "step": 4200, + "student_mlm_loss": 3.1228485107421875 + }, + { + "epoch": 0.15752069748699538, + "grad_norm": 12.137129783630371, + "learning_rate": 4.766567560262428e-05, + "loss": 3.6101, + "step": 4300 + }, + { + "combined_loss": 1.9776763916015625, + "distill_loss": 1.4785245656967163, + "epoch": 0.15752069748699538, + "step": 4300, + "student_mlm_loss": 2.476828098297119 + }, + { + "epoch": 0.16118396952157668, + "grad_norm": 74.8094253540039, + "learning_rate": 4.760424601321965e-05, + "loss": 4.9111, + "step": 4400 + }, + { + "combined_loss": 3.0158274173736572, + "distill_loss": 1.2940564155578613, + "epoch": 0.16118396952157668, + "step": 4400, + "student_mlm_loss": 4.737598419189453 + }, + { + "epoch": 0.16484724155615796, + "grad_norm": 5.339694499969482, + "learning_rate": 4.754281642381502e-05, + "loss": 3.4013, + "step": 4500 + }, + { + "combined_loss": 2.176065683364868, + "distill_loss": 1.5688632726669312, + "epoch": 0.16484724155615796, + "step": 4500, + "student_mlm_loss": 2.7832682132720947 + }, + { + "epoch": 0.16851051359073924, + "grad_norm": 12.745500564575195, + "learning_rate": 4.74813868344104e-05, + "loss": 3.1244, + "step": 4600 + }, + { + "combined_loss": 2.4230682849884033, + "distill_loss": 1.46636962890625, + "epoch": 0.16851051359073924, + "step": 4600, + "student_mlm_loss": 3.3797669410705566 + }, + { + "epoch": 0.17217378562532054, + "grad_norm": 14.515507698059082, + "learning_rate": 4.7419957245005777e-05, + "loss": 4.9862, + "step": 4700 + }, + { + "combined_loss": 6.772428512573242, + "distill_loss": 1.6445391178131104, + "epoch": 0.17217378562532054, + "step": 4700, + "student_mlm_loss": 11.900318145751953 + }, + { + "epoch": 0.17583705765990182, + "grad_norm": 10.036664962768555, + "learning_rate": 4.7358527655601154e-05, + "loss": 3.72, + "step": 4800 + }, + { + "combined_loss": 27.606048583984375, + "distill_loss": 1.4302338361740112, + "epoch": 0.17583705765990182, + "step": 4800, + "student_mlm_loss": 53.781864166259766 + }, + { + "epoch": 0.17950032969448312, + "grad_norm": 14.220582008361816, + "learning_rate": 4.7297098066196525e-05, + "loss": 9.0684, + "step": 4900 + }, + { + "combined_loss": 7.97739839553833, + "distill_loss": 1.4764257669448853, + "epoch": 0.17950032969448312, + "step": 4900, + "student_mlm_loss": 14.478370666503906 + }, + { + "epoch": 0.1831636017290644, + "grad_norm": 8.734748840332031, + "learning_rate": 4.72356684767919e-05, + "loss": 13.2974, + "step": 5000 + }, + { + "combined_loss": 3.3007736206054688, + "distill_loss": 1.5111989974975586, + "epoch": 0.1831636017290644, + "step": 5000, + "student_mlm_loss": 5.090348243713379 + }, + { + "epoch": 0.18682687376364568, + "grad_norm": 23.457653045654297, + "learning_rate": 4.717423888738728e-05, + "loss": 4.4811, + "step": 5100 + }, + { + "combined_loss": 2.695789337158203, + "distill_loss": 1.4495799541473389, + "epoch": 0.18682687376364568, + "step": 5100, + "student_mlm_loss": 3.9419989585876465 + }, + { + "epoch": 0.19049014579822698, + "grad_norm": 11.504470825195312, + "learning_rate": 4.711280929798265e-05, + "loss": 3.2576, + "step": 5200 + }, + { + "combined_loss": 3.5765743255615234, + "distill_loss": 1.3500127792358398, + "epoch": 0.19049014579822698, + "step": 5200, + "student_mlm_loss": 5.803135871887207 + }, + { + "epoch": 0.19415341783280826, + "grad_norm": 34.68207550048828, + "learning_rate": 4.705137970857803e-05, + "loss": 5.8403, + "step": 5300 + }, + { + "combined_loss": 4.304483413696289, + "distill_loss": 1.4075747728347778, + "epoch": 0.19415341783280826, + "step": 5300, + "student_mlm_loss": 7.20139217376709 + }, + { + "epoch": 0.19781668986738954, + "grad_norm": 22.416582107543945, + "learning_rate": 4.69899501191734e-05, + "loss": 4.045, + "step": 5400 + }, + { + "combined_loss": 1.9111289978027344, + "distill_loss": 1.321276307106018, + "epoch": 0.19781668986738954, + "step": 5400, + "student_mlm_loss": 2.500981569290161 + }, + { + "epoch": 0.20147996190197084, + "grad_norm": 27.66775894165039, + "learning_rate": 4.6928520529768786e-05, + "loss": 3.8896, + "step": 5500 + }, + { + "combined_loss": 2.142390251159668, + "distill_loss": 1.4025957584381104, + "epoch": 0.20147996190197084, + "step": 5500, + "student_mlm_loss": 2.8821845054626465 + }, + { + "epoch": 0.20514323393655212, + "grad_norm": 35.84339141845703, + "learning_rate": 4.686709094036416e-05, + "loss": 4.94, + "step": 5600 + }, + { + "combined_loss": 2.1642816066741943, + "distill_loss": 1.392912745475769, + "epoch": 0.20514323393655212, + "step": 5600, + "student_mlm_loss": 2.935650587081909 + }, + { + "epoch": 0.20880650597113343, + "grad_norm": 18.43452262878418, + "learning_rate": 4.6805661350959535e-05, + "loss": 7.4575, + "step": 5700 + }, + { + "combined_loss": 2.354356288909912, + "distill_loss": 1.3411612510681152, + "epoch": 0.20880650597113343, + "step": 5700, + "student_mlm_loss": 3.36755108833313 + }, + { + "epoch": 0.2124697780057147, + "grad_norm": 5.364467144012451, + "learning_rate": 4.6744231761554906e-05, + "loss": 3.2172, + "step": 5800 + }, + { + "combined_loss": 2.129748821258545, + "distill_loss": 1.4555408954620361, + "epoch": 0.2124697780057147, + "step": 5800, + "student_mlm_loss": 2.8039567470550537 + }, + { + "epoch": 0.21613305004029598, + "grad_norm": 12.704414367675781, + "learning_rate": 4.6682802172150283e-05, + "loss": 9.9214, + "step": 5900 + }, + { + "combined_loss": 5.396609783172607, + "distill_loss": 1.3954136371612549, + "epoch": 0.21613305004029598, + "step": 5900, + "student_mlm_loss": 9.397806167602539 + }, + { + "epoch": 0.2197963220748773, + "grad_norm": 9.411243438720703, + "learning_rate": 4.662137258274566e-05, + "loss": 4.6268, + "step": 6000 + }, + { + "epoch": 0.2197963220748773, + "eval_loss": 4.474331855773926, + "eval_runtime": 2.0765, + "eval_samples_per_second": 3369.116, + "eval_steps_per_second": 13.484, + "step": 6000 + }, + { + "combined_loss": 2.3863794803619385, + "distill_loss": 1.4665789604187012, + "epoch": 0.2197963220748773, + "step": 6000, + "student_mlm_loss": 3.306180000305176 + }, + { + "epoch": 0.22345959410945856, + "grad_norm": 15.34604263305664, + "learning_rate": 4.655994299334103e-05, + "loss": 3.586, + "step": 6100 + }, + { + "combined_loss": 2.5740702152252197, + "distill_loss": 1.5186127424240112, + "epoch": 0.22345959410945856, + "step": 6100, + "student_mlm_loss": 3.6295275688171387 + }, + { + "epoch": 0.22712286614403987, + "grad_norm": 10.821826934814453, + "learning_rate": 4.649851340393641e-05, + "loss": 5.516, + "step": 6200 + }, + { + "combined_loss": 4.770940780639648, + "distill_loss": 1.5328683853149414, + "epoch": 0.22712286614403987, + "step": 6200, + "student_mlm_loss": 8.009013175964355 + }, + { + "epoch": 0.23078613817862115, + "grad_norm": 45.33203887939453, + "learning_rate": 4.643708381453178e-05, + "loss": 6.4937, + "step": 6300 + }, + { + "combined_loss": 2.257235050201416, + "distill_loss": 1.4594223499298096, + "epoch": 0.23078613817862115, + "step": 6300, + "student_mlm_loss": 3.0550475120544434 + }, + { + "epoch": 0.23444941021320242, + "grad_norm": 24.137001037597656, + "learning_rate": 4.6375654225127166e-05, + "loss": 2.8761, + "step": 6400 + }, + { + "combined_loss": 3.673408031463623, + "distill_loss": 1.5113860368728638, + "epoch": 0.23444941021320242, + "step": 6400, + "student_mlm_loss": 5.835430145263672 + }, + { + "epoch": 0.23811268224778373, + "grad_norm": 89.53437042236328, + "learning_rate": 4.631422463572254e-05, + "loss": 4.9469, + "step": 6500 + }, + { + "combined_loss": 2.289175271987915, + "distill_loss": 1.6255369186401367, + "epoch": 0.23811268224778373, + "step": 6500, + "student_mlm_loss": 2.9528136253356934 + }, + { + "epoch": 0.241775954282365, + "grad_norm": 29.47341537475586, + "learning_rate": 4.6252795046317915e-05, + "loss": 3.2857, + "step": 6600 + }, + { + "combined_loss": 2.986036777496338, + "distill_loss": 1.3628634214401245, + "epoch": 0.241775954282365, + "step": 6600, + "student_mlm_loss": 4.609210014343262 + }, + { + "epoch": 0.24543922631694629, + "grad_norm": 8.413643836975098, + "learning_rate": 4.6191365456913286e-05, + "loss": 4.1874, + "step": 6700 + }, + { + "combined_loss": 4.9381103515625, + "distill_loss": 1.5604116916656494, + "epoch": 0.24543922631694629, + "step": 6700, + "student_mlm_loss": 8.31580924987793 + }, + { + "epoch": 0.2491024983515276, + "grad_norm": 19.279678344726562, + "learning_rate": 4.6129935867508664e-05, + "loss": 5.5581, + "step": 6800 + }, + { + "combined_loss": 4.7175493240356445, + "distill_loss": 1.5657355785369873, + "epoch": 0.2491024983515276, + "step": 6800, + "student_mlm_loss": 7.869362831115723 + }, + { + "epoch": 0.25276577038610887, + "grad_norm": 14.9283447265625, + "learning_rate": 4.606850627810404e-05, + "loss": 4.6319, + "step": 6900 + }, + { + "combined_loss": 5.707411766052246, + "distill_loss": 1.566019058227539, + "epoch": 0.25276577038610887, + "step": 6900, + "student_mlm_loss": 9.848804473876953 + }, + { + "epoch": 0.25642904242069015, + "grad_norm": 5.006555557250977, + "learning_rate": 4.600707668869941e-05, + "loss": 6.1192, + "step": 7000 + }, + { + "combined_loss": 4.373297691345215, + "distill_loss": 1.4654217958450317, + "epoch": 0.25642904242069015, + "step": 7000, + "student_mlm_loss": 7.281173229217529 + }, + { + "epoch": 0.2600923144552714, + "grad_norm": 15.025683403015137, + "learning_rate": 4.594564709929479e-05, + "loss": 3.472, + "step": 7100 + }, + { + "combined_loss": 5.1388630867004395, + "distill_loss": 1.5254905223846436, + "epoch": 0.2600923144552714, + "step": 7100, + "student_mlm_loss": 8.752235412597656 + }, + { + "epoch": 0.26375558648985276, + "grad_norm": 44.157169342041016, + "learning_rate": 4.588421750989017e-05, + "loss": 8.8482, + "step": 7200 + }, + { + "combined_loss": 2.1565892696380615, + "distill_loss": 1.2985585927963257, + "epoch": 0.26375558648985276, + "step": 7200, + "student_mlm_loss": 3.014619827270508 + }, + { + "epoch": 0.26741885852443403, + "grad_norm": 5.755523204803467, + "learning_rate": 4.5822787920485546e-05, + "loss": 5.7829, + "step": 7300 + }, + { + "combined_loss": 2.5404441356658936, + "distill_loss": 1.5058717727661133, + "epoch": 0.26741885852443403, + "step": 7300, + "student_mlm_loss": 3.575016498565674 + }, + { + "epoch": 0.2710821305590153, + "grad_norm": 15.252013206481934, + "learning_rate": 4.576135833108092e-05, + "loss": 7.9361, + "step": 7400 + }, + { + "combined_loss": 2.5752511024475098, + "distill_loss": 1.5916697978973389, + "epoch": 0.2710821305590153, + "step": 7400, + "student_mlm_loss": 3.5588326454162598 + }, + { + "epoch": 0.2747454025935966, + "grad_norm": 26.218740463256836, + "learning_rate": 4.5699928741676295e-05, + "loss": 4.8534, + "step": 7500 + }, + { + "combined_loss": 2.1656486988067627, + "distill_loss": 1.4179739952087402, + "epoch": 0.2747454025935966, + "step": 7500, + "student_mlm_loss": 2.913323402404785 + }, + { + "epoch": 0.27840867462817787, + "grad_norm": 6.031148910522461, + "learning_rate": 4.5638499152271666e-05, + "loss": 6.4535, + "step": 7600 + }, + { + "combined_loss": 2.8603813648223877, + "distill_loss": 1.5837383270263672, + "epoch": 0.27840867462817787, + "step": 7600, + "student_mlm_loss": 4.137024402618408 + }, + { + "epoch": 0.2820719466627592, + "grad_norm": 107.95591735839844, + "learning_rate": 4.5577069562867044e-05, + "loss": 3.2702, + "step": 7700 + }, + { + "combined_loss": 1.8474111557006836, + "distill_loss": 1.437280297279358, + "epoch": 0.2820719466627592, + "step": 7700, + "student_mlm_loss": 2.257542133331299 + }, + { + "epoch": 0.2857352186973405, + "grad_norm": 5.394913673400879, + "learning_rate": 4.551563997346242e-05, + "loss": 2.8998, + "step": 7800 + }, + { + "combined_loss": 4.77987813949585, + "distill_loss": 1.5358555316925049, + "epoch": 0.2857352186973405, + "step": 7800, + "student_mlm_loss": 8.023900985717773 + }, + { + "epoch": 0.28939849073192175, + "grad_norm": 7.790286540985107, + "learning_rate": 4.545421038405779e-05, + "loss": 2.9018, + "step": 7900 + }, + { + "combined_loss": 3.34071946144104, + "distill_loss": 1.3893283605575562, + "epoch": 0.28939849073192175, + "step": 7900, + "student_mlm_loss": 5.292110443115234 + }, + { + "epoch": 0.29306176276650303, + "grad_norm": 10.3685941696167, + "learning_rate": 4.539278079465317e-05, + "loss": 3.5884, + "step": 8000 + }, + { + "epoch": 0.29306176276650303, + "eval_loss": 3.7581117153167725, + "eval_runtime": 2.0302, + "eval_samples_per_second": 3446.049, + "eval_steps_per_second": 13.792, + "step": 8000 + }, + { + "combined_loss": 2.8955559730529785, + "distill_loss": 1.3627426624298096, + "epoch": 0.29306176276650303, + "step": 8000, + "student_mlm_loss": 4.428369522094727 + }, + { + "epoch": 0.2967250348010843, + "grad_norm": 49.06619644165039, + "learning_rate": 4.533135120524855e-05, + "loss": 3.5788, + "step": 8100 + }, + { + "combined_loss": 4.52724552154541, + "distill_loss": 1.3924285173416138, + "epoch": 0.2967250348010843, + "step": 8100, + "student_mlm_loss": 7.662062644958496 + }, + { + "epoch": 0.30038830683566564, + "grad_norm": 27.40319061279297, + "learning_rate": 4.5269921615843926e-05, + "loss": 3.9229, + "step": 8200 + }, + { + "combined_loss": 3.3075461387634277, + "distill_loss": 1.5311795473098755, + "epoch": 0.30038830683566564, + "step": 8200, + "student_mlm_loss": 5.0839128494262695 + }, + { + "epoch": 0.3040515788702469, + "grad_norm": 31.07562255859375, + "learning_rate": 4.52084920264393e-05, + "loss": 3.9566, + "step": 8300 + }, + { + "combined_loss": 1.9784274101257324, + "distill_loss": 1.41036057472229, + "epoch": 0.3040515788702469, + "step": 8300, + "student_mlm_loss": 2.546494245529175 + }, + { + "epoch": 0.3077148509048282, + "grad_norm": 4.548298358917236, + "learning_rate": 4.514706243703467e-05, + "loss": 5.1591, + "step": 8400 + }, + { + "combined_loss": 1.9796760082244873, + "distill_loss": 1.408158302307129, + "epoch": 0.3077148509048282, + "step": 8400, + "student_mlm_loss": 2.5511937141418457 + }, + { + "epoch": 0.3113781229394095, + "grad_norm": 8.897561073303223, + "learning_rate": 4.5085632847630046e-05, + "loss": 5.7057, + "step": 8500 + }, + { + "combined_loss": 2.080671548843384, + "distill_loss": 1.4321857690811157, + "epoch": 0.3113781229394095, + "step": 8500, + "student_mlm_loss": 2.7291574478149414 + }, + { + "epoch": 0.31504139497399075, + "grad_norm": 10.005053520202637, + "learning_rate": 4.5024203258225424e-05, + "loss": 7.7928, + "step": 8600 + }, + { + "combined_loss": 2.6395342350006104, + "distill_loss": 1.5675503015518188, + "epoch": 0.31504139497399075, + "step": 8600, + "student_mlm_loss": 3.7115182876586914 + }, + { + "epoch": 0.31870466700857203, + "grad_norm": 5.425146579742432, + "learning_rate": 4.49627736688208e-05, + "loss": 3.7716, + "step": 8700 + }, + { + "combined_loss": 2.9848690032958984, + "distill_loss": 1.592170000076294, + "epoch": 0.31870466700857203, + "step": 8700, + "student_mlm_loss": 4.377568244934082 + }, + { + "epoch": 0.32236793904315336, + "grad_norm": 5.64302396774292, + "learning_rate": 4.490134407941617e-05, + "loss": 6.8888, + "step": 8800 + }, + { + "combined_loss": 4.167844772338867, + "distill_loss": 1.4308810234069824, + "epoch": 0.32236793904315336, + "step": 8800, + "student_mlm_loss": 6.904808044433594 + }, + { + "epoch": 0.32603121107773464, + "grad_norm": 99.88166809082031, + "learning_rate": 4.483991449001155e-05, + "loss": 3.988, + "step": 8900 + }, + { + "combined_loss": 2.484290599822998, + "distill_loss": 1.3509743213653564, + "epoch": 0.32603121107773464, + "step": 8900, + "student_mlm_loss": 3.6176071166992188 + }, + { + "epoch": 0.3296944831123159, + "grad_norm": 74.52608489990234, + "learning_rate": 4.477848490060693e-05, + "loss": 7.0959, + "step": 9000 + }, + { + "combined_loss": 3.0457074642181396, + "distill_loss": 1.3116565942764282, + "epoch": 0.3296944831123159, + "step": 9000, + "student_mlm_loss": 4.779758453369141 + }, + { + "epoch": 0.3333577551468972, + "grad_norm": 11.735849380493164, + "learning_rate": 4.47170553112023e-05, + "loss": 3.3274, + "step": 9100 + }, + { + "combined_loss": 4.452191352844238, + "distill_loss": 1.3943032026290894, + "epoch": 0.3333577551468972, + "step": 9100, + "student_mlm_loss": 7.510079860687256 + }, + { + "epoch": 0.33702102718147847, + "grad_norm": 9.601778030395508, + "learning_rate": 4.465562572179768e-05, + "loss": 3.8928, + "step": 9200 + }, + { + "combined_loss": 4.875356197357178, + "distill_loss": 1.4536867141723633, + "epoch": 0.33702102718147847, + "step": 9200, + "student_mlm_loss": 8.297025680541992 + }, + { + "epoch": 0.3406842992160598, + "grad_norm": 9.49219799041748, + "learning_rate": 4.459419613239305e-05, + "loss": 3.7362, + "step": 9300 + }, + { + "combined_loss": 2.9027719497680664, + "distill_loss": 1.3480241298675537, + "epoch": 0.3406842992160598, + "step": 9300, + "student_mlm_loss": 4.45751953125 + }, + { + "epoch": 0.3443475712506411, + "grad_norm": 7.6804728507995605, + "learning_rate": 4.453276654298843e-05, + "loss": 4.4018, + "step": 9400 + }, + { + "combined_loss": 2.7022647857666016, + "distill_loss": 1.3614214658737183, + "epoch": 0.3443475712506411, + "step": 9400, + "student_mlm_loss": 4.043107986450195 + }, + { + "epoch": 0.34801084328522236, + "grad_norm": 38.41388702392578, + "learning_rate": 4.4471336953583804e-05, + "loss": 3.0632, + "step": 9500 + }, + { + "combined_loss": 1.9494025707244873, + "distill_loss": 1.3876396417617798, + "epoch": 0.34801084328522236, + "step": 9500, + "student_mlm_loss": 2.5111656188964844 + }, + { + "epoch": 0.35167411531980364, + "grad_norm": 37.10932540893555, + "learning_rate": 4.440990736417918e-05, + "loss": 3.3258, + "step": 9600 + }, + { + "combined_loss": 2.6435036659240723, + "distill_loss": 1.3941702842712402, + "epoch": 0.35167411531980364, + "step": 9600, + "student_mlm_loss": 3.8928370475769043 + }, + { + "epoch": 0.3553373873543849, + "grad_norm": 17.652099609375, + "learning_rate": 4.434847777477455e-05, + "loss": 8.3854, + "step": 9700 + }, + { + "combined_loss": 2.336359977722168, + "distill_loss": 1.5497583150863647, + "epoch": 0.3553373873543849, + "step": 9700, + "student_mlm_loss": 3.1229615211486816 + }, + { + "epoch": 0.35900065938896625, + "grad_norm": 58.41902160644531, + "learning_rate": 4.428704818536993e-05, + "loss": 6.9624, + "step": 9800 + }, + { + "combined_loss": 2.6561923027038574, + "distill_loss": 1.5154696702957153, + "epoch": 0.35900065938896625, + "step": 9800, + "student_mlm_loss": 3.796915054321289 + }, + { + "epoch": 0.3626639314235475, + "grad_norm": 23.230680465698242, + "learning_rate": 4.422561859596531e-05, + "loss": 3.4226, + "step": 9900 + }, + { + "combined_loss": 1.9643871784210205, + "distill_loss": 1.3770619630813599, + "epoch": 0.3626639314235475, + "step": 9900, + "student_mlm_loss": 2.5517125129699707 + }, + { + "epoch": 0.3663272034581288, + "grad_norm": 11.580951690673828, + "learning_rate": 4.416418900656068e-05, + "loss": 4.7414, + "step": 10000 + }, + { + "epoch": 0.3663272034581288, + "eval_loss": 3.8432743549346924, + "eval_runtime": 2.2879, + "eval_samples_per_second": 3057.772, + "eval_steps_per_second": 12.238, + "step": 10000 + }, + { + "combined_loss": 2.395519971847534, + "distill_loss": 1.382614254951477, + "epoch": 0.3663272034581288, + "step": 10000, + "student_mlm_loss": 3.408425807952881 + }, + { + "epoch": 0.3699904754927101, + "grad_norm": 19.014955520629883, + "learning_rate": 4.410275941715606e-05, + "loss": 6.6365, + "step": 10100 + }, + { + "combined_loss": 2.1697921752929688, + "distill_loss": 1.5128508806228638, + "epoch": 0.3699904754927101, + "step": 10100, + "student_mlm_loss": 2.8267335891723633 + }, + { + "epoch": 0.37365374752729136, + "grad_norm": 6.532296180725098, + "learning_rate": 4.404132982775143e-05, + "loss": 3.199, + "step": 10200 + }, + { + "combined_loss": 1.8516874313354492, + "distill_loss": 1.413927674293518, + "epoch": 0.37365374752729136, + "step": 10200, + "student_mlm_loss": 2.289447069168091 + }, + { + "epoch": 0.3773170195618727, + "grad_norm": 25.607181549072266, + "learning_rate": 4.397990023834681e-05, + "loss": 3.822, + "step": 10300 + }, + { + "combined_loss": 3.3827946186065674, + "distill_loss": 1.4635933637619019, + "epoch": 0.3773170195618727, + "step": 10300, + "student_mlm_loss": 5.301995754241943 + }, + { + "epoch": 0.38098029159645397, + "grad_norm": 12.52314567565918, + "learning_rate": 4.3918470648942184e-05, + "loss": 6.9491, + "step": 10400 + }, + { + "combined_loss": 1.9748457670211792, + "distill_loss": 1.445707082748413, + "epoch": 0.38098029159645397, + "step": 10400, + "student_mlm_loss": 2.5039844512939453 + }, + { + "epoch": 0.38464356363103525, + "grad_norm": 12.69713306427002, + "learning_rate": 4.385704105953756e-05, + "loss": 9.4794, + "step": 10500 + }, + { + "combined_loss": 3.5582261085510254, + "distill_loss": 1.4324952363967896, + "epoch": 0.38464356363103525, + "step": 10500, + "student_mlm_loss": 5.683957099914551 + }, + { + "epoch": 0.3883068356656165, + "grad_norm": 9.131495475769043, + "learning_rate": 4.379561147013293e-05, + "loss": 7.1932, + "step": 10600 + }, + { + "combined_loss": 6.080216407775879, + "distill_loss": 1.477283000946045, + "epoch": 0.3883068356656165, + "step": 10600, + "student_mlm_loss": 10.683149337768555 + }, + { + "epoch": 0.3919701077001978, + "grad_norm": 24.739810943603516, + "learning_rate": 4.373418188072831e-05, + "loss": 5.6399, + "step": 10700 + }, + { + "combined_loss": 3.7993698120117188, + "distill_loss": 1.452317476272583, + "epoch": 0.3919701077001978, + "step": 10700, + "student_mlm_loss": 6.146422386169434 + }, + { + "epoch": 0.3956333797347791, + "grad_norm": 42.44218063354492, + "learning_rate": 4.367275229132369e-05, + "loss": 4.2291, + "step": 10800 + }, + { + "combined_loss": 2.037079095840454, + "distill_loss": 1.4349570274353027, + "epoch": 0.3956333797347791, + "step": 10800, + "student_mlm_loss": 2.6392011642456055 + }, + { + "epoch": 0.3992966517693604, + "grad_norm": 231.26116943359375, + "learning_rate": 4.361132270191906e-05, + "loss": 4.6188, + "step": 10900 + }, + { + "combined_loss": 182.1781768798828, + "distill_loss": 1.4427307844161987, + "epoch": 0.3992966517693604, + "step": 10900, + "student_mlm_loss": 362.91363525390625 + }, + { + "epoch": 0.4029599238039417, + "grad_norm": 16.01262092590332, + "learning_rate": 4.354989311251444e-05, + "loss": 4.8535, + "step": 11000 + }, + { + "combined_loss": 3.2922308444976807, + "distill_loss": 1.7308834791183472, + "epoch": 0.4029599238039417, + "step": 11000, + "student_mlm_loss": 4.853578090667725 + }, + { + "epoch": 0.40662319583852297, + "grad_norm": 23.69573974609375, + "learning_rate": 4.3488463523109816e-05, + "loss": 2.8692, + "step": 11100 + }, + { + "combined_loss": 2.1010890007019043, + "distill_loss": 1.3140019178390503, + "epoch": 0.40662319583852297, + "step": 11100, + "student_mlm_loss": 2.888176202774048 + }, + { + "epoch": 0.41028646787310424, + "grad_norm": 9.695125579833984, + "learning_rate": 4.3427033933705193e-05, + "loss": 7.6829, + "step": 11200 + }, + { + "combined_loss": 2.24194598197937, + "distill_loss": 1.560063362121582, + "epoch": 0.41028646787310424, + "step": 11200, + "student_mlm_loss": 2.923828601837158 + }, + { + "epoch": 0.4139497399076855, + "grad_norm": 37.06310272216797, + "learning_rate": 4.3365604344300565e-05, + "loss": 3.5562, + "step": 11300 + }, + { + "combined_loss": 9.297407150268555, + "distill_loss": 1.2328678369522095, + "epoch": 0.4139497399076855, + "step": 11300, + "student_mlm_loss": 17.36194610595703 + }, + { + "epoch": 0.41761301194226685, + "grad_norm": 6.411166667938232, + "learning_rate": 4.330417475489594e-05, + "loss": 4.0543, + "step": 11400 + }, + { + "combined_loss": 2.141500949859619, + "distill_loss": 1.467064380645752, + "epoch": 0.41761301194226685, + "step": 11400, + "student_mlm_loss": 2.8159377574920654 + }, + { + "epoch": 0.42127628397684813, + "grad_norm": 5.802677154541016, + "learning_rate": 4.3242745165491313e-05, + "loss": 14.3215, + "step": 11500 + }, + { + "combined_loss": 6.576130390167236, + "distill_loss": 1.46802818775177, + "epoch": 0.42127628397684813, + "step": 11500, + "student_mlm_loss": 11.684232711791992 + }, + { + "epoch": 0.4249395560114294, + "grad_norm": 15.660844802856445, + "learning_rate": 4.318131557608669e-05, + "loss": 30.5877, + "step": 11600 + }, + { + "combined_loss": 1.9305293560028076, + "distill_loss": 1.405720591545105, + "epoch": 0.4249395560114294, + "step": 11600, + "student_mlm_loss": 2.4553380012512207 + }, + { + "epoch": 0.4286028280460107, + "grad_norm": 3.041947603225708, + "learning_rate": 4.311988598668207e-05, + "loss": 3.7156, + "step": 11700 + }, + { + "combined_loss": 2.78572940826416, + "distill_loss": 1.45219886302948, + "epoch": 0.4286028280460107, + "step": 11700, + "student_mlm_loss": 4.119259834289551 + }, + { + "epoch": 0.43226610008059196, + "grad_norm": 20.6744384765625, + "learning_rate": 4.305845639727744e-05, + "loss": 3.3939, + "step": 11800 + }, + { + "combined_loss": 2.0835349559783936, + "distill_loss": 1.4508671760559082, + "epoch": 0.43226610008059196, + "step": 11800, + "student_mlm_loss": 2.716202735900879 + }, + { + "epoch": 0.4359293721151733, + "grad_norm": 5.804731369018555, + "learning_rate": 4.299702680787282e-05, + "loss": 6.1951, + "step": 11900 + }, + { + "combined_loss": 3.1048030853271484, + "distill_loss": 1.455564260482788, + "epoch": 0.4359293721151733, + "step": 11900, + "student_mlm_loss": 4.75404167175293 + }, + { + "epoch": 0.4395926441497546, + "grad_norm": 33.689720153808594, + "learning_rate": 4.2935597218468196e-05, + "loss": 3.6583, + "step": 12000 + }, + { + "epoch": 0.4395926441497546, + "eval_loss": 3.919630527496338, + "eval_runtime": 2.0425, + "eval_samples_per_second": 3425.261, + "eval_steps_per_second": 13.709, + "step": 12000 + }, + { + "combined_loss": 2.315965175628662, + "distill_loss": 1.3009124994277954, + "epoch": 0.4395926441497546, + "step": 12000, + "student_mlm_loss": 3.3310179710388184 + }, + { + "epoch": 0.44325591618433585, + "grad_norm": 24.73545265197754, + "learning_rate": 4.2874167629063574e-05, + "loss": 2.9828, + "step": 12100 + }, + { + "combined_loss": 5.060952186584473, + "distill_loss": 1.3712559938430786, + "epoch": 0.44325591618433585, + "step": 12100, + "student_mlm_loss": 8.750648498535156 + }, + { + "epoch": 0.44691918821891713, + "grad_norm": 19.548921585083008, + "learning_rate": 4.2812738039658945e-05, + "loss": 3.1716, + "step": 12200 + }, + { + "combined_loss": 2.3697307109832764, + "distill_loss": 1.480096459388733, + "epoch": 0.44691918821891713, + "step": 12200, + "student_mlm_loss": 3.2593650817871094 + }, + { + "epoch": 0.4505824602534984, + "grad_norm": 6.217925548553467, + "learning_rate": 4.2751308450254316e-05, + "loss": 5.1037, + "step": 12300 + }, + { + "combined_loss": 1.9682085514068604, + "distill_loss": 1.3534774780273438, + "epoch": 0.4505824602534984, + "step": 12300, + "student_mlm_loss": 2.582939624786377 + }, + { + "epoch": 0.45424573228807974, + "grad_norm": 53.592735290527344, + "learning_rate": 4.2689878860849694e-05, + "loss": 5.3409, + "step": 12400 + }, + { + "combined_loss": 2.413550853729248, + "distill_loss": 1.3951433897018433, + "epoch": 0.45424573228807974, + "step": 12400, + "student_mlm_loss": 3.4319584369659424 + }, + { + "epoch": 0.457909004322661, + "grad_norm": 13.716507911682129, + "learning_rate": 4.262844927144507e-05, + "loss": 3.2261, + "step": 12500 + }, + { + "combined_loss": 3.6318020820617676, + "distill_loss": 1.3529082536697388, + "epoch": 0.457909004322661, + "step": 12500, + "student_mlm_loss": 5.910696029663086 + }, + { + "epoch": 0.4615722763572423, + "grad_norm": 16.206933975219727, + "learning_rate": 4.256701968204045e-05, + "loss": 3.1534, + "step": 12600 + }, + { + "combined_loss": 15.371432304382324, + "distill_loss": 1.4290032386779785, + "epoch": 0.4615722763572423, + "step": 12600, + "student_mlm_loss": 29.313861846923828 + }, + { + "epoch": 0.4652355483918236, + "grad_norm": 8.626960754394531, + "learning_rate": 4.250559009263582e-05, + "loss": 3.0824, + "step": 12700 + }, + { + "combined_loss": 2.0715112686157227, + "distill_loss": 1.3553932905197144, + "epoch": 0.4652355483918236, + "step": 12700, + "student_mlm_loss": 2.7876293659210205 + }, + { + "epoch": 0.46889882042640485, + "grad_norm": 8.153878211975098, + "learning_rate": 4.24441605032312e-05, + "loss": 3.8805, + "step": 12800 + }, + { + "combined_loss": 2.0972392559051514, + "distill_loss": 1.2276250123977661, + "epoch": 0.46889882042640485, + "step": 12800, + "student_mlm_loss": 2.966853618621826 + }, + { + "epoch": 0.4725620924609861, + "grad_norm": 12.068700790405273, + "learning_rate": 4.2382730913826576e-05, + "loss": 2.8937, + "step": 12900 + }, + { + "combined_loss": 2.9497852325439453, + "distill_loss": 1.314728021621704, + "epoch": 0.4725620924609861, + "step": 12900, + "student_mlm_loss": 4.584842681884766 + }, + { + "epoch": 0.47622536449556746, + "grad_norm": 12.260379791259766, + "learning_rate": 4.232130132442195e-05, + "loss": 5.581, + "step": 13000 + }, + { + "combined_loss": 1.8658246994018555, + "distill_loss": 1.2703187465667725, + "epoch": 0.47622536449556746, + "step": 13000, + "student_mlm_loss": 2.4613306522369385 + }, + { + "epoch": 0.47988863653014874, + "grad_norm": 22.688852310180664, + "learning_rate": 4.2259871735017325e-05, + "loss": 7.0059, + "step": 13100 + }, + { + "combined_loss": 3.673346519470215, + "distill_loss": 1.397099256515503, + "epoch": 0.47988863653014874, + "step": 13100, + "student_mlm_loss": 5.949593544006348 + }, + { + "epoch": 0.48355190856473, + "grad_norm": 28.811817169189453, + "learning_rate": 4.2198442145612696e-05, + "loss": 9.6395, + "step": 13200 + }, + { + "combined_loss": 2.036362409591675, + "distill_loss": 1.3239866495132446, + "epoch": 0.48355190856473, + "step": 13200, + "student_mlm_loss": 2.7487380504608154 + }, + { + "epoch": 0.4872151805993113, + "grad_norm": 6.380947589874268, + "learning_rate": 4.213701255620808e-05, + "loss": 2.7095, + "step": 13300 + }, + { + "combined_loss": 2.2547478675842285, + "distill_loss": 1.4122509956359863, + "epoch": 0.4872151805993113, + "step": 13300, + "student_mlm_loss": 3.09724497795105 + }, + { + "epoch": 0.49087845263389257, + "grad_norm": 83.60982513427734, + "learning_rate": 4.207558296680345e-05, + "loss": 3.2917, + "step": 13400 + }, + { + "combined_loss": 2.009040355682373, + "distill_loss": 1.4236946105957031, + "epoch": 0.49087845263389257, + "step": 13400, + "student_mlm_loss": 2.594385862350464 + }, + { + "epoch": 0.4945417246684739, + "grad_norm": 10.06588077545166, + "learning_rate": 4.201415337739883e-05, + "loss": 12.3205, + "step": 13500 + }, + { + "combined_loss": 2.9317073822021484, + "distill_loss": 1.4229042530059814, + "epoch": 0.4945417246684739, + "step": 13500, + "student_mlm_loss": 4.440510272979736 + }, + { + "epoch": 0.4982049967030552, + "grad_norm": 4.126479625701904, + "learning_rate": 4.19527237879942e-05, + "loss": 3.8077, + "step": 13600 + }, + { + "combined_loss": 1.9033926725387573, + "distill_loss": 1.357490062713623, + "epoch": 0.4982049967030552, + "step": 13600, + "student_mlm_loss": 2.4492952823638916 + }, + { + "epoch": 0.5018682687376365, + "grad_norm": 18.483203887939453, + "learning_rate": 4.189129419858958e-05, + "loss": 11.6361, + "step": 13700 + }, + { + "combined_loss": 3.165005683898926, + "distill_loss": 1.3812006711959839, + "epoch": 0.5018682687376365, + "step": 13700, + "student_mlm_loss": 4.948810577392578 + }, + { + "epoch": 0.5055315407722177, + "grad_norm": 7.388655662536621, + "learning_rate": 4.1829864609184956e-05, + "loss": 3.875, + "step": 13800 + }, + { + "combined_loss": 1.8155145645141602, + "distill_loss": 1.3641600608825684, + "epoch": 0.5055315407722177, + "step": 13800, + "student_mlm_loss": 2.266869068145752 + }, + { + "epoch": 0.509194812806799, + "grad_norm": 9.352982521057129, + "learning_rate": 4.176843501978033e-05, + "loss": 9.268, + "step": 13900 + }, + { + "combined_loss": 2.3618173599243164, + "distill_loss": 1.3162891864776611, + "epoch": 0.509194812806799, + "step": 13900, + "student_mlm_loss": 3.4073452949523926 + }, + { + "epoch": 0.5128580848413803, + "grad_norm": 8.513871192932129, + "learning_rate": 4.1707005430375705e-05, + "loss": 3.3999, + "step": 14000 + }, + { + "epoch": 0.5128580848413803, + "eval_loss": 3.5987370014190674, + "eval_runtime": 2.2869, + "eval_samples_per_second": 3059.222, + "eval_steps_per_second": 12.244, + "step": 14000 + }, + { + "combined_loss": 2.6841559410095215, + "distill_loss": 1.401199460029602, + "epoch": 0.5128580848413803, + "step": 14000, + "student_mlm_loss": 3.9671125411987305 + }, + { + "epoch": 0.5165213568759616, + "grad_norm": 30.661813735961914, + "learning_rate": 4.1645575840971076e-05, + "loss": 18.3341, + "step": 14100 + }, + { + "combined_loss": 4.752758026123047, + "distill_loss": 1.247560977935791, + "epoch": 0.5165213568759616, + "step": 14100, + "student_mlm_loss": 8.257954597473145 + }, + { + "epoch": 0.5201846289105428, + "grad_norm": 40.303707122802734, + "learning_rate": 4.158414625156646e-05, + "loss": 3.1057, + "step": 14200 + }, + { + "combined_loss": 1.988144874572754, + "distill_loss": 1.2577546834945679, + "epoch": 0.5201846289105428, + "step": 14200, + "student_mlm_loss": 2.7185349464416504 + }, + { + "epoch": 0.5238479009451242, + "grad_norm": 19.77947235107422, + "learning_rate": 4.152271666216183e-05, + "loss": 7.3457, + "step": 14300 + }, + { + "combined_loss": 4.299380779266357, + "distill_loss": 1.2770593166351318, + "epoch": 0.5238479009451242, + "step": 14300, + "student_mlm_loss": 7.321702480316162 + }, + { + "epoch": 0.5275111729797055, + "grad_norm": 7.412100315093994, + "learning_rate": 4.146128707275721e-05, + "loss": 4.8104, + "step": 14400 + }, + { + "combined_loss": 10.650766372680664, + "distill_loss": 1.3233892917633057, + "epoch": 0.5275111729797055, + "step": 14400, + "student_mlm_loss": 19.9781436920166 + }, + { + "epoch": 0.5311744450142868, + "grad_norm": 5.799710750579834, + "learning_rate": 4.139985748335258e-05, + "loss": 3.4765, + "step": 14500 + }, + { + "combined_loss": 2.4540774822235107, + "distill_loss": 1.319036841392517, + "epoch": 0.5311744450142868, + "step": 14500, + "student_mlm_loss": 3.589118003845215 + }, + { + "epoch": 0.5348377170488681, + "grad_norm": 7.147758483886719, + "learning_rate": 4.133842789394796e-05, + "loss": 3.12, + "step": 14600 + }, + { + "combined_loss": 1.8580541610717773, + "distill_loss": 1.3114832639694214, + "epoch": 0.5348377170488681, + "step": 14600, + "student_mlm_loss": 2.4046249389648438 + }, + { + "epoch": 0.5385009890834493, + "grad_norm": 5.120487213134766, + "learning_rate": 4.1276998304543336e-05, + "loss": 6.7029, + "step": 14700 + }, + { + "combined_loss": 1.9685258865356445, + "distill_loss": 1.2455390691757202, + "epoch": 0.5385009890834493, + "step": 14700, + "student_mlm_loss": 2.6915125846862793 + }, + { + "epoch": 0.5421642611180306, + "grad_norm": 6.225675106048584, + "learning_rate": 4.121556871513871e-05, + "loss": 7.1336, + "step": 14800 + }, + { + "combined_loss": 1.8886613845825195, + "distill_loss": 1.2913726568222046, + "epoch": 0.5421642611180306, + "step": 14800, + "student_mlm_loss": 2.485949993133545 + }, + { + "epoch": 0.5458275331526119, + "grad_norm": 11.508244514465332, + "learning_rate": 4.1154139125734085e-05, + "loss": 11.8719, + "step": 14900 + }, + { + "combined_loss": 2.1455585956573486, + "distill_loss": 1.3711117506027222, + "epoch": 0.5458275331526119, + "step": 14900, + "student_mlm_loss": 2.9200053215026855 + }, + { + "epoch": 0.5494908051871932, + "grad_norm": 17.030780792236328, + "learning_rate": 4.109270953632946e-05, + "loss": 3.091, + "step": 15000 + }, + { + "combined_loss": 1.9433504343032837, + "distill_loss": 1.538583517074585, + "epoch": 0.5494908051871932, + "step": 15000, + "student_mlm_loss": 2.3481173515319824 + }, + { + "epoch": 0.5531540772217745, + "grad_norm": 4.692992687225342, + "learning_rate": 4.103127994692484e-05, + "loss": 3.2488, + "step": 15100 + }, + { + "combined_loss": 2.820077657699585, + "distill_loss": 1.2906769514083862, + "epoch": 0.5531540772217745, + "step": 15100, + "student_mlm_loss": 4.349478244781494 + }, + { + "epoch": 0.5568173492563557, + "grad_norm": 49.70892333984375, + "learning_rate": 4.096985035752021e-05, + "loss": 10.6593, + "step": 15200 + }, + { + "combined_loss": 1.857104778289795, + "distill_loss": 1.4106833934783936, + "epoch": 0.5568173492563557, + "step": 15200, + "student_mlm_loss": 2.3035261631011963 + }, + { + "epoch": 0.5604806212909371, + "grad_norm": 7.913967609405518, + "learning_rate": 4.090842076811558e-05, + "loss": 3.3056, + "step": 15300 + }, + { + "combined_loss": 3.2144076824188232, + "distill_loss": 1.3917032480239868, + "epoch": 0.5604806212909371, + "step": 15300, + "student_mlm_loss": 5.037112236022949 + }, + { + "epoch": 0.5641438933255184, + "grad_norm": 10.575057983398438, + "learning_rate": 4.084699117871096e-05, + "loss": 10.0757, + "step": 15400 + }, + { + "combined_loss": 5.352452754974365, + "distill_loss": 1.3542910814285278, + "epoch": 0.5641438933255184, + "step": 15400, + "student_mlm_loss": 9.350614547729492 + }, + { + "epoch": 0.5678071653600997, + "grad_norm": 119.92784118652344, + "learning_rate": 4.078556158930634e-05, + "loss": 3.4463, + "step": 15500 + }, + { + "combined_loss": 1.7753610610961914, + "distill_loss": 1.3875095844268799, + "epoch": 0.5678071653600997, + "step": 15500, + "student_mlm_loss": 2.163212537765503 + }, + { + "epoch": 0.571470437394681, + "grad_norm": 4.203140735626221, + "learning_rate": 4.0724131999901717e-05, + "loss": 4.8205, + "step": 15600 + }, + { + "combined_loss": 1.8941802978515625, + "distill_loss": 1.3584777116775513, + "epoch": 0.571470437394681, + "step": 15600, + "student_mlm_loss": 2.4298830032348633 + }, + { + "epoch": 0.5751337094292622, + "grad_norm": 16.848825454711914, + "learning_rate": 4.066270241049709e-05, + "loss": 7.7339, + "step": 15700 + }, + { + "combined_loss": 1.9499808549880981, + "distill_loss": 1.3122260570526123, + "epoch": 0.5751337094292622, + "step": 15700, + "student_mlm_loss": 2.587735652923584 + }, + { + "epoch": 0.5787969814638435, + "grad_norm": 2.9838955402374268, + "learning_rate": 4.0601272821092465e-05, + "loss": 3.4354, + "step": 15800 + }, + { + "combined_loss": 1.9672229290008545, + "distill_loss": 1.3119910955429077, + "epoch": 0.5787969814638435, + "step": 15800, + "student_mlm_loss": 2.622454881668091 + }, + { + "epoch": 0.5824602534984248, + "grad_norm": 6.6938676834106445, + "learning_rate": 4.053984323168784e-05, + "loss": 5.2244, + "step": 15900 + }, + { + "combined_loss": 2.8469321727752686, + "distill_loss": 1.361178994178772, + "epoch": 0.5824602534984248, + "step": 15900, + "student_mlm_loss": 4.332685470581055 + }, + { + "epoch": 0.5861235255330061, + "grad_norm": 31.440717697143555, + "learning_rate": 4.047841364228322e-05, + "loss": 8.7168, + "step": 16000 + }, + { + "epoch": 0.5861235255330061, + "eval_loss": 3.480536937713623, + "eval_runtime": 2.1572, + "eval_samples_per_second": 3243.154, + "eval_steps_per_second": 12.98, + "step": 16000 + }, + { + "combined_loss": 2.0847339630126953, + "distill_loss": 1.4640412330627441, + "epoch": 0.5861235255330061, + "step": 16000, + "student_mlm_loss": 2.7054266929626465 + }, + { + "epoch": 0.5897867975675873, + "grad_norm": 6.238570690155029, + "learning_rate": 4.041698405287859e-05, + "loss": 3.2375, + "step": 16100 + }, + { + "combined_loss": 2.2635374069213867, + "distill_loss": 1.5188945531845093, + "epoch": 0.5897867975675873, + "step": 16100, + "student_mlm_loss": 3.0081801414489746 + }, + { + "epoch": 0.5934500696021686, + "grad_norm": 11.832098960876465, + "learning_rate": 4.035555446347396e-05, + "loss": 3.3115, + "step": 16200 + }, + { + "combined_loss": 2.2285714149475098, + "distill_loss": 1.4724992513656616, + "epoch": 0.5934500696021686, + "step": 16200, + "student_mlm_loss": 2.9846436977386475 + }, + { + "epoch": 0.5971133416367499, + "grad_norm": 8.876389503479004, + "learning_rate": 4.029412487406934e-05, + "loss": 4.1388, + "step": 16300 + }, + { + "combined_loss": 2.0907256603240967, + "distill_loss": 1.2955131530761719, + "epoch": 0.5971133416367499, + "step": 16300, + "student_mlm_loss": 2.8859381675720215 + }, + { + "epoch": 0.6007766136713313, + "grad_norm": 4.118688106536865, + "learning_rate": 4.023269528466472e-05, + "loss": 5.4036, + "step": 16400 + }, + { + "combined_loss": 5.190587997436523, + "distill_loss": 1.502519965171814, + "epoch": 0.6007766136713313, + "step": 16400, + "student_mlm_loss": 8.878656387329102 + }, + { + "epoch": 0.6044398857059126, + "grad_norm": 17.806203842163086, + "learning_rate": 4.01712656952601e-05, + "loss": 3.4529, + "step": 16500 + }, + { + "combined_loss": 2.0771563053131104, + "distill_loss": 1.5032036304473877, + "epoch": 0.6044398857059126, + "step": 16500, + "student_mlm_loss": 2.651108980178833 + }, + { + "epoch": 0.6081031577404938, + "grad_norm": 11.406692504882812, + "learning_rate": 4.010983610585547e-05, + "loss": 2.9157, + "step": 16600 + }, + { + "combined_loss": 2.0262105464935303, + "distill_loss": 1.406888723373413, + "epoch": 0.6081031577404938, + "step": 16600, + "student_mlm_loss": 2.6455323696136475 + }, + { + "epoch": 0.6117664297750751, + "grad_norm": 9.248611450195312, + "learning_rate": 4.0048406516450846e-05, + "loss": 3.7273, + "step": 16700 + }, + { + "combined_loss": 9.912755966186523, + "distill_loss": 1.3654385805130005, + "epoch": 0.6117664297750751, + "step": 16700, + "student_mlm_loss": 18.460073471069336 + }, + { + "epoch": 0.6154297018096564, + "grad_norm": 7.337488651275635, + "learning_rate": 3.9986976927046223e-05, + "loss": 3.5316, + "step": 16800 + }, + { + "combined_loss": 2.2111759185791016, + "distill_loss": 1.410059928894043, + "epoch": 0.6154297018096564, + "step": 16800, + "student_mlm_loss": 3.012291669845581 + }, + { + "epoch": 0.6190929738442377, + "grad_norm": 3.7927513122558594, + "learning_rate": 3.9925547337641595e-05, + "loss": 2.942, + "step": 16900 + }, + { + "combined_loss": 1.9941096305847168, + "distill_loss": 1.3353883028030396, + "epoch": 0.6190929738442377, + "step": 16900, + "student_mlm_loss": 2.6528310775756836 + }, + { + "epoch": 0.622756245878819, + "grad_norm": 8.092863082885742, + "learning_rate": 3.986411774823697e-05, + "loss": 8.3194, + "step": 17000 + }, + { + "combined_loss": 1.8197941780090332, + "distill_loss": 1.2830308675765991, + "epoch": 0.622756245878819, + "step": 17000, + "student_mlm_loss": 2.356557607650757 + }, + { + "epoch": 0.6264195179134002, + "grad_norm": 21.95607566833496, + "learning_rate": 3.9802688158832343e-05, + "loss": 3.6842, + "step": 17100 + }, + { + "combined_loss": 1.967858076095581, + "distill_loss": 1.3744505643844604, + "epoch": 0.6264195179134002, + "step": 17100, + "student_mlm_loss": 2.561265707015991 + }, + { + "epoch": 0.6300827899479815, + "grad_norm": 17.734630584716797, + "learning_rate": 3.974125856942773e-05, + "loss": 3.4446, + "step": 17200 + }, + { + "combined_loss": 3.56831955909729, + "distill_loss": 1.4127169847488403, + "epoch": 0.6300827899479815, + "step": 17200, + "student_mlm_loss": 5.723922252655029 + }, + { + "epoch": 0.6337460619825628, + "grad_norm": 14.227143287658691, + "learning_rate": 3.96798289800231e-05, + "loss": 4.3058, + "step": 17300 + }, + { + "combined_loss": 6.485238552093506, + "distill_loss": 1.3285768032073975, + "epoch": 0.6337460619825628, + "step": 17300, + "student_mlm_loss": 11.641900062561035 + }, + { + "epoch": 0.6374093340171441, + "grad_norm": 27.379819869995117, + "learning_rate": 3.961839939061848e-05, + "loss": 3.3666, + "step": 17400 + }, + { + "combined_loss": 3.212083339691162, + "distill_loss": 1.3358004093170166, + "epoch": 0.6374093340171441, + "step": 17400, + "student_mlm_loss": 5.088366508483887 + }, + { + "epoch": 0.6410726060517254, + "grad_norm": 6.261890411376953, + "learning_rate": 3.955696980121385e-05, + "loss": 6.3216, + "step": 17500 + }, + { + "combined_loss": 1.8787257671356201, + "distill_loss": 1.3068917989730835, + "epoch": 0.6410726060517254, + "step": 17500, + "student_mlm_loss": 2.4505598545074463 + }, + { + "epoch": 0.6447358780863067, + "grad_norm": 4.643723011016846, + "learning_rate": 3.9495540211809226e-05, + "loss": 6.3659, + "step": 17600 + }, + { + "combined_loss": 1.9111711978912354, + "distill_loss": 1.315952181816101, + "epoch": 0.6447358780863067, + "step": 17600, + "student_mlm_loss": 2.506390333175659 + }, + { + "epoch": 0.648399150120888, + "grad_norm": 209.94358825683594, + "learning_rate": 3.9434110622404604e-05, + "loss": 3.1778, + "step": 17700 + }, + { + "combined_loss": 2.7990779876708984, + "distill_loss": 1.360758662223816, + "epoch": 0.648399150120888, + "step": 17700, + "student_mlm_loss": 4.237397193908691 + }, + { + "epoch": 0.6520624221554693, + "grad_norm": 25.861230850219727, + "learning_rate": 3.9372681032999975e-05, + "loss": 6.5636, + "step": 17800 + }, + { + "combined_loss": 3.8194119930267334, + "distill_loss": 1.45068359375, + "epoch": 0.6520624221554693, + "step": 17800, + "student_mlm_loss": 6.188140392303467 + }, + { + "epoch": 0.6557256941900506, + "grad_norm": 46.81015396118164, + "learning_rate": 3.931125144359535e-05, + "loss": 6.4281, + "step": 17900 + }, + { + "combined_loss": 1.8790740966796875, + "distill_loss": 1.2603598833084106, + "epoch": 0.6557256941900506, + "step": 17900, + "student_mlm_loss": 2.497788429260254 + }, + { + "epoch": 0.6593889662246318, + "grad_norm": 3.634798049926758, + "learning_rate": 3.924982185419073e-05, + "loss": 3.7705, + "step": 18000 + }, + { + "epoch": 0.6593889662246318, + "eval_loss": 3.4686477184295654, + "eval_runtime": 2.0476, + "eval_samples_per_second": 3416.619, + "eval_steps_per_second": 13.674, + "step": 18000 + }, + { + "combined_loss": 1.8001245260238647, + "distill_loss": 1.358407735824585, + "epoch": 0.6593889662246318, + "step": 18000, + "student_mlm_loss": 2.2418413162231445 + }, + { + "epoch": 0.6630522382592131, + "grad_norm": 14.09543514251709, + "learning_rate": 3.918839226478611e-05, + "loss": 7.2198, + "step": 18100 + }, + { + "combined_loss": 2.165346622467041, + "distill_loss": 1.3290469646453857, + "epoch": 0.6630522382592131, + "step": 18100, + "student_mlm_loss": 3.0016462802886963 + }, + { + "epoch": 0.6667155102937944, + "grad_norm": 4.29142951965332, + "learning_rate": 3.912696267538148e-05, + "loss": 4.3053, + "step": 18200 + }, + { + "combined_loss": 1.8569279909133911, + "distill_loss": 1.355130910873413, + "epoch": 0.6667155102937944, + "step": 18200, + "student_mlm_loss": 2.358725070953369 + }, + { + "epoch": 0.6703787823283757, + "grad_norm": 4.424899101257324, + "learning_rate": 3.906553308597686e-05, + "loss": 3.2385, + "step": 18300 + }, + { + "combined_loss": 2.083707094192505, + "distill_loss": 1.307104229927063, + "epoch": 0.6703787823283757, + "step": 18300, + "student_mlm_loss": 2.8603098392486572 + }, + { + "epoch": 0.6740420543629569, + "grad_norm": 8.061409950256348, + "learning_rate": 3.900410349657223e-05, + "loss": 2.9075, + "step": 18400 + }, + { + "combined_loss": 1.9213597774505615, + "distill_loss": 1.434320330619812, + "epoch": 0.6740420543629569, + "step": 18400, + "student_mlm_loss": 2.4083993434906006 + }, + { + "epoch": 0.6777053263975383, + "grad_norm": 55.50898361206055, + "learning_rate": 3.8942673907167606e-05, + "loss": 13.4077, + "step": 18500 + }, + { + "combined_loss": 2.01340389251709, + "distill_loss": 1.3991159200668335, + "epoch": 0.6777053263975383, + "step": 18500, + "student_mlm_loss": 2.6276917457580566 + }, + { + "epoch": 0.6813685984321196, + "grad_norm": 5.348477840423584, + "learning_rate": 3.8881244317762984e-05, + "loss": 6.8559, + "step": 18600 + }, + { + "combined_loss": 2.5955307483673096, + "distill_loss": 1.4375801086425781, + "epoch": 0.6813685984321196, + "step": 18600, + "student_mlm_loss": 3.753481388092041 + }, + { + "epoch": 0.6850318704667009, + "grad_norm": 26.911954879760742, + "learning_rate": 3.8819814728358355e-05, + "loss": 9.8471, + "step": 18700 + }, + { + "combined_loss": 2.3086562156677246, + "distill_loss": 1.4082762002944946, + "epoch": 0.6850318704667009, + "step": 18700, + "student_mlm_loss": 3.209036350250244 + }, + { + "epoch": 0.6886951425012822, + "grad_norm": 8.086039543151855, + "learning_rate": 3.875838513895373e-05, + "loss": 3.841, + "step": 18800 + }, + { + "combined_loss": 4.487699031829834, + "distill_loss": 1.4052667617797852, + "epoch": 0.6886951425012822, + "step": 18800, + "student_mlm_loss": 7.570131301879883 + }, + { + "epoch": 0.6923584145358634, + "grad_norm": 10.749812126159668, + "learning_rate": 3.869695554954911e-05, + "loss": 9.7279, + "step": 18900 + }, + { + "combined_loss": 3.3014779090881348, + "distill_loss": 1.246164083480835, + "epoch": 0.6923584145358634, + "step": 18900, + "student_mlm_loss": 5.3567914962768555 + }, + { + "epoch": 0.6960216865704447, + "grad_norm": 11.313789367675781, + "learning_rate": 3.863552596014449e-05, + "loss": 28.0849, + "step": 19000 + }, + { + "combined_loss": 4.825923919677734, + "distill_loss": 1.377113938331604, + "epoch": 0.6960216865704447, + "step": 19000, + "student_mlm_loss": 8.274733543395996 + }, + { + "epoch": 0.699684958605026, + "grad_norm": 3.8648459911346436, + "learning_rate": 3.857409637073986e-05, + "loss": 5.8981, + "step": 19100 + }, + { + "combined_loss": 3.4921586513519287, + "distill_loss": 1.4171725511550903, + "epoch": 0.699684958605026, + "step": 19100, + "student_mlm_loss": 5.567144870758057 + }, + { + "epoch": 0.7033482306396073, + "grad_norm": 18.98455238342285, + "learning_rate": 3.851266678133523e-05, + "loss": 2.5944, + "step": 19200 + }, + { + "combined_loss": 1.8949182033538818, + "distill_loss": 1.3743678331375122, + "epoch": 0.7033482306396073, + "step": 19200, + "student_mlm_loss": 2.415468692779541 + }, + { + "epoch": 0.7070115026741886, + "grad_norm": 27.53456687927246, + "learning_rate": 3.845123719193061e-05, + "loss": 2.8462, + "step": 19300 + }, + { + "combined_loss": 1.8077284097671509, + "distill_loss": 1.2764451503753662, + "epoch": 0.7070115026741886, + "step": 19300, + "student_mlm_loss": 2.3390116691589355 + }, + { + "epoch": 0.7106747747087698, + "grad_norm": 8.815896987915039, + "learning_rate": 3.8389807602525986e-05, + "loss": 3.403, + "step": 19400 + }, + { + "combined_loss": 2.2496674060821533, + "distill_loss": 1.408218264579773, + "epoch": 0.7106747747087698, + "step": 19400, + "student_mlm_loss": 3.091116428375244 + }, + { + "epoch": 0.7143380467433511, + "grad_norm": 20.02590560913086, + "learning_rate": 3.8328378013121364e-05, + "loss": 3.7767, + "step": 19500 + }, + { + "combined_loss": 2.6540353298187256, + "distill_loss": 1.451707124710083, + "epoch": 0.7143380467433511, + "step": 19500, + "student_mlm_loss": 3.856363534927368 + }, + { + "epoch": 0.7180013187779325, + "grad_norm": 48.139583587646484, + "learning_rate": 3.8266948423716735e-05, + "loss": 3.4148, + "step": 19600 + }, + { + "combined_loss": 3.5710411071777344, + "distill_loss": 1.2874888181686401, + "epoch": 0.7180013187779325, + "step": 19600, + "student_mlm_loss": 5.854593276977539 + }, + { + "epoch": 0.7216645908125138, + "grad_norm": 5.810763835906982, + "learning_rate": 3.820551883431211e-05, + "loss": 11.1815, + "step": 19700 + }, + { + "combined_loss": 2.022658586502075, + "distill_loss": 1.408826231956482, + "epoch": 0.7216645908125138, + "step": 19700, + "student_mlm_loss": 2.636491060256958 + }, + { + "epoch": 0.725327862847095, + "grad_norm": 5.03505277633667, + "learning_rate": 3.814408924490749e-05, + "loss": 3.5792, + "step": 19800 + }, + { + "combined_loss": 2.450950860977173, + "distill_loss": 1.3786026239395142, + "epoch": 0.725327862847095, + "step": 19800, + "student_mlm_loss": 3.523299217224121 + }, + { + "epoch": 0.7289911348816763, + "grad_norm": 44.703548431396484, + "learning_rate": 3.808265965550287e-05, + "loss": 14.0822, + "step": 19900 + }, + { + "combined_loss": 1.8448269367218018, + "distill_loss": 1.3061137199401855, + "epoch": 0.7289911348816763, + "step": 19900, + "student_mlm_loss": 2.383540153503418 + }, + { + "epoch": 0.7326544069162576, + "grad_norm": 73.46593475341797, + "learning_rate": 3.802123006609824e-05, + "loss": 3.5648, + "step": 20000 + }, + { + "epoch": 0.7326544069162576, + "eval_loss": 3.689605474472046, + "eval_runtime": 2.2951, + "eval_samples_per_second": 3048.261, + "eval_steps_per_second": 12.2, + "step": 20000 + }, + { + "combined_loss": 5.831945896148682, + "distill_loss": 1.2505719661712646, + "epoch": 0.7326544069162576, + "step": 20000, + "student_mlm_loss": 10.41331958770752 + }, + { + "epoch": 0.7363176789508389, + "grad_norm": 7.289074897766113, + "learning_rate": 3.795980047669361e-05, + "loss": 5.9452, + "step": 20100 + }, + { + "combined_loss": 14.608942985534668, + "distill_loss": 1.4141182899475098, + "epoch": 0.7363176789508389, + "step": 20100, + "student_mlm_loss": 27.803768157958984 + }, + { + "epoch": 0.7399809509854202, + "grad_norm": 15.717759132385254, + "learning_rate": 3.7898370887288995e-05, + "loss": 5.3196, + "step": 20200 + }, + { + "combined_loss": 2.34932279586792, + "distill_loss": 1.2641239166259766, + "epoch": 0.7399809509854202, + "step": 20200, + "student_mlm_loss": 3.434521436691284 + }, + { + "epoch": 0.7436442230200014, + "grad_norm": 75.113037109375, + "learning_rate": 3.7836941297884366e-05, + "loss": 3.4868, + "step": 20300 + }, + { + "combined_loss": 2.0885400772094727, + "distill_loss": 1.3560060262680054, + "epoch": 0.7436442230200014, + "step": 20300, + "student_mlm_loss": 2.8210740089416504 + }, + { + "epoch": 0.7473074950545827, + "grad_norm": 12.071985244750977, + "learning_rate": 3.7775511708479744e-05, + "loss": 3.1594, + "step": 20400 + }, + { + "combined_loss": 2.104968309402466, + "distill_loss": 1.456742286682129, + "epoch": 0.7473074950545827, + "step": 20400, + "student_mlm_loss": 2.7531943321228027 + }, + { + "epoch": 0.750970767089164, + "grad_norm": 49.17687225341797, + "learning_rate": 3.7714082119075115e-05, + "loss": 5.0772, + "step": 20500 + }, + { + "combined_loss": 1.9532296657562256, + "distill_loss": 1.2734321355819702, + "epoch": 0.750970767089164, + "step": 20500, + "student_mlm_loss": 2.6330270767211914 + }, + { + "epoch": 0.7546340391237454, + "grad_norm": 4.601011753082275, + "learning_rate": 3.765265252967049e-05, + "loss": 8.0874, + "step": 20600 + }, + { + "combined_loss": 1.8828588724136353, + "distill_loss": 1.35260009765625, + "epoch": 0.7546340391237454, + "step": 20600, + "student_mlm_loss": 2.4131176471710205 + }, + { + "epoch": 0.7582973111583267, + "grad_norm": 3.9183883666992188, + "learning_rate": 3.759122294026587e-05, + "loss": 3.1836, + "step": 20700 + }, + { + "combined_loss": 3.261841058731079, + "distill_loss": 1.35749351978302, + "epoch": 0.7582973111583267, + "step": 20700, + "student_mlm_loss": 5.166188716888428 + }, + { + "epoch": 0.7619605831929079, + "grad_norm": 59.35635757446289, + "learning_rate": 3.752979335086124e-05, + "loss": 3.446, + "step": 20800 + }, + { + "combined_loss": 2.0783181190490723, + "distill_loss": 1.3386023044586182, + "epoch": 0.7619605831929079, + "step": 20800, + "student_mlm_loss": 2.8180341720581055 + }, + { + "epoch": 0.7656238552274892, + "grad_norm": 14.875, + "learning_rate": 3.746836376145662e-05, + "loss": 8.5798, + "step": 20900 + }, + { + "combined_loss": 1.926416039466858, + "distill_loss": 1.3077542781829834, + "epoch": 0.7656238552274892, + "step": 20900, + "student_mlm_loss": 2.5450778007507324 + }, + { + "epoch": 0.7692871272620705, + "grad_norm": 23.419870376586914, + "learning_rate": 3.740693417205199e-05, + "loss": 5.2177, + "step": 21000 + }, + { + "combined_loss": 1.7290170192718506, + "distill_loss": 1.2258715629577637, + "epoch": 0.7692871272620705, + "step": 21000, + "student_mlm_loss": 2.2321624755859375 + }, + { + "epoch": 0.7729503992966518, + "grad_norm": 29.292964935302734, + "learning_rate": 3.7345504582647375e-05, + "loss": 13.8021, + "step": 21100 + }, + { + "combined_loss": 1.9402461051940918, + "distill_loss": 1.2749103307724, + "epoch": 0.7729503992966518, + "step": 21100, + "student_mlm_loss": 2.6055819988250732 + }, + { + "epoch": 0.776613671331233, + "grad_norm": 9.03995418548584, + "learning_rate": 3.7284074993242747e-05, + "loss": 6.547, + "step": 21200 + }, + { + "combined_loss": 2.2710204124450684, + "distill_loss": 1.312924861907959, + "epoch": 0.776613671331233, + "step": 21200, + "student_mlm_loss": 3.229116201400757 + }, + { + "epoch": 0.7802769433658143, + "grad_norm": 11.86938190460205, + "learning_rate": 3.7222645403838124e-05, + "loss": 12.9682, + "step": 21300 + }, + { + "combined_loss": 3.114459991455078, + "distill_loss": 1.318755865097046, + "epoch": 0.7802769433658143, + "step": 21300, + "student_mlm_loss": 4.910163879394531 + }, + { + "epoch": 0.7839402154003956, + "grad_norm": 14.11950969696045, + "learning_rate": 3.7161215814433495e-05, + "loss": 3.1257, + "step": 21400 + }, + { + "combined_loss": 3.882293224334717, + "distill_loss": 1.1930829286575317, + "epoch": 0.7839402154003956, + "step": 21400, + "student_mlm_loss": 6.571503639221191 + }, + { + "epoch": 0.7876034874349769, + "grad_norm": 22.7275447845459, + "learning_rate": 3.709978622502887e-05, + "loss": 3.1395, + "step": 21500 + }, + { + "combined_loss": 2.00057315826416, + "distill_loss": 1.3134089708328247, + "epoch": 0.7876034874349769, + "step": 21500, + "student_mlm_loss": 2.687737226486206 + }, + { + "epoch": 0.7912667594695582, + "grad_norm": 56.84143829345703, + "learning_rate": 3.703835663562425e-05, + "loss": 13.1799, + "step": 21600 + }, + { + "combined_loss": 2.094574213027954, + "distill_loss": 1.3792191743850708, + "epoch": 0.7912667594695582, + "step": 21600, + "student_mlm_loss": 2.809929370880127 + }, + { + "epoch": 0.7949300315041395, + "grad_norm": 30.655105590820312, + "learning_rate": 3.697692704621962e-05, + "loss": 4.1563, + "step": 21700 + }, + { + "combined_loss": 2.167109489440918, + "distill_loss": 1.3041900396347046, + "epoch": 0.7949300315041395, + "step": 21700, + "student_mlm_loss": 3.030029058456421 + }, + { + "epoch": 0.7985933035387208, + "grad_norm": 7.400668144226074, + "learning_rate": 3.6915497456815e-05, + "loss": 9.7848, + "step": 21800 + }, + { + "combined_loss": 2.2639806270599365, + "distill_loss": 1.3241550922393799, + "epoch": 0.7985933035387208, + "step": 21800, + "student_mlm_loss": 3.203806161880493 + }, + { + "epoch": 0.8022565755733021, + "grad_norm": 28.212512969970703, + "learning_rate": 3.685406786741038e-05, + "loss": 2.7595, + "step": 21900 + }, + { + "combined_loss": 1.9249264001846313, + "distill_loss": 1.337939739227295, + "epoch": 0.8022565755733021, + "step": 21900, + "student_mlm_loss": 2.5119130611419678 + }, + { + "epoch": 0.8059198476078834, + "grad_norm": 5.998919486999512, + "learning_rate": 3.6792638278005756e-05, + "loss": 5.9041, + "step": 22000 + }, + { + "epoch": 0.8059198476078834, + "eval_loss": 3.310230016708374, + "eval_runtime": 1.9252, + "eval_samples_per_second": 3633.98, + "eval_steps_per_second": 14.544, + "step": 22000 + }, + { + "combined_loss": 2.208944320678711, + "distill_loss": 1.2883169651031494, + "epoch": 0.8059198476078834, + "step": 22000, + "student_mlm_loss": 3.1295716762542725 + }, + { + "epoch": 0.8095831196424647, + "grad_norm": 42.16996383666992, + "learning_rate": 3.673120868860113e-05, + "loss": 10.4166, + "step": 22100 + }, + { + "combined_loss": 2.089421510696411, + "distill_loss": 1.3541114330291748, + "epoch": 0.8095831196424647, + "step": 22100, + "student_mlm_loss": 2.8247315883636475 + }, + { + "epoch": 0.8132463916770459, + "grad_norm": 10.702394485473633, + "learning_rate": 3.6669779099196505e-05, + "loss": 3.5812, + "step": 22200 + }, + { + "combined_loss": 1.8974239826202393, + "distill_loss": 1.3954590559005737, + "epoch": 0.8132463916770459, + "step": 22200, + "student_mlm_loss": 2.3993890285491943 + }, + { + "epoch": 0.8169096637116272, + "grad_norm": 149.82179260253906, + "learning_rate": 3.6608349509791876e-05, + "loss": 3.229, + "step": 22300 + }, + { + "combined_loss": 2.0663747787475586, + "distill_loss": 1.3880882263183594, + "epoch": 0.8169096637116272, + "step": 22300, + "student_mlm_loss": 2.7446610927581787 + }, + { + "epoch": 0.8205729357462085, + "grad_norm": 5.735169410705566, + "learning_rate": 3.6546919920387253e-05, + "loss": 13.0135, + "step": 22400 + }, + { + "combined_loss": 2.3801686763763428, + "distill_loss": 1.2296876907348633, + "epoch": 0.8205729357462085, + "step": 22400, + "student_mlm_loss": 3.5306496620178223 + }, + { + "epoch": 0.8242362077807898, + "grad_norm": 3.9154951572418213, + "learning_rate": 3.648549033098263e-05, + "loss": 3.0256, + "step": 22500 + }, + { + "combined_loss": 2.619138240814209, + "distill_loss": 1.369718313217163, + "epoch": 0.8242362077807898, + "step": 22500, + "student_mlm_loss": 3.868557929992676 + }, + { + "epoch": 0.827899479815371, + "grad_norm": 6.706686019897461, + "learning_rate": 3.6424060741578e-05, + "loss": 6.8373, + "step": 22600 + }, + { + "combined_loss": 3.571559429168701, + "distill_loss": 1.360285758972168, + "epoch": 0.827899479815371, + "step": 22600, + "student_mlm_loss": 5.782833099365234 + }, + { + "epoch": 0.8315627518499524, + "grad_norm": 63.70609664916992, + "learning_rate": 3.636263115217338e-05, + "loss": 3.1874, + "step": 22700 + }, + { + "combined_loss": 6.645792007446289, + "distill_loss": 1.3381716012954712, + "epoch": 0.8315627518499524, + "step": 22700, + "student_mlm_loss": 11.953412055969238 + }, + { + "epoch": 0.8352260238845337, + "grad_norm": 112.02607727050781, + "learning_rate": 3.630120156276876e-05, + "loss": 4.1698, + "step": 22800 + }, + { + "combined_loss": 2.399282455444336, + "distill_loss": 1.2190183401107788, + "epoch": 0.8352260238845337, + "step": 22800, + "student_mlm_loss": 3.5795464515686035 + }, + { + "epoch": 0.838889295919115, + "grad_norm": 319.05230712890625, + "learning_rate": 3.6239771973364136e-05, + "loss": 3.351, + "step": 22900 + }, + { + "combined_loss": 5.626018047332764, + "distill_loss": 1.3532286882400513, + "epoch": 0.838889295919115, + "step": 22900, + "student_mlm_loss": 9.898807525634766 + }, + { + "epoch": 0.8425525679536963, + "grad_norm": 4.46912956237793, + "learning_rate": 3.617834238395951e-05, + "loss": 3.1926, + "step": 23000 + }, + { + "combined_loss": 1.8462562561035156, + "distill_loss": 1.339337944984436, + "epoch": 0.8425525679536963, + "step": 23000, + "student_mlm_loss": 2.3531746864318848 + }, + { + "epoch": 0.8462158399882775, + "grad_norm": 15.756026268005371, + "learning_rate": 3.611691279455488e-05, + "loss": 11.7086, + "step": 23100 + }, + { + "combined_loss": 3.4101529121398926, + "distill_loss": 1.3407546281814575, + "epoch": 0.8462158399882775, + "step": 23100, + "student_mlm_loss": 5.479551315307617 + }, + { + "epoch": 0.8498791120228588, + "grad_norm": 12.350069046020508, + "learning_rate": 3.6055483205150256e-05, + "loss": 3.1203, + "step": 23200 + }, + { + "combined_loss": 2.5675039291381836, + "distill_loss": 1.2296205759048462, + "epoch": 0.8498791120228588, + "step": 23200, + "student_mlm_loss": 3.9053874015808105 + }, + { + "epoch": 0.8535423840574401, + "grad_norm": 11.17212963104248, + "learning_rate": 3.5994053615745634e-05, + "loss": 6.2935, + "step": 23300 + }, + { + "combined_loss": 2.901674270629883, + "distill_loss": 1.318871021270752, + "epoch": 0.8535423840574401, + "step": 23300, + "student_mlm_loss": 4.484477519989014 + }, + { + "epoch": 0.8572056560920214, + "grad_norm": 11.69430160522461, + "learning_rate": 3.593262402634101e-05, + "loss": 6.1123, + "step": 23400 + }, + { + "combined_loss": 1.962475061416626, + "distill_loss": 1.3837331533432007, + "epoch": 0.8572056560920214, + "step": 23400, + "student_mlm_loss": 2.541217088699341 + }, + { + "epoch": 0.8608689281266027, + "grad_norm": 6.221428394317627, + "learning_rate": 3.587119443693638e-05, + "loss": 5.0621, + "step": 23500 + }, + { + "combined_loss": 2.3063066005706787, + "distill_loss": 1.364685297012329, + "epoch": 0.8608689281266027, + "step": 23500, + "student_mlm_loss": 3.2479279041290283 + }, + { + "epoch": 0.8645322001611839, + "grad_norm": 3.200302839279175, + "learning_rate": 3.580976484753176e-05, + "loss": 3.1679, + "step": 23600 + }, + { + "combined_loss": 14.653901100158691, + "distill_loss": 1.3521461486816406, + "epoch": 0.8645322001611839, + "step": 23600, + "student_mlm_loss": 27.955656051635742 + }, + { + "epoch": 0.8681954721957652, + "grad_norm": 18.003841400146484, + "learning_rate": 3.574833525812714e-05, + "loss": 4.2524, + "step": 23700 + }, + { + "combined_loss": 2.05013108253479, + "distill_loss": 1.473749041557312, + "epoch": 0.8681954721957652, + "step": 23700, + "student_mlm_loss": 2.6265130043029785 + }, + { + "epoch": 0.8718587442303466, + "grad_norm": 16.64165687561035, + "learning_rate": 3.5686905668722516e-05, + "loss": 3.4139, + "step": 23800 + }, + { + "combined_loss": 3.8039913177490234, + "distill_loss": 1.3022387027740479, + "epoch": 0.8718587442303466, + "step": 23800, + "student_mlm_loss": 6.305744171142578 + }, + { + "epoch": 0.8755220162649279, + "grad_norm": 6.90595817565918, + "learning_rate": 3.562547607931789e-05, + "loss": 5.4512, + "step": 23900 + }, + { + "combined_loss": 2.0175633430480957, + "distill_loss": 1.2362921237945557, + "epoch": 0.8755220162649279, + "step": 23900, + "student_mlm_loss": 2.7988343238830566 + }, + { + "epoch": 0.8791852882995091, + "grad_norm": 26.792980194091797, + "learning_rate": 3.556404648991326e-05, + "loss": 6.622, + "step": 24000 + }, + { + "epoch": 0.8791852882995091, + "eval_loss": 3.643918991088867, + "eval_runtime": 1.9198, + "eval_samples_per_second": 3644.043, + "eval_steps_per_second": 14.585, + "step": 24000 + }, + { + "combined_loss": 2.1716020107269287, + "distill_loss": 1.3234556913375854, + "epoch": 0.8791852882995091, + "step": 24000, + "student_mlm_loss": 3.0197484493255615 + }, + { + "epoch": 0.8828485603340904, + "grad_norm": 4.8087568283081055, + "learning_rate": 3.550261690050864e-05, + "loss": 4.0542, + "step": 24100 + }, + { + "combined_loss": 13.035262107849121, + "distill_loss": 1.353433609008789, + "epoch": 0.8828485603340904, + "step": 24100, + "student_mlm_loss": 24.717090606689453 + }, + { + "epoch": 0.8865118323686717, + "grad_norm": 10.60560417175293, + "learning_rate": 3.5441187311104014e-05, + "loss": 3.1068, + "step": 24200 + }, + { + "combined_loss": 1.8867456912994385, + "distill_loss": 1.2289210557937622, + "epoch": 0.8865118323686717, + "step": 24200, + "student_mlm_loss": 2.544570207595825 + }, + { + "epoch": 0.890175104403253, + "grad_norm": 11.34473705291748, + "learning_rate": 3.537975772169939e-05, + "loss": 2.9801, + "step": 24300 + }, + { + "combined_loss": 1.7472858428955078, + "distill_loss": 1.229453206062317, + "epoch": 0.890175104403253, + "step": 24300, + "student_mlm_loss": 2.265118360519409 + }, + { + "epoch": 0.8938383764378343, + "grad_norm": 17.742507934570312, + "learning_rate": 3.531832813229476e-05, + "loss": 4.6617, + "step": 24400 + }, + { + "combined_loss": 1.9173786640167236, + "distill_loss": 1.3212807178497314, + "epoch": 0.8938383764378343, + "step": 24400, + "student_mlm_loss": 2.513476610183716 + }, + { + "epoch": 0.8975016484724155, + "grad_norm": 14.223791122436523, + "learning_rate": 3.525689854289014e-05, + "loss": 3.0537, + "step": 24500 + }, + { + "combined_loss": 1.7878549098968506, + "distill_loss": 1.2908958196640015, + "epoch": 0.8975016484724155, + "step": 24500, + "student_mlm_loss": 2.28481388092041 + }, + { + "epoch": 0.9011649205069968, + "grad_norm": 4.241771697998047, + "learning_rate": 3.519546895348552e-05, + "loss": 7.9255, + "step": 24600 + }, + { + "combined_loss": 1.8853719234466553, + "distill_loss": 1.3350555896759033, + "epoch": 0.9011649205069968, + "step": 24600, + "student_mlm_loss": 2.4356882572174072 + }, + { + "epoch": 0.9048281925415781, + "grad_norm": 5.793640613555908, + "learning_rate": 3.513403936408089e-05, + "loss": 2.9971, + "step": 24700 + }, + { + "combined_loss": 9.072087287902832, + "distill_loss": 1.2805593013763428, + "epoch": 0.9048281925415781, + "step": 24700, + "student_mlm_loss": 16.863615036010742 + }, + { + "epoch": 0.9084914645761595, + "grad_norm": 4.500351905822754, + "learning_rate": 3.507260977467627e-05, + "loss": 2.9841, + "step": 24800 + }, + { + "combined_loss": 4.229645252227783, + "distill_loss": 1.231893539428711, + "epoch": 0.9084914645761595, + "step": 24800, + "student_mlm_loss": 7.2273969650268555 + }, + { + "epoch": 0.9121547366107408, + "grad_norm": 24.93678855895996, + "learning_rate": 3.501118018527164e-05, + "loss": 5.2865, + "step": 24900 + }, + { + "combined_loss": 4.519498825073242, + "distill_loss": 1.35053288936615, + "epoch": 0.9121547366107408, + "step": 24900, + "student_mlm_loss": 7.688465118408203 + }, + { + "epoch": 0.915818008645322, + "grad_norm": 9.416017532348633, + "learning_rate": 3.494975059586702e-05, + "loss": 2.9688, + "step": 25000 + }, + { + "combined_loss": 4.33969783782959, + "distill_loss": 1.2811079025268555, + "epoch": 0.915818008645322, + "step": 25000, + "student_mlm_loss": 7.398288249969482 + }, + { + "epoch": 0.9194812806799033, + "grad_norm": 41.79585266113281, + "learning_rate": 3.4888321006462394e-05, + "loss": 12.352, + "step": 25100 + }, + { + "combined_loss": 2.398942232131958, + "distill_loss": 1.3129199743270874, + "epoch": 0.9194812806799033, + "step": 25100, + "student_mlm_loss": 3.484964609146118 + }, + { + "epoch": 0.9231445527144846, + "grad_norm": 27.67843246459961, + "learning_rate": 3.482689141705777e-05, + "loss": 4.6291, + "step": 25200 + }, + { + "combined_loss": 1.8275630474090576, + "distill_loss": 1.1290583610534668, + "epoch": 0.9231445527144846, + "step": 25200, + "student_mlm_loss": 2.5260677337646484 + }, + { + "epoch": 0.9268078247490659, + "grad_norm": 57.03019332885742, + "learning_rate": 3.476546182765314e-05, + "loss": 3.8226, + "step": 25300 + }, + { + "combined_loss": 1.8621808290481567, + "distill_loss": 1.3249785900115967, + "epoch": 0.9268078247490659, + "step": 25300, + "student_mlm_loss": 2.399383068084717 + }, + { + "epoch": 0.9304710967836471, + "grad_norm": 5.4275007247924805, + "learning_rate": 3.470403223824852e-05, + "loss": 3.7803, + "step": 25400 + }, + { + "combined_loss": 5.317490100860596, + "distill_loss": 1.3810964822769165, + "epoch": 0.9304710967836471, + "step": 25400, + "student_mlm_loss": 9.253883361816406 + }, + { + "epoch": 0.9341343688182284, + "grad_norm": 6.36318302154541, + "learning_rate": 3.46426026488439e-05, + "loss": 17.9114, + "step": 25500 + }, + { + "combined_loss": 4.816742897033691, + "distill_loss": 1.274537444114685, + "epoch": 0.9341343688182284, + "step": 25500, + "student_mlm_loss": 8.358948707580566 + }, + { + "epoch": 0.9377976408528097, + "grad_norm": 4.670822620391846, + "learning_rate": 3.458117305943927e-05, + "loss": 3.4352, + "step": 25600 + }, + { + "combined_loss": 1.7166364192962646, + "distill_loss": 1.2876447439193726, + "epoch": 0.9377976408528097, + "step": 25600, + "student_mlm_loss": 2.145627975463867 + }, + { + "epoch": 0.941460912887391, + "grad_norm": 16.301795959472656, + "learning_rate": 3.451974347003465e-05, + "loss": 2.591, + "step": 25700 + }, + { + "combined_loss": 1.8349076509475708, + "distill_loss": 1.3192713260650635, + "epoch": 0.941460912887391, + "step": 25700, + "student_mlm_loss": 2.350543975830078 + }, + { + "epoch": 0.9451241849219723, + "grad_norm": 4.464934349060059, + "learning_rate": 3.4458313880630025e-05, + "loss": 5.3202, + "step": 25800 + }, + { + "combined_loss": 2.022656202316284, + "distill_loss": 1.4582451581954956, + "epoch": 0.9451241849219723, + "step": 25800, + "student_mlm_loss": 2.587067127227783 + }, + { + "epoch": 0.9487874569565536, + "grad_norm": 13.280508041381836, + "learning_rate": 3.43968842912254e-05, + "loss": 3.2685, + "step": 25900 + }, + { + "combined_loss": 1.7409727573394775, + "distill_loss": 1.2449432611465454, + "epoch": 0.9487874569565536, + "step": 25900, + "student_mlm_loss": 2.23700213432312 + }, + { + "epoch": 0.9524507289911349, + "grad_norm": 34.54155349731445, + "learning_rate": 3.4335454701820774e-05, + "loss": 4.4614, + "step": 26000 + }, + { + "epoch": 0.9524507289911349, + "eval_loss": 3.371135950088501, + "eval_runtime": 1.9026, + "eval_samples_per_second": 3677.064, + "eval_steps_per_second": 14.717, + "step": 26000 + }, + { + "combined_loss": 2.1200222969055176, + "distill_loss": 1.4147942066192627, + "epoch": 0.9524507289911349, + "step": 26000, + "student_mlm_loss": 2.8252503871917725 + }, + { + "epoch": 0.9561140010257162, + "grad_norm": 12.063314437866211, + "learning_rate": 3.427402511241615e-05, + "loss": 3.8605, + "step": 26100 + }, + { + "combined_loss": 2.440842866897583, + "distill_loss": 1.4115891456604004, + "epoch": 0.9561140010257162, + "step": 26100, + "student_mlm_loss": 3.4700965881347656 + }, + { + "epoch": 0.9597772730602975, + "grad_norm": 3.154322862625122, + "learning_rate": 3.421259552301152e-05, + "loss": 3.4216, + "step": 26200 + }, + { + "combined_loss": 2.0511860847473145, + "distill_loss": 1.2086646556854248, + "epoch": 0.9597772730602975, + "step": 26200, + "student_mlm_loss": 2.893707752227783 + }, + { + "epoch": 0.9634405450948788, + "grad_norm": 4.469895839691162, + "learning_rate": 3.41511659336069e-05, + "loss": 8.4313, + "step": 26300 + }, + { + "combined_loss": 1.9184556007385254, + "distill_loss": 1.311684489250183, + "epoch": 0.9634405450948788, + "step": 26300, + "student_mlm_loss": 2.525226593017578 + }, + { + "epoch": 0.96710381712946, + "grad_norm": 37.47445297241211, + "learning_rate": 3.408973634420228e-05, + "loss": 3.33, + "step": 26400 + }, + { + "combined_loss": 1.8568530082702637, + "distill_loss": 1.3435510396957397, + "epoch": 0.96710381712946, + "step": 26400, + "student_mlm_loss": 2.370154857635498 + }, + { + "epoch": 0.9707670891640413, + "grad_norm": 5.385250091552734, + "learning_rate": 3.402830675479765e-05, + "loss": 3.0353, + "step": 26500 + }, + { + "combined_loss": 2.078137159347534, + "distill_loss": 1.4688613414764404, + "epoch": 0.9707670891640413, + "step": 26500, + "student_mlm_loss": 2.687412977218628 + }, + { + "epoch": 0.9744303611986226, + "grad_norm": 20.363506317138672, + "learning_rate": 3.396687716539303e-05, + "loss": 5.5902, + "step": 26600 + }, + { + "combined_loss": 2.420652151107788, + "distill_loss": 1.3566147089004517, + "epoch": 0.9744303611986226, + "step": 26600, + "student_mlm_loss": 3.484689474105835 + }, + { + "epoch": 0.9780936332332039, + "grad_norm": 5.678069591522217, + "learning_rate": 3.3905447575988405e-05, + "loss": 3.1063, + "step": 26700 + }, + { + "combined_loss": 2.2643003463745117, + "distill_loss": 1.3446204662322998, + "epoch": 0.9780936332332039, + "step": 26700, + "student_mlm_loss": 3.1839799880981445 + }, + { + "epoch": 0.9817569052677851, + "grad_norm": 8.722668647766113, + "learning_rate": 3.384401798658378e-05, + "loss": 9.3685, + "step": 26800 + }, + { + "combined_loss": 8.34331226348877, + "distill_loss": 1.3864542245864868, + "epoch": 0.9817569052677851, + "step": 26800, + "student_mlm_loss": 15.3001708984375 + }, + { + "epoch": 0.9854201773023665, + "grad_norm": 5.101404190063477, + "learning_rate": 3.3782588397179154e-05, + "loss": 3.1112, + "step": 26900 + }, + { + "combined_loss": 30.241453170776367, + "distill_loss": 1.3818217515945435, + "epoch": 0.9854201773023665, + "step": 26900, + "student_mlm_loss": 59.1010856628418 + }, + { + "epoch": 0.9890834493369478, + "grad_norm": 3.8359858989715576, + "learning_rate": 3.3721158807774525e-05, + "loss": 3.348, + "step": 27000 + }, + { + "combined_loss": 1.8264105319976807, + "distill_loss": 1.2956147193908691, + "epoch": 0.9890834493369478, + "step": 27000, + "student_mlm_loss": 2.357206344604492 + }, + { + "epoch": 0.9927467213715291, + "grad_norm": 33.43736267089844, + "learning_rate": 3.36597292183699e-05, + "loss": 3.5437, + "step": 27100 + }, + { + "combined_loss": 2.331777572631836, + "distill_loss": 1.3274433612823486, + "epoch": 0.9927467213715291, + "step": 27100, + "student_mlm_loss": 3.3361120223999023 + }, + { + "epoch": 0.9964099934061104, + "grad_norm": 2.9736690521240234, + "learning_rate": 3.359829962896528e-05, + "loss": 2.828, + "step": 27200 + }, + { + "combined_loss": 2.0438201427459717, + "distill_loss": 1.334372639656067, + "epoch": 0.9964099934061104, + "step": 27200, + "student_mlm_loss": 2.753267526626587 + }, + { + "epoch": 1.0000732654406916, + "grad_norm": 3.6774871349334717, + "learning_rate": 3.353687003956066e-05, + "loss": 3.168, + "step": 27300 + }, + { + "combined_loss": 3.4676733016967773, + "distill_loss": 1.2681790590286255, + "epoch": 1.0000732654406916, + "step": 27300, + "student_mlm_loss": 5.667167663574219 + }, + { + "epoch": 1.003736537475273, + "grad_norm": 20.265796661376953, + "learning_rate": 3.347544045015603e-05, + "loss": 4.9071, + "step": 27400 + }, + { + "combined_loss": 1.740236520767212, + "distill_loss": 1.1595730781555176, + "epoch": 1.003736537475273, + "step": 27400, + "student_mlm_loss": 2.3208999633789062 + }, + { + "epoch": 1.0073998095098542, + "grad_norm": 14.427675247192383, + "learning_rate": 3.341401086075141e-05, + "loss": 3.1375, + "step": 27500 + }, + { + "combined_loss": 2.0229873657226562, + "distill_loss": 1.3961925506591797, + "epoch": 1.0073998095098542, + "step": 27500, + "student_mlm_loss": 2.6497819423675537 + }, + { + "epoch": 1.0110630815444355, + "grad_norm": 3.032438039779663, + "learning_rate": 3.3352581271346786e-05, + "loss": 2.7581, + "step": 27600 + }, + { + "combined_loss": 1.9314367771148682, + "distill_loss": 1.2618595361709595, + "epoch": 1.0110630815444355, + "step": 27600, + "student_mlm_loss": 2.6010141372680664 + }, + { + "epoch": 1.0147263535790167, + "grad_norm": 6.167496681213379, + "learning_rate": 3.3291151681942163e-05, + "loss": 6.7788, + "step": 27700 + }, + { + "combined_loss": 2.247697353363037, + "distill_loss": 1.4385483264923096, + "epoch": 1.0147263535790167, + "step": 27700, + "student_mlm_loss": 3.0568461418151855 + }, + { + "epoch": 1.018389625613598, + "grad_norm": 4.82693338394165, + "learning_rate": 3.3229722092537534e-05, + "loss": 5.9229, + "step": 27800 + }, + { + "combined_loss": 3.4328160285949707, + "distill_loss": 1.319059133529663, + "epoch": 1.018389625613598, + "step": 27800, + "student_mlm_loss": 5.546572685241699 + }, + { + "epoch": 1.0220528976481793, + "grad_norm": 13.18911361694336, + "learning_rate": 3.3168292503132906e-05, + "loss": 3.5041, + "step": 27900 + }, + { + "combined_loss": 3.720487594604492, + "distill_loss": 1.233067274093628, + "epoch": 1.0220528976481793, + "step": 27900, + "student_mlm_loss": 6.207907676696777 + }, + { + "epoch": 1.0257161696827606, + "grad_norm": 10.725250244140625, + "learning_rate": 3.310686291372829e-05, + "loss": 2.9279, + "step": 28000 + }, + { + "epoch": 1.0257161696827606, + "eval_loss": 3.3177244663238525, + "eval_runtime": 2.0821, + "eval_samples_per_second": 3360.034, + "eval_steps_per_second": 13.448, + "step": 28000 + }, + { + "combined_loss": 2.0106987953186035, + "distill_loss": 1.3163011074066162, + "epoch": 1.0257161696827606, + "step": 28000, + "student_mlm_loss": 2.70509672164917 + }, + { + "epoch": 1.0293794417173419, + "grad_norm": 5.406506538391113, + "learning_rate": 3.304543332432366e-05, + "loss": 3.2149, + "step": 28100 + }, + { + "combined_loss": 2.042628288269043, + "distill_loss": 1.3173636198043823, + "epoch": 1.0293794417173419, + "step": 28100, + "student_mlm_loss": 2.767892837524414 + }, + { + "epoch": 1.0330427137519231, + "grad_norm": 3.2733256816864014, + "learning_rate": 3.298400373491904e-05, + "loss": 6.3856, + "step": 28200 + }, + { + "combined_loss": 1.9145760536193848, + "distill_loss": 1.438834309577942, + "epoch": 1.0330427137519231, + "step": 28200, + "student_mlm_loss": 2.390317916870117 + }, + { + "epoch": 1.0367059857865044, + "grad_norm": 10.546121597290039, + "learning_rate": 3.292257414551441e-05, + "loss": 3.5422, + "step": 28300 + }, + { + "combined_loss": 2.6431736946105957, + "distill_loss": 1.367489218711853, + "epoch": 1.0367059857865044, + "step": 28300, + "student_mlm_loss": 3.918858289718628 + }, + { + "epoch": 1.0403692578210857, + "grad_norm": 25.674352645874023, + "learning_rate": 3.286114455610979e-05, + "loss": 6.2258, + "step": 28400 + }, + { + "combined_loss": 1.8416577577590942, + "distill_loss": 1.2867157459259033, + "epoch": 1.0403692578210857, + "step": 28400, + "student_mlm_loss": 2.396599769592285 + }, + { + "epoch": 1.044032529855667, + "grad_norm": 3.6745688915252686, + "learning_rate": 3.2799714966705166e-05, + "loss": 5.0647, + "step": 28500 + }, + { + "combined_loss": 1.9693520069122314, + "distill_loss": 1.3039644956588745, + "epoch": 1.044032529855667, + "step": 28500, + "student_mlm_loss": 2.634739637374878 + }, + { + "epoch": 1.0476958018902485, + "grad_norm": 40.79129409790039, + "learning_rate": 3.273828537730054e-05, + "loss": 2.6424, + "step": 28600 + }, + { + "combined_loss": 2.4251365661621094, + "distill_loss": 1.3121291399002075, + "epoch": 1.0476958018902485, + "step": 28600, + "student_mlm_loss": 3.5381438732147217 + }, + { + "epoch": 1.0513590739248297, + "grad_norm": 7.185906410217285, + "learning_rate": 3.2676855787895915e-05, + "loss": 2.9095, + "step": 28700 + }, + { + "combined_loss": 5.781175136566162, + "distill_loss": 1.3236074447631836, + "epoch": 1.0513590739248297, + "step": 28700, + "student_mlm_loss": 10.23874282836914 + }, + { + "epoch": 1.055022345959411, + "grad_norm": 7.2639079093933105, + "learning_rate": 3.2615426198491286e-05, + "loss": 3.0536, + "step": 28800 + }, + { + "combined_loss": 1.8534462451934814, + "distill_loss": 1.433970332145691, + "epoch": 1.055022345959411, + "step": 28800, + "student_mlm_loss": 2.2729220390319824 + }, + { + "epoch": 1.0586856179939923, + "grad_norm": 82.9974365234375, + "learning_rate": 3.255399660908667e-05, + "loss": 3.4605, + "step": 28900 + }, + { + "combined_loss": 2.385720729827881, + "distill_loss": 1.319982647895813, + "epoch": 1.0586856179939923, + "step": 28900, + "student_mlm_loss": 3.4514589309692383 + }, + { + "epoch": 1.0623488900285736, + "grad_norm": 8.101861000061035, + "learning_rate": 3.249256701968204e-05, + "loss": 2.9531, + "step": 29000 + }, + { + "combined_loss": 1.9569958448410034, + "distill_loss": 1.350255012512207, + "epoch": 1.0623488900285736, + "step": 29000, + "student_mlm_loss": 2.5637366771698 + }, + { + "epoch": 1.0660121620631549, + "grad_norm": 42.843135833740234, + "learning_rate": 3.243113743027742e-05, + "loss": 3.5336, + "step": 29100 + }, + { + "combined_loss": 2.0199599266052246, + "distill_loss": 1.1558183431625366, + "epoch": 1.0660121620631549, + "step": 29100, + "student_mlm_loss": 2.884101390838623 + }, + { + "epoch": 1.0696754340977361, + "grad_norm": 10.401261329650879, + "learning_rate": 3.236970784087279e-05, + "loss": 2.6909, + "step": 29200 + }, + { + "combined_loss": 1.898897409439087, + "distill_loss": 1.2361267805099487, + "epoch": 1.0696754340977361, + "step": 29200, + "student_mlm_loss": 2.5616679191589355 + }, + { + "epoch": 1.0733387061323174, + "grad_norm": 13.08026123046875, + "learning_rate": 3.230827825146817e-05, + "loss": 10.7499, + "step": 29300 + }, + { + "combined_loss": 2.385263442993164, + "distill_loss": 1.2960166931152344, + "epoch": 1.0733387061323174, + "step": 29300, + "student_mlm_loss": 3.4745099544525146 + }, + { + "epoch": 1.0770019781668987, + "grad_norm": 6.8822431564331055, + "learning_rate": 3.2246848662063546e-05, + "loss": 3.0651, + "step": 29400 + }, + { + "combined_loss": 2.1257505416870117, + "distill_loss": 1.3224972486495972, + "epoch": 1.0770019781668987, + "step": 29400, + "student_mlm_loss": 2.929003953933716 + }, + { + "epoch": 1.08066525020148, + "grad_norm": 3.4312744140625, + "learning_rate": 3.218541907265892e-05, + "loss": 3.1323, + "step": 29500 + }, + { + "combined_loss": 2.0117716789245605, + "distill_loss": 1.2447552680969238, + "epoch": 1.08066525020148, + "step": 29500, + "student_mlm_loss": 2.7787880897521973 + }, + { + "epoch": 1.0843285222360612, + "grad_norm": 3.970820426940918, + "learning_rate": 3.2123989483254295e-05, + "loss": 3.7427, + "step": 29600 + }, + { + "combined_loss": 2.493256092071533, + "distill_loss": 1.27970290184021, + "epoch": 1.0843285222360612, + "step": 29600, + "student_mlm_loss": 3.7068092823028564 + }, + { + "epoch": 1.0879917942706425, + "grad_norm": 5.8632426261901855, + "learning_rate": 3.206255989384967e-05, + "loss": 3.0698, + "step": 29700 + }, + { + "combined_loss": 2.017867088317871, + "distill_loss": 1.408115029335022, + "epoch": 1.0879917942706425, + "step": 29700, + "student_mlm_loss": 2.6276190280914307 + }, + { + "epoch": 1.0916550663052238, + "grad_norm": 7.350955963134766, + "learning_rate": 3.200113030444505e-05, + "loss": 10.1517, + "step": 29800 + }, + { + "combined_loss": 3.020230770111084, + "distill_loss": 1.1870992183685303, + "epoch": 1.0916550663052238, + "step": 29800, + "student_mlm_loss": 4.853362083435059 + }, + { + "epoch": 1.095318338339805, + "grad_norm": 14.347647666931152, + "learning_rate": 3.193970071504042e-05, + "loss": 2.8345, + "step": 29900 + }, + { + "combined_loss": 1.8037035465240479, + "distill_loss": 1.2421637773513794, + "epoch": 1.095318338339805, + "step": 29900, + "student_mlm_loss": 2.365243434906006 + }, + { + "epoch": 1.0989816103743864, + "grad_norm": 8.716060638427734, + "learning_rate": 3.18782711256358e-05, + "loss": 4.9073, + "step": 30000 + }, + { + "epoch": 1.0989816103743864, + "eval_loss": 3.289705753326416, + "eval_runtime": 2.6398, + "eval_samples_per_second": 2650.179, + "eval_steps_per_second": 10.607, + "step": 30000 + }, + { + "combined_loss": 3.3838839530944824, + "distill_loss": 1.2657897472381592, + "epoch": 1.0989816103743864, + "step": 30000, + "student_mlm_loss": 5.501977920532227 + }, + { + "epoch": 1.1026448824089676, + "grad_norm": 9.78013801574707, + "learning_rate": 3.181684153623117e-05, + "loss": 6.1366, + "step": 30100 + }, + { + "combined_loss": 1.8116616010665894, + "distill_loss": 1.3585631847381592, + "epoch": 1.1026448824089676, + "step": 30100, + "student_mlm_loss": 2.2647600173950195 + }, + { + "epoch": 1.106308154443549, + "grad_norm": 20.41010856628418, + "learning_rate": 3.175541194682655e-05, + "loss": 4.7028, + "step": 30200 + }, + { + "combined_loss": 1.9074151515960693, + "distill_loss": 1.119224190711975, + "epoch": 1.106308154443549, + "step": 30200, + "student_mlm_loss": 2.695605993270874 + }, + { + "epoch": 1.1099714264781302, + "grad_norm": 7.005733966827393, + "learning_rate": 3.1693982357421926e-05, + "loss": 4.9073, + "step": 30300 + }, + { + "combined_loss": 1.7690558433532715, + "distill_loss": 1.2762707471847534, + "epoch": 1.1099714264781302, + "step": 30300, + "student_mlm_loss": 2.2618408203125 + }, + { + "epoch": 1.1136346985127115, + "grad_norm": 4.290195465087891, + "learning_rate": 3.16325527680173e-05, + "loss": 4.1257, + "step": 30400 + }, + { + "combined_loss": 15.505983352661133, + "distill_loss": 1.252361536026001, + "epoch": 1.1136346985127115, + "step": 30400, + "student_mlm_loss": 29.759605407714844 + }, + { + "epoch": 1.1172979705472927, + "grad_norm": 27.59025764465332, + "learning_rate": 3.1571123178612675e-05, + "loss": 3.6319, + "step": 30500 + }, + { + "combined_loss": 3.190175771713257, + "distill_loss": 1.237632155418396, + "epoch": 1.1172979705472927, + "step": 30500, + "student_mlm_loss": 5.142719268798828 + }, + { + "epoch": 1.120961242581874, + "grad_norm": 35.681365966796875, + "learning_rate": 3.150969358920805e-05, + "loss": 5.2866, + "step": 30600 + }, + { + "combined_loss": 2.1486501693725586, + "distill_loss": 1.3570821285247803, + "epoch": 1.120961242581874, + "step": 30600, + "student_mlm_loss": 2.940218448638916 + }, + { + "epoch": 1.1246245146164555, + "grad_norm": 28.920949935913086, + "learning_rate": 3.144826399980343e-05, + "loss": 11.35, + "step": 30700 + }, + { + "combined_loss": 3.544619560241699, + "distill_loss": 1.3219174146652222, + "epoch": 1.1246245146164555, + "step": 30700, + "student_mlm_loss": 5.767321586608887 + }, + { + "epoch": 1.1282877866510368, + "grad_norm": 36.29865264892578, + "learning_rate": 3.13868344103988e-05, + "loss": 8.8748, + "step": 30800 + }, + { + "combined_loss": 3.136960744857788, + "distill_loss": 1.4069170951843262, + "epoch": 1.1282877866510368, + "step": 30800, + "student_mlm_loss": 4.86700439453125 + }, + { + "epoch": 1.131951058685618, + "grad_norm": 8.498424530029297, + "learning_rate": 3.132540482099417e-05, + "loss": 2.6175, + "step": 30900 + }, + { + "combined_loss": 2.584123373031616, + "distill_loss": 1.3318666219711304, + "epoch": 1.131951058685618, + "step": 30900, + "student_mlm_loss": 3.8363800048828125 + }, + { + "epoch": 1.1356143307201993, + "grad_norm": 8.784627914428711, + "learning_rate": 3.126397523158955e-05, + "loss": 3.7912, + "step": 31000 + }, + { + "combined_loss": 4.065792083740234, + "distill_loss": 1.279055118560791, + "epoch": 1.1356143307201993, + "step": 31000, + "student_mlm_loss": 6.8525285720825195 + }, + { + "epoch": 1.1392776027547806, + "grad_norm": 15.763399124145508, + "learning_rate": 3.120254564218493e-05, + "loss": 7.3671, + "step": 31100 + }, + { + "combined_loss": 1.9532334804534912, + "distill_loss": 1.2137418985366821, + "epoch": 1.1392776027547806, + "step": 31100, + "student_mlm_loss": 2.6927249431610107 + }, + { + "epoch": 1.142940874789362, + "grad_norm": 6.777341842651367, + "learning_rate": 3.1141116052780306e-05, + "loss": 2.8877, + "step": 31200 + }, + { + "combined_loss": 3.5847015380859375, + "distill_loss": 1.3712694644927979, + "epoch": 1.142940874789362, + "step": 31200, + "student_mlm_loss": 5.798133850097656 + }, + { + "epoch": 1.1466041468239432, + "grad_norm": 6.115112781524658, + "learning_rate": 3.107968646337568e-05, + "loss": 3.3763, + "step": 31300 + }, + { + "combined_loss": 1.899533748626709, + "distill_loss": 1.2805981636047363, + "epoch": 1.1466041468239432, + "step": 31300, + "student_mlm_loss": 2.5184693336486816 + }, + { + "epoch": 1.1502674188585245, + "grad_norm": 3.3896713256835938, + "learning_rate": 3.1018256873971055e-05, + "loss": 3.2932, + "step": 31400 + }, + { + "combined_loss": 1.9794254302978516, + "distill_loss": 1.3896270990371704, + "epoch": 1.1502674188585245, + "step": 31400, + "student_mlm_loss": 2.5692238807678223 + }, + { + "epoch": 1.1539306908931057, + "grad_norm": 12.824034690856934, + "learning_rate": 3.095682728456643e-05, + "loss": 3.5341, + "step": 31500 + }, + { + "combined_loss": 2.5983529090881348, + "distill_loss": 1.2135576009750366, + "epoch": 1.1539306908931057, + "step": 31500, + "student_mlm_loss": 3.9831480979919434 + }, + { + "epoch": 1.157593962927687, + "grad_norm": 73.47982025146484, + "learning_rate": 3.089539769516181e-05, + "loss": 2.9879, + "step": 31600 + }, + { + "combined_loss": 1.8584779500961304, + "distill_loss": 1.3214514255523682, + "epoch": 1.157593962927687, + "step": 31600, + "student_mlm_loss": 2.3955044746398926 + }, + { + "epoch": 1.1612572349622683, + "grad_norm": 5.6778340339660645, + "learning_rate": 3.083396810575718e-05, + "loss": 2.9781, + "step": 31700 + }, + { + "combined_loss": 4.854001045227051, + "distill_loss": 1.2088978290557861, + "epoch": 1.1612572349622683, + "step": 31700, + "student_mlm_loss": 8.499104499816895 + }, + { + "epoch": 1.1649205069968496, + "grad_norm": 17.93754768371582, + "learning_rate": 3.077253851635255e-05, + "loss": 3.5773, + "step": 31800 + }, + { + "combined_loss": 1.9064607620239258, + "distill_loss": 1.363638997077942, + "epoch": 1.1649205069968496, + "step": 31800, + "student_mlm_loss": 2.449282646179199 + }, + { + "epoch": 1.1685837790314308, + "grad_norm": 8.912027359008789, + "learning_rate": 3.071110892694794e-05, + "loss": 3.0949, + "step": 31900 + }, + { + "combined_loss": 1.9666361808776855, + "distill_loss": 1.3997029066085815, + "epoch": 1.1685837790314308, + "step": 31900, + "student_mlm_loss": 2.5335693359375 + }, + { + "epoch": 1.1722470510660121, + "grad_norm": 21.05866050720215, + "learning_rate": 3.064967933754331e-05, + "loss": 2.965, + "step": 32000 + }, + { + "epoch": 1.1722470510660121, + "eval_loss": 3.516061544418335, + "eval_runtime": 2.6391, + "eval_samples_per_second": 2650.903, + "eval_steps_per_second": 10.61, + "step": 32000 + }, + { + "combined_loss": 2.466904640197754, + "distill_loss": 1.2619636058807373, + "epoch": 1.1722470510660121, + "step": 32000, + "student_mlm_loss": 3.6718459129333496 + }, + { + "epoch": 1.1759103231005934, + "grad_norm": 14.288066864013672, + "learning_rate": 3.0588249748138686e-05, + "loss": 6.5656, + "step": 32100 + }, + { + "combined_loss": 5.987391471862793, + "distill_loss": 1.3964972496032715, + "epoch": 1.1759103231005934, + "step": 32100, + "student_mlm_loss": 10.578286170959473 + }, + { + "epoch": 1.1795735951351747, + "grad_norm": 10.953961372375488, + "learning_rate": 3.052682015873406e-05, + "loss": 7.1246, + "step": 32200 + }, + { + "combined_loss": 1.758845567703247, + "distill_loss": 1.2731348276138306, + "epoch": 1.1795735951351747, + "step": 32200, + "student_mlm_loss": 2.244556188583374 + }, + { + "epoch": 1.183236867169756, + "grad_norm": 17.076087951660156, + "learning_rate": 3.046539056932944e-05, + "loss": 7.3734, + "step": 32300 + }, + { + "combined_loss": 1.7941749095916748, + "distill_loss": 1.282630205154419, + "epoch": 1.183236867169756, + "step": 32300, + "student_mlm_loss": 2.3057196140289307 + }, + { + "epoch": 1.1869001392043372, + "grad_norm": 11.33812427520752, + "learning_rate": 3.040396097992481e-05, + "loss": 5.4979, + "step": 32400 + }, + { + "combined_loss": 2.379426956176758, + "distill_loss": 1.2975032329559326, + "epoch": 1.1869001392043372, + "step": 32400, + "student_mlm_loss": 3.461350917816162 + }, + { + "epoch": 1.1905634112389185, + "grad_norm": 3.6378591060638428, + "learning_rate": 3.0342531390520184e-05, + "loss": 5.077, + "step": 32500 + }, + { + "combined_loss": 1.835166573524475, + "distill_loss": 1.294168472290039, + "epoch": 1.1905634112389185, + "step": 32500, + "student_mlm_loss": 2.376164674758911 + }, + { + "epoch": 1.1942266832735, + "grad_norm": 23.017444610595703, + "learning_rate": 3.0281101801115562e-05, + "loss": 3.1428, + "step": 32600 + }, + { + "combined_loss": 1.8867619037628174, + "distill_loss": 1.2372292280197144, + "epoch": 1.1942266832735, + "step": 32600, + "student_mlm_loss": 2.536294460296631 + }, + { + "epoch": 1.197889955308081, + "grad_norm": 7.055652141571045, + "learning_rate": 3.0219672211710937e-05, + "loss": 8.7118, + "step": 32700 + }, + { + "combined_loss": 6.59044075012207, + "distill_loss": 1.3554973602294922, + "epoch": 1.197889955308081, + "step": 32700, + "student_mlm_loss": 11.825384140014648 + }, + { + "epoch": 1.2015532273426626, + "grad_norm": 6.935373783111572, + "learning_rate": 3.0158242622306314e-05, + "loss": 7.5763, + "step": 32800 + }, + { + "combined_loss": 2.4971964359283447, + "distill_loss": 1.2960432767868042, + "epoch": 1.2015532273426626, + "step": 32800, + "student_mlm_loss": 3.698349714279175 + }, + { + "epoch": 1.2052164993772438, + "grad_norm": 19.48725700378418, + "learning_rate": 3.009681303290169e-05, + "loss": 5.1993, + "step": 32900 + }, + { + "combined_loss": 2.639206886291504, + "distill_loss": 1.2536990642547607, + "epoch": 1.2052164993772438, + "step": 32900, + "student_mlm_loss": 4.024714469909668 + }, + { + "epoch": 1.2088797714118251, + "grad_norm": 215.4875946044922, + "learning_rate": 3.0035383443497067e-05, + "loss": 3.9297, + "step": 33000 + }, + { + "combined_loss": 2.1888670921325684, + "distill_loss": 1.4587746858596802, + "epoch": 1.2088797714118251, + "step": 33000, + "student_mlm_loss": 2.918959379196167 + }, + { + "epoch": 1.2125430434464064, + "grad_norm": 5.346382141113281, + "learning_rate": 2.997395385409244e-05, + "loss": 3.3704, + "step": 33100 + }, + { + "combined_loss": 2.5722949504852295, + "distill_loss": 1.2250982522964478, + "epoch": 1.2125430434464064, + "step": 33100, + "student_mlm_loss": 3.9194915294647217 + }, + { + "epoch": 1.2162063154809877, + "grad_norm": 21.193038940429688, + "learning_rate": 2.991252426468782e-05, + "loss": 3.22, + "step": 33200 + }, + { + "combined_loss": 1.8822517395019531, + "distill_loss": 1.264020323753357, + "epoch": 1.2162063154809877, + "step": 33200, + "student_mlm_loss": 2.5004830360412598 + }, + { + "epoch": 1.219869587515569, + "grad_norm": 8.840603828430176, + "learning_rate": 2.9851094675283193e-05, + "loss": 13.091, + "step": 33300 + }, + { + "combined_loss": 2.0461645126342773, + "distill_loss": 1.3376085758209229, + "epoch": 1.219869587515569, + "step": 33300, + "student_mlm_loss": 2.7547202110290527 + }, + { + "epoch": 1.2235328595501502, + "grad_norm": 16.414852142333984, + "learning_rate": 2.9789665085878564e-05, + "loss": 3.6096, + "step": 33400 + }, + { + "combined_loss": 1.8437246084213257, + "distill_loss": 1.2731173038482666, + "epoch": 1.2235328595501502, + "step": 33400, + "student_mlm_loss": 2.4143319129943848 + }, + { + "epoch": 1.2271961315847315, + "grad_norm": 5.047356605529785, + "learning_rate": 2.9728235496473946e-05, + "loss": 10.6014, + "step": 33500 + }, + { + "combined_loss": 2.0613672733306885, + "distill_loss": 1.1784592866897583, + "epoch": 1.2271961315847315, + "step": 33500, + "student_mlm_loss": 2.944275140762329 + }, + { + "epoch": 1.2308594036193128, + "grad_norm": 8.502574920654297, + "learning_rate": 2.9666805907069317e-05, + "loss": 12.6532, + "step": 33600 + }, + { + "combined_loss": 2.301725149154663, + "distill_loss": 1.2482868432998657, + "epoch": 1.2308594036193128, + "step": 33600, + "student_mlm_loss": 3.355163335800171 + }, + { + "epoch": 1.234522675653894, + "grad_norm": 25.97445297241211, + "learning_rate": 2.9605376317664695e-05, + "loss": 3.1296, + "step": 33700 + }, + { + "combined_loss": 1.8135402202606201, + "distill_loss": 1.309229850769043, + "epoch": 1.234522675653894, + "step": 33700, + "student_mlm_loss": 2.3178505897521973 + }, + { + "epoch": 1.2381859476884753, + "grad_norm": 7.912507057189941, + "learning_rate": 2.954394672826007e-05, + "loss": 2.9749, + "step": 33800 + }, + { + "combined_loss": 1.9506487846374512, + "distill_loss": 1.3808802366256714, + "epoch": 1.2381859476884753, + "step": 33800, + "student_mlm_loss": 2.5204174518585205 + }, + { + "epoch": 1.2418492197230566, + "grad_norm": 28.239988327026367, + "learning_rate": 2.9482517138855447e-05, + "loss": 5.7527, + "step": 33900 + }, + { + "combined_loss": 1.881349802017212, + "distill_loss": 1.3489292860031128, + "epoch": 1.2418492197230566, + "step": 33900, + "student_mlm_loss": 2.4137701988220215 + }, + { + "epoch": 1.245512491757638, + "grad_norm": 25.953353881835938, + "learning_rate": 2.942108754945082e-05, + "loss": 4.0339, + "step": 34000 + }, + { + "epoch": 1.245512491757638, + "eval_loss": 3.297154188156128, + "eval_runtime": 2.3826, + "eval_samples_per_second": 2936.248, + "eval_steps_per_second": 11.752, + "step": 34000 + }, + { + "combined_loss": 2.5429787635803223, + "distill_loss": 1.2718520164489746, + "epoch": 1.245512491757638, + "step": 34000, + "student_mlm_loss": 3.814105272293091 + }, + { + "epoch": 1.2491757637922192, + "grad_norm": 48.45500183105469, + "learning_rate": 2.9359657960046196e-05, + "loss": 6.1408, + "step": 34100 + }, + { + "combined_loss": 4.794422626495361, + "distill_loss": 1.3052036762237549, + "epoch": 1.2491757637922192, + "step": 34100, + "student_mlm_loss": 8.283641815185547 + }, + { + "epoch": 1.2528390358268005, + "grad_norm": 6.028234004974365, + "learning_rate": 2.9298228370641574e-05, + "loss": 2.9116, + "step": 34200 + }, + { + "combined_loss": 2.125443458557129, + "distill_loss": 1.25053071975708, + "epoch": 1.2528390358268005, + "step": 34200, + "student_mlm_loss": 3.0003561973571777 + }, + { + "epoch": 1.2565023078613817, + "grad_norm": 15.824817657470703, + "learning_rate": 2.9236798781236945e-05, + "loss": 3.5834, + "step": 34300 + }, + { + "combined_loss": 2.156796932220459, + "distill_loss": 1.1805670261383057, + "epoch": 1.2565023078613817, + "step": 34300, + "student_mlm_loss": 3.1330268383026123 + }, + { + "epoch": 1.260165579895963, + "grad_norm": 8.438326835632324, + "learning_rate": 2.9175369191832326e-05, + "loss": 5.0724, + "step": 34400 + }, + { + "combined_loss": 3.144615888595581, + "distill_loss": 1.2467416524887085, + "epoch": 1.260165579895963, + "step": 34400, + "student_mlm_loss": 5.042490005493164 + }, + { + "epoch": 1.2638288519305443, + "grad_norm": 3.7252449989318848, + "learning_rate": 2.9113939602427697e-05, + "loss": 2.9306, + "step": 34500 + }, + { + "combined_loss": 4.309004783630371, + "distill_loss": 1.2629985809326172, + "epoch": 1.2638288519305443, + "step": 34500, + "student_mlm_loss": 7.355010986328125 + }, + { + "epoch": 1.2674921239651256, + "grad_norm": 14.86426067352295, + "learning_rate": 2.9052510013023078e-05, + "loss": 3.059, + "step": 34600 + }, + { + "combined_loss": 2.128227472305298, + "distill_loss": 1.3674236536026, + "epoch": 1.2674921239651256, + "step": 34600, + "student_mlm_loss": 2.889031171798706 + }, + { + "epoch": 1.271155395999707, + "grad_norm": 14.947731018066406, + "learning_rate": 2.899108042361845e-05, + "loss": 3.0461, + "step": 34700 + }, + { + "combined_loss": 1.9557018280029297, + "distill_loss": 1.3122907876968384, + "epoch": 1.271155395999707, + "step": 34700, + "student_mlm_loss": 2.5991127490997314 + }, + { + "epoch": 1.2748186680342881, + "grad_norm": 4.714714527130127, + "learning_rate": 2.8929650834213824e-05, + "loss": 3.0221, + "step": 34800 + }, + { + "combined_loss": 1.7830932140350342, + "distill_loss": 1.278725028038025, + "epoch": 1.2748186680342881, + "step": 34800, + "student_mlm_loss": 2.287461519241333 + }, + { + "epoch": 1.2784819400688696, + "grad_norm": 13.885130882263184, + "learning_rate": 2.88682212448092e-05, + "loss": 8.529, + "step": 34900 + }, + { + "combined_loss": 4.974426746368408, + "distill_loss": 1.4173694849014282, + "epoch": 1.2784819400688696, + "step": 34900, + "student_mlm_loss": 8.53148365020752 + }, + { + "epoch": 1.2821452121034507, + "grad_norm": 6.786545753479004, + "learning_rate": 2.8806791655404576e-05, + "loss": 3.563, + "step": 35000 + }, + { + "combined_loss": 1.7134695053100586, + "distill_loss": 1.2251827716827393, + "epoch": 1.2821452121034507, + "step": 35000, + "student_mlm_loss": 2.201756238937378 + }, + { + "epoch": 1.2858084841380322, + "grad_norm": 18.235891342163086, + "learning_rate": 2.8745362065999954e-05, + "loss": 6.9188, + "step": 35100 + }, + { + "combined_loss": 6.00921106338501, + "distill_loss": 1.3103188276290894, + "epoch": 1.2858084841380322, + "step": 35100, + "student_mlm_loss": 10.70810317993164 + }, + { + "epoch": 1.2894717561726134, + "grad_norm": 6.3708696365356445, + "learning_rate": 2.8683932476595328e-05, + "loss": 6.7695, + "step": 35200 + }, + { + "combined_loss": 2.2400052547454834, + "distill_loss": 1.3289698362350464, + "epoch": 1.2894717561726134, + "step": 35200, + "student_mlm_loss": 3.151040554046631 + }, + { + "epoch": 1.2931350282071947, + "grad_norm": 7.5602946281433105, + "learning_rate": 2.8622502887190706e-05, + "loss": 9.8005, + "step": 35300 + }, + { + "combined_loss": 1.848390817642212, + "distill_loss": 1.2897430658340454, + "epoch": 1.2931350282071947, + "step": 35300, + "student_mlm_loss": 2.407038688659668 + }, + { + "epoch": 1.296798300241776, + "grad_norm": 24.799640655517578, + "learning_rate": 2.8561073297786077e-05, + "loss": 3.2996, + "step": 35400 + }, + { + "combined_loss": 4.894403457641602, + "distill_loss": 1.282358169555664, + "epoch": 1.296798300241776, + "step": 35400, + "student_mlm_loss": 8.506448745727539 + }, + { + "epoch": 1.3004615722763573, + "grad_norm": 34.4364013671875, + "learning_rate": 2.849964370838146e-05, + "loss": 3.399, + "step": 35500 + }, + { + "combined_loss": 1.7965787649154663, + "distill_loss": 1.3232142925262451, + "epoch": 1.3004615722763573, + "step": 35500, + "student_mlm_loss": 2.2699432373046875 + }, + { + "epoch": 1.3041248443109386, + "grad_norm": 7.9551825523376465, + "learning_rate": 2.843821411897683e-05, + "loss": 3.1887, + "step": 35600 + }, + { + "combined_loss": 1.855729579925537, + "distill_loss": 1.2217527627944946, + "epoch": 1.3041248443109386, + "step": 35600, + "student_mlm_loss": 2.48970627784729 + }, + { + "epoch": 1.3077881163455198, + "grad_norm": 5.838754177093506, + "learning_rate": 2.8376784529572204e-05, + "loss": 3.1524, + "step": 35700 + }, + { + "combined_loss": 2.3417129516601562, + "distill_loss": 1.2872867584228516, + "epoch": 1.3077881163455198, + "step": 35700, + "student_mlm_loss": 3.39613938331604 + }, + { + "epoch": 1.3114513883801011, + "grad_norm": 4.118559837341309, + "learning_rate": 2.831535494016758e-05, + "loss": 7.9754, + "step": 35800 + }, + { + "combined_loss": 3.906961679458618, + "distill_loss": 1.2905327081680298, + "epoch": 1.3114513883801011, + "step": 35800, + "student_mlm_loss": 6.523390769958496 + }, + { + "epoch": 1.3151146604146824, + "grad_norm": 5.229255199432373, + "learning_rate": 2.8253925350762956e-05, + "loss": 3.6586, + "step": 35900 + }, + { + "combined_loss": 2.6259002685546875, + "distill_loss": 1.217278003692627, + "epoch": 1.3151146604146824, + "step": 35900, + "student_mlm_loss": 4.034522533416748 + }, + { + "epoch": 1.3187779324492637, + "grad_norm": 9.182631492614746, + "learning_rate": 2.8192495761358334e-05, + "loss": 8.5789, + "step": 36000 + }, + { + "epoch": 1.3187779324492637, + "eval_loss": 3.3097567558288574, + "eval_runtime": 1.9861, + "eval_samples_per_second": 3522.525, + "eval_steps_per_second": 14.098, + "step": 36000 + }, + { + "combined_loss": 15.921034812927246, + "distill_loss": 1.2575896978378296, + "epoch": 1.3187779324492637, + "step": 36000, + "student_mlm_loss": 30.58448028564453 + }, + { + "epoch": 1.322441204483845, + "grad_norm": 5.999209880828857, + "learning_rate": 2.813106617195371e-05, + "loss": 3.6109, + "step": 36100 + }, + { + "combined_loss": 204.92184448242188, + "distill_loss": 1.2291535139083862, + "epoch": 1.322441204483845, + "step": 36100, + "student_mlm_loss": 408.6145324707031 + }, + { + "epoch": 1.3261044765184262, + "grad_norm": 8.351846694946289, + "learning_rate": 2.8069636582549086e-05, + "loss": 5.9753, + "step": 36200 + }, + { + "combined_loss": 3.7332310676574707, + "distill_loss": 1.377110481262207, + "epoch": 1.3261044765184262, + "step": 36200, + "student_mlm_loss": 6.089351654052734 + }, + { + "epoch": 1.3297677485530075, + "grad_norm": 4.738751411437988, + "learning_rate": 2.800820699314446e-05, + "loss": 2.8706, + "step": 36300 + }, + { + "combined_loss": 1.949210286140442, + "distill_loss": 1.1820151805877686, + "epoch": 1.3297677485530075, + "step": 36300, + "student_mlm_loss": 2.7164053916931152 + }, + { + "epoch": 1.3334310205875888, + "grad_norm": 3.7835421562194824, + "learning_rate": 2.7946777403739832e-05, + "loss": 3.5794, + "step": 36400 + }, + { + "combined_loss": 1.7922800779342651, + "distill_loss": 1.2455928325653076, + "epoch": 1.3334310205875888, + "step": 36400, + "student_mlm_loss": 2.3389673233032227 + }, + { + "epoch": 1.33709429262217, + "grad_norm": 22.528881072998047, + "learning_rate": 2.788534781433521e-05, + "loss": 3.8623, + "step": 36500 + }, + { + "combined_loss": 1.788147211074829, + "distill_loss": 1.2254056930541992, + "epoch": 1.33709429262217, + "step": 36500, + "student_mlm_loss": 2.350888729095459 + }, + { + "epoch": 1.3407575646567513, + "grad_norm": 5.876169681549072, + "learning_rate": 2.7823918224930584e-05, + "loss": 8.4137, + "step": 36600 + }, + { + "combined_loss": 2.0377962589263916, + "distill_loss": 1.2204126119613647, + "epoch": 1.3407575646567513, + "step": 36600, + "student_mlm_loss": 2.855179786682129 + }, + { + "epoch": 1.3444208366913326, + "grad_norm": 20.921276092529297, + "learning_rate": 2.7762488635525962e-05, + "loss": 3.5857, + "step": 36700 + }, + { + "combined_loss": 1.9521321058273315, + "distill_loss": 1.249513864517212, + "epoch": 1.3444208366913326, + "step": 36700, + "student_mlm_loss": 2.654750347137451 + }, + { + "epoch": 1.348084108725914, + "grad_norm": 13.851704597473145, + "learning_rate": 2.7701059046121336e-05, + "loss": 3.8678, + "step": 36800 + }, + { + "combined_loss": 2.2560389041900635, + "distill_loss": 1.2315130233764648, + "epoch": 1.348084108725914, + "step": 36800, + "student_mlm_loss": 3.280564785003662 + }, + { + "epoch": 1.3517473807604952, + "grad_norm": 16.56214714050293, + "learning_rate": 2.7639629456716714e-05, + "loss": 3.3998, + "step": 36900 + }, + { + "combined_loss": 3.098896026611328, + "distill_loss": 1.3377043008804321, + "epoch": 1.3517473807604952, + "step": 36900, + "student_mlm_loss": 4.860087871551514 + }, + { + "epoch": 1.3554106527950767, + "grad_norm": 35.91291809082031, + "learning_rate": 2.757819986731209e-05, + "loss": 3.761, + "step": 37000 + }, + { + "combined_loss": 1.9794631004333496, + "distill_loss": 1.3087836503982544, + "epoch": 1.3554106527950767, + "step": 37000, + "student_mlm_loss": 2.6501426696777344 + }, + { + "epoch": 1.3590739248296577, + "grad_norm": 11.776296615600586, + "learning_rate": 2.7516770277907466e-05, + "loss": 3.9886, + "step": 37100 + }, + { + "combined_loss": 2.3107573986053467, + "distill_loss": 1.268768310546875, + "epoch": 1.3590739248296577, + "step": 37100, + "student_mlm_loss": 3.3527464866638184 + }, + { + "epoch": 1.3627371968642392, + "grad_norm": 13.237029075622559, + "learning_rate": 2.745534068850284e-05, + "loss": 5.3161, + "step": 37200 + }, + { + "combined_loss": 4.210747718811035, + "distill_loss": 1.4009877443313599, + "epoch": 1.3627371968642392, + "step": 37200, + "student_mlm_loss": 7.0205078125 + }, + { + "epoch": 1.3664004688988205, + "grad_norm": 18.256624221801758, + "learning_rate": 2.7393911099098212e-05, + "loss": 3.3122, + "step": 37300 + }, + { + "combined_loss": 2.467655658721924, + "distill_loss": 1.3313319683074951, + "epoch": 1.3664004688988205, + "step": 37300, + "student_mlm_loss": 3.6039793491363525 + }, + { + "epoch": 1.3700637409334018, + "grad_norm": 3.6821129322052, + "learning_rate": 2.7332481509693593e-05, + "loss": 2.5638, + "step": 37400 + }, + { + "combined_loss": 4.0961503982543945, + "distill_loss": 1.2590566873550415, + "epoch": 1.3700637409334018, + "step": 37400, + "student_mlm_loss": 6.933243751525879 + }, + { + "epoch": 1.373727012967983, + "grad_norm": 9.491351127624512, + "learning_rate": 2.7271051920288964e-05, + "loss": 5.2572, + "step": 37500 + }, + { + "combined_loss": 1.8323596715927124, + "distill_loss": 1.2323403358459473, + "epoch": 1.373727012967983, + "step": 37500, + "student_mlm_loss": 2.4323790073394775 + }, + { + "epoch": 1.3773902850025643, + "grad_norm": 10.13337516784668, + "learning_rate": 2.7209622330884342e-05, + "loss": 2.9805, + "step": 37600 + }, + { + "combined_loss": 2.7236733436584473, + "distill_loss": 1.2598845958709717, + "epoch": 1.3773902850025643, + "step": 37600, + "student_mlm_loss": 4.187462329864502 + }, + { + "epoch": 1.3810535570371456, + "grad_norm": 22.098358154296875, + "learning_rate": 2.7148192741479716e-05, + "loss": 3.1095, + "step": 37700 + }, + { + "combined_loss": 1.7910634279251099, + "distill_loss": 1.271672010421753, + "epoch": 1.3810535570371456, + "step": 37700, + "student_mlm_loss": 2.310454845428467 + }, + { + "epoch": 1.3847168290717269, + "grad_norm": 233.01779174804688, + "learning_rate": 2.7086763152075094e-05, + "loss": 3.0334, + "step": 37800 + }, + { + "combined_loss": 2.449730396270752, + "distill_loss": 1.343329906463623, + "epoch": 1.3847168290717269, + "step": 37800, + "student_mlm_loss": 3.556130886077881 + }, + { + "epoch": 1.3883801011063082, + "grad_norm": 7.459797382354736, + "learning_rate": 2.702533356267047e-05, + "loss": 5.0088, + "step": 37900 + }, + { + "combined_loss": 2.047302722930908, + "distill_loss": 1.2358465194702148, + "epoch": 1.3883801011063082, + "step": 37900, + "student_mlm_loss": 2.8587586879730225 + }, + { + "epoch": 1.3920433731408894, + "grad_norm": 3.9627275466918945, + "learning_rate": 2.6963903973265843e-05, + "loss": 2.7476, + "step": 38000 + }, + { + "epoch": 1.3920433731408894, + "eval_loss": 4.346156120300293, + "eval_runtime": 1.974, + "eval_samples_per_second": 3544.088, + "eval_steps_per_second": 14.184, + "step": 38000 + }, + { + "combined_loss": 2.4468555450439453, + "distill_loss": 1.166190505027771, + "epoch": 1.3920433731408894, + "step": 38000, + "student_mlm_loss": 3.72752046585083 + }, + { + "epoch": 1.3957066451754707, + "grad_norm": 11.812987327575684, + "learning_rate": 2.690247438386122e-05, + "loss": 3.8226, + "step": 38100 + }, + { + "combined_loss": 2.274935245513916, + "distill_loss": 1.3503799438476562, + "epoch": 1.3957066451754707, + "step": 38100, + "student_mlm_loss": 3.199490785598755 + }, + { + "epoch": 1.399369917210052, + "grad_norm": 6.545460224151611, + "learning_rate": 2.6841044794456592e-05, + "loss": 4.1598, + "step": 38200 + }, + { + "combined_loss": 2.1577343940734863, + "distill_loss": 1.2623993158340454, + "epoch": 1.399369917210052, + "step": 38200, + "student_mlm_loss": 3.0530693531036377 + }, + { + "epoch": 1.4030331892446333, + "grad_norm": 7.286951541900635, + "learning_rate": 2.6779615205051973e-05, + "loss": 3.8211, + "step": 38300 + }, + { + "combined_loss": 2.479806900024414, + "distill_loss": 1.2152717113494873, + "epoch": 1.4030331892446333, + "step": 38300, + "student_mlm_loss": 3.74434232711792 + }, + { + "epoch": 1.4066964612792145, + "grad_norm": 18.360294342041016, + "learning_rate": 2.6718185615647344e-05, + "loss": 3.3871, + "step": 38400 + }, + { + "combined_loss": 1.7289254665374756, + "distill_loss": 1.3171356916427612, + "epoch": 1.4066964612792145, + "step": 38400, + "student_mlm_loss": 2.1407151222229004 + }, + { + "epoch": 1.4103597333137958, + "grad_norm": 8.086026191711426, + "learning_rate": 2.6656756026242726e-05, + "loss": 2.6337, + "step": 38500 + }, + { + "combined_loss": 1.9621633291244507, + "distill_loss": 1.3215687274932861, + "epoch": 1.4103597333137958, + "step": 38500, + "student_mlm_loss": 2.6027579307556152 + }, + { + "epoch": 1.414023005348377, + "grad_norm": 13.378824234008789, + "learning_rate": 2.6595326436838097e-05, + "loss": 3.4032, + "step": 38600 + }, + { + "combined_loss": 37.448326110839844, + "distill_loss": 1.2198776006698608, + "epoch": 1.414023005348377, + "step": 38600, + "student_mlm_loss": 73.67677307128906 + }, + { + "epoch": 1.4176862773829584, + "grad_norm": 5.834230422973633, + "learning_rate": 2.653389684743347e-05, + "loss": 6.724, + "step": 38700 + }, + { + "combined_loss": 1.8702625036239624, + "distill_loss": 1.2802906036376953, + "epoch": 1.4176862773829584, + "step": 38700, + "student_mlm_loss": 2.4602344036102295 + }, + { + "epoch": 1.4213495494175397, + "grad_norm": 3.5685741901397705, + "learning_rate": 2.647246725802885e-05, + "loss": 3.2721, + "step": 38800 + }, + { + "combined_loss": 1.7411483526229858, + "distill_loss": 1.285083532333374, + "epoch": 1.4213495494175397, + "step": 38800, + "student_mlm_loss": 2.1972131729125977 + }, + { + "epoch": 1.4250128214521212, + "grad_norm": 8.644251823425293, + "learning_rate": 2.6411037668624223e-05, + "loss": 13.6859, + "step": 38900 + }, + { + "combined_loss": 3.234241008758545, + "distill_loss": 1.2654619216918945, + "epoch": 1.4250128214521212, + "step": 38900, + "student_mlm_loss": 5.203020095825195 + }, + { + "epoch": 1.4286760934867022, + "grad_norm": 15.043992042541504, + "learning_rate": 2.63496080792196e-05, + "loss": 4.3161, + "step": 39000 + }, + { + "combined_loss": 2.013312339782715, + "distill_loss": 1.2555652856826782, + "epoch": 1.4286760934867022, + "step": 39000, + "student_mlm_loss": 2.771059274673462 + }, + { + "epoch": 1.4323393655212837, + "grad_norm": 35.315345764160156, + "learning_rate": 2.6288178489814976e-05, + "loss": 6.3089, + "step": 39100 + }, + { + "combined_loss": 1.7854509353637695, + "distill_loss": 1.2994376420974731, + "epoch": 1.4323393655212837, + "step": 39100, + "student_mlm_loss": 2.2714641094207764 + }, + { + "epoch": 1.4360026375558648, + "grad_norm": 8.155647277832031, + "learning_rate": 2.6226748900410353e-05, + "loss": 3.3881, + "step": 39200 + }, + { + "combined_loss": 1.8790473937988281, + "distill_loss": 1.2656193971633911, + "epoch": 1.4360026375558648, + "step": 39200, + "student_mlm_loss": 2.4924752712249756 + }, + { + "epoch": 1.4396659095904463, + "grad_norm": 4.777060508728027, + "learning_rate": 2.6165319311005725e-05, + "loss": 3.0181, + "step": 39300 + }, + { + "combined_loss": 2.2714784145355225, + "distill_loss": 1.2724400758743286, + "epoch": 1.4396659095904463, + "step": 39300, + "student_mlm_loss": 3.270516872406006 + }, + { + "epoch": 1.4433291816250275, + "grad_norm": 3.7660317420959473, + "learning_rate": 2.6103889721601106e-05, + "loss": 3.3045, + "step": 39400 + }, + { + "combined_loss": 1.9759800434112549, + "distill_loss": 1.1767717599868774, + "epoch": 1.4433291816250275, + "step": 39400, + "student_mlm_loss": 2.775188446044922 + }, + { + "epoch": 1.4469924536596088, + "grad_norm": 55.78919982910156, + "learning_rate": 2.6042460132196477e-05, + "loss": 3.5094, + "step": 39500 + }, + { + "combined_loss": 2.5586395263671875, + "distill_loss": 1.3177176713943481, + "epoch": 1.4469924536596088, + "step": 39500, + "student_mlm_loss": 3.7995612621307373 + }, + { + "epoch": 1.45065572569419, + "grad_norm": 11.648473739624023, + "learning_rate": 2.598103054279185e-05, + "loss": 6.3066, + "step": 39600 + }, + { + "combined_loss": 1.8263496160507202, + "distill_loss": 1.2649195194244385, + "epoch": 1.45065572569419, + "step": 39600, + "student_mlm_loss": 2.387779712677002 + }, + { + "epoch": 1.4543189977287714, + "grad_norm": 4.982020378112793, + "learning_rate": 2.591960095338723e-05, + "loss": 3.1475, + "step": 39700 + }, + { + "combined_loss": 4.95673131942749, + "distill_loss": 1.2415388822555542, + "epoch": 1.4543189977287714, + "step": 39700, + "student_mlm_loss": 8.671923637390137 + }, + { + "epoch": 1.4579822697633527, + "grad_norm": 4.551340103149414, + "learning_rate": 2.5858171363982604e-05, + "loss": 6.0043, + "step": 39800 + }, + { + "combined_loss": 2.124246597290039, + "distill_loss": 1.197386384010315, + "epoch": 1.4579822697633527, + "step": 39800, + "student_mlm_loss": 3.0511069297790527 + }, + { + "epoch": 1.461645541797934, + "grad_norm": 41.217533111572266, + "learning_rate": 2.579674177457798e-05, + "loss": 2.7216, + "step": 39900 + }, + { + "combined_loss": 1.8579926490783691, + "distill_loss": 1.1948734521865845, + "epoch": 1.461645541797934, + "step": 39900, + "student_mlm_loss": 2.5211119651794434 + }, + { + "epoch": 1.4653088138325152, + "grad_norm": 3.3428897857666016, + "learning_rate": 2.5735312185173356e-05, + "loss": 3.5888, + "step": 40000 + }, + { + "epoch": 1.4653088138325152, + "eval_loss": 3.433469295501709, + "eval_runtime": 2.0987, + "eval_samples_per_second": 3333.452, + "eval_steps_per_second": 13.341, + "step": 40000 + }, + { + "combined_loss": 3.9790029525756836, + "distill_loss": 1.2571158409118652, + "epoch": 1.4653088138325152, + "step": 40000, + "student_mlm_loss": 6.700890064239502 + }, + { + "epoch": 1.4689720858670965, + "grad_norm": 24.387128829956055, + "learning_rate": 2.5673882595768734e-05, + "loss": 3.3546, + "step": 40100 + }, + { + "combined_loss": 2.113370418548584, + "distill_loss": 1.2904696464538574, + "epoch": 1.4689720858670965, + "step": 40100, + "student_mlm_loss": 2.9362711906433105 + }, + { + "epoch": 1.4726353579016778, + "grad_norm": 11.271422386169434, + "learning_rate": 2.5612453006364108e-05, + "loss": 9.1182, + "step": 40200 + }, + { + "combined_loss": 1.7249795198440552, + "distill_loss": 1.2220125198364258, + "epoch": 1.4726353579016778, + "step": 40200, + "student_mlm_loss": 2.2279465198516846 + }, + { + "epoch": 1.476298629936259, + "grad_norm": 88.92086029052734, + "learning_rate": 2.555102341695948e-05, + "loss": 5.5622, + "step": 40300 + }, + { + "combined_loss": 3.5107364654541016, + "distill_loss": 1.2663298845291138, + "epoch": 1.476298629936259, + "step": 40300, + "student_mlm_loss": 5.755143165588379 + }, + { + "epoch": 1.4799619019708403, + "grad_norm": 4.677048683166504, + "learning_rate": 2.5489593827554857e-05, + "loss": 5.3278, + "step": 40400 + }, + { + "combined_loss": 3.5298116207122803, + "distill_loss": 1.1846145391464233, + "epoch": 1.4799619019708403, + "step": 40400, + "student_mlm_loss": 5.875008583068848 + }, + { + "epoch": 1.4836251740054216, + "grad_norm": 21.207704544067383, + "learning_rate": 2.542816423815023e-05, + "loss": 2.9588, + "step": 40500 + }, + { + "combined_loss": 2.6109657287597656, + "distill_loss": 1.2608091831207275, + "epoch": 1.4836251740054216, + "step": 40500, + "student_mlm_loss": 3.9611220359802246 + }, + { + "epoch": 1.4872884460400029, + "grad_norm": 7.7415876388549805, + "learning_rate": 2.536673464874561e-05, + "loss": 2.706, + "step": 40600 + }, + { + "combined_loss": 2.455023765563965, + "distill_loss": 1.3175585269927979, + "epoch": 1.4872884460400029, + "step": 40600, + "student_mlm_loss": 3.5924887657165527 + }, + { + "epoch": 1.4909517180745842, + "grad_norm": 19.366378784179688, + "learning_rate": 2.5305305059340984e-05, + "loss": 2.7981, + "step": 40700 + }, + { + "combined_loss": 3.624007225036621, + "distill_loss": 1.1402699947357178, + "epoch": 1.4909517180745842, + "step": 40700, + "student_mlm_loss": 6.1077446937561035 + }, + { + "epoch": 1.4946149901091654, + "grad_norm": 7.310671806335449, + "learning_rate": 2.524387546993636e-05, + "loss": 29.272, + "step": 40800 + }, + { + "combined_loss": 2.2329726219177246, + "distill_loss": 1.303555965423584, + "epoch": 1.4946149901091654, + "step": 40800, + "student_mlm_loss": 3.1623895168304443 + }, + { + "epoch": 1.4982782621437467, + "grad_norm": 48.7297477722168, + "learning_rate": 2.5182445880531736e-05, + "loss": 3.1319, + "step": 40900 + }, + { + "combined_loss": 1.8255285024642944, + "distill_loss": 1.1643202304840088, + "epoch": 1.4982782621437467, + "step": 40900, + "student_mlm_loss": 2.48673677444458 + }, + { + "epoch": 1.5019415341783282, + "grad_norm": 32.60409927368164, + "learning_rate": 2.5121016291127114e-05, + "loss": 8.524, + "step": 41000 + }, + { + "combined_loss": 2.896923542022705, + "distill_loss": 1.3571655750274658, + "epoch": 1.5019415341783282, + "step": 41000, + "student_mlm_loss": 4.436681747436523 + }, + { + "epoch": 1.5056048062129093, + "grad_norm": 4.127974510192871, + "learning_rate": 2.5059586701722488e-05, + "loss": 6.3087, + "step": 41100 + }, + { + "combined_loss": 2.145819664001465, + "distill_loss": 1.2983198165893555, + "epoch": 1.5056048062129093, + "step": 41100, + "student_mlm_loss": 2.993319511413574 + }, + { + "epoch": 1.5092680782474908, + "grad_norm": 3.873206853866577, + "learning_rate": 2.4998157112317863e-05, + "loss": 5.279, + "step": 41200 + }, + { + "combined_loss": 4.8266730308532715, + "distill_loss": 1.1676665544509888, + "epoch": 1.5092680782474908, + "step": 41200, + "student_mlm_loss": 8.485679626464844 + }, + { + "epoch": 1.5129313502820718, + "grad_norm": 6.902312755584717, + "learning_rate": 2.493672752291324e-05, + "loss": 5.3583, + "step": 41300 + }, + { + "combined_loss": 1.7068848609924316, + "distill_loss": 1.1335561275482178, + "epoch": 1.5129313502820718, + "step": 41300, + "student_mlm_loss": 2.2802135944366455 + }, + { + "epoch": 1.5165946223166533, + "grad_norm": 17.415306091308594, + "learning_rate": 2.487529793350861e-05, + "loss": 2.8319, + "step": 41400 + }, + { + "combined_loss": 1.5696630477905273, + "distill_loss": 1.152633786201477, + "epoch": 1.5165946223166533, + "step": 41400, + "student_mlm_loss": 1.9866924285888672 + }, + { + "epoch": 1.5202578943512344, + "grad_norm": 11.67779541015625, + "learning_rate": 2.481386834410399e-05, + "loss": 3.0117, + "step": 41500 + }, + { + "combined_loss": 1.9209272861480713, + "distill_loss": 1.2611881494522095, + "epoch": 1.5202578943512344, + "step": 41500, + "student_mlm_loss": 2.5806663036346436 + }, + { + "epoch": 1.5239211663858159, + "grad_norm": 9.814743041992188, + "learning_rate": 2.4752438754699364e-05, + "loss": 2.8479, + "step": 41600 + }, + { + "combined_loss": 4.1822404861450195, + "distill_loss": 1.254117488861084, + "epoch": 1.5239211663858159, + "step": 41600, + "student_mlm_loss": 7.110363960266113 + }, + { + "epoch": 1.5275844384203972, + "grad_norm": 11.7344970703125, + "learning_rate": 2.4691009165294742e-05, + "loss": 3.2502, + "step": 41700 + }, + { + "combined_loss": 1.7558622360229492, + "distill_loss": 1.1821727752685547, + "epoch": 1.5275844384203972, + "step": 41700, + "student_mlm_loss": 2.3295516967773438 + }, + { + "epoch": 1.5312477104549784, + "grad_norm": 8.426025390625, + "learning_rate": 2.4629579575890116e-05, + "loss": 3.3169, + "step": 41800 + }, + { + "combined_loss": 1.843000054359436, + "distill_loss": 1.1456735134124756, + "epoch": 1.5312477104549784, + "step": 41800, + "student_mlm_loss": 2.5403265953063965 + }, + { + "epoch": 1.5349109824895597, + "grad_norm": 3.654872417449951, + "learning_rate": 2.456814998648549e-05, + "loss": 2.6259, + "step": 41900 + }, + { + "combined_loss": 1.7651002407073975, + "distill_loss": 1.1741529703140259, + "epoch": 1.5349109824895597, + "step": 41900, + "student_mlm_loss": 2.3560476303100586 + }, + { + "epoch": 1.538574254524141, + "grad_norm": 18.605615615844727, + "learning_rate": 2.450672039708087e-05, + "loss": 2.4854, + "step": 42000 + }, + { + "epoch": 1.538574254524141, + "eval_loss": 3.4032058715820312, + "eval_runtime": 1.8747, + "eval_samples_per_second": 3731.788, + "eval_steps_per_second": 14.936, + "step": 42000 + }, + { + "combined_loss": 2.60400390625, + "distill_loss": 1.2034615278244019, + "epoch": 1.538574254524141, + "step": 42000, + "student_mlm_loss": 4.004546165466309 + }, + { + "epoch": 1.5422375265587223, + "grad_norm": 6.775146484375, + "learning_rate": 2.4445290807676243e-05, + "loss": 2.8405, + "step": 42100 + }, + { + "combined_loss": 1.7485601902008057, + "distill_loss": 1.1682909727096558, + "epoch": 1.5422375265587223, + "step": 42100, + "student_mlm_loss": 2.328829288482666 + }, + { + "epoch": 1.5459007985933035, + "grad_norm": 24.79000473022461, + "learning_rate": 2.4383861218271617e-05, + "loss": 2.9811, + "step": 42200 + }, + { + "combined_loss": 2.2294323444366455, + "distill_loss": 1.262848138809204, + "epoch": 1.5459007985933035, + "step": 42200, + "student_mlm_loss": 3.196016550064087 + }, + { + "epoch": 1.5495640706278848, + "grad_norm": 11.027627944946289, + "learning_rate": 2.4322431628866992e-05, + "loss": 3.7109, + "step": 42300 + }, + { + "combined_loss": 1.8129802942276, + "distill_loss": 1.205324411392212, + "epoch": 1.5495640706278848, + "step": 42300, + "student_mlm_loss": 2.4206361770629883 + }, + { + "epoch": 1.553227342662466, + "grad_norm": 6.328401565551758, + "learning_rate": 2.426100203946237e-05, + "loss": 31.168, + "step": 42400 + }, + { + "combined_loss": 2.391860246658325, + "distill_loss": 1.1356655359268188, + "epoch": 1.553227342662466, + "step": 42400, + "student_mlm_loss": 3.648054838180542 + }, + { + "epoch": 1.5568906146970474, + "grad_norm": 26.61184310913086, + "learning_rate": 2.4199572450057744e-05, + "loss": 6.4259, + "step": 42500 + }, + { + "combined_loss": 3.222200870513916, + "distill_loss": 1.3243845701217651, + "epoch": 1.5568906146970474, + "step": 42500, + "student_mlm_loss": 5.120017051696777 + }, + { + "epoch": 1.5605538867316286, + "grad_norm": 78.89910888671875, + "learning_rate": 2.4138142860653122e-05, + "loss": 3.3441, + "step": 42600 + }, + { + "combined_loss": 1.7442145347595215, + "distill_loss": 1.282542109489441, + "epoch": 1.5605538867316286, + "step": 42600, + "student_mlm_loss": 2.2058870792388916 + }, + { + "epoch": 1.56421715876621, + "grad_norm": 88.92566680908203, + "learning_rate": 2.4076713271248496e-05, + "loss": 2.8234, + "step": 42700 + }, + { + "combined_loss": 2.366835117340088, + "distill_loss": 1.1711124181747437, + "epoch": 1.56421715876621, + "step": 42700, + "student_mlm_loss": 3.5625579357147217 + }, + { + "epoch": 1.5678804308007912, + "grad_norm": 6.83758544921875, + "learning_rate": 2.4015283681843874e-05, + "loss": 5.4491, + "step": 42800 + }, + { + "combined_loss": 4.174956798553467, + "distill_loss": 1.0669249296188354, + "epoch": 1.5678804308007912, + "step": 42800, + "student_mlm_loss": 7.282988548278809 + }, + { + "epoch": 1.5715437028353727, + "grad_norm": 5.723924160003662, + "learning_rate": 2.395385409243925e-05, + "loss": 3.1108, + "step": 42900 + }, + { + "combined_loss": 2.3197238445281982, + "distill_loss": 1.2763570547103882, + "epoch": 1.5715437028353727, + "step": 42900, + "student_mlm_loss": 3.3630905151367188 + }, + { + "epoch": 1.5752069748699538, + "grad_norm": 14.807353973388672, + "learning_rate": 2.3892424503034623e-05, + "loss": 6.4113, + "step": 43000 + }, + { + "combined_loss": 1.7868092060089111, + "distill_loss": 1.1304634809494019, + "epoch": 1.5752069748699538, + "step": 43000, + "student_mlm_loss": 2.44315505027771 + }, + { + "epoch": 1.5788702469045353, + "grad_norm": 8.68276596069336, + "learning_rate": 2.3830994913629998e-05, + "loss": 5.1213, + "step": 43100 + }, + { + "combined_loss": 19.46100425720215, + "distill_loss": 1.259545087814331, + "epoch": 1.5788702469045353, + "step": 43100, + "student_mlm_loss": 37.6624641418457 + }, + { + "epoch": 1.5825335189391163, + "grad_norm": 4.91242790222168, + "learning_rate": 2.3769565324225372e-05, + "loss": 3.2674, + "step": 43200 + }, + { + "combined_loss": 1.797656536102295, + "distill_loss": 1.3039189577102661, + "epoch": 1.5825335189391163, + "step": 43200, + "student_mlm_loss": 2.2913942337036133 + }, + { + "epoch": 1.5861967909736978, + "grad_norm": 52.68294906616211, + "learning_rate": 2.370813573482075e-05, + "loss": 3.7711, + "step": 43300 + }, + { + "combined_loss": 1.8017528057098389, + "distill_loss": 1.1734706163406372, + "epoch": 1.5861967909736978, + "step": 43300, + "student_mlm_loss": 2.43003511428833 + }, + { + "epoch": 1.5898600630082789, + "grad_norm": 11.869544982910156, + "learning_rate": 2.3646706145416124e-05, + "loss": 9.8177, + "step": 43400 + }, + { + "combined_loss": 2.760119915008545, + "distill_loss": 1.2446471452713013, + "epoch": 1.5898600630082789, + "step": 43400, + "student_mlm_loss": 4.275592803955078 + }, + { + "epoch": 1.5935233350428604, + "grad_norm": 3.7819387912750244, + "learning_rate": 2.3585276556011502e-05, + "loss": 4.6552, + "step": 43500 + }, + { + "combined_loss": 4.660012245178223, + "distill_loss": 1.1187530755996704, + "epoch": 1.5935233350428604, + "step": 43500, + "student_mlm_loss": 8.201271057128906 + }, + { + "epoch": 1.5971866070774414, + "grad_norm": 21.269559860229492, + "learning_rate": 2.3523846966606877e-05, + "loss": 8.5404, + "step": 43600 + }, + { + "combined_loss": 2.3045759201049805, + "distill_loss": 1.3545589447021484, + "epoch": 1.5971866070774414, + "step": 43600, + "student_mlm_loss": 3.2545931339263916 + }, + { + "epoch": 1.600849879112023, + "grad_norm": 8.289508819580078, + "learning_rate": 2.3462417377202254e-05, + "loss": 2.7135, + "step": 43700 + }, + { + "combined_loss": 3.0867691040039062, + "distill_loss": 1.1124651432037354, + "epoch": 1.600849879112023, + "step": 43700, + "student_mlm_loss": 5.061073303222656 + }, + { + "epoch": 1.6045131511466042, + "grad_norm": 22.303661346435547, + "learning_rate": 2.3400987787797625e-05, + "loss": 3.6364, + "step": 43800 + }, + { + "combined_loss": 1.7930564880371094, + "distill_loss": 1.2114512920379639, + "epoch": 1.6045131511466042, + "step": 43800, + "student_mlm_loss": 2.374661684036255 + }, + { + "epoch": 1.6081764231811855, + "grad_norm": 4.351790904998779, + "learning_rate": 2.3339558198393003e-05, + "loss": 5.6887, + "step": 43900 + }, + { + "combined_loss": 1.7365663051605225, + "distill_loss": 1.2089755535125732, + "epoch": 1.6081764231811855, + "step": 43900, + "student_mlm_loss": 2.2641570568084717 + }, + { + "epoch": 1.6118396952157668, + "grad_norm": 13.450850486755371, + "learning_rate": 2.3278128608988378e-05, + "loss": 3.6702, + "step": 44000 + }, + { + "epoch": 1.6118396952157668, + "eval_loss": 3.194415330886841, + "eval_runtime": 1.9274, + "eval_samples_per_second": 3629.828, + "eval_steps_per_second": 14.528, + "step": 44000 + }, + { + "combined_loss": 1.760496735572815, + "distill_loss": 1.1514201164245605, + "epoch": 1.6118396952157668, + "step": 44000, + "student_mlm_loss": 2.3695733547210693 + }, + { + "epoch": 1.615502967250348, + "grad_norm": 7.381774425506592, + "learning_rate": 2.3216699019583756e-05, + "loss": 2.9269, + "step": 44100 + }, + { + "combined_loss": 4.663776397705078, + "distill_loss": 1.307958722114563, + "epoch": 1.615502967250348, + "step": 44100, + "student_mlm_loss": 8.019594192504883 + }, + { + "epoch": 1.6191662392849293, + "grad_norm": 10.999051094055176, + "learning_rate": 2.315526943017913e-05, + "loss": 3.0334, + "step": 44200 + }, + { + "combined_loss": 1.9191560745239258, + "distill_loss": 1.3481658697128296, + "epoch": 1.6191662392849293, + "step": 44200, + "student_mlm_loss": 2.4901461601257324 + }, + { + "epoch": 1.6228295113195106, + "grad_norm": 6.187446594238281, + "learning_rate": 2.3093839840774504e-05, + "loss": 30.6923, + "step": 44300 + }, + { + "combined_loss": 12.122703552246094, + "distill_loss": 1.1659897565841675, + "epoch": 1.6228295113195106, + "step": 44300, + "student_mlm_loss": 23.079418182373047 + }, + { + "epoch": 1.6264927833540919, + "grad_norm": 6.142828941345215, + "learning_rate": 2.3032410251369882e-05, + "loss": 7.4162, + "step": 44400 + }, + { + "combined_loss": 1.9456160068511963, + "distill_loss": 1.257858157157898, + "epoch": 1.6264927833540919, + "step": 44400, + "student_mlm_loss": 2.633373737335205 + }, + { + "epoch": 1.6301560553886731, + "grad_norm": 15.393942832946777, + "learning_rate": 2.2970980661965257e-05, + "loss": 4.8003, + "step": 44500 + }, + { + "combined_loss": 2.7578635215759277, + "distill_loss": 1.1640808582305908, + "epoch": 1.6301560553886731, + "step": 44500, + "student_mlm_loss": 4.351646423339844 + }, + { + "epoch": 1.6338193274232544, + "grad_norm": 18.73512077331543, + "learning_rate": 2.290955107256063e-05, + "loss": 5.3592, + "step": 44600 + }, + { + "combined_loss": 3.758654832839966, + "distill_loss": 1.260606288909912, + "epoch": 1.6338193274232544, + "step": 44600, + "student_mlm_loss": 6.2567033767700195 + }, + { + "epoch": 1.6374825994578357, + "grad_norm": 6.1570048332214355, + "learning_rate": 2.2848121483156006e-05, + "loss": 10.8594, + "step": 44700 + }, + { + "combined_loss": 3.205047845840454, + "distill_loss": 1.1495074033737183, + "epoch": 1.6374825994578357, + "step": 44700, + "student_mlm_loss": 5.2605881690979 + }, + { + "epoch": 1.641145871492417, + "grad_norm": 8.748614311218262, + "learning_rate": 2.2786691893751383e-05, + "loss": 2.611, + "step": 44800 + }, + { + "combined_loss": 2.7548794746398926, + "distill_loss": 1.153849482536316, + "epoch": 1.641145871492417, + "step": 44800, + "student_mlm_loss": 4.35590934753418 + }, + { + "epoch": 1.6448091435269983, + "grad_norm": 9.594339370727539, + "learning_rate": 2.2725262304346758e-05, + "loss": 3.621, + "step": 44900 + }, + { + "combined_loss": 2.63676381111145, + "distill_loss": 1.144437313079834, + "epoch": 1.6448091435269983, + "step": 44900, + "student_mlm_loss": 4.129090309143066 + }, + { + "epoch": 1.6484724155615798, + "grad_norm": 8.756010055541992, + "learning_rate": 2.2663832714942136e-05, + "loss": 5.0762, + "step": 45000 + }, + { + "combined_loss": 2.0047507286071777, + "distill_loss": 1.203262209892273, + "epoch": 1.6484724155615798, + "step": 45000, + "student_mlm_loss": 2.806239366531372 + }, + { + "epoch": 1.6521356875961608, + "grad_norm": 16.163911819458008, + "learning_rate": 2.260240312553751e-05, + "loss": 3.1675, + "step": 45100 + }, + { + "combined_loss": 1.822305679321289, + "distill_loss": 1.187317967414856, + "epoch": 1.6521356875961608, + "step": 45100, + "student_mlm_loss": 2.4572935104370117 + }, + { + "epoch": 1.6557989596307423, + "grad_norm": 4.047428607940674, + "learning_rate": 2.2540973536132888e-05, + "loss": 2.6406, + "step": 45200 + }, + { + "combined_loss": 2.431349039077759, + "distill_loss": 1.2643455266952515, + "epoch": 1.6557989596307423, + "step": 45200, + "student_mlm_loss": 3.5983526706695557 + }, + { + "epoch": 1.6594622316653234, + "grad_norm": 28.598485946655273, + "learning_rate": 2.247954394672826e-05, + "loss": 3.7667, + "step": 45300 + }, + { + "combined_loss": 2.274944543838501, + "distill_loss": 1.266087293624878, + "epoch": 1.6594622316653234, + "step": 45300, + "student_mlm_loss": 3.283801794052124 + }, + { + "epoch": 1.6631255036999049, + "grad_norm": 11.642946243286133, + "learning_rate": 2.2418114357323637e-05, + "loss": 3.0131, + "step": 45400 + }, + { + "combined_loss": 2.064805507659912, + "distill_loss": 1.2423893213272095, + "epoch": 1.6631255036999049, + "step": 45400, + "student_mlm_loss": 2.8872218132019043 + }, + { + "epoch": 1.666788775734486, + "grad_norm": 7.227854251861572, + "learning_rate": 2.235668476791901e-05, + "loss": 7.556, + "step": 45500 + }, + { + "combined_loss": 1.8626993894577026, + "distill_loss": 1.153686761856079, + "epoch": 1.666788775734486, + "step": 45500, + "student_mlm_loss": 2.571712017059326 + }, + { + "epoch": 1.6704520477690674, + "grad_norm": 11.972105026245117, + "learning_rate": 2.229525517851439e-05, + "loss": 3.9606, + "step": 45600 + }, + { + "combined_loss": 1.7529842853546143, + "distill_loss": 1.2637630701065063, + "epoch": 1.6704520477690674, + "step": 45600, + "student_mlm_loss": 2.2422056198120117 + }, + { + "epoch": 1.6741153198036485, + "grad_norm": 4.263253211975098, + "learning_rate": 2.2233825589109764e-05, + "loss": 3.0922, + "step": 45700 + }, + { + "combined_loss": 2.6089985370635986, + "distill_loss": 1.2136098146438599, + "epoch": 1.6741153198036485, + "step": 45700, + "student_mlm_loss": 4.004387378692627 + }, + { + "epoch": 1.67777859183823, + "grad_norm": 24.4074764251709, + "learning_rate": 2.2172395999705138e-05, + "loss": 3.2329, + "step": 45800 + }, + { + "combined_loss": 1.6919562816619873, + "distill_loss": 1.139168381690979, + "epoch": 1.67777859183823, + "step": 45800, + "student_mlm_loss": 2.244744300842285 + }, + { + "epoch": 1.6814418638728112, + "grad_norm": 5.1518778800964355, + "learning_rate": 2.2110966410300516e-05, + "loss": 9.4019, + "step": 45900 + }, + { + "combined_loss": 2.1822292804718018, + "distill_loss": 1.3423482179641724, + "epoch": 1.6814418638728112, + "step": 45900, + "student_mlm_loss": 3.0221104621887207 + }, + { + "epoch": 1.6851051359073925, + "grad_norm": 18.045368194580078, + "learning_rate": 2.204953682089589e-05, + "loss": 3.3662, + "step": 46000 + }, + { + "epoch": 1.6851051359073925, + "eval_loss": 3.070533275604248, + "eval_runtime": 1.9768, + "eval_samples_per_second": 3539.063, + "eval_steps_per_second": 14.164, + "step": 46000 + }, + { + "combined_loss": 1.8376495838165283, + "distill_loss": 1.261283278465271, + "epoch": 1.6851051359073925, + "step": 46000, + "student_mlm_loss": 2.414015769958496 + }, + { + "epoch": 1.6887684079419738, + "grad_norm": 5.69982385635376, + "learning_rate": 2.1988107231491265e-05, + "loss": 3.3451, + "step": 46100 + }, + { + "combined_loss": 1.7916219234466553, + "distill_loss": 1.2525031566619873, + "epoch": 1.6887684079419738, + "step": 46100, + "student_mlm_loss": 2.3307406902313232 + }, + { + "epoch": 1.692431679976555, + "grad_norm": 27.134151458740234, + "learning_rate": 2.192667764208664e-05, + "loss": 9.1006, + "step": 46200 + }, + { + "combined_loss": 59.0687141418457, + "distill_loss": 1.1848413944244385, + "epoch": 1.692431679976555, + "step": 46200, + "student_mlm_loss": 116.95258331298828 + }, + { + "epoch": 1.6960949520111364, + "grad_norm": 6.624229431152344, + "learning_rate": 2.1865248052682017e-05, + "loss": 3.0016, + "step": 46300 + }, + { + "combined_loss": 2.7997608184814453, + "distill_loss": 1.1524275541305542, + "epoch": 1.6960949520111364, + "step": 46300, + "student_mlm_loss": 4.447093963623047 + }, + { + "epoch": 1.6997582240457176, + "grad_norm": 5.472049236297607, + "learning_rate": 2.180381846327739e-05, + "loss": 20.0915, + "step": 46400 + }, + { + "combined_loss": 1.7153997421264648, + "distill_loss": 1.237658143043518, + "epoch": 1.6997582240457176, + "step": 46400, + "student_mlm_loss": 2.193141460418701 + }, + { + "epoch": 1.703421496080299, + "grad_norm": 14.290247917175293, + "learning_rate": 2.174238887387277e-05, + "loss": 4.5936, + "step": 46500 + }, + { + "combined_loss": 1.709627628326416, + "distill_loss": 1.2791212797164917, + "epoch": 1.703421496080299, + "step": 46500, + "student_mlm_loss": 2.140133857727051 + }, + { + "epoch": 1.7070847681148802, + "grad_norm": 17.962997436523438, + "learning_rate": 2.1680959284468144e-05, + "loss": 3.3627, + "step": 46600 + }, + { + "combined_loss": 7.8201751708984375, + "distill_loss": 1.3012824058532715, + "epoch": 1.7070847681148802, + "step": 46600, + "student_mlm_loss": 14.339067459106445 + }, + { + "epoch": 1.7107480401494615, + "grad_norm": 6.800339698791504, + "learning_rate": 2.161952969506352e-05, + "loss": 6.7955, + "step": 46700 + }, + { + "combined_loss": 1.809753656387329, + "distill_loss": 1.2891262769699097, + "epoch": 1.7107480401494615, + "step": 46700, + "student_mlm_loss": 2.330381155014038 + }, + { + "epoch": 1.7144113121840427, + "grad_norm": 12.281099319458008, + "learning_rate": 2.1558100105658896e-05, + "loss": 10.3436, + "step": 46800 + }, + { + "combined_loss": 3.3808600902557373, + "distill_loss": 1.2777303457260132, + "epoch": 1.7144113121840427, + "step": 46800, + "student_mlm_loss": 5.483989715576172 + }, + { + "epoch": 1.718074584218624, + "grad_norm": 3.3210408687591553, + "learning_rate": 2.149667051625427e-05, + "loss": 2.8055, + "step": 46900 + }, + { + "combined_loss": 2.1092348098754883, + "distill_loss": 1.2058593034744263, + "epoch": 1.718074584218624, + "step": 46900, + "student_mlm_loss": 3.0126101970672607 + }, + { + "epoch": 1.7217378562532053, + "grad_norm": 11.694738388061523, + "learning_rate": 2.1435240926849645e-05, + "loss": 4.6311, + "step": 47000 + }, + { + "combined_loss": 2.2222890853881836, + "distill_loss": 1.218597173690796, + "epoch": 1.7217378562532053, + "step": 47000, + "student_mlm_loss": 3.2259812355041504 + }, + { + "epoch": 1.7254011282877868, + "grad_norm": 23.036334991455078, + "learning_rate": 2.137381133744502e-05, + "loss": 2.5923, + "step": 47100 + }, + { + "combined_loss": 1.882810354232788, + "distill_loss": 1.2441027164459229, + "epoch": 1.7254011282877868, + "step": 47100, + "student_mlm_loss": 2.5215179920196533 + }, + { + "epoch": 1.7290644003223679, + "grad_norm": 65.06354522705078, + "learning_rate": 2.1312381748040397e-05, + "loss": 3.3375, + "step": 47200 + }, + { + "combined_loss": 1.84983229637146, + "distill_loss": 1.224557876586914, + "epoch": 1.7290644003223679, + "step": 47200, + "student_mlm_loss": 2.475106716156006 + }, + { + "epoch": 1.7327276723569494, + "grad_norm": 9.202945709228516, + "learning_rate": 2.1250952158635772e-05, + "loss": 3.0094, + "step": 47300 + }, + { + "combined_loss": 1.6417255401611328, + "distill_loss": 1.2296794652938843, + "epoch": 1.7327276723569494, + "step": 47300, + "student_mlm_loss": 2.053771734237671 + }, + { + "epoch": 1.7363909443915304, + "grad_norm": 7.1568193435668945, + "learning_rate": 2.118952256923115e-05, + "loss": 3.3413, + "step": 47400 + }, + { + "combined_loss": 2.165384531021118, + "distill_loss": 1.2572156190872192, + "epoch": 1.7363909443915304, + "step": 47400, + "student_mlm_loss": 3.0735535621643066 + }, + { + "epoch": 1.740054216426112, + "grad_norm": 39.054439544677734, + "learning_rate": 2.1128092979826524e-05, + "loss": 4.8522, + "step": 47500 + }, + { + "combined_loss": 2.6122236251831055, + "distill_loss": 1.1487023830413818, + "epoch": 1.740054216426112, + "step": 47500, + "student_mlm_loss": 4.07574462890625 + }, + { + "epoch": 1.743717488460693, + "grad_norm": 3.18758487701416, + "learning_rate": 2.1066663390421902e-05, + "loss": 4.3993, + "step": 47600 + }, + { + "combined_loss": 6.344114303588867, + "distill_loss": 1.1341725587844849, + "epoch": 1.743717488460693, + "step": 47600, + "student_mlm_loss": 11.554056167602539 + }, + { + "epoch": 1.7473807604952745, + "grad_norm": 9.418896675109863, + "learning_rate": 2.1005233801017273e-05, + "loss": 8.7279, + "step": 47700 + }, + { + "combined_loss": 2.8721518516540527, + "distill_loss": 1.2175838947296143, + "epoch": 1.7473807604952745, + "step": 47700, + "student_mlm_loss": 4.526719570159912 + }, + { + "epoch": 1.7510440325298555, + "grad_norm": 4.730939865112305, + "learning_rate": 2.094380421161265e-05, + "loss": 2.74, + "step": 47800 + }, + { + "combined_loss": 1.8483730554580688, + "distill_loss": 1.2789607048034668, + "epoch": 1.7510440325298555, + "step": 47800, + "student_mlm_loss": 2.417785406112671 + }, + { + "epoch": 1.754707304564437, + "grad_norm": 4.566458225250244, + "learning_rate": 2.0882374622208025e-05, + "loss": 2.63, + "step": 47900 + }, + { + "combined_loss": 1.8073049783706665, + "distill_loss": 1.3073413372039795, + "epoch": 1.754707304564437, + "step": 47900, + "student_mlm_loss": 2.3072686195373535 + }, + { + "epoch": 1.7583705765990183, + "grad_norm": 14.967068672180176, + "learning_rate": 2.0820945032803403e-05, + "loss": 2.5821, + "step": 48000 + }, + { + "epoch": 1.7583705765990183, + "eval_loss": 3.2400870323181152, + "eval_runtime": 1.8322, + "eval_samples_per_second": 3818.29, + "eval_steps_per_second": 15.282, + "step": 48000 + } + ], + "logging_steps": 100, + "max_steps": 81894, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.7150683130961408e+16, + "train_batch_size": 256, + "trial_name": null, + "trial_params": null +}