diff --git "a/checkpoint-4257/trainer_state.json" "b/checkpoint-4257/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-4257/trainer_state.json" @@ -0,0 +1,4284 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 28.00657894736842, + "eval_steps": 500, + "global_step": 4257, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.06578947368421052, + "grad_norm": 5.228871822357178, + "learning_rate": 9e-06, + "loss": 3.9986, + "memory/device_mem_reserved(gib)": 66.83, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 10 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 2.1954143047332764, + "learning_rate": 1.9e-05, + "loss": 0.8823, + "memory/device_mem_reserved(gib)": 66.83, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 20 + }, + { + "epoch": 0.19736842105263158, + "grad_norm": 2.786663055419922, + "learning_rate": 2.9e-05, + "loss": 0.8943, + "memory/device_mem_reserved(gib)": 66.83, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 30 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 1.4439328908920288, + "learning_rate": 3.9000000000000006e-05, + "loss": 0.6571, + "memory/device_mem_reserved(gib)": 66.83, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 40 + }, + { + "epoch": 0.32894736842105265, + "grad_norm": 1.8948026895523071, + "learning_rate": 4.9e-05, + "loss": 0.7597, + "memory/device_mem_reserved(gib)": 66.84, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 50 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 1.6593940258026123, + "learning_rate": 5.9e-05, + "loss": 0.7014, + "memory/device_mem_reserved(gib)": 66.84, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 60 + }, + { + "epoch": 0.4605263157894737, + "grad_norm": 2.095798969268799, + "learning_rate": 6.9e-05, + "loss": 0.7159, + "memory/device_mem_reserved(gib)": 66.84, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 70 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 1.5991102457046509, + "learning_rate": 7.900000000000001e-05, + "loss": 0.6052, + "memory/device_mem_reserved(gib)": 66.84, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 80 + }, + { + "epoch": 0.5921052631578947, + "grad_norm": 1.9454739093780518, + "learning_rate": 8.900000000000001e-05, + "loss": 0.6825, + "memory/device_mem_reserved(gib)": 66.84, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 90 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 2.4410903453826904, + "learning_rate": 9.900000000000001e-05, + "loss": 0.6352, + "memory/device_mem_reserved(gib)": 66.84, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 100 + }, + { + "epoch": 0.7236842105263158, + "grad_norm": 3.553440570831299, + "learning_rate": 0.000109, + "loss": 0.6316, + "memory/device_mem_reserved(gib)": 66.84, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 110 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 3.3301632404327393, + "learning_rate": 0.000119, + "loss": 0.6096, + "memory/device_mem_reserved(gib)": 66.84, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 120 + }, + { + "epoch": 0.8552631578947368, + "grad_norm": 3.497053384780884, + "learning_rate": 0.00012900000000000002, + "loss": 0.6476, + "memory/device_mem_reserved(gib)": 66.84, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 130 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 2.0716304779052734, + "learning_rate": 0.000139, + "loss": 0.6442, + "memory/device_mem_reserved(gib)": 66.84, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 140 + }, + { + "epoch": 0.9868421052631579, + "grad_norm": 3.281266450881958, + "learning_rate": 0.00014900000000000002, + "loss": 0.6593, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 150 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 3.6694233417510986, + "learning_rate": 0.00015900000000000002, + "loss": 0.6109, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 160 + }, + { + "epoch": 1.118421052631579, + "grad_norm": 3.227562665939331, + "learning_rate": 0.00016900000000000002, + "loss": 0.5442, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 170 + }, + { + "epoch": 1.1842105263157894, + "grad_norm": 3.710609197616577, + "learning_rate": 0.00017900000000000001, + "loss": 0.5785, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 180 + }, + { + "epoch": 1.25, + "grad_norm": 5.2679877281188965, + "learning_rate": 0.00018899999999999999, + "loss": 0.5875, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 190 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 3.2948834896087646, + "learning_rate": 0.000199, + "loss": 0.5709, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 200 + }, + { + "epoch": 1.381578947368421, + "grad_norm": 3.0435378551483154, + "learning_rate": 0.00019999830003086864, + "loss": 0.612, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 210 + }, + { + "epoch": 1.4473684210526316, + "grad_norm": 5.632095813751221, + "learning_rate": 0.00019999242370037052, + "loss": 0.5554, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 220 + }, + { + "epoch": 1.513157894736842, + "grad_norm": 2.9409236907958984, + "learning_rate": 0.0001999823503735144, + "loss": 0.5954, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 230 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 4.222487926483154, + "learning_rate": 0.00019996808065433287, + "loss": 0.6102, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 240 + }, + { + "epoch": 1.6447368421052633, + "grad_norm": 3.6207034587860107, + "learning_rate": 0.00019994961539848957, + "loss": 0.6663, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 250 + }, + { + "epoch": 1.7105263157894737, + "grad_norm": 6.34915828704834, + "learning_rate": 0.00019992695571322724, + "loss": 0.5956, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 260 + }, + { + "epoch": 1.776315789473684, + "grad_norm": 5.599079608917236, + "learning_rate": 0.00019990010295730176, + "loss": 0.6437, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 270 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 3.9812982082366943, + "learning_rate": 0.00019986905874090063, + "loss": 0.6788, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 280 + }, + { + "epoch": 1.9078947368421053, + "grad_norm": 3.97013783454895, + "learning_rate": 0.00019983382492554607, + "loss": 0.5463, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 290 + }, + { + "epoch": 1.973684210526316, + "grad_norm": 4.474316120147705, + "learning_rate": 0.00019979440362398397, + "loss": 0.6562, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 300 + }, + { + "epoch": 2.039473684210526, + "grad_norm": 6.513801574707031, + "learning_rate": 0.00019975079720005665, + "loss": 0.5757, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 310 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 4.47428560256958, + "learning_rate": 0.00019970300826856145, + "loss": 0.4933, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 320 + }, + { + "epoch": 2.1710526315789473, + "grad_norm": 3.6297032833099365, + "learning_rate": 0.00019965103969509385, + "loss": 0.5623, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 330 + }, + { + "epoch": 2.236842105263158, + "grad_norm": 2.9947564601898193, + "learning_rate": 0.00019959489459587558, + "loss": 0.5077, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 340 + }, + { + "epoch": 2.3026315789473686, + "grad_norm": 5.7996039390563965, + "learning_rate": 0.00019953457633756786, + "loss": 0.5758, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 350 + }, + { + "epoch": 2.3684210526315788, + "grad_norm": 7.004776954650879, + "learning_rate": 0.0001994700885370694, + "loss": 0.5046, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 360 + }, + { + "epoch": 2.4342105263157894, + "grad_norm": 2.668243169784546, + "learning_rate": 0.00019940143506129974, + "loss": 0.5202, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 370 + }, + { + "epoch": 2.5, + "grad_norm": 5.5414252281188965, + "learning_rate": 0.00019932862002696702, + "loss": 0.5763, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 380 + }, + { + "epoch": 2.5657894736842106, + "grad_norm": 4.762598514556885, + "learning_rate": 0.00019925164780032145, + "loss": 0.5329, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 390 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 5.675711154937744, + "learning_rate": 0.00019917052299689344, + "loss": 0.5617, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 400 + }, + { + "epoch": 2.6973684210526314, + "grad_norm": 5.017973899841309, + "learning_rate": 0.00019908525048121666, + "loss": 0.5775, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 410 + }, + { + "epoch": 2.763157894736842, + "grad_norm": 4.787014961242676, + "learning_rate": 0.00019899583536653648, + "loss": 0.5534, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 420 + }, + { + "epoch": 2.8289473684210527, + "grad_norm": 4.4121856689453125, + "learning_rate": 0.0001989022830145034, + "loss": 0.5598, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 430 + }, + { + "epoch": 2.8947368421052633, + "grad_norm": 5.0906982421875, + "learning_rate": 0.0001988045990348514, + "loss": 0.5637, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 440 + }, + { + "epoch": 2.9605263157894735, + "grad_norm": 5.442751884460449, + "learning_rate": 0.0001987027892850617, + "loss": 0.5529, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 450 + }, + { + "epoch": 3.026315789473684, + "grad_norm": 3.2587528228759766, + "learning_rate": 0.0001985968598700115, + "loss": 0.5539, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 460 + }, + { + "epoch": 3.0921052631578947, + "grad_norm": 5.267582893371582, + "learning_rate": 0.0001984868171416078, + "loss": 0.4498, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 470 + }, + { + "epoch": 3.1578947368421053, + "grad_norm": 6.26557731628418, + "learning_rate": 0.0001983726676984067, + "loss": 0.5543, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 480 + }, + { + "epoch": 3.223684210526316, + "grad_norm": 7.339848518371582, + "learning_rate": 0.00019825441838521753, + "loss": 0.5068, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 490 + }, + { + "epoch": 3.2894736842105265, + "grad_norm": 7.684560298919678, + "learning_rate": 0.0001981320762926925, + "loss": 0.4447, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 500 + }, + { + "epoch": 3.3552631578947367, + "grad_norm": 5.45398473739624, + "learning_rate": 0.00019800564875690167, + "loss": 0.5575, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 510 + }, + { + "epoch": 3.4210526315789473, + "grad_norm": 8.519457817077637, + "learning_rate": 0.0001978751433588927, + "loss": 0.4763, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 520 + }, + { + "epoch": 3.486842105263158, + "grad_norm": 6.082020282745361, + "learning_rate": 0.00019774056792423665, + "loss": 0.4817, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 530 + }, + { + "epoch": 3.5526315789473686, + "grad_norm": 5.932907581329346, + "learning_rate": 0.0001976019305225584, + "loss": 0.5368, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 540 + }, + { + "epoch": 3.6184210526315788, + "grad_norm": 4.9030561447143555, + "learning_rate": 0.0001974592394670531, + "loss": 0.5254, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 550 + }, + { + "epoch": 3.6842105263157894, + "grad_norm": 8.374964714050293, + "learning_rate": 0.00019731250331398733, + "loss": 0.5555, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 560 + }, + { + "epoch": 3.75, + "grad_norm": 10.392874717712402, + "learning_rate": 0.00019716173086218626, + "loss": 0.5351, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 570 + }, + { + "epoch": 3.8157894736842106, + "grad_norm": 5.0130815505981445, + "learning_rate": 0.00019700693115250607, + "loss": 0.5059, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 580 + }, + { + "epoch": 3.8815789473684212, + "grad_norm": 4.398721694946289, + "learning_rate": 0.00019684811346729156, + "loss": 0.5064, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 590 + }, + { + "epoch": 3.9473684210526314, + "grad_norm": 5.850337505340576, + "learning_rate": 0.0001966852873298199, + "loss": 0.5199, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 600 + }, + { + "epoch": 4.0131578947368425, + "grad_norm": 5.074413776397705, + "learning_rate": 0.00019651846250372937, + "loss": 0.4722, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 610 + }, + { + "epoch": 4.078947368421052, + "grad_norm": 8.62219524383545, + "learning_rate": 0.00019634764899243389, + "loss": 0.4188, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 620 + }, + { + "epoch": 4.144736842105263, + "grad_norm": 5.095092296600342, + "learning_rate": 0.00019617285703852323, + "loss": 0.4477, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 630 + }, + { + "epoch": 4.2105263157894735, + "grad_norm": 4.571897506713867, + "learning_rate": 0.0001959940971231489, + "loss": 0.4309, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 640 + }, + { + "epoch": 4.276315789473684, + "grad_norm": 6.1334075927734375, + "learning_rate": 0.00019581137996539552, + "loss": 0.4363, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 650 + }, + { + "epoch": 4.342105263157895, + "grad_norm": 5.199427127838135, + "learning_rate": 0.00019562471652163815, + "loss": 0.4562, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 660 + }, + { + "epoch": 4.407894736842105, + "grad_norm": 5.7770676612854, + "learning_rate": 0.00019543411798488532, + "loss": 0.4621, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 670 + }, + { + "epoch": 4.473684210526316, + "grad_norm": 5.296002388000488, + "learning_rate": 0.00019523959578410772, + "loss": 0.4868, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 680 + }, + { + "epoch": 4.5394736842105265, + "grad_norm": 7.707890510559082, + "learning_rate": 0.00019504116158355314, + "loss": 0.4507, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 690 + }, + { + "epoch": 4.605263157894737, + "grad_norm": 5.631321907043457, + "learning_rate": 0.00019483882728204682, + "loss": 0.5316, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 700 + }, + { + "epoch": 4.671052631578947, + "grad_norm": 6.96210241317749, + "learning_rate": 0.000194632605012278, + "loss": 0.4924, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 710 + }, + { + "epoch": 4.7368421052631575, + "grad_norm": 6.5153937339782715, + "learning_rate": 0.00019442250714007234, + "loss": 0.4652, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 720 + }, + { + "epoch": 4.802631578947368, + "grad_norm": 7.671054363250732, + "learning_rate": 0.00019420854626365065, + "loss": 0.5399, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 730 + }, + { + "epoch": 4.868421052631579, + "grad_norm": 6.234044075012207, + "learning_rate": 0.00019399073521287333, + "loss": 0.5311, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 740 + }, + { + "epoch": 4.934210526315789, + "grad_norm": 4.996755599975586, + "learning_rate": 0.00019376908704847083, + "loss": 0.5329, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 750 + }, + { + "epoch": 5.0, + "grad_norm": 9.81963062286377, + "learning_rate": 0.0001935436150612608, + "loss": 0.4912, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 760 + }, + { + "epoch": 5.065789473684211, + "grad_norm": 5.93031644821167, + "learning_rate": 0.00019331433277135096, + "loss": 0.4343, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 770 + }, + { + "epoch": 5.131578947368421, + "grad_norm": 7.9363837242126465, + "learning_rate": 0.00019308125392732847, + "loss": 0.4316, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 780 + }, + { + "epoch": 5.197368421052632, + "grad_norm": 5.575252532958984, + "learning_rate": 0.00019284439250543542, + "loss": 0.4041, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 790 + }, + { + "epoch": 5.2631578947368425, + "grad_norm": 6.127432346343994, + "learning_rate": 0.00019260376270873087, + "loss": 0.4205, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 800 + }, + { + "epoch": 5.328947368421053, + "grad_norm": 8.333614349365234, + "learning_rate": 0.00019235937896623905, + "loss": 0.4604, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 810 + }, + { + "epoch": 5.394736842105263, + "grad_norm": 6.921279430389404, + "learning_rate": 0.0001921112559320843, + "loss": 0.416, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 820 + }, + { + "epoch": 5.4605263157894735, + "grad_norm": 8.031194686889648, + "learning_rate": 0.00019185940848461225, + "loss": 0.4593, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 830 + }, + { + "epoch": 5.526315789473684, + "grad_norm": 5.994675636291504, + "learning_rate": 0.00019160385172549757, + "loss": 0.5126, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 840 + }, + { + "epoch": 5.592105263157895, + "grad_norm": 8.621824264526367, + "learning_rate": 0.0001913446009788388, + "loss": 0.4761, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 850 + }, + { + "epoch": 5.657894736842105, + "grad_norm": 7.873433589935303, + "learning_rate": 0.00019108167179023893, + "loss": 0.4399, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 860 + }, + { + "epoch": 5.723684210526316, + "grad_norm": 7.9271650314331055, + "learning_rate": 0.0001908150799258737, + "loss": 0.4469, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 870 + }, + { + "epoch": 5.7894736842105265, + "grad_norm": 7.980473041534424, + "learning_rate": 0.0001905448413715459, + "loss": 0.4781, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 880 + }, + { + "epoch": 5.855263157894737, + "grad_norm": 5.033918380737305, + "learning_rate": 0.00019027097233172694, + "loss": 0.5237, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 890 + }, + { + "epoch": 5.921052631578947, + "grad_norm": 5.501688003540039, + "learning_rate": 0.00018999348922858514, + "loss": 0.4594, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 900 + }, + { + "epoch": 5.9868421052631575, + "grad_norm": 5.770474910736084, + "learning_rate": 0.00018971240870100094, + "loss": 0.4845, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 910 + }, + { + "epoch": 6.052631578947368, + "grad_norm": 6.346399784088135, + "learning_rate": 0.00018942774760356934, + "loss": 0.4128, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 920 + }, + { + "epoch": 6.118421052631579, + "grad_norm": 6.847306728363037, + "learning_rate": 0.00018913952300558907, + "loss": 0.3971, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 930 + }, + { + "epoch": 6.184210526315789, + "grad_norm": 4.9199347496032715, + "learning_rate": 0.00018884775219003909, + "loss": 0.3764, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 940 + }, + { + "epoch": 6.25, + "grad_norm": 8.841605186462402, + "learning_rate": 0.00018855245265254227, + "loss": 0.4349, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 950 + }, + { + "epoch": 6.315789473684211, + "grad_norm": 7.917416095733643, + "learning_rate": 0.00018825364210031635, + "loss": 0.4043, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 960 + }, + { + "epoch": 6.381578947368421, + "grad_norm": 9.34174919128418, + "learning_rate": 0.00018795133845111206, + "loss": 0.3961, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 970 + }, + { + "epoch": 6.447368421052632, + "grad_norm": 7.267455101013184, + "learning_rate": 0.00018764555983213864, + "loss": 0.436, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 980 + }, + { + "epoch": 6.5131578947368425, + "grad_norm": 8.763919830322266, + "learning_rate": 0.00018733632457897708, + "loss": 0.4639, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 990 + }, + { + "epoch": 6.578947368421053, + "grad_norm": 6.768899917602539, + "learning_rate": 0.00018702365123448046, + "loss": 0.4811, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1000 + }, + { + "epoch": 6.644736842105263, + "grad_norm": 6.540329456329346, + "learning_rate": 0.0001867075585476622, + "loss": 0.4476, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1010 + }, + { + "epoch": 6.7105263157894735, + "grad_norm": 5.938429355621338, + "learning_rate": 0.00018638806547257168, + "loss": 0.4052, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1020 + }, + { + "epoch": 6.776315789473684, + "grad_norm": 6.720884799957275, + "learning_rate": 0.00018606519116715772, + "loss": 0.4525, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1030 + }, + { + "epoch": 6.842105263157895, + "grad_norm": 8.531013488769531, + "learning_rate": 0.0001857389549921198, + "loss": 0.4441, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1040 + }, + { + "epoch": 6.907894736842105, + "grad_norm": 6.624144077301025, + "learning_rate": 0.00018540937650974728, + "loss": 0.4791, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1050 + }, + { + "epoch": 6.973684210526316, + "grad_norm": 8.459912300109863, + "learning_rate": 0.000185076475482746, + "loss": 0.4243, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1060 + }, + { + "epoch": 7.0394736842105265, + "grad_norm": 8.084807395935059, + "learning_rate": 0.00018474027187305377, + "loss": 0.4047, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1070 + }, + { + "epoch": 7.105263157894737, + "grad_norm": 6.393621921539307, + "learning_rate": 0.0001844007858406428, + "loss": 0.377, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1080 + }, + { + "epoch": 7.171052631578948, + "grad_norm": 9.367650985717773, + "learning_rate": 0.0001840580377423113, + "loss": 0.3671, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1090 + }, + { + "epoch": 7.2368421052631575, + "grad_norm": 7.191049098968506, + "learning_rate": 0.00018371204813046258, + "loss": 0.3986, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1100 + }, + { + "epoch": 7.302631578947368, + "grad_norm": 5.127690315246582, + "learning_rate": 0.00018336283775187266, + "loss": 0.3936, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1110 + }, + { + "epoch": 7.368421052631579, + "grad_norm": 8.498778343200684, + "learning_rate": 0.00018301042754644638, + "loss": 0.3985, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1120 + }, + { + "epoch": 7.434210526315789, + "grad_norm": 7.945418357849121, + "learning_rate": 0.00018265483864596154, + "loss": 0.4012, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1130 + }, + { + "epoch": 7.5, + "grad_norm": 9.021563529968262, + "learning_rate": 0.00018229609237280196, + "loss": 0.4443, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1140 + }, + { + "epoch": 7.565789473684211, + "grad_norm": 14.01905632019043, + "learning_rate": 0.00018193421023867878, + "loss": 0.4273, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1150 + }, + { + "epoch": 7.631578947368421, + "grad_norm": 7.097718238830566, + "learning_rate": 0.0001815692139433406, + "loss": 0.3867, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1160 + }, + { + "epoch": 7.697368421052632, + "grad_norm": 6.583855628967285, + "learning_rate": 0.00018120112537327234, + "loss": 0.4731, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1170 + }, + { + "epoch": 7.7631578947368425, + "grad_norm": 8.54875373840332, + "learning_rate": 0.00018082996660038266, + "loss": 0.378, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1180 + }, + { + "epoch": 7.828947368421053, + "grad_norm": 7.399291515350342, + "learning_rate": 0.00018045575988068072, + "loss": 0.4387, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1190 + }, + { + "epoch": 7.894736842105263, + "grad_norm": 9.295549392700195, + "learning_rate": 0.00018007852765294135, + "loss": 0.4581, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1200 + }, + { + "epoch": 7.9605263157894735, + "grad_norm": 9.612926483154297, + "learning_rate": 0.00017969829253735976, + "loss": 0.4128, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1210 + }, + { + "epoch": 8.026315789473685, + "grad_norm": 7.819753170013428, + "learning_rate": 0.00017931507733419508, + "loss": 0.4026, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1220 + }, + { + "epoch": 8.092105263157896, + "grad_norm": 6.5006632804870605, + "learning_rate": 0.0001789289050224031, + "loss": 0.3644, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1230 + }, + { + "epoch": 8.157894736842104, + "grad_norm": 7.953379154205322, + "learning_rate": 0.00017853979875825848, + "loss": 0.3859, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1240 + }, + { + "epoch": 8.223684210526315, + "grad_norm": 5.973986625671387, + "learning_rate": 0.00017814778187396613, + "loss": 0.4247, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1250 + }, + { + "epoch": 8.289473684210526, + "grad_norm": 8.134764671325684, + "learning_rate": 0.00017775287787626223, + "loss": 0.3536, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1260 + }, + { + "epoch": 8.355263157894736, + "grad_norm": 7.2093963623046875, + "learning_rate": 0.00017735511044500454, + "loss": 0.3697, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1270 + }, + { + "epoch": 8.421052631578947, + "grad_norm": 5.207829475402832, + "learning_rate": 0.0001769545034317526, + "loss": 0.376, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1280 + }, + { + "epoch": 8.486842105263158, + "grad_norm": 6.704354763031006, + "learning_rate": 0.0001765510808583375, + "loss": 0.3702, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1290 + }, + { + "epoch": 8.552631578947368, + "grad_norm": 7.463472366333008, + "learning_rate": 0.00017614486691542122, + "loss": 0.4381, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1300 + }, + { + "epoch": 8.618421052631579, + "grad_norm": 9.245990753173828, + "learning_rate": 0.00017573588596104647, + "loss": 0.4001, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1310 + }, + { + "epoch": 8.68421052631579, + "grad_norm": 8.084155082702637, + "learning_rate": 0.00017532416251917572, + "loss": 0.437, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1320 + }, + { + "epoch": 8.75, + "grad_norm": 8.936637878417969, + "learning_rate": 0.00017490972127822088, + "loss": 0.4072, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1330 + }, + { + "epoch": 8.81578947368421, + "grad_norm": 8.653999328613281, + "learning_rate": 0.00017449258708956283, + "loss": 0.3979, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1340 + }, + { + "epoch": 8.881578947368421, + "grad_norm": 6.477110862731934, + "learning_rate": 0.00017407278496606122, + "loss": 0.4, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1350 + }, + { + "epoch": 8.947368421052632, + "grad_norm": 8.848350524902344, + "learning_rate": 0.00017365034008055467, + "loss": 0.3951, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1360 + }, + { + "epoch": 9.013157894736842, + "grad_norm": 3.7075564861297607, + "learning_rate": 0.00017322527776435115, + "loss": 0.3813, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1370 + }, + { + "epoch": 9.078947368421053, + "grad_norm": 7.235719680786133, + "learning_rate": 0.00017279762350570925, + "loss": 0.3373, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1380 + }, + { + "epoch": 9.144736842105264, + "grad_norm": 11.8607177734375, + "learning_rate": 0.00017236740294830969, + "loss": 0.3858, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1390 + }, + { + "epoch": 9.210526315789474, + "grad_norm": 9.025496482849121, + "learning_rate": 0.00017193464188971767, + "loss": 0.3341, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1400 + }, + { + "epoch": 9.276315789473685, + "grad_norm": 6.856828212738037, + "learning_rate": 0.00017149936627983595, + "loss": 0.3887, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1410 + }, + { + "epoch": 9.342105263157896, + "grad_norm": 10.805776596069336, + "learning_rate": 0.0001710616022193487, + "loss": 0.3375, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1420 + }, + { + "epoch": 9.407894736842104, + "grad_norm": 6.890063285827637, + "learning_rate": 0.00017062137595815657, + "loss": 0.3546, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1430 + }, + { + "epoch": 9.473684210526315, + "grad_norm": 9.32254695892334, + "learning_rate": 0.00017017871389380255, + "loss": 0.3875, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1440 + }, + { + "epoch": 9.539473684210526, + "grad_norm": 8.32424545288086, + "learning_rate": 0.00016973364256988918, + "loss": 0.416, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1450 + }, + { + "epoch": 9.605263157894736, + "grad_norm": 6.700080871582031, + "learning_rate": 0.00016928618867448675, + "loss": 0.3982, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1460 + }, + { + "epoch": 9.671052631578947, + "grad_norm": 9.059173583984375, + "learning_rate": 0.0001688363790385331, + "loss": 0.3322, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1470 + }, + { + "epoch": 9.736842105263158, + "grad_norm": 7.265253067016602, + "learning_rate": 0.0001683842406342247, + "loss": 0.3597, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1480 + }, + { + "epoch": 9.802631578947368, + "grad_norm": 9.619430541992188, + "learning_rate": 0.00016792980057339936, + "loss": 0.3976, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1490 + }, + { + "epoch": 9.868421052631579, + "grad_norm": 7.571810245513916, + "learning_rate": 0.00016747308610591034, + "loss": 0.3728, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1500 + }, + { + "epoch": 9.93421052631579, + "grad_norm": 8.444099426269531, + "learning_rate": 0.00016701412461799254, + "loss": 0.3842, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1510 + }, + { + "epoch": 10.0, + "grad_norm": 6.8055291175842285, + "learning_rate": 0.00016655294363062035, + "loss": 0.3844, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1520 + }, + { + "epoch": 10.06578947368421, + "grad_norm": 9.351649284362793, + "learning_rate": 0.00016608957079785713, + "loss": 0.3382, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1530 + }, + { + "epoch": 10.131578947368421, + "grad_norm": 9.218072891235352, + "learning_rate": 0.00016562403390519724, + "loss": 0.3175, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1540 + }, + { + "epoch": 10.197368421052632, + "grad_norm": 8.207752227783203, + "learning_rate": 0.0001651563608678997, + "loss": 0.3422, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1550 + }, + { + "epoch": 10.263157894736842, + "grad_norm": 7.536536693572998, + "learning_rate": 0.0001646865797293146, + "loss": 0.3548, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1560 + }, + { + "epoch": 10.328947368421053, + "grad_norm": 6.550157070159912, + "learning_rate": 0.00016421471865920112, + "loss": 0.3501, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1570 + }, + { + "epoch": 10.394736842105264, + "grad_norm": 7.151719570159912, + "learning_rate": 0.00016374080595203883, + "loss": 0.3693, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1580 + }, + { + "epoch": 10.460526315789474, + "grad_norm": 7.3648810386657715, + "learning_rate": 0.00016326487002533058, + "loss": 0.3312, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1590 + }, + { + "epoch": 10.526315789473685, + "grad_norm": 7.90741491317749, + "learning_rate": 0.00016278693941789877, + "loss": 0.3586, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1600 + }, + { + "epoch": 10.592105263157894, + "grad_norm": 6.969632148742676, + "learning_rate": 0.00016230704278817406, + "loss": 0.3492, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1610 + }, + { + "epoch": 10.657894736842106, + "grad_norm": 9.677424430847168, + "learning_rate": 0.00016182520891247685, + "loss": 0.3514, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1620 + }, + { + "epoch": 10.723684210526315, + "grad_norm": 7.527509689331055, + "learning_rate": 0.00016134146668329166, + "loss": 0.3368, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1630 + }, + { + "epoch": 10.789473684210526, + "grad_norm": 7.857253074645996, + "learning_rate": 0.00016085584510753474, + "loss": 0.3755, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1640 + }, + { + "epoch": 10.855263157894736, + "grad_norm": 5.162316799163818, + "learning_rate": 0.00016036837330481477, + "loss": 0.3805, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1650 + }, + { + "epoch": 10.921052631578947, + "grad_norm": 7.910579204559326, + "learning_rate": 0.0001598790805056866, + "loss": 0.3351, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1660 + }, + { + "epoch": 10.986842105263158, + "grad_norm": 7.014622688293457, + "learning_rate": 0.00015938799604989852, + "loss": 0.3842, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1670 + }, + { + "epoch": 11.052631578947368, + "grad_norm": 9.553169250488281, + "learning_rate": 0.00015889514938463304, + "loss": 0.308, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1680 + }, + { + "epoch": 11.118421052631579, + "grad_norm": 10.32496452331543, + "learning_rate": 0.000158400570062741, + "loss": 0.3324, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1690 + }, + { + "epoch": 11.18421052631579, + "grad_norm": 13.280173301696777, + "learning_rate": 0.00015790428774096953, + "loss": 0.3678, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1700 + }, + { + "epoch": 11.25, + "grad_norm": 10.031278610229492, + "learning_rate": 0.00015740633217818382, + "loss": 0.3176, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1710 + }, + { + "epoch": 11.31578947368421, + "grad_norm": 6.93234920501709, + "learning_rate": 0.00015690673323358253, + "loss": 0.3413, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1720 + }, + { + "epoch": 11.381578947368421, + "grad_norm": 8.427581787109375, + "learning_rate": 0.0001564055208649073, + "loss": 0.3147, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1730 + }, + { + "epoch": 11.447368421052632, + "grad_norm": 7.225602626800537, + "learning_rate": 0.00015590272512664655, + "loss": 0.2998, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1740 + }, + { + "epoch": 11.513157894736842, + "grad_norm": 8.522515296936035, + "learning_rate": 0.00015539837616823326, + "loss": 0.2984, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1750 + }, + { + "epoch": 11.578947368421053, + "grad_norm": 11.115621566772461, + "learning_rate": 0.0001548925042322369, + "loss": 0.3423, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1760 + }, + { + "epoch": 11.644736842105264, + "grad_norm": 6.074847221374512, + "learning_rate": 0.00015438513965255023, + "loss": 0.3423, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1770 + }, + { + "epoch": 11.710526315789474, + "grad_norm": 9.569510459899902, + "learning_rate": 0.00015387631285257021, + "loss": 0.3416, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1780 + }, + { + "epoch": 11.776315789473685, + "grad_norm": 8.04273509979248, + "learning_rate": 0.00015336605434337387, + "loss": 0.3376, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1790 + }, + { + "epoch": 11.842105263157894, + "grad_norm": 9.288858413696289, + "learning_rate": 0.0001528543947218886, + "loss": 0.3668, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1800 + }, + { + "epoch": 11.907894736842106, + "grad_norm": 9.00793743133545, + "learning_rate": 0.0001523413646690574, + "loss": 0.322, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1810 + }, + { + "epoch": 11.973684210526315, + "grad_norm": 6.323388576507568, + "learning_rate": 0.0001518269949479994, + "loss": 0.3419, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1820 + }, + { + "epoch": 12.039473684210526, + "grad_norm": 7.900981903076172, + "learning_rate": 0.0001513113164021649, + "loss": 0.3268, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1830 + }, + { + "epoch": 12.105263157894736, + "grad_norm": 8.535825729370117, + "learning_rate": 0.00015079435995348615, + "loss": 0.2922, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1840 + }, + { + "epoch": 12.171052631578947, + "grad_norm": 10.261241912841797, + "learning_rate": 0.0001502761566005229, + "loss": 0.316, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1850 + }, + { + "epoch": 12.236842105263158, + "grad_norm": 9.959452629089355, + "learning_rate": 0.0001497567374166038, + "loss": 0.2672, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1860 + }, + { + "epoch": 12.302631578947368, + "grad_norm": 8.756327629089355, + "learning_rate": 0.00014923613354796313, + "loss": 0.2825, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1870 + }, + { + "epoch": 12.368421052631579, + "grad_norm": 11.042543411254883, + "learning_rate": 0.00014871437621187306, + "loss": 0.348, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1880 + }, + { + "epoch": 12.43421052631579, + "grad_norm": 8.125487327575684, + "learning_rate": 0.00014819149669477178, + "loss": 0.2827, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1890 + }, + { + "epoch": 12.5, + "grad_norm": 6.272602081298828, + "learning_rate": 0.0001476675263503875, + "loss": 0.331, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1900 + }, + { + "epoch": 12.56578947368421, + "grad_norm": 9.739338874816895, + "learning_rate": 0.00014714249659785833, + "loss": 0.3421, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1910 + }, + { + "epoch": 12.631578947368421, + "grad_norm": 10.085533142089844, + "learning_rate": 0.0001466164389198483, + "loss": 0.3315, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1920 + }, + { + "epoch": 12.697368421052632, + "grad_norm": 10.818131446838379, + "learning_rate": 0.00014608938486065953, + "loss": 0.3479, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1930 + }, + { + "epoch": 12.763157894736842, + "grad_norm": 9.922698974609375, + "learning_rate": 0.00014556136602434064, + "loss": 0.3279, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1940 + }, + { + "epoch": 12.828947368421053, + "grad_norm": 6.283480167388916, + "learning_rate": 0.00014503241407279194, + "loss": 0.3254, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1950 + }, + { + "epoch": 12.894736842105264, + "grad_norm": 6.850347995758057, + "learning_rate": 0.00014450256072386645, + "loss": 0.3053, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1960 + }, + { + "epoch": 12.960526315789474, + "grad_norm": 7.839858055114746, + "learning_rate": 0.00014397183774946833, + "loss": 0.296, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1970 + }, + { + "epoch": 13.026315789473685, + "grad_norm": 8.421372413635254, + "learning_rate": 0.0001434402769736476, + "loss": 0.3125, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1980 + }, + { + "epoch": 13.092105263157896, + "grad_norm": 8.362040519714355, + "learning_rate": 0.00014290791027069176, + "loss": 0.2749, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 1990 + }, + { + "epoch": 13.157894736842104, + "grad_norm": 10.712233543395996, + "learning_rate": 0.00014237476956321468, + "loss": 0.2971, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2000 + }, + { + "epoch": 13.223684210526315, + "grad_norm": 7.288019180297852, + "learning_rate": 0.00014184088682024233, + "loss": 0.3226, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2010 + }, + { + "epoch": 13.289473684210526, + "grad_norm": 12.341687202453613, + "learning_rate": 0.00014130629405529566, + "loss": 0.2857, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2020 + }, + { + "epoch": 13.355263157894736, + "grad_norm": 8.097869873046875, + "learning_rate": 0.00014077102332447122, + "loss": 0.2739, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2030 + }, + { + "epoch": 13.421052631578947, + "grad_norm": 10.796974182128906, + "learning_rate": 0.0001402351067245187, + "loss": 0.2883, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2040 + }, + { + "epoch": 13.486842105263158, + "grad_norm": 10.561753273010254, + "learning_rate": 0.00013969857639091653, + "loss": 0.2911, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2050 + }, + { + "epoch": 13.552631578947368, + "grad_norm": 7.319301128387451, + "learning_rate": 0.00013916146449594472, + "loss": 0.322, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2060 + }, + { + "epoch": 13.618421052631579, + "grad_norm": 6.3976850509643555, + "learning_rate": 0.0001386238032467558, + "loss": 0.2861, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2070 + }, + { + "epoch": 13.68421052631579, + "grad_norm": 8.518961906433105, + "learning_rate": 0.00013808562488344363, + "loss": 0.293, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2080 + }, + { + "epoch": 13.75, + "grad_norm": 9.818099975585938, + "learning_rate": 0.00013754696167710993, + "loss": 0.3276, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2090 + }, + { + "epoch": 13.81578947368421, + "grad_norm": 10.728662490844727, + "learning_rate": 0.00013700784592792948, + "loss": 0.3067, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2100 + }, + { + "epoch": 13.881578947368421, + "grad_norm": 9.833274841308594, + "learning_rate": 0.0001364683099632131, + "loss": 0.2844, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2110 + }, + { + "epoch": 13.947368421052632, + "grad_norm": 9.362689971923828, + "learning_rate": 0.0001359283861354692, + "loss": 0.2611, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2120 + }, + { + "epoch": 14.013157894736842, + "grad_norm": 7.0610857009887695, + "learning_rate": 0.0001353881068204639, + "loss": 0.2892, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2130 + }, + { + "epoch": 14.078947368421053, + "grad_norm": 6.5400495529174805, + "learning_rate": 0.00013484750441527957, + "loss": 0.2665, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2140 + }, + { + "epoch": 14.144736842105264, + "grad_norm": 7.0163469314575195, + "learning_rate": 0.0001343066113363723, + "loss": 0.2517, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2150 + }, + { + "epoch": 14.210526315789474, + "grad_norm": 6.757364273071289, + "learning_rate": 0.00013376546001762793, + "loss": 0.2383, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2160 + }, + { + "epoch": 14.276315789473685, + "grad_norm": 8.389124870300293, + "learning_rate": 0.00013322408290841734, + "loss": 0.2944, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2170 + }, + { + "epoch": 14.342105263157896, + "grad_norm": 11.034085273742676, + "learning_rate": 0.00013268251247165055, + "loss": 0.2619, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2180 + }, + { + "epoch": 14.407894736842104, + "grad_norm": 8.859076499938965, + "learning_rate": 0.00013214078118183031, + "loss": 0.2655, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2190 + }, + { + "epoch": 14.473684210526315, + "grad_norm": 7.588623046875, + "learning_rate": 0.0001315989215231045, + "loss": 0.2493, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2200 + }, + { + "epoch": 14.539473684210526, + "grad_norm": 11.013096809387207, + "learning_rate": 0.0001310569659873187, + "loss": 0.2765, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2210 + }, + { + "epoch": 14.605263157894736, + "grad_norm": 9.266365051269531, + "learning_rate": 0.00013051494707206742, + "loss": 0.2694, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2220 + }, + { + "epoch": 14.671052631578947, + "grad_norm": 7.71904993057251, + "learning_rate": 0.00012997289727874578, + "loss": 0.2444, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2230 + }, + { + "epoch": 14.736842105263158, + "grad_norm": 7.048625946044922, + "learning_rate": 0.00012943084911060034, + "loss": 0.2539, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2240 + }, + { + "epoch": 14.802631578947368, + "grad_norm": 9.421672821044922, + "learning_rate": 0.00012888883507078028, + "loss": 0.2633, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2250 + }, + { + "epoch": 14.868421052631579, + "grad_norm": 8.31679916381836, + "learning_rate": 0.0001283468876603883, + "loss": 0.2918, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2260 + }, + { + "epoch": 14.93421052631579, + "grad_norm": 10.346576690673828, + "learning_rate": 0.00012780503937653175, + "loss": 0.3205, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2270 + }, + { + "epoch": 15.0, + "grad_norm": 10.213838577270508, + "learning_rate": 0.00012726332271037407, + "loss": 0.2688, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2280 + }, + { + "epoch": 15.06578947368421, + "grad_norm": 7.39166784286499, + "learning_rate": 0.00012672177014518628, + "loss": 0.2079, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2290 + }, + { + "epoch": 15.131578947368421, + "grad_norm": 9.744684219360352, + "learning_rate": 0.0001261804141543995, + "loss": 0.2397, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2300 + }, + { + "epoch": 15.197368421052632, + "grad_norm": 8.710470199584961, + "learning_rate": 0.00012563928719965743, + "loss": 0.2277, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2310 + }, + { + "epoch": 15.263157894736842, + "grad_norm": 7.7291998863220215, + "learning_rate": 0.00012509842172886995, + "loss": 0.2373, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2320 + }, + { + "epoch": 15.328947368421053, + "grad_norm": 8.500285148620605, + "learning_rate": 0.00012455785017426743, + "loss": 0.2387, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2330 + }, + { + "epoch": 15.394736842105264, + "grad_norm": 12.7296781539917, + "learning_rate": 0.00012401760495045608, + "loss": 0.2405, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2340 + }, + { + "epoch": 15.460526315789474, + "grad_norm": 8.660082817077637, + "learning_rate": 0.000123477718452474, + "loss": 0.2075, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2350 + }, + { + "epoch": 15.526315789473685, + "grad_norm": 7.300625324249268, + "learning_rate": 0.00012293822305384886, + "loss": 0.2772, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2360 + }, + { + "epoch": 15.592105263157894, + "grad_norm": 8.595913887023926, + "learning_rate": 0.00012239915110465668, + "loss": 0.2583, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2370 + }, + { + "epoch": 15.657894736842106, + "grad_norm": 7.006585121154785, + "learning_rate": 0.00012186053492958178, + "loss": 0.2493, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2380 + }, + { + "epoch": 15.723684210526315, + "grad_norm": 7.5067267417907715, + "learning_rate": 0.00012132240682597882, + "loss": 0.2329, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2390 + }, + { + "epoch": 15.789473684210526, + "grad_norm": 9.166180610656738, + "learning_rate": 0.00012078479906193579, + "loss": 0.2474, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2400 + }, + { + "epoch": 15.855263157894736, + "grad_norm": 12.06090259552002, + "learning_rate": 0.00012024774387433933, + "loss": 0.2599, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2410 + }, + { + "epoch": 15.921052631578947, + "grad_norm": 8.838837623596191, + "learning_rate": 0.00011971127346694164, + "loss": 0.2822, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2420 + }, + { + "epoch": 15.986842105263158, + "grad_norm": 10.12195873260498, + "learning_rate": 0.00011917542000842932, + "loss": 0.2762, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2430 + }, + { + "epoch": 16.05263157894737, + "grad_norm": 7.118005275726318, + "learning_rate": 0.00011864021563049461, + "loss": 0.2289, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2440 + }, + { + "epoch": 16.11842105263158, + "grad_norm": 9.466529846191406, + "learning_rate": 0.0001181056924259085, + "loss": 0.2096, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2450 + }, + { + "epoch": 16.18421052631579, + "grad_norm": 6.640699863433838, + "learning_rate": 0.00011757188244659633, + "loss": 0.2044, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2460 + }, + { + "epoch": 16.25, + "grad_norm": 6.390682220458984, + "learning_rate": 0.000117038817701716, + "loss": 0.2199, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2470 + }, + { + "epoch": 16.31578947368421, + "grad_norm": 6.348979949951172, + "learning_rate": 0.00011650653015573847, + "loss": 0.2124, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2480 + }, + { + "epoch": 16.38157894736842, + "grad_norm": 6.506417274475098, + "learning_rate": 0.000115975051726531, + "loss": 0.2165, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2490 + }, + { + "epoch": 16.44736842105263, + "grad_norm": 9.003459930419922, + "learning_rate": 0.00011544441428344337, + "loss": 0.2201, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2500 + }, + { + "epoch": 16.513157894736842, + "grad_norm": 8.156404495239258, + "learning_rate": 0.00011491464964539678, + "loss": 0.2297, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2510 + }, + { + "epoch": 16.57894736842105, + "grad_norm": 6.955758571624756, + "learning_rate": 0.00011438578957897598, + "loss": 0.226, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2520 + }, + { + "epoch": 16.644736842105264, + "grad_norm": 5.890193462371826, + "learning_rate": 0.00011385786579652431, + "loss": 0.2266, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2530 + }, + { + "epoch": 16.710526315789473, + "grad_norm": 9.810389518737793, + "learning_rate": 0.0001133309099542422, + "loss": 0.2323, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2540 + }, + { + "epoch": 16.776315789473685, + "grad_norm": 8.91415023803711, + "learning_rate": 0.00011280495365028885, + "loss": 0.226, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2550 + }, + { + "epoch": 16.842105263157894, + "grad_norm": 7.513998985290527, + "learning_rate": 0.00011228002842288768, + "loss": 0.2223, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2560 + }, + { + "epoch": 16.907894736842106, + "grad_norm": 4.712993144989014, + "learning_rate": 0.00011175616574843499, + "loss": 0.2319, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2570 + }, + { + "epoch": 16.973684210526315, + "grad_norm": 7.016789436340332, + "learning_rate": 0.00011123339703961262, + "loss": 0.2256, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2580 + }, + { + "epoch": 17.039473684210527, + "grad_norm": 9.788738250732422, + "learning_rate": 0.00011071175364350432, + "loss": 0.2108, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2590 + }, + { + "epoch": 17.105263157894736, + "grad_norm": 7.460390567779541, + "learning_rate": 0.00011019126683971603, + "loss": 0.171, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2600 + }, + { + "epoch": 17.17105263157895, + "grad_norm": 8.930059432983398, + "learning_rate": 0.00010967196783850033, + "loss": 0.1919, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2610 + }, + { + "epoch": 17.236842105263158, + "grad_norm": 6.457809925079346, + "learning_rate": 0.00010915388777888482, + "loss": 0.2081, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2620 + }, + { + "epoch": 17.30263157894737, + "grad_norm": 7.385796070098877, + "learning_rate": 0.0001086370577268051, + "loss": 0.1974, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2630 + }, + { + "epoch": 17.36842105263158, + "grad_norm": 7.0241007804870605, + "learning_rate": 0.00010812150867324176, + "loss": 0.2199, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2640 + }, + { + "epoch": 17.43421052631579, + "grad_norm": 12.928653717041016, + "learning_rate": 0.00010760727153236215, + "loss": 0.2327, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2650 + }, + { + "epoch": 17.5, + "grad_norm": 7.8260884284973145, + "learning_rate": 0.00010709437713966664, + "loss": 0.2002, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2660 + }, + { + "epoch": 17.56578947368421, + "grad_norm": 5.859843730926514, + "learning_rate": 0.00010658285625013966, + "loss": 0.2215, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2670 + }, + { + "epoch": 17.63157894736842, + "grad_norm": 8.312082290649414, + "learning_rate": 0.00010607273953640542, + "loss": 0.194, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2680 + }, + { + "epoch": 17.69736842105263, + "grad_norm": 8.437788009643555, + "learning_rate": 0.0001055640575868887, + "loss": 0.225, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2690 + }, + { + "epoch": 17.763157894736842, + "grad_norm": 6.751978874206543, + "learning_rate": 0.00010505684090398072, + "loss": 0.189, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2700 + }, + { + "epoch": 17.82894736842105, + "grad_norm": 5.854010581970215, + "learning_rate": 0.00010455111990221009, + "loss": 0.2041, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2710 + }, + { + "epoch": 17.894736842105264, + "grad_norm": 9.748807907104492, + "learning_rate": 0.00010404692490641896, + "loss": 0.2077, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2720 + }, + { + "epoch": 17.960526315789473, + "grad_norm": 7.232575416564941, + "learning_rate": 0.00010354428614994476, + "loss": 0.211, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2730 + }, + { + "epoch": 18.026315789473685, + "grad_norm": 6.184597492218018, + "learning_rate": 0.00010304323377280715, + "loss": 0.1707, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2740 + }, + { + "epoch": 18.092105263157894, + "grad_norm": 5.879486083984375, + "learning_rate": 0.00010254379781990091, + "loss": 0.1556, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2750 + }, + { + "epoch": 18.157894736842106, + "grad_norm": 6.144423484802246, + "learning_rate": 0.00010204600823919419, + "loss": 0.1882, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2760 + }, + { + "epoch": 18.223684210526315, + "grad_norm": 7.17083740234375, + "learning_rate": 0.00010154989487993272, + "loss": 0.1742, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2770 + }, + { + "epoch": 18.289473684210527, + "grad_norm": 5.142153739929199, + "learning_rate": 0.00010105548749085007, + "loss": 0.1545, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2780 + }, + { + "epoch": 18.355263157894736, + "grad_norm": 4.805744171142578, + "learning_rate": 0.00010056281571838369, + "loss": 0.1752, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2790 + }, + { + "epoch": 18.42105263157895, + "grad_norm": 9.351277351379395, + "learning_rate": 0.00010007190910489725, + "loss": 0.1819, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2800 + }, + { + "epoch": 18.486842105263158, + "grad_norm": 7.704065799713135, + "learning_rate": 9.958279708690913e-05, + "loss": 0.1963, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2810 + }, + { + "epoch": 18.55263157894737, + "grad_norm": 7.071595191955566, + "learning_rate": 9.90955089933274e-05, + "loss": 0.1861, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2820 + }, + { + "epoch": 18.61842105263158, + "grad_norm": 6.073451042175293, + "learning_rate": 9.861007404369105e-05, + "loss": 0.1752, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2830 + }, + { + "epoch": 18.68421052631579, + "grad_norm": 10.440926551818848, + "learning_rate": 9.812652134641792e-05, + "loss": 0.2065, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2840 + }, + { + "epoch": 18.75, + "grad_norm": 4.4701924324035645, + "learning_rate": 9.764487989705931e-05, + "loss": 0.1853, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2850 + }, + { + "epoch": 18.81578947368421, + "grad_norm": 9.070950508117676, + "learning_rate": 9.716517857656117e-05, + "loss": 0.1932, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2860 + }, + { + "epoch": 18.88157894736842, + "grad_norm": 6.689568519592285, + "learning_rate": 9.668744614953243e-05, + "loss": 0.2051, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2870 + }, + { + "epoch": 18.94736842105263, + "grad_norm": 6.0212883949279785, + "learning_rate": 9.62117112625201e-05, + "loss": 0.181, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2880 + }, + { + "epoch": 19.013157894736842, + "grad_norm": 7.055004596710205, + "learning_rate": 9.573800244229162e-05, + "loss": 0.1926, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2890 + }, + { + "epoch": 19.07894736842105, + "grad_norm": 6.019172668457031, + "learning_rate": 9.526634809412406e-05, + "loss": 0.1522, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2900 + }, + { + "epoch": 19.144736842105264, + "grad_norm": 5.405515193939209, + "learning_rate": 9.479677650010104e-05, + "loss": 0.1477, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2910 + }, + { + "epoch": 19.210526315789473, + "grad_norm": 9.99826717376709, + "learning_rate": 9.432931581741688e-05, + "loss": 0.145, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2920 + }, + { + "epoch": 19.276315789473685, + "grad_norm": 5.589150428771973, + "learning_rate": 9.3863994076688e-05, + "loss": 0.1454, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2930 + }, + { + "epoch": 19.342105263157894, + "grad_norm": 5.293956756591797, + "learning_rate": 9.340083918027221e-05, + "loss": 0.1687, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2940 + }, + { + "epoch": 19.407894736842106, + "grad_norm": 8.118623733520508, + "learning_rate": 9.29398789005956e-05, + "loss": 0.1781, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2950 + }, + { + "epoch": 19.473684210526315, + "grad_norm": 5.828381538391113, + "learning_rate": 9.248114087848716e-05, + "loss": 0.1443, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2960 + }, + { + "epoch": 19.539473684210527, + "grad_norm": 9.651313781738281, + "learning_rate": 9.20246526215214e-05, + "loss": 0.1752, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2970 + }, + { + "epoch": 19.605263157894736, + "grad_norm": 6.652307510375977, + "learning_rate": 9.15704415023688e-05, + "loss": 0.1744, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2980 + }, + { + "epoch": 19.67105263157895, + "grad_norm": 6.448312282562256, + "learning_rate": 9.111853475715455e-05, + "loss": 0.1738, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 2990 + }, + { + "epoch": 19.736842105263158, + "grad_norm": 9.422768592834473, + "learning_rate": 9.06689594838253e-05, + "loss": 0.1546, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3000 + }, + { + "epoch": 19.80263157894737, + "grad_norm": 12.273077964782715, + "learning_rate": 9.022174264052431e-05, + "loss": 0.1998, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3010 + }, + { + "epoch": 19.86842105263158, + "grad_norm": 7.415524959564209, + "learning_rate": 8.977691104397492e-05, + "loss": 0.1873, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3020 + }, + { + "epoch": 19.93421052631579, + "grad_norm": 5.000813961029053, + "learning_rate": 8.93344913678725e-05, + "loss": 0.1546, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3030 + }, + { + "epoch": 20.0, + "grad_norm": 6.139503479003906, + "learning_rate": 8.889451014128516e-05, + "loss": 0.1721, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3040 + }, + { + "epoch": 20.06578947368421, + "grad_norm": 7.5522050857543945, + "learning_rate": 8.845699374706267e-05, + "loss": 0.1317, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3050 + }, + { + "epoch": 20.13157894736842, + "grad_norm": 6.387609958648682, + "learning_rate": 8.802196842025477e-05, + "loss": 0.1129, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3060 + }, + { + "epoch": 20.19736842105263, + "grad_norm": 9.34554672241211, + "learning_rate": 8.758946024653779e-05, + "loss": 0.1597, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3070 + }, + { + "epoch": 20.263157894736842, + "grad_norm": 3.530388593673706, + "learning_rate": 8.715949516065058e-05, + "loss": 0.1303, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3080 + }, + { + "epoch": 20.32894736842105, + "grad_norm": 5.443478107452393, + "learning_rate": 8.673209894483935e-05, + "loss": 0.1251, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3090 + }, + { + "epoch": 20.394736842105264, + "grad_norm": 5.092613220214844, + "learning_rate": 8.630729722731162e-05, + "loss": 0.1425, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3100 + }, + { + "epoch": 20.460526315789473, + "grad_norm": 9.21035385131836, + "learning_rate": 8.588511548069953e-05, + "loss": 0.155, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3110 + }, + { + "epoch": 20.526315789473685, + "grad_norm": 7.470590114593506, + "learning_rate": 8.54655790205324e-05, + "loss": 0.133, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3120 + }, + { + "epoch": 20.592105263157894, + "grad_norm": 6.920080184936523, + "learning_rate": 8.504871300371868e-05, + "loss": 0.1545, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3130 + }, + { + "epoch": 20.657894736842106, + "grad_norm": 5.543353080749512, + "learning_rate": 8.463454242703748e-05, + "loss": 0.1469, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3140 + }, + { + "epoch": 20.723684210526315, + "grad_norm": 7.444575786590576, + "learning_rate": 8.422309212563966e-05, + "loss": 0.1523, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3150 + }, + { + "epoch": 20.789473684210527, + "grad_norm": 9.51788330078125, + "learning_rate": 8.381438677155862e-05, + "loss": 0.1535, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3160 + }, + { + "epoch": 20.855263157894736, + "grad_norm": 5.5765862464904785, + "learning_rate": 8.34084508722309e-05, + "loss": 0.1747, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3170 + }, + { + "epoch": 20.92105263157895, + "grad_norm": 7.758854389190674, + "learning_rate": 8.300530876902665e-05, + "loss": 0.1366, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3180 + }, + { + "epoch": 20.986842105263158, + "grad_norm": 6.344717502593994, + "learning_rate": 8.260498463578997e-05, + "loss": 0.1446, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3190 + }, + { + "epoch": 21.05263157894737, + "grad_norm": 5.9779276847839355, + "learning_rate": 8.22075024773894e-05, + "loss": 0.1268, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3200 + }, + { + "epoch": 21.11842105263158, + "grad_norm": 3.6732325553894043, + "learning_rate": 8.181288612827847e-05, + "loss": 0.1181, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3210 + }, + { + "epoch": 21.18421052631579, + "grad_norm": 7.801024436950684, + "learning_rate": 8.142115925106652e-05, + "loss": 0.1125, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3220 + }, + { + "epoch": 21.25, + "grad_norm": 5.0273332595825195, + "learning_rate": 8.103234533509988e-05, + "loss": 0.1182, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3230 + }, + { + "epoch": 21.31578947368421, + "grad_norm": 10.303738594055176, + "learning_rate": 8.064646769505319e-05, + "loss": 0.12, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3240 + }, + { + "epoch": 21.38157894736842, + "grad_norm": 3.5069081783294678, + "learning_rate": 8.026354946953153e-05, + "loss": 0.1156, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3250 + }, + { + "epoch": 21.44736842105263, + "grad_norm": 8.985977172851562, + "learning_rate": 7.988361361968288e-05, + "loss": 0.1474, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3260 + }, + { + "epoch": 21.513157894736842, + "grad_norm": 7.387016296386719, + "learning_rate": 7.95066829278213e-05, + "loss": 0.1302, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3270 + }, + { + "epoch": 21.57894736842105, + "grad_norm": 4.838381767272949, + "learning_rate": 7.913277999606084e-05, + "loss": 0.121, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3280 + }, + { + "epoch": 21.644736842105264, + "grad_norm": 6.857619762420654, + "learning_rate": 7.876192724496012e-05, + "loss": 0.123, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3290 + }, + { + "epoch": 21.710526315789473, + "grad_norm": 5.610379219055176, + "learning_rate": 7.839414691217811e-05, + "loss": 0.1352, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3300 + }, + { + "epoch": 21.776315789473685, + "grad_norm": 5.731847286224365, + "learning_rate": 7.802946105114052e-05, + "loss": 0.1547, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3310 + }, + { + "epoch": 21.842105263157894, + "grad_norm": 6.4344377517700195, + "learning_rate": 7.766789152971747e-05, + "loss": 0.1416, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3320 + }, + { + "epoch": 21.907894736842106, + "grad_norm": 6.7426934242248535, + "learning_rate": 7.73094600289122e-05, + "loss": 0.1323, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3330 + }, + { + "epoch": 21.973684210526315, + "grad_norm": 7.127729415893555, + "learning_rate": 7.695418804156094e-05, + "loss": 0.1355, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3340 + }, + { + "epoch": 22.039473684210527, + "grad_norm": 3.8695497512817383, + "learning_rate": 7.66020968710443e-05, + "loss": 0.1197, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3350 + }, + { + "epoch": 22.105263157894736, + "grad_norm": 4.851317882537842, + "learning_rate": 7.625320763000964e-05, + "loss": 0.1108, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3360 + }, + { + "epoch": 22.17105263157895, + "grad_norm": 7.691250324249268, + "learning_rate": 7.590754123910515e-05, + "loss": 0.1168, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3370 + }, + { + "epoch": 22.236842105263158, + "grad_norm": 5.645074844360352, + "learning_rate": 7.556511842572543e-05, + "loss": 0.1116, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3380 + }, + { + "epoch": 22.30263157894737, + "grad_norm": 5.763618469238281, + "learning_rate": 7.522595972276851e-05, + "loss": 0.109, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3390 + }, + { + "epoch": 22.36842105263158, + "grad_norm": 8.13197135925293, + "learning_rate": 7.489008546740474e-05, + "loss": 0.1044, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3400 + }, + { + "epoch": 22.43421052631579, + "grad_norm": 7.799911022186279, + "learning_rate": 7.455751579985714e-05, + "loss": 0.1409, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3410 + }, + { + "epoch": 22.5, + "grad_norm": 6.310256004333496, + "learning_rate": 7.42282706621939e-05, + "loss": 0.1297, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3420 + }, + { + "epoch": 22.56578947368421, + "grad_norm": 6.451694965362549, + "learning_rate": 7.390236979713248e-05, + "loss": 0.1086, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3430 + }, + { + "epoch": 22.63157894736842, + "grad_norm": 8.012211799621582, + "learning_rate": 7.357983274685569e-05, + "loss": 0.1312, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3440 + }, + { + "epoch": 22.69736842105263, + "grad_norm": 7.39089298248291, + "learning_rate": 7.326067885184007e-05, + "loss": 0.1127, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3450 + }, + { + "epoch": 22.763157894736842, + "grad_norm": 7.845798969268799, + "learning_rate": 7.294492724969598e-05, + "loss": 0.1139, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3460 + }, + { + "epoch": 22.82894736842105, + "grad_norm": 5.291744709014893, + "learning_rate": 7.263259687402016e-05, + "loss": 0.1191, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3470 + }, + { + "epoch": 22.894736842105264, + "grad_norm": 6.590104103088379, + "learning_rate": 7.232370645326036e-05, + "loss": 0.1185, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3480 + }, + { + "epoch": 22.960526315789473, + "grad_norm": 5.286604404449463, + "learning_rate": 7.201827450959225e-05, + "loss": 0.1171, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3490 + }, + { + "epoch": 23.026315789473685, + "grad_norm": 3.8253142833709717, + "learning_rate": 7.171631935780896e-05, + "loss": 0.0955, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3500 + }, + { + "epoch": 23.092105263157894, + "grad_norm": 6.925575256347656, + "learning_rate": 7.141785910422259e-05, + "loss": 0.0919, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3510 + }, + { + "epoch": 23.157894736842106, + "grad_norm": 9.26873779296875, + "learning_rate": 7.11229116455787e-05, + "loss": 0.1031, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3520 + }, + { + "epoch": 23.223684210526315, + "grad_norm": 6.181906223297119, + "learning_rate": 7.083149466798312e-05, + "loss": 0.1151, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3530 + }, + { + "epoch": 23.289473684210527, + "grad_norm": 6.093035697937012, + "learning_rate": 7.054362564584128e-05, + "loss": 0.1033, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3540 + }, + { + "epoch": 23.355263157894736, + "grad_norm": 8.938755989074707, + "learning_rate": 7.025932184081061e-05, + "loss": 0.1059, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3550 + }, + { + "epoch": 23.42105263157895, + "grad_norm": 6.073050022125244, + "learning_rate": 6.997860030076529e-05, + "loss": 0.1256, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3560 + }, + { + "epoch": 23.486842105263158, + "grad_norm": 5.09712553024292, + "learning_rate": 6.97014778587741e-05, + "loss": 0.1054, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3570 + }, + { + "epoch": 23.55263157894737, + "grad_norm": 6.039034843444824, + "learning_rate": 6.9427971132091e-05, + "loss": 0.0938, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3580 + }, + { + "epoch": 23.61842105263158, + "grad_norm": 5.665463924407959, + "learning_rate": 6.91580965211587e-05, + "loss": 0.1012, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3590 + }, + { + "epoch": 23.68421052631579, + "grad_norm": 4.0321807861328125, + "learning_rate": 6.889187020862527e-05, + "loss": 0.1135, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3600 + }, + { + "epoch": 23.75, + "grad_norm": 8.287370681762695, + "learning_rate": 6.862930815837372e-05, + "loss": 0.1013, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3610 + }, + { + "epoch": 23.81578947368421, + "grad_norm": 5.64683723449707, + "learning_rate": 6.837042611456477e-05, + "loss": 0.1048, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3620 + }, + { + "epoch": 23.88157894736842, + "grad_norm": 3.9935617446899414, + "learning_rate": 6.81152396006928e-05, + "loss": 0.116, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3630 + }, + { + "epoch": 23.94736842105263, + "grad_norm": 4.570790767669678, + "learning_rate": 6.786376391865493e-05, + "loss": 0.1142, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3640 + }, + { + "epoch": 24.013157894736842, + "grad_norm": 3.90972900390625, + "learning_rate": 6.761601414783363e-05, + "loss": 0.1112, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3650 + }, + { + "epoch": 24.07894736842105, + "grad_norm": 4.516672611236572, + "learning_rate": 6.737200514419225e-05, + "loss": 0.0727, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3660 + }, + { + "epoch": 24.144736842105264, + "grad_norm": 4.735109329223633, + "learning_rate": 6.713175153938444e-05, + "loss": 0.0726, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3670 + }, + { + "epoch": 24.210526315789473, + "grad_norm": 5.059301376342773, + "learning_rate": 6.689526773987667e-05, + "loss": 0.0818, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3680 + }, + { + "epoch": 24.276315789473685, + "grad_norm": 4.411863803863525, + "learning_rate": 6.666256792608434e-05, + "loss": 0.0821, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3690 + }, + { + "epoch": 24.342105263157894, + "grad_norm": 5.8183465003967285, + "learning_rate": 6.643366605152156e-05, + "loss": 0.0943, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3700 + }, + { + "epoch": 24.407894736842106, + "grad_norm": 5.473435878753662, + "learning_rate": 6.620857584196439e-05, + "loss": 0.0864, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3710 + }, + { + "epoch": 24.473684210526315, + "grad_norm": 5.164342880249023, + "learning_rate": 6.598731079462784e-05, + "loss": 0.0847, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3720 + }, + { + "epoch": 24.539473684210527, + "grad_norm": 4.145204067230225, + "learning_rate": 6.576988417735645e-05, + "loss": 0.0845, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3730 + }, + { + "epoch": 24.605263157894736, + "grad_norm": 8.296338081359863, + "learning_rate": 6.555630902782875e-05, + "loss": 0.0978, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3740 + }, + { + "epoch": 24.67105263157895, + "grad_norm": 7.629764556884766, + "learning_rate": 6.534659815277551e-05, + "loss": 0.0995, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3750 + }, + { + "epoch": 24.736842105263158, + "grad_norm": 3.3637771606445312, + "learning_rate": 6.514076412721174e-05, + "loss": 0.1165, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3760 + }, + { + "epoch": 24.80263157894737, + "grad_norm": 3.8581345081329346, + "learning_rate": 6.493881929368267e-05, + "loss": 0.1068, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3770 + }, + { + "epoch": 24.86842105263158, + "grad_norm": 4.501940727233887, + "learning_rate": 6.474077576152366e-05, + "loss": 0.0821, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3780 + }, + { + "epoch": 24.93421052631579, + "grad_norm": 7.215863227844238, + "learning_rate": 6.454664540613403e-05, + "loss": 0.1116, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3790 + }, + { + "epoch": 25.0, + "grad_norm": 7.456452369689941, + "learning_rate": 6.43564398682651e-05, + "loss": 0.1105, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3800 + }, + { + "epoch": 25.06578947368421, + "grad_norm": 4.260319232940674, + "learning_rate": 6.417017055332196e-05, + "loss": 0.0692, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3810 + }, + { + "epoch": 25.13157894736842, + "grad_norm": 6.243800163269043, + "learning_rate": 6.39878486306798e-05, + "loss": 0.0809, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3820 + }, + { + "epoch": 25.19736842105263, + "grad_norm": 5.938414096832275, + "learning_rate": 6.380948503301394e-05, + "loss": 0.0672, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3830 + }, + { + "epoch": 25.263157894736842, + "grad_norm": 4.197356700897217, + "learning_rate": 6.363509045564438e-05, + "loss": 0.0704, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3840 + }, + { + "epoch": 25.32894736842105, + "grad_norm": 6.645064353942871, + "learning_rate": 6.346467535589447e-05, + "loss": 0.0728, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3850 + }, + { + "epoch": 25.394736842105264, + "grad_norm": 3.9544174671173096, + "learning_rate": 6.329824995246385e-05, + "loss": 0.0876, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3860 + }, + { + "epoch": 25.460526315789473, + "grad_norm": 7.730473041534424, + "learning_rate": 6.313582422481561e-05, + "loss": 0.0874, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3870 + }, + { + "epoch": 25.526315789473685, + "grad_norm": 4.0409770011901855, + "learning_rate": 6.297740791257803e-05, + "loss": 0.0916, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3880 + }, + { + "epoch": 25.592105263157894, + "grad_norm": 6.474958419799805, + "learning_rate": 6.282301051496045e-05, + "loss": 0.1027, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3890 + }, + { + "epoch": 25.657894736842106, + "grad_norm": 5.480382442474365, + "learning_rate": 6.267264129018372e-05, + "loss": 0.106, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3900 + }, + { + "epoch": 25.723684210526315, + "grad_norm": 5.7164788246154785, + "learning_rate": 6.252630925492498e-05, + "loss": 0.0769, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3910 + }, + { + "epoch": 25.789473684210527, + "grad_norm": 4.286190509796143, + "learning_rate": 6.23840231837771e-05, + "loss": 0.0876, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3920 + }, + { + "epoch": 25.855263157894736, + "grad_norm": 6.741421699523926, + "learning_rate": 6.224579160872236e-05, + "loss": 0.0915, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3930 + }, + { + "epoch": 25.92105263157895, + "grad_norm": 4.233421325683594, + "learning_rate": 6.211162281862105e-05, + "loss": 0.0813, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3940 + }, + { + "epoch": 25.986842105263158, + "grad_norm": 7.668642044067383, + "learning_rate": 6.19815248587142e-05, + "loss": 0.089, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3950 + }, + { + "epoch": 26.05263157894737, + "grad_norm": 3.4035732746124268, + "learning_rate": 6.185550553014139e-05, + "loss": 0.0747, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3960 + }, + { + "epoch": 26.11842105263158, + "grad_norm": 3.7069666385650635, + "learning_rate": 6.173357238947281e-05, + "loss": 0.0695, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3970 + }, + { + "epoch": 26.18421052631579, + "grad_norm": 5.699673652648926, + "learning_rate": 6.161573274825621e-05, + "loss": 0.0588, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3980 + }, + { + "epoch": 26.25, + "grad_norm": 6.198535442352295, + "learning_rate": 6.150199367257843e-05, + "loss": 0.0832, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 3990 + }, + { + "epoch": 26.31578947368421, + "grad_norm": 6.22232723236084, + "learning_rate": 6.139236198264172e-05, + "loss": 0.0764, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 4000 + }, + { + "epoch": 26.38157894736842, + "grad_norm": 9.198460578918457, + "learning_rate": 6.128684425235482e-05, + "loss": 0.0902, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 4010 + }, + { + "epoch": 26.44736842105263, + "grad_norm": 8.103082656860352, + "learning_rate": 6.118544680893863e-05, + "loss": 0.0855, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 4020 + }, + { + "epoch": 26.513157894736842, + "grad_norm": 5.254090785980225, + "learning_rate": 6.1088175732547e-05, + "loss": 0.0798, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 4030 + }, + { + "epoch": 26.57894736842105, + "grad_norm": 6.608073711395264, + "learning_rate": 6.099503685590196e-05, + "loss": 0.0877, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 4040 + }, + { + "epoch": 26.644736842105264, + "grad_norm": 6.865917682647705, + "learning_rate": 6.0906035763944055e-05, + "loss": 0.0783, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 4050 + }, + { + "epoch": 26.710526315789473, + "grad_norm": 5.2130446434021, + "learning_rate": 6.082117779349749e-05, + "loss": 0.0878, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 4060 + }, + { + "epoch": 26.776315789473685, + "grad_norm": 6.448189735412598, + "learning_rate": 6.074046803295003e-05, + "loss": 0.0879, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 4070 + }, + { + "epoch": 26.842105263157894, + "grad_norm": 4.945561408996582, + "learning_rate": 6.066391132194791e-05, + "loss": 0.0867, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 4080 + }, + { + "epoch": 26.907894736842106, + "grad_norm": 5.888365268707275, + "learning_rate": 6.0591512251105665e-05, + "loss": 0.0892, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 4090 + }, + { + "epoch": 26.973684210526315, + "grad_norm": 7.48224401473999, + "learning_rate": 6.0523275161730853e-05, + "loss": 0.0911, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 4100 + }, + { + "epoch": 27.039473684210527, + "grad_norm": 3.253098726272583, + "learning_rate": 6.0459204145563686e-05, + "loss": 0.0678, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 4110 + }, + { + "epoch": 27.105263157894736, + "grad_norm": 3.074223756790161, + "learning_rate": 6.039930304453173e-05, + "loss": 0.068, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 4120 + }, + { + "epoch": 27.17105263157895, + "grad_norm": 5.4365458488464355, + "learning_rate": 6.0343575450519485e-05, + "loss": 0.0682, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 4130 + }, + { + "epoch": 27.236842105263158, + "grad_norm": 4.085286617279053, + "learning_rate": 6.029202470515306e-05, + "loss": 0.0799, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 4140 + }, + { + "epoch": 27.30263157894737, + "grad_norm": 5.46896505355835, + "learning_rate": 6.024465389959973e-05, + "loss": 0.0722, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 4150 + }, + { + "epoch": 27.36842105263158, + "grad_norm": 5.356459617614746, + "learning_rate": 6.020146587438263e-05, + "loss": 0.0644, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 4160 + }, + { + "epoch": 27.43421052631579, + "grad_norm": 5.528810501098633, + "learning_rate": 6.0162463219210396e-05, + "loss": 0.0653, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 4170 + }, + { + "epoch": 27.5, + "grad_norm": 4.0345892906188965, + "learning_rate": 6.0127648272821894e-05, + "loss": 0.0615, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 4180 + }, + { + "epoch": 27.56578947368421, + "grad_norm": 4.616658687591553, + "learning_rate": 6.009702312284599e-05, + "loss": 0.0757, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 4190 + }, + { + "epoch": 27.63157894736842, + "grad_norm": 7.638866901397705, + "learning_rate": 6.0070589605676334e-05, + "loss": 0.0741, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 4200 + }, + { + "epoch": 27.69736842105263, + "grad_norm": 3.7885446548461914, + "learning_rate": 6.004834930636126e-05, + "loss": 0.0736, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 4210 + }, + { + "epoch": 27.763157894736842, + "grad_norm": 4.767909526824951, + "learning_rate": 6.0030303558508773e-05, + "loss": 0.0761, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 4220 + }, + { + "epoch": 27.82894736842105, + "grad_norm": 4.913625717163086, + "learning_rate": 6.001645344420652e-05, + "loss": 0.0767, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 4230 + }, + { + "epoch": 27.894736842105264, + "grad_norm": 3.778637170791626, + "learning_rate": 6.0006799793956955e-05, + "loss": 0.0888, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 4240 + }, + { + "epoch": 27.960526315789473, + "grad_norm": 4.147508144378662, + "learning_rate": 6.0001343186627523e-05, + "loss": 0.0804, + "memory/device_mem_reserved(gib)": 66.85, + "memory/max_mem_active(gib)": 58.44, + "memory/max_mem_allocated(gib)": 58.44, + "step": 4250 + } + ], + "logging_steps": 10, + "max_steps": 4257, + "num_input_tokens_seen": 0, + "num_train_epochs": 29, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.1919564813923123e+18, + "train_batch_size": 12, + "trial_name": null, + "trial_params": null +}