| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.7507360157016683, | |
| "eval_steps": 500, | |
| "global_step": 765, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.009813542688910697, | |
| "grad_norm": 511850.5, | |
| "learning_rate": 3.921568627450981e-07, | |
| "loss": 3.358, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.019627085377821395, | |
| "grad_norm": 852402.5625, | |
| "learning_rate": 7.843137254901962e-07, | |
| "loss": 4.0004, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.029440628066732092, | |
| "grad_norm": 41.74958801269531, | |
| "learning_rate": 1.1764705882352942e-06, | |
| "loss": 3.1047, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.03925417075564279, | |
| "grad_norm": 29.402185440063477, | |
| "learning_rate": 1.5686274509803923e-06, | |
| "loss": 2.4535, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.04906771344455348, | |
| "grad_norm": 42.18159866333008, | |
| "learning_rate": 1.96078431372549e-06, | |
| "loss": 3.8615, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.058881256133464184, | |
| "grad_norm": 15.556477546691895, | |
| "learning_rate": 2.3529411764705885e-06, | |
| "loss": 2.6701, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.06869479882237488, | |
| "grad_norm": 1009053.9375, | |
| "learning_rate": 2.7450980392156867e-06, | |
| "loss": 2.5866, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.07850834151128558, | |
| "grad_norm": 3.3355813026428223, | |
| "learning_rate": 3.1372549019607846e-06, | |
| "loss": 2.1546, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.08832188420019627, | |
| "grad_norm": 6364043.5, | |
| "learning_rate": 3.529411764705883e-06, | |
| "loss": 1.8961, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.09813542688910697, | |
| "grad_norm": 10.409449577331543, | |
| "learning_rate": 3.92156862745098e-06, | |
| "loss": 3.8516, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.10794896957801767, | |
| "grad_norm": 3035116.25, | |
| "learning_rate": 4.313725490196079e-06, | |
| "loss": 2.656, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.11776251226692837, | |
| "grad_norm": 1568043.375, | |
| "learning_rate": 4.705882352941177e-06, | |
| "loss": 2.5056, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.12757605495583907, | |
| "grad_norm": 2.928816795349121, | |
| "learning_rate": 5.098039215686274e-06, | |
| "loss": 1.4711, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.13738959764474976, | |
| "grad_norm": 2664501.25, | |
| "learning_rate": 5.4901960784313735e-06, | |
| "loss": 2.4145, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.14720314033366044, | |
| "grad_norm": 5433292.0, | |
| "learning_rate": 5.882352941176471e-06, | |
| "loss": 2.5401, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.15701668302257116, | |
| "grad_norm": 2.9648523330688477, | |
| "learning_rate": 6.274509803921569e-06, | |
| "loss": 1.1487, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.16683022571148184, | |
| "grad_norm": 3384641.25, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 1.5521, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.17664376840039253, | |
| "grad_norm": 2175554.5, | |
| "learning_rate": 7.058823529411766e-06, | |
| "loss": 2.2009, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.18645731108930325, | |
| "grad_norm": 4.447290897369385, | |
| "learning_rate": 7.450980392156863e-06, | |
| "loss": 1.6716, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.19627085377821393, | |
| "grad_norm": 2.5263493061065674, | |
| "learning_rate": 7.84313725490196e-06, | |
| "loss": 3.2454, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.20608439646712462, | |
| "grad_norm": 772904.75, | |
| "learning_rate": 8.23529411764706e-06, | |
| "loss": 1.622, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.21589793915603533, | |
| "grad_norm": 3.8818535804748535, | |
| "learning_rate": 8.627450980392157e-06, | |
| "loss": 2.1061, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.22571148184494602, | |
| "grad_norm": 4.158895015716553, | |
| "learning_rate": 9.019607843137256e-06, | |
| "loss": 1.7532, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.23552502453385674, | |
| "grad_norm": 4.066924571990967, | |
| "learning_rate": 9.411764705882354e-06, | |
| "loss": 1.6545, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.24533856722276742, | |
| "grad_norm": 1732917.625, | |
| "learning_rate": 9.803921568627451e-06, | |
| "loss": 2.3048, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.25515210991167814, | |
| "grad_norm": 221879.953125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.3068, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.2649656526005888, | |
| "grad_norm": 859696.125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.7565, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.2747791952894995, | |
| "grad_norm": 3.6436195373535156, | |
| "learning_rate": 1e-05, | |
| "loss": 2.1635, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.2845927379784102, | |
| "grad_norm": 5.748630046844482, | |
| "learning_rate": 1e-05, | |
| "loss": 1.678, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.2944062806673209, | |
| "grad_norm": 493996.46875, | |
| "learning_rate": 1e-05, | |
| "loss": 2.1511, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.3042198233562316, | |
| "grad_norm": 9.792704582214355, | |
| "learning_rate": 1e-05, | |
| "loss": 2.6151, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.3140333660451423, | |
| "grad_norm": 1645629.125, | |
| "learning_rate": 1e-05, | |
| "loss": 1.4831, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.323846908734053, | |
| "grad_norm": 4.4405131340026855, | |
| "learning_rate": 1e-05, | |
| "loss": 2.7356, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.3336604514229637, | |
| "grad_norm": 8.055213928222656, | |
| "learning_rate": 1e-05, | |
| "loss": 1.9915, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.3434739941118744, | |
| "grad_norm": 604711.25, | |
| "learning_rate": 1e-05, | |
| "loss": 1.7425, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.35328753680078506, | |
| "grad_norm": 8.440975189208984, | |
| "learning_rate": 1e-05, | |
| "loss": 1.5921, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.3631010794896958, | |
| "grad_norm": 1266034.375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0832, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.3729146221786065, | |
| "grad_norm": 3.2711005210876465, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0002, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.38272816486751715, | |
| "grad_norm": 1738446.375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.7899, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.39254170755642787, | |
| "grad_norm": 10.065378189086914, | |
| "learning_rate": 1e-05, | |
| "loss": 1.942, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.4023552502453386, | |
| "grad_norm": 839358.625, | |
| "learning_rate": 1e-05, | |
| "loss": 2.2337, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.41216879293424924, | |
| "grad_norm": 820795.5, | |
| "learning_rate": 1e-05, | |
| "loss": 2.4434, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.42198233562315995, | |
| "grad_norm": 760894.875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.332, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.43179587831207067, | |
| "grad_norm": 465132.90625, | |
| "learning_rate": 1e-05, | |
| "loss": 2.1238, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.44160942100098133, | |
| "grad_norm": 151798.609375, | |
| "learning_rate": 1e-05, | |
| "loss": 2.0838, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.45142296368989204, | |
| "grad_norm": 3.6023194789886475, | |
| "learning_rate": 1e-05, | |
| "loss": 1.5318, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.46123650637880276, | |
| "grad_norm": 3.711779832839966, | |
| "learning_rate": 1e-05, | |
| "loss": 1.8415, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.47105004906771347, | |
| "grad_norm": 3.6837337017059326, | |
| "learning_rate": 1e-05, | |
| "loss": 2.2333, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.48086359175662413, | |
| "grad_norm": 2.7638938426971436, | |
| "learning_rate": 1e-05, | |
| "loss": 1.612, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.49067713444553485, | |
| "grad_norm": 2.2806527614593506, | |
| "learning_rate": 1e-05, | |
| "loss": 2.4336, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.5004906771344455, | |
| "grad_norm": 2.6325523853302, | |
| "learning_rate": 1e-05, | |
| "loss": 2.3051, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.5103042198233563, | |
| "grad_norm": 4.162623882293701, | |
| "learning_rate": 1e-05, | |
| "loss": 2.5193, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.5201177625122669, | |
| "grad_norm": 3.865851879119873, | |
| "learning_rate": 1e-05, | |
| "loss": 2.1113, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.5299313052011776, | |
| "grad_norm": 3.6652672290802, | |
| "learning_rate": 1e-05, | |
| "loss": 2.605, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.5397448478900884, | |
| "grad_norm": 1123418.0, | |
| "learning_rate": 1e-05, | |
| "loss": 2.367, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.549558390578999, | |
| "grad_norm": 3.206057071685791, | |
| "learning_rate": 1e-05, | |
| "loss": 0.9706, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.5593719332679097, | |
| "grad_norm": 3.8300833702087402, | |
| "learning_rate": 1e-05, | |
| "loss": 1.6688, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.5691854759568205, | |
| "grad_norm": 3.4160726070404053, | |
| "learning_rate": 1e-05, | |
| "loss": 1.8959, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.5789990186457311, | |
| "grad_norm": 6.991641044616699, | |
| "learning_rate": 1e-05, | |
| "loss": 2.8449, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.5888125613346418, | |
| "grad_norm": 3.89111065864563, | |
| "learning_rate": 1e-05, | |
| "loss": 2.8364, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.5986261040235525, | |
| "grad_norm": 12.52274227142334, | |
| "learning_rate": 1e-05, | |
| "loss": 2.3841, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.6084396467124632, | |
| "grad_norm": 1124655.25, | |
| "learning_rate": 1e-05, | |
| "loss": 2.8931, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.6182531894013739, | |
| "grad_norm": 2132181.75, | |
| "learning_rate": 1e-05, | |
| "loss": 1.8265, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.6280667320902846, | |
| "grad_norm": 3.21681547164917, | |
| "learning_rate": 1e-05, | |
| "loss": 0.8137, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.6378802747791953, | |
| "grad_norm": 1385230.375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.2742, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.647693817468106, | |
| "grad_norm": 10.80539321899414, | |
| "learning_rate": 1e-05, | |
| "loss": 3.0502, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.6575073601570167, | |
| "grad_norm": 1592570.0, | |
| "learning_rate": 1e-05, | |
| "loss": 1.9121, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.6673209028459274, | |
| "grad_norm": 985591.5625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.8159, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.677134445534838, | |
| "grad_norm": 1119573.375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.9695, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.6869479882237488, | |
| "grad_norm": 3.928929090499878, | |
| "learning_rate": 1e-05, | |
| "loss": 2.1545, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.6967615309126595, | |
| "grad_norm": 998297.4375, | |
| "learning_rate": 1e-05, | |
| "loss": 1.2963, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.7065750736015701, | |
| "grad_norm": 3.8201591968536377, | |
| "learning_rate": 1e-05, | |
| "loss": 0.9735, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.7163886162904809, | |
| "grad_norm": 3.7799386978149414, | |
| "learning_rate": 1e-05, | |
| "loss": 1.5274, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.7262021589793916, | |
| "grad_norm": 3.718870162963867, | |
| "learning_rate": 1e-05, | |
| "loss": 2.9676, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.7360157016683022, | |
| "grad_norm": 4.023947715759277, | |
| "learning_rate": 1e-05, | |
| "loss": 1.3345, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.745829244357213, | |
| "grad_norm": 14.283628463745117, | |
| "learning_rate": 1e-05, | |
| "loss": 2.7141, | |
| "step": 760 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1019, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 255, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |