{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.3161794977356937, "eval_steps": 500, "global_step": 24, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013174145738987238, "grad_norm": 0.5102696418762207, "learning_rate": 1.25e-07, "logits/chosen": 10.088521957397461, "logits/rejected": 10.263787269592285, "logps/chosen": -0.9118157029151917, "logps/rejected": -0.9621729850769043, "loss": 1.3897, "rewards/accuracies": 0.5234375, "rewards/chosen": -1.8236314058303833, "rewards/margins": 0.10071463882923126, "rewards/rejected": -1.9243459701538086, "step": 1 }, { "epoch": 0.026348291477974475, "grad_norm": 0.9815747141838074, "learning_rate": 2.5e-07, "logits/chosen": 10.592972755432129, "logits/rejected": 10.720216751098633, "logps/chosen": -0.945902943611145, "logps/rejected": -1.0317902565002441, "loss": 1.3077, "rewards/accuracies": 0.609375, "rewards/chosen": -1.89180588722229, "rewards/margins": 0.1717745065689087, "rewards/rejected": -2.0635805130004883, "step": 2 }, { "epoch": 0.03952243721696171, "grad_norm": 0.9049758315086365, "learning_rate": 3.75e-07, "logits/chosen": 10.041976928710938, "logits/rejected": 10.399316787719727, "logps/chosen": -1.0869810581207275, "logps/rejected": -1.1895216703414917, "loss": 1.346, "rewards/accuracies": 0.578125, "rewards/chosen": -2.173962116241455, "rewards/margins": 0.20508113503456116, "rewards/rejected": -2.3790433406829834, "step": 3 }, { "epoch": 0.05269658295594895, "grad_norm": 1.8911848068237305, "learning_rate": 5e-07, "logits/chosen": 10.243470191955566, "logits/rejected": 10.443375587463379, "logps/chosen": -0.966098427772522, "logps/rejected": -1.0040662288665771, "loss": 1.4032, "rewards/accuracies": 0.546875, "rewards/chosen": -1.932196855545044, "rewards/margins": 0.07593552023172379, "rewards/rejected": -2.0081324577331543, "step": 4 }, { "epoch": 0.06587072869493618, "grad_norm": 0.6135074496269226, "learning_rate": 6.249999999999999e-07, "logits/chosen": 10.439040184020996, "logits/rejected": 10.739177703857422, "logps/chosen": -0.9262609481811523, "logps/rejected": -0.9657196998596191, "loss": 1.3727, "rewards/accuracies": 0.5546875, "rewards/chosen": -1.8525218963623047, "rewards/margins": 0.07891744375228882, "rewards/rejected": -1.9314393997192383, "step": 5 }, { "epoch": 0.07904487443392343, "grad_norm": 0.5990542769432068, "learning_rate": 7.5e-07, "logits/chosen": 10.910269737243652, "logits/rejected": 11.204473495483398, "logps/chosen": -0.9439595341682434, "logps/rejected": -1.0420396327972412, "loss": 1.3491, "rewards/accuracies": 0.59375, "rewards/chosen": -1.8879190683364868, "rewards/margins": 0.196160227060318, "rewards/rejected": -2.0840792655944824, "step": 6 }, { "epoch": 0.09221902017291066, "grad_norm": 1.3676807880401611, "learning_rate": 8.75e-07, "logits/chosen": 9.873465538024902, "logits/rejected": 10.022269248962402, "logps/chosen": -0.8941428661346436, "logps/rejected": -1.0010743141174316, "loss": 1.3507, "rewards/accuracies": 0.625, "rewards/chosen": -1.788285732269287, "rewards/margins": 0.21386288106441498, "rewards/rejected": -2.0021486282348633, "step": 7 }, { "epoch": 0.1053931659118979, "grad_norm": 1.7690002918243408, "learning_rate": 1e-06, "logits/chosen": 10.597719192504883, "logits/rejected": 10.780376434326172, "logps/chosen": -0.9080270528793335, "logps/rejected": -0.9909782409667969, "loss": 1.3305, "rewards/accuracies": 0.640625, "rewards/chosen": -1.816054105758667, "rewards/margins": 0.16590236127376556, "rewards/rejected": -1.9819564819335938, "step": 8 }, { "epoch": 0.11856731165088513, "grad_norm": 1.056247353553772, "learning_rate": 9.994504457428556e-07, "logits/chosen": 10.446786880493164, "logits/rejected": 10.839168548583984, "logps/chosen": -1.1091859340667725, "logps/rejected": -1.0694739818572998, "loss": 1.5127, "rewards/accuracies": 0.5390625, "rewards/chosen": -2.218371868133545, "rewards/margins": -0.07942387461662292, "rewards/rejected": -2.1389479637145996, "step": 9 }, { "epoch": 0.13174145738987236, "grad_norm": 2.076240062713623, "learning_rate": 9.97802991010949e-07, "logits/chosen": 10.343971252441406, "logits/rejected": 10.492179870605469, "logps/chosen": -0.9705042839050293, "logps/rejected": -0.9916192889213562, "loss": 1.4611, "rewards/accuracies": 0.53125, "rewards/chosen": -1.9410085678100586, "rewards/margins": 0.04223020374774933, "rewards/rejected": -1.9832385778427124, "step": 10 }, { "epoch": 0.14491560312885962, "grad_norm": 1.13358736038208, "learning_rate": 9.950612572673255e-07, "logits/chosen": 10.49313735961914, "logits/rejected": 10.680143356323242, "logps/chosen": -1.1081148386001587, "logps/rejected": -1.223841667175293, "loss": 1.3449, "rewards/accuracies": 0.59375, "rewards/chosen": -2.2162296772003174, "rewards/margins": 0.23145350813865662, "rewards/rejected": -2.447683334350586, "step": 11 }, { "epoch": 0.15808974886784685, "grad_norm": 1.1638388633728027, "learning_rate": 9.912312714377879e-07, "logits/chosen": 10.328557014465332, "logits/rejected": 10.365793228149414, "logps/chosen": -0.9249637722969055, "logps/rejected": -0.9842618703842163, "loss": 1.351, "rewards/accuracies": 0.6015625, "rewards/chosen": -1.849927544593811, "rewards/margins": 0.11859625577926636, "rewards/rejected": -1.9685237407684326, "step": 12 }, { "epoch": 0.17126389460683408, "grad_norm": 0.9941473007202148, "learning_rate": 9.863214526624063e-07, "logits/chosen": 9.909621238708496, "logits/rejected": 10.248769760131836, "logps/chosen": -0.9913480877876282, "logps/rejected": -1.1752512454986572, "loss": 1.2767, "rewards/accuracies": 0.5703125, "rewards/chosen": -1.9826961755752563, "rewards/margins": 0.36780619621276855, "rewards/rejected": -2.3505024909973145, "step": 13 }, { "epoch": 0.1844380403458213, "grad_norm": 1.3600269556045532, "learning_rate": 9.8034259378842e-07, "logits/chosen": 10.472145080566406, "logits/rejected": 10.987956047058105, "logps/chosen": -0.9751205444335938, "logps/rejected": -1.0532664060592651, "loss": 1.3626, "rewards/accuracies": 0.578125, "rewards/chosen": -1.9502410888671875, "rewards/margins": 0.15629185736179352, "rewards/rejected": -2.1065328121185303, "step": 14 }, { "epoch": 0.19761218608480857, "grad_norm": 0.3477364182472229, "learning_rate": 9.73307837645217e-07, "logits/chosen": 10.209980010986328, "logits/rejected": 10.457592964172363, "logps/chosen": -0.9716652035713196, "logps/rejected": -1.0775285959243774, "loss": 1.3132, "rewards/accuracies": 0.5859375, "rewards/chosen": -1.9433304071426392, "rewards/margins": 0.2117268592119217, "rewards/rejected": -2.155057191848755, "step": 15 }, { "epoch": 0.2107863318237958, "grad_norm": 0.975040853023529, "learning_rate": 9.652326481535433e-07, "logits/chosen": 10.770889282226562, "logits/rejected": 11.057292938232422, "logps/chosen": -0.9405269026756287, "logps/rejected": -0.9816387891769409, "loss": 1.4142, "rewards/accuracies": 0.5078125, "rewards/chosen": -1.8810538053512573, "rewards/margins": 0.08222392201423645, "rewards/rejected": -1.9632775783538818, "step": 16 }, { "epoch": 0.22396047756278303, "grad_norm": 0.47477808594703674, "learning_rate": 9.561347763324483e-07, "logits/chosen": 10.384443283081055, "logits/rejected": 10.546278953552246, "logps/chosen": -0.9655594229698181, "logps/rejected": -0.9963297247886658, "loss": 1.4058, "rewards/accuracies": 0.578125, "rewards/chosen": -1.9311188459396362, "rewards/margins": 0.061540693044662476, "rewards/rejected": -1.9926594495773315, "step": 17 }, { "epoch": 0.23713462330177026, "grad_norm": 0.9369856119155884, "learning_rate": 9.460342212786932e-07, "logits/chosen": 10.428518295288086, "logits/rejected": 10.742942810058594, "logps/chosen": -1.0061042308807373, "logps/rejected": -0.9558196067810059, "loss": 1.5279, "rewards/accuracies": 0.5390625, "rewards/chosen": -2.0122084617614746, "rewards/margins": -0.10056903213262558, "rewards/rejected": -1.9116392135620117, "step": 18 }, { "epoch": 0.2503087690407575, "grad_norm": 0.6867318153381348, "learning_rate": 9.349531862043951e-07, "logits/chosen": 10.536978721618652, "logits/rejected": 10.496305465698242, "logps/chosen": -1.0390187501907349, "logps/rejected": -1.1175179481506348, "loss": 1.3199, "rewards/accuracies": 0.6953125, "rewards/chosen": -2.0780375003814697, "rewards/margins": 0.1569983810186386, "rewards/rejected": -2.2350358963012695, "step": 19 }, { "epoch": 0.2634829147797447, "grad_norm": 1.6277413368225098, "learning_rate": 9.229160296295487e-07, "logits/chosen": 10.487991333007812, "logits/rejected": 10.849261283874512, "logps/chosen": -1.005416989326477, "logps/rejected": -1.0715974569320679, "loss": 1.3772, "rewards/accuracies": 0.6328125, "rewards/chosen": -2.010833978652954, "rewards/margins": 0.1323607861995697, "rewards/rejected": -2.1431949138641357, "step": 20 }, { "epoch": 0.276657060518732, "grad_norm": 1.1200292110443115, "learning_rate": 9.099492118367122e-07, "logits/chosen": 10.419047355651855, "logits/rejected": 10.756099700927734, "logps/chosen": -0.9289014935493469, "logps/rejected": -1.020794153213501, "loss": 1.3128, "rewards/accuracies": 0.625, "rewards/chosen": -1.8578029870986938, "rewards/margins": 0.18378500640392303, "rewards/rejected": -2.041588306427002, "step": 21 }, { "epoch": 0.28983120625771924, "grad_norm": 1.3312275409698486, "learning_rate": 8.960812367055646e-07, "logits/chosen": 10.375129699707031, "logits/rejected": 10.722561836242676, "logps/chosen": -1.0812478065490723, "logps/rejected": -1.104426383972168, "loss": 1.438, "rewards/accuracies": 0.5703125, "rewards/chosen": -2.1624956130981445, "rewards/margins": 0.04635699465870857, "rewards/rejected": -2.208852767944336, "step": 22 }, { "epoch": 0.3030053519967065, "grad_norm": 0.36184069514274597, "learning_rate": 8.813425890551909e-07, "logits/chosen": 10.423131942749023, "logits/rejected": 10.69787311553955, "logps/chosen": -1.0428340435028076, "logps/rejected": -1.0485754013061523, "loss": 1.4543, "rewards/accuracies": 0.5625, "rewards/chosen": -2.0856680870056152, "rewards/margins": 0.01148274727165699, "rewards/rejected": -2.0971508026123047, "step": 23 }, { "epoch": 0.3161794977356937, "grad_norm": 0.5770995020866394, "learning_rate": 8.657656676318345e-07, "logits/chosen": 10.316368103027344, "logits/rejected": 10.479074478149414, "logps/chosen": -0.9681622982025146, "logps/rejected": -1.050377607345581, "loss": 1.3743, "rewards/accuracies": 0.515625, "rewards/chosen": -1.9363245964050293, "rewards/margins": 0.16443049907684326, "rewards/rejected": -2.100755214691162, "step": 24 } ], "logging_steps": 1, "max_steps": 75, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 12, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }