{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.957345971563981, "eval_steps": 100, "global_step": 156, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 1063.9925388773709, "learning_rate": 3.125e-08, "logits/chosen": 123.11854553222656, "logits/rejected": 97.00198364257812, "logps/chosen": -425.18585205078125, "logps/rejected": -424.1869201660156, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.19, "grad_norm": 1254.7036040526557, "learning_rate": 3.1249999999999997e-07, "logits/chosen": 117.43434143066406, "logits/rejected": 136.35675048828125, "logps/chosen": -442.97802734375, "logps/rejected": -524.6129760742188, "loss": 1.564, "rewards/accuracies": 0.4583333432674408, "rewards/chosen": 0.16415566205978394, "rewards/margins": 0.36335471272468567, "rewards/rejected": -0.19919908046722412, "step": 10 }, { "epoch": 0.38, "grad_norm": 909.9038063820053, "learning_rate": 4.989935734988097e-07, "logits/chosen": 126.66890716552734, "logits/rejected": 134.35414123535156, "logps/chosen": -426.7857360839844, "logps/rejected": -491.2925720214844, "loss": 1.3477, "rewards/accuracies": 0.59375, "rewards/chosen": -0.2084747850894928, "rewards/margins": 1.3043320178985596, "rewards/rejected": -1.51280677318573, "step": 20 }, { "epoch": 0.57, "grad_norm": 821.3627225843074, "learning_rate": 4.877641290737883e-07, "logits/chosen": 125.84306335449219, "logits/rejected": 129.29446411132812, "logps/chosen": -467.2300720214844, "logps/rejected": -528.94189453125, "loss": 1.4491, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.843996524810791, "rewards/margins": 3.846839189529419, "rewards/rejected": -8.690834999084473, "step": 30 }, { "epoch": 0.76, "grad_norm": 865.3817040985649, "learning_rate": 4.646121984004665e-07, "logits/chosen": 127.130859375, "logits/rejected": 122.1098861694336, "logps/chosen": -485.9337463378906, "logps/rejected": -506.68548583984375, "loss": 1.4982, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -5.1009392738342285, "rewards/margins": 3.6673903465270996, "rewards/rejected": -8.768329620361328, "step": 40 }, { "epoch": 0.95, "grad_norm": 907.7995009069369, "learning_rate": 4.3069871595684787e-07, "logits/chosen": 133.64224243164062, "logits/rejected": 133.93919372558594, "logps/chosen": -493.3519592285156, "logps/rejected": -511.6605529785156, "loss": 1.3238, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.211705684661865, "rewards/margins": 3.953176975250244, "rewards/rejected": -8.164883613586426, "step": 50 }, { "epoch": 1.14, "grad_norm": 320.5550768420106, "learning_rate": 3.877242453630256e-07, "logits/chosen": 133.02821350097656, "logits/rejected": 136.251953125, "logps/chosen": -460.30291748046875, "logps/rejected": -494.0633239746094, "loss": 0.5962, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -1.4149787425994873, "rewards/margins": 10.490567207336426, "rewards/rejected": -11.905545234680176, "step": 60 }, { "epoch": 1.33, "grad_norm": 427.3414833642942, "learning_rate": 3.378437060203357e-07, "logits/chosen": 129.8929901123047, "logits/rejected": 130.46600341796875, "logps/chosen": -432.332275390625, "logps/rejected": -534.7671508789062, "loss": 0.1911, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -1.3673985004425049, "rewards/margins": 11.68455696105957, "rewards/rejected": -13.051956176757812, "step": 70 }, { "epoch": 1.52, "grad_norm": 381.35819492011535, "learning_rate": 2.8355831645441387e-07, "logits/chosen": 135.67372131347656, "logits/rejected": 136.30862426757812, "logps/chosen": -487.7591247558594, "logps/rejected": -561.80712890625, "loss": 0.2207, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.5364077091217041, "rewards/margins": 14.564852714538574, "rewards/rejected": -14.02844524383545, "step": 80 }, { "epoch": 1.71, "grad_norm": 263.6170847100913, "learning_rate": 2.2759017277414164e-07, "logits/chosen": 122.95021057128906, "logits/rejected": 125.04380798339844, "logps/chosen": -465.0882873535156, "logps/rejected": -521.892578125, "loss": 0.2059, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1682957410812378, "rewards/margins": 13.839956283569336, "rewards/rejected": -15.00825309753418, "step": 90 }, { "epoch": 1.9, "grad_norm": 498.9305336886761, "learning_rate": 1.7274575140626315e-07, "logits/chosen": 137.44198608398438, "logits/rejected": 127.8071060180664, "logps/chosen": -482.68829345703125, "logps/rejected": -564.5560913085938, "loss": 0.2569, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -1.7192827463150024, "rewards/margins": 14.991134643554688, "rewards/rejected": -16.710416793823242, "step": 100 }, { "epoch": 1.9, "eval_logits/chosen": 105.28992462158203, "eval_logits/rejected": 99.2330093383789, "eval_logps/chosen": -470.2108459472656, "eval_logps/rejected": -482.4095153808594, "eval_loss": 1.0775203704833984, "eval_rewards/accuracies": 0.7395833134651184, "eval_rewards/chosen": -6.722555160522461, "eval_rewards/margins": 5.621420383453369, "eval_rewards/rejected": -12.343975067138672, "eval_runtime": 52.5735, "eval_samples_per_second": 14.266, "eval_steps_per_second": 0.457, "step": 100 }, { "epoch": 2.09, "grad_norm": 312.21164489149646, "learning_rate": 1.2177518064852348e-07, "logits/chosen": 116.9559555053711, "logits/rejected": 130.40074157714844, "logps/chosen": -501.41314697265625, "logps/rejected": -597.8336181640625, "loss": 0.1917, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.5299580097198486, "rewards/margins": 14.8624906539917, "rewards/rejected": -17.3924503326416, "step": 110 }, { "epoch": 2.27, "grad_norm": 88.0209774984605, "learning_rate": 7.723433775328384e-08, "logits/chosen": 128.97409057617188, "logits/rejected": 129.96273803710938, "logps/chosen": -482.223876953125, "logps/rejected": -545.4796752929688, "loss": 0.054, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.9787194728851318, "rewards/margins": 14.918545722961426, "rewards/rejected": -16.897266387939453, "step": 120 }, { "epoch": 2.46, "grad_norm": 195.2044766984358, "learning_rate": 4.1356686569674335e-08, "logits/chosen": 134.0525665283203, "logits/rejected": 139.18789672851562, "logps/chosen": -496.6250915527344, "logps/rejected": -565.2105712890625, "loss": 0.0788, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.1465753316879272, "rewards/margins": 14.6506986618042, "rewards/rejected": -15.797274589538574, "step": 130 }, { "epoch": 2.65, "grad_norm": 133.44761939021552, "learning_rate": 1.5941282340065697e-08, "logits/chosen": 119.77888488769531, "logits/rejected": 119.9384536743164, "logps/chosen": -450.76904296875, "logps/rejected": -552.9923095703125, "loss": 0.0613, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.1010565757751465, "rewards/margins": 16.25819206237793, "rewards/rejected": -18.359249114990234, "step": 140 }, { "epoch": 2.84, "grad_norm": 48.801935911090936, "learning_rate": 2.2625595580163247e-09, "logits/chosen": 127.36897277832031, "logits/rejected": 140.77224731445312, "logps/chosen": -477.6751403808594, "logps/rejected": -549.9277954101562, "loss": 0.0601, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.1382229328155518, "rewards/margins": 15.574376106262207, "rewards/rejected": -16.712596893310547, "step": 150 }, { "epoch": 2.96, "step": 156, "total_flos": 0.0, "train_loss": 0.5786063394103295, "train_runtime": 1791.9811, "train_samples_per_second": 11.3, "train_steps_per_second": 0.087 } ], "logging_steps": 10, "max_steps": 156, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }