{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.15808974886784685, "eval_steps": 500, "global_step": 12, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013174145738987238, "grad_norm": 0.4815484285354614, "learning_rate": 6.25e-08, "logits/chosen": 10.088521957397461, "logits/rejected": 10.263787269592285, "logps/chosen": -163.12940979003906, "logps/ref_chosen": -163.12940979003906, "logps/ref_rejected": -171.48428344726562, "logps/rejected": -171.48428344726562, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.026348291477974475, "grad_norm": 0.627070426940918, "learning_rate": 1.25e-07, "logits/chosen": 10.592972755432129, "logits/rejected": 10.720216751098633, "logps/chosen": -155.91574096679688, "logps/ref_chosen": -155.91574096679688, "logps/ref_rejected": -161.34078979492188, "logps/rejected": -161.34078979492188, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.03952243721696171, "grad_norm": 0.4148138165473938, "learning_rate": 1.875e-07, "logits/chosen": 10.043272972106934, "logits/rejected": 10.398024559020996, "logps/chosen": -158.3568115234375, "logps/ref_chosen": -157.65640258789062, "logps/ref_rejected": -168.5882110595703, "logps/rejected": -168.91085815429688, "loss": 0.6951, "rewards/accuracies": 0.3828125, "rewards/chosen": -0.007004000246524811, "rewards/margins": -0.0037774655502289534, "rewards/rejected": -0.003226534929126501, "step": 3 }, { "epoch": 0.05269658295594895, "grad_norm": 0.7029770612716675, "learning_rate": 2.5e-07, "logits/chosen": 10.250253677368164, "logits/rejected": 10.45008659362793, "logps/chosen": -164.01119995117188, "logps/ref_chosen": -162.89878845214844, "logps/ref_rejected": -168.30462646484375, "logps/rejected": -169.1818389892578, "loss": 0.6944, "rewards/accuracies": 0.453125, "rewards/chosen": -0.01112416572868824, "rewards/margins": -0.0023521997500211, "rewards/rejected": -0.008771965280175209, "step": 4 }, { "epoch": 0.06587072869493618, "grad_norm": 0.4063253104686737, "learning_rate": 3.1249999999999997e-07, "logits/chosen": 10.442557334899902, "logits/rejected": 10.740192413330078, "logps/chosen": -156.1859130859375, "logps/ref_chosen": -156.03257751464844, "logps/ref_rejected": -165.37911987304688, "logps/rejected": -165.6518096923828, "loss": 0.6926, "rewards/accuracies": 0.5234375, "rewards/chosen": -0.001533512957394123, "rewards/margins": 0.001193464733660221, "rewards/rejected": -0.002726977691054344, "step": 5 }, { "epoch": 0.07904487443392343, "grad_norm": 0.4845049977302551, "learning_rate": 3.75e-07, "logits/chosen": 10.906261444091797, "logits/rejected": 11.201122283935547, "logps/chosen": -162.45692443847656, "logps/ref_chosen": -161.98570251464844, "logps/ref_rejected": -169.72560119628906, "logps/rejected": -170.18275451660156, "loss": 0.6932, "rewards/accuracies": 0.515625, "rewards/chosen": -0.004712029360234737, "rewards/margins": -0.00014030117017682642, "rewards/rejected": -0.004571728408336639, "step": 6 }, { "epoch": 0.09221902017291066, "grad_norm": 0.8172655701637268, "learning_rate": 4.375e-07, "logits/chosen": 9.883949279785156, "logits/rejected": 10.030972480773926, "logps/chosen": -157.43295288085938, "logps/ref_chosen": -157.26968383789062, "logps/ref_rejected": -167.37155151367188, "logps/rejected": -167.53939819335938, "loss": 0.6931, "rewards/accuracies": 0.484375, "rewards/chosen": -0.0016327811172232032, "rewards/margins": 4.5756096369586885e-05, "rewards/rejected": -0.0016785369953140616, "step": 7 }, { "epoch": 0.1053931659118979, "grad_norm": 0.588524341583252, "learning_rate": 5e-07, "logits/chosen": 10.633930206298828, "logits/rejected": 10.81590747833252, "logps/chosen": -162.8237762451172, "logps/ref_chosen": -162.2948455810547, "logps/ref_rejected": -172.98866271972656, "logps/rejected": -173.56680297851562, "loss": 0.6929, "rewards/accuracies": 0.5, "rewards/chosen": -0.005289244465529919, "rewards/margins": 0.0004922347725369036, "rewards/rejected": -0.005781479645520449, "step": 8 }, { "epoch": 0.11856731165088513, "grad_norm": 0.46077635884284973, "learning_rate": 4.997252228714278e-07, "logits/chosen": 10.326555252075195, "logits/rejected": 10.736672401428223, "logps/chosen": -164.5288543701172, "logps/ref_chosen": -163.37091064453125, "logps/ref_rejected": -173.1500701904297, "logps/rejected": -174.08392333984375, "loss": 0.6943, "rewards/accuracies": 0.4375, "rewards/chosen": -0.011579334735870361, "rewards/margins": -0.0022407739888876677, "rewards/rejected": -0.00933856051415205, "step": 9 }, { "epoch": 0.13174145738987236, "grad_norm": 0.673312783241272, "learning_rate": 4.989014955054745e-07, "logits/chosen": 10.325155258178711, "logits/rejected": 10.473593711853027, "logps/chosen": -157.8944091796875, "logps/ref_chosen": -156.87838745117188, "logps/ref_rejected": -165.17373657226562, "logps/rejected": -166.20751953125, "loss": 0.6931, "rewards/accuracies": 0.546875, "rewards/chosen": -0.010160216130316257, "rewards/margins": 0.00017760891932994127, "rewards/rejected": -0.010337824933230877, "step": 10 }, { "epoch": 0.14491560312885962, "grad_norm": 0.6500194668769836, "learning_rate": 4.975306286336627e-07, "logits/chosen": 10.476134300231934, "logits/rejected": 10.66375732421875, "logps/chosen": -161.99935913085938, "logps/ref_chosen": -160.73855590820312, "logps/ref_rejected": -173.1862030029297, "logps/rejected": -174.4076385498047, "loss": 0.6934, "rewards/accuracies": 0.484375, "rewards/chosen": -0.0126079972833395, "rewards/margins": -0.00039388981531374156, "rewards/rejected": -0.012214107438921928, "step": 11 }, { "epoch": 0.15808974886784685, "grad_norm": 0.5539909601211548, "learning_rate": 4.956156357188939e-07, "logits/chosen": 10.318845748901367, "logits/rejected": 10.355680465698242, "logps/chosen": -167.43121337890625, "logps/ref_chosen": -165.21177673339844, "logps/ref_rejected": -170.47381591796875, "logps/rejected": -172.76483154296875, "loss": 0.6928, "rewards/accuracies": 0.5078125, "rewards/chosen": -0.022194450721144676, "rewards/margins": 0.0007156741339713335, "rewards/rejected": -0.02291012369096279, "step": 12 } ], "logging_steps": 1, "max_steps": 75, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 12, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }