diff --git "a/MedReason-Med-REFL-LoraAdapter/trainer_state.json" "b/MedReason-Med-REFL-LoraAdapter/trainer_state.json" new file mode 100644--- /dev/null +++ "b/MedReason-Med-REFL-LoraAdapter/trainer_state.json" @@ -0,0 +1,4220 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9997605937275557, + "eval_steps": 500, + "global_step": 261, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0038305003591094086, + "grad_norm": 1.0148344039916992, + "learning_rate": 1.0000000000000002e-06, + "logits/chosen": -0.8676647543907166, + "logits/rejected": -0.8620176911354065, + "logps/chosen": -931.5811157226562, + "logps/rejected": -954.55126953125, + "loss": 0.6931, + "num_input_tokens_seen": 362496, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.007661000718218817, + "grad_norm": 1.0636553764343262, + "learning_rate": 2.0000000000000003e-06, + "logits/chosen": -0.8708184361457825, + "logits/rejected": -0.8565943241119385, + "logps/chosen": -926.818359375, + "logps/rejected": -922.3740234375, + "loss": 0.6931, + "num_input_tokens_seen": 718144, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.011491501077328227, + "grad_norm": 1.0695427656173706, + "learning_rate": 3e-06, + "logits/chosen": -0.863418698310852, + "logits/rejected": -0.8498145341873169, + "logps/chosen": -947.353271484375, + "logps/rejected": -936.1754150390625, + "loss": 0.6962, + "num_input_tokens_seen": 1075392, + "rewards/accuracies": 0.5234375, + "rewards/chosen": 0.034177109599113464, + "rewards/margins": -0.001909733284264803, + "rewards/rejected": 0.03608684986829758, + "step": 3 + }, + { + "epoch": 0.015322001436437634, + "grad_norm": 1.0633010864257812, + "learning_rate": 4.000000000000001e-06, + "logits/chosen": -0.8631210923194885, + "logits/rejected": -0.8552025556564331, + "logps/chosen": -923.91845703125, + "logps/rejected": -927.66748046875, + "loss": 0.6938, + "num_input_tokens_seen": 1430080, + "rewards/accuracies": 0.5390625, + "rewards/chosen": 0.0004860623739659786, + "rewards/margins": 0.0031057593878358603, + "rewards/rejected": -0.0026196949183940887, + "step": 4 + }, + { + "epoch": 0.019152501795547044, + "grad_norm": 0.9670782685279846, + "learning_rate": 5e-06, + "logits/chosen": -0.8592735528945923, + "logits/rejected": -0.8517563939094543, + "logps/chosen": -976.3739624023438, + "logps/rejected": -981.5032958984375, + "loss": 0.6831, + "num_input_tokens_seen": 1786304, + "rewards/accuracies": 0.5859375, + "rewards/chosen": 0.013646865263581276, + "rewards/margins": 0.02478773519396782, + "rewards/rejected": -0.011140871793031693, + "step": 5 + }, + { + "epoch": 0.022983002154656453, + "grad_norm": 1.0966469049453735, + "learning_rate": 6e-06, + "logits/chosen": -0.8702839612960815, + "logits/rejected": -0.8596780300140381, + "logps/chosen": -912.2633666992188, + "logps/rejected": -912.9552001953125, + "loss": 0.7014, + "num_input_tokens_seen": 2132544, + "rewards/accuracies": 0.4765625, + "rewards/chosen": -0.006884288042783737, + "rewards/margins": -0.011900162324309349, + "rewards/rejected": 0.005015874281525612, + "step": 6 + }, + { + "epoch": 0.02681350251376586, + "grad_norm": 1.0480536222457886, + "learning_rate": 7e-06, + "logits/chosen": -0.8704050183296204, + "logits/rejected": -0.8629523515701294, + "logps/chosen": -924.78125, + "logps/rejected": -920.9426879882812, + "loss": 0.6961, + "num_input_tokens_seen": 2488832, + "rewards/accuracies": 0.515625, + "rewards/chosen": -0.009704709984362125, + "rewards/margins": -0.000337100587785244, + "rewards/rejected": -0.009367610327899456, + "step": 7 + }, + { + "epoch": 0.03064400287287527, + "grad_norm": 1.1411088705062866, + "learning_rate": 8.000000000000001e-06, + "logits/chosen": -0.8697146773338318, + "logits/rejected": -0.8652409911155701, + "logps/chosen": -954.1392211914062, + "logps/rejected": -954.830322265625, + "loss": 0.6955, + "num_input_tokens_seen": 2852544, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.006894397083669901, + "rewards/margins": -0.0002739670453593135, + "rewards/rejected": -0.00662043085321784, + "step": 8 + }, + { + "epoch": 0.03447450323198468, + "grad_norm": 1.0232956409454346, + "learning_rate": 9e-06, + "logits/chosen": -0.8681120276451111, + "logits/rejected": -0.8613412380218506, + "logps/chosen": -995.03857421875, + "logps/rejected": -979.0482788085938, + "loss": 0.6871, + "num_input_tokens_seen": 3220608, + "rewards/accuracies": 0.546875, + "rewards/chosen": 0.005396081134676933, + "rewards/margins": 0.01677885092794895, + "rewards/rejected": -0.011382771655917168, + "step": 9 + }, + { + "epoch": 0.03830500359109409, + "grad_norm": 1.0501480102539062, + "learning_rate": 1e-05, + "logits/chosen": -0.8518915176391602, + "logits/rejected": -0.847193717956543, + "logps/chosen": -982.511474609375, + "logps/rejected": -954.72021484375, + "loss": 0.6906, + "num_input_tokens_seen": 3592128, + "rewards/accuracies": 0.5078125, + "rewards/chosen": -0.01743159256875515, + "rewards/margins": 0.009594391100108624, + "rewards/rejected": -0.027025986462831497, + "step": 10 + }, + { + "epoch": 0.0421355039502035, + "grad_norm": 1.0379517078399658, + "learning_rate": 9.999608360361114e-06, + "logits/chosen": -0.8713157176971436, + "logits/rejected": -0.8774940967559814, + "logps/chosen": -986.1469116210938, + "logps/rejected": -973.7518310546875, + "loss": 0.6908, + "num_input_tokens_seen": 3971392, + "rewards/accuracies": 0.515625, + "rewards/chosen": -0.012122892774641514, + "rewards/margins": 0.009276675060391426, + "rewards/rejected": -0.021399570629000664, + "step": 11 + }, + { + "epoch": 0.045966004309312906, + "grad_norm": 1.145377278327942, + "learning_rate": 9.998433502797097e-06, + "logits/chosen": -0.8562194108963013, + "logits/rejected": -0.8570854663848877, + "logps/chosen": -974.7759399414062, + "logps/rejected": -966.2169189453125, + "loss": 0.6946, + "num_input_tokens_seen": 4327552, + "rewards/accuracies": 0.5078125, + "rewards/chosen": -0.015546893700957298, + "rewards/margins": 0.0011881589889526367, + "rewards/rejected": -0.016735052689909935, + "step": 12 + }, + { + "epoch": 0.049796504668422316, + "grad_norm": 1.0811679363250732, + "learning_rate": 9.996475611356265e-06, + "logits/chosen": -0.8627577424049377, + "logits/rejected": -0.8670358657836914, + "logps/chosen": -983.03662109375, + "logps/rejected": -946.1083984375, + "loss": 0.6772, + "num_input_tokens_seen": 4701568, + "rewards/accuracies": 0.6171875, + "rewards/chosen": -0.014124488458037376, + "rewards/margins": 0.038048554211854935, + "rewards/rejected": -0.05217304825782776, + "step": 13 + }, + { + "epoch": 0.05362700502753172, + "grad_norm": 1.1433998346328735, + "learning_rate": 9.993734992753777e-06, + "logits/chosen": -0.8575330972671509, + "logits/rejected": -0.8639339804649353, + "logps/chosen": -986.741943359375, + "logps/rejected": -945.912353515625, + "loss": 0.6716, + "num_input_tokens_seen": 5056448, + "rewards/accuracies": 0.6328125, + "rewards/chosen": -0.001568555599078536, + "rewards/margins": 0.04844195768237114, + "rewards/rejected": -0.05001051723957062, + "step": 14 + }, + { + "epoch": 0.05745750538664113, + "grad_norm": 1.1800273656845093, + "learning_rate": 9.990212076323587e-06, + "logits/chosen": -0.8705130815505981, + "logits/rejected": -0.8687374591827393, + "logps/chosen": -935.5665283203125, + "logps/rejected": -934.8147583007812, + "loss": 0.6834, + "num_input_tokens_seen": 5405056, + "rewards/accuracies": 0.5546875, + "rewards/chosen": -0.013772297650575638, + "rewards/margins": 0.024125196039676666, + "rewards/rejected": -0.037897489964962006, + "step": 15 + }, + { + "epoch": 0.06128800574575054, + "grad_norm": 1.068210244178772, + "learning_rate": 9.98590741395118e-06, + "logits/chosen": -0.8617660999298096, + "logits/rejected": -0.8488986492156982, + "logps/chosen": -904.996826171875, + "logps/rejected": -926.2788696289062, + "loss": 0.673, + "num_input_tokens_seen": 5758464, + "rewards/accuracies": 0.6484375, + "rewards/chosen": -0.009490394964814186, + "rewards/margins": 0.04549403488636017, + "rewards/rejected": -0.054984427988529205, + "step": 16 + }, + { + "epoch": 0.06511850610485995, + "grad_norm": 1.105748176574707, + "learning_rate": 9.980821679987125e-06, + "logits/chosen": -0.8635365962982178, + "logits/rejected": -0.8617705702781677, + "logps/chosen": -961.701171875, + "logps/rejected": -941.8101806640625, + "loss": 0.6778, + "num_input_tokens_seen": 6116480, + "rewards/accuracies": 0.5859375, + "rewards/chosen": -0.026325415819883347, + "rewards/margins": 0.03649773821234703, + "rewards/rejected": -0.06282316148281097, + "step": 17 + }, + { + "epoch": 0.06894900646396936, + "grad_norm": 1.0891010761260986, + "learning_rate": 9.974955671141425e-06, + "logits/chosen": -0.8440446853637695, + "logits/rejected": -0.827153205871582, + "logps/chosen": -995.2553100585938, + "logps/rejected": -1013.8685302734375, + "loss": 0.6699, + "num_input_tokens_seen": 6482624, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.0219649076461792, + "rewards/margins": 0.05287675932049751, + "rewards/rejected": -0.07484166324138641, + "step": 18 + }, + { + "epoch": 0.07277950682307877, + "grad_norm": 1.1034226417541504, + "learning_rate": 9.968310306358715e-06, + "logits/chosen": -0.8631645441055298, + "logits/rejected": -0.8484251499176025, + "logps/chosen": -948.0541381835938, + "logps/rejected": -954.09912109375, + "loss": 0.668, + "num_input_tokens_seen": 6835712, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.008054876700043678, + "rewards/margins": 0.05839255452156067, + "rewards/rejected": -0.0664474219083786, + "step": 19 + }, + { + "epoch": 0.07661000718218818, + "grad_norm": 1.119596004486084, + "learning_rate": 9.960886626674302e-06, + "logits/chosen": -0.8531184196472168, + "logits/rejected": -0.8500455021858215, + "logps/chosen": -943.4425659179688, + "logps/rejected": -931.519287109375, + "loss": 0.664, + "num_input_tokens_seen": 7183040, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.008525203913450241, + "rewards/margins": 0.06722362339496613, + "rewards/rejected": -0.07574883103370667, + "step": 20 + }, + { + "epoch": 0.08044050754129758, + "grad_norm": 1.105559229850769, + "learning_rate": 9.952685795051078e-06, + "logits/chosen": -0.853840708732605, + "logits/rejected": -0.855196475982666, + "logps/chosen": -991.0256958007812, + "logps/rejected": -998.6373291015625, + "loss": 0.6632, + "num_input_tokens_seen": 7552000, + "rewards/accuracies": 0.6640625, + "rewards/chosen": -0.021702932193875313, + "rewards/margins": 0.06813166290521622, + "rewards/rejected": -0.08983460068702698, + "step": 21 + }, + { + "epoch": 0.084271007900407, + "grad_norm": 1.0911303758621216, + "learning_rate": 9.943709096197334e-06, + "logits/chosen": -0.8640886545181274, + "logits/rejected": -0.8526146411895752, + "logps/chosen": -972.9132080078125, + "logps/rejected": -980.2176513671875, + "loss": 0.6619, + "num_input_tokens_seen": 7918784, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.028134111315011978, + "rewards/margins": 0.07227017730474472, + "rewards/rejected": -0.100404292345047, + "step": 22 + }, + { + "epoch": 0.0881015082595164, + "grad_norm": 1.0360777378082275, + "learning_rate": 9.933957936365515e-06, + "logits/chosen": -0.8492975831031799, + "logits/rejected": -0.8457398414611816, + "logps/chosen": -956.4877319335938, + "logps/rejected": -930.35498046875, + "loss": 0.6536, + "num_input_tokens_seen": 8275392, + "rewards/accuracies": 0.6640625, + "rewards/chosen": -0.023028088733553886, + "rewards/margins": 0.09184832870960236, + "rewards/rejected": -0.1148764118552208, + "step": 23 + }, + { + "epoch": 0.09193200861862581, + "grad_norm": 1.139214277267456, + "learning_rate": 9.9234338431319e-06, + "logits/chosen": -0.8631800413131714, + "logits/rejected": -0.8530561923980713, + "logps/chosen": -971.2386474609375, + "logps/rejected": -971.96435546875, + "loss": 0.6412, + "num_input_tokens_seen": 8630976, + "rewards/accuracies": 0.7265625, + "rewards/chosen": -0.013565229251980782, + "rewards/margins": 0.12116232514381409, + "rewards/rejected": -0.1347275674343109, + "step": 24 + }, + { + "epoch": 0.09576250897773522, + "grad_norm": 1.1299138069152832, + "learning_rate": 9.912138465157325e-06, + "logits/chosen": -0.8681377172470093, + "logits/rejected": -0.8573090434074402, + "logps/chosen": -976.7225341796875, + "logps/rejected": -978.9688720703125, + "loss": 0.6345, + "num_input_tokens_seen": 8994496, + "rewards/accuracies": 0.7265625, + "rewards/chosen": -0.025455141440033913, + "rewards/margins": 0.13739538192749023, + "rewards/rejected": -0.1628505289554596, + "step": 25 + }, + { + "epoch": 0.09959300933684463, + "grad_norm": 1.0964128971099854, + "learning_rate": 9.900073571928887e-06, + "logits/chosen": -0.8694045543670654, + "logits/rejected": -0.8569152355194092, + "logps/chosen": -956.187255859375, + "logps/rejected": -972.0398559570312, + "loss": 0.6366, + "num_input_tokens_seen": 9363648, + "rewards/accuracies": 0.7421875, + "rewards/chosen": -0.05354519188404083, + "rewards/margins": 0.13473626971244812, + "rewards/rejected": -0.18828144669532776, + "step": 26 + }, + { + "epoch": 0.10342350969595403, + "grad_norm": 1.0967702865600586, + "learning_rate": 9.887241053482756e-06, + "logits/chosen": -0.868346631526947, + "logits/rejected": -0.8507559299468994, + "logps/chosen": -931.1055297851562, + "logps/rejected": -925.87255859375, + "loss": 0.6203, + "num_input_tokens_seen": 9716544, + "rewards/accuracies": 0.7265625, + "rewards/chosen": -0.03256046772003174, + "rewards/margins": 0.17498177289962769, + "rewards/rejected": -0.20754224061965942, + "step": 27 + }, + { + "epoch": 0.10725401005506344, + "grad_norm": 1.0988025665283203, + "learning_rate": 9.87364292010809e-06, + "logits/chosen": -0.8444482088088989, + "logits/rejected": -0.8354026079177856, + "logps/chosen": -947.176513671875, + "logps/rejected": -939.490234375, + "loss": 0.5958, + "num_input_tokens_seen": 10062592, + "rewards/accuracies": 0.828125, + "rewards/chosen": -0.008495497517287731, + "rewards/margins": 0.22998949885368347, + "rewards/rejected": -0.23848500847816467, + "step": 28 + }, + { + "epoch": 0.11108451041417285, + "grad_norm": 1.0441173315048218, + "learning_rate": 9.859281302032107e-06, + "logits/chosen": -0.8624484539031982, + "logits/rejected": -0.8477456569671631, + "logps/chosen": -936.7383422851562, + "logps/rejected": -945.6826171875, + "loss": 0.6147, + "num_input_tokens_seen": 10416320, + "rewards/accuracies": 0.7265625, + "rewards/chosen": -0.03869447857141495, + "rewards/margins": 0.19933387637138367, + "rewards/rejected": -0.2380283772945404, + "step": 29 + }, + { + "epoch": 0.11491501077328226, + "grad_norm": 1.1245601177215576, + "learning_rate": 9.844158449086372e-06, + "logits/chosen": -0.883080244064331, + "logits/rejected": -0.87412428855896, + "logps/chosen": -972.410888671875, + "logps/rejected": -973.052490234375, + "loss": 0.6035, + "num_input_tokens_seen": 10785856, + "rewards/accuracies": 0.796875, + "rewards/chosen": -0.09114108234643936, + "rewards/margins": 0.21878743171691895, + "rewards/rejected": -0.3099285364151001, + "step": 30 + }, + { + "epoch": 0.11874551113239167, + "grad_norm": 1.174370527267456, + "learning_rate": 9.828276730354353e-06, + "logits/chosen": -0.896267831325531, + "logits/rejected": -0.8861621618270874, + "logps/chosen": -959.3692626953125, + "logps/rejected": -963.2615966796875, + "loss": 0.5897, + "num_input_tokens_seen": 11150656, + "rewards/accuracies": 0.7578125, + "rewards/chosen": -0.09654560685157776, + "rewards/margins": 0.2596088647842407, + "rewards/rejected": -0.3561544418334961, + "step": 31 + }, + { + "epoch": 0.12257601149150107, + "grad_norm": 1.0689289569854736, + "learning_rate": 9.811638633800287e-06, + "logits/chosen": -0.8521184921264648, + "logits/rejected": -0.8504342436790466, + "logps/chosen": -941.5089721679688, + "logps/rejected": -940.1748657226562, + "loss": 0.564, + "num_input_tokens_seen": 11493568, + "rewards/accuracies": 0.8046875, + "rewards/chosen": -0.037901975214481354, + "rewards/margins": 0.33260565996170044, + "rewards/rejected": -0.3705076277256012, + "step": 32 + }, + { + "epoch": 0.1264065118506105, + "grad_norm": 1.1155372858047485, + "learning_rate": 9.794246765879421e-06, + "logits/chosen": -0.8701536655426025, + "logits/rejected": -0.8652772903442383, + "logps/chosen": -992.7672119140625, + "logps/rejected": -992.2999267578125, + "loss": 0.5837, + "num_input_tokens_seen": 11861824, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09247064590454102, + "rewards/margins": 0.3021788001060486, + "rewards/rejected": -0.394649475812912, + "step": 33 + }, + { + "epoch": 0.1302370122097199, + "grad_norm": 1.0900242328643799, + "learning_rate": 9.776103851129706e-06, + "logits/chosen": -0.8530943393707275, + "logits/rejected": -0.8423566818237305, + "logps/chosen": -979.34765625, + "logps/rejected": -996.4287719726562, + "loss": 0.5788, + "num_input_tokens_seen": 12229824, + "rewards/accuracies": 0.7578125, + "rewards/chosen": -0.1235356330871582, + "rewards/margins": 0.30674082040786743, + "rewards/rejected": -0.43027642369270325, + "step": 34 + }, + { + "epoch": 0.13406751256882932, + "grad_norm": 1.142975091934204, + "learning_rate": 9.757212731744973e-06, + "logits/chosen": -0.8924177885055542, + "logits/rejected": -0.8867870569229126, + "logps/chosen": -957.2198486328125, + "logps/rejected": -982.8868408203125, + "loss": 0.5352, + "num_input_tokens_seen": 12610880, + "rewards/accuracies": 0.8359375, + "rewards/chosen": -0.12410111725330353, + "rewards/margins": 0.42041316628456116, + "rewards/rejected": -0.5445142984390259, + "step": 35 + }, + { + "epoch": 0.1378980129279387, + "grad_norm": 1.125343680381775, + "learning_rate": 9.737576367129694e-06, + "logits/chosen": -0.8580018281936646, + "logits/rejected": -0.8490207195281982, + "logps/chosen": -903.6456298828125, + "logps/rejected": -910.9893188476562, + "loss": 0.5433, + "num_input_tokens_seen": 12958592, + "rewards/accuracies": 0.8359375, + "rewards/chosen": -0.07187622040510178, + "rewards/margins": 0.40486499667167664, + "rewards/rejected": -0.4767411947250366, + "step": 36 + }, + { + "epoch": 0.1417285132870481, + "grad_norm": 1.1175780296325684, + "learning_rate": 9.717197833435367e-06, + "logits/chosen": -0.8762209415435791, + "logits/rejected": -0.8664510250091553, + "logps/chosen": -903.818115234375, + "logps/rejected": -899.0189819335938, + "loss": 0.554, + "num_input_tokens_seen": 13310336, + "rewards/accuracies": 0.7265625, + "rewards/chosen": -0.12072329968214035, + "rewards/margins": 0.405148446559906, + "rewards/rejected": -0.525871753692627, + "step": 37 + }, + { + "epoch": 0.14555901364615753, + "grad_norm": 1.1025341749191284, + "learning_rate": 9.696080323078621e-06, + "logits/chosen": -0.8720874786376953, + "logits/rejected": -0.8588823080062866, + "logps/chosen": -978.2528076171875, + "logps/rejected": -981.0466918945312, + "loss": 0.5275, + "num_input_tokens_seen": 13681920, + "rewards/accuracies": 0.8046875, + "rewards/chosen": -0.15624140202999115, + "rewards/margins": 0.46234357357025146, + "rewards/rejected": -0.6185849905014038, + "step": 38 + }, + { + "epoch": 0.14938951400526693, + "grad_norm": 1.0933482646942139, + "learning_rate": 9.67422714424111e-06, + "logits/chosen": -0.8733354210853577, + "logits/rejected": -0.8568714261054993, + "logps/chosen": -929.06591796875, + "logps/rejected": -927.2193603515625, + "loss": 0.5439, + "num_input_tokens_seen": 14033792, + "rewards/accuracies": 0.7890625, + "rewards/chosen": -0.18334656953811646, + "rewards/margins": 0.44002479314804077, + "rewards/rejected": -0.6233713626861572, + "step": 39 + }, + { + "epoch": 0.15322001436437635, + "grad_norm": 1.0646389722824097, + "learning_rate": 9.651641720351262e-06, + "logits/chosen": -0.8848574161529541, + "logits/rejected": -0.8833534717559814, + "logps/chosen": -939.94921875, + "logps/rejected": -930.15185546875, + "loss": 0.5058, + "num_input_tokens_seen": 14392064, + "rewards/accuracies": 0.8515625, + "rewards/chosen": -0.09075319766998291, + "rewards/margins": 0.5285364389419556, + "rewards/rejected": -0.6192896366119385, + "step": 40 + }, + { + "epoch": 0.15705051472348575, + "grad_norm": 1.0706998109817505, + "learning_rate": 9.628327589547977e-06, + "logits/chosen": -0.870700478553772, + "logits/rejected": -0.8600829839706421, + "logps/chosen": -942.9664916992188, + "logps/rejected": -940.08251953125, + "loss": 0.5153, + "num_input_tokens_seen": 14750144, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.19169776141643524, + "rewards/margins": 0.4832345247268677, + "rewards/rejected": -0.6749322414398193, + "step": 41 + }, + { + "epoch": 0.16088101508259517, + "grad_norm": 1.0716564655303955, + "learning_rate": 9.604288404126362e-06, + "logits/chosen": -0.8692469596862793, + "logits/rejected": -0.8594658374786377, + "logps/chosen": -980.7017822265625, + "logps/rejected": -972.3961181640625, + "loss": 0.5011, + "num_input_tokens_seen": 15115648, + "rewards/accuracies": 0.8515625, + "rewards/chosen": -0.24686011672019958, + "rewards/margins": 0.5668089389801025, + "rewards/rejected": -0.8136690855026245, + "step": 42 + }, + { + "epoch": 0.16471151544170456, + "grad_norm": 1.1626282930374146, + "learning_rate": 9.579527929965581e-06, + "logits/chosen": -0.8824278116226196, + "logits/rejected": -0.8659209609031677, + "logps/chosen": -970.1652221679688, + "logps/rejected": -969.03955078125, + "loss": 0.5145, + "num_input_tokens_seen": 15473984, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24589437246322632, + "rewards/margins": 0.5467432141304016, + "rewards/rejected": -0.7926375865936279, + "step": 43 + }, + { + "epoch": 0.168542015800814, + "grad_norm": 1.0826659202575684, + "learning_rate": 9.554050045938893e-06, + "logits/chosen": -0.8809198141098022, + "logits/rejected": -0.8635910749435425, + "logps/chosen": -955.4505615234375, + "logps/rejected": -982.0885009765625, + "loss": 0.4498, + "num_input_tokens_seen": 15818304, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.23824483156204224, + "rewards/margins": 0.7484439611434937, + "rewards/rejected": -0.9866887331008911, + "step": 44 + }, + { + "epoch": 0.17237251615992338, + "grad_norm": 1.0296498537063599, + "learning_rate": 9.52785874330602e-06, + "logits/chosen": -0.8923338651657104, + "logits/rejected": -0.8831720352172852, + "logps/chosen": -949.2203979492188, + "logps/rejected": -926.826904296875, + "loss": 0.4762, + "num_input_tokens_seen": 16173312, + "rewards/accuracies": 0.828125, + "rewards/chosen": -0.20413096249103546, + "rewards/margins": 0.6942947506904602, + "rewards/rejected": -0.8984256982803345, + "step": 45 + }, + { + "epoch": 0.1762030165190328, + "grad_norm": 0.9988199472427368, + "learning_rate": 9.500958125087882e-06, + "logits/chosen": -0.8683270215988159, + "logits/rejected": -0.856480598449707, + "logps/chosen": -952.26318359375, + "logps/rejected": -959.3363647460938, + "loss": 0.4698, + "num_input_tokens_seen": 16542400, + "rewards/accuracies": 0.828125, + "rewards/chosen": -0.24105213582515717, + "rewards/margins": 0.7012782692909241, + "rewards/rejected": -0.9423303604125977, + "step": 46 + }, + { + "epoch": 0.1800335168781422, + "grad_norm": 1.0127381086349487, + "learning_rate": 9.473352405423845e-06, + "logits/chosen": -0.8729474544525146, + "logits/rejected": -0.8667709827423096, + "logps/chosen": -959.4134521484375, + "logps/rejected": -973.1416015625, + "loss": 0.4451, + "num_input_tokens_seen": 16906368, + "rewards/accuracies": 0.921875, + "rewards/chosen": -0.29501286149024963, + "rewards/margins": 0.7987682223320007, + "rewards/rejected": -1.0937812328338623, + "step": 47 + }, + { + "epoch": 0.18386401723725163, + "grad_norm": 0.9952019453048706, + "learning_rate": 9.445045908911536e-06, + "logits/chosen": -0.8597334027290344, + "logits/rejected": -0.8594260215759277, + "logps/chosen": -944.3863525390625, + "logps/rejected": -949.9017333984375, + "loss": 0.454, + "num_input_tokens_seen": 17244160, + "rewards/accuracies": 0.8515625, + "rewards/chosen": -0.23967772722244263, + "rewards/margins": 0.7824161648750305, + "rewards/rejected": -1.0220938920974731, + "step": 48 + }, + { + "epoch": 0.18769451759636102, + "grad_norm": 1.0403015613555908, + "learning_rate": 9.416043069929389e-06, + "logits/chosen": -0.8966540098190308, + "logits/rejected": -0.894058108329773, + "logps/chosen": -945.79443359375, + "logps/rejected": -945.3613891601562, + "loss": 0.4516, + "num_input_tokens_seen": 17605632, + "rewards/accuracies": 0.8359375, + "rewards/chosen": -0.35652709007263184, + "rewards/margins": 0.8266175985336304, + "rewards/rejected": -1.1831448078155518, + "step": 49 + }, + { + "epoch": 0.19152501795547044, + "grad_norm": 0.9946941137313843, + "learning_rate": 9.386348431941953e-06, + "logits/chosen": -0.86366868019104, + "logits/rejected": -0.8597589135169983, + "logps/chosen": -958.8348388671875, + "logps/rejected": -946.8438720703125, + "loss": 0.4555, + "num_input_tokens_seen": 17966784, + "rewards/accuracies": 0.8203125, + "rewards/chosen": -0.3083305358886719, + "rewards/margins": 0.7388800382614136, + "rewards/rejected": -1.047210454940796, + "step": 50 + }, + { + "epoch": 0.19535551831457984, + "grad_norm": 0.9000129699707031, + "learning_rate": 9.355966646788152e-06, + "logits/chosen": -0.8675721883773804, + "logits/rejected": -0.8705277442932129, + "logps/chosen": -952.7518310546875, + "logps/rejected": -957.5247192382812, + "loss": 0.3739, + "num_input_tokens_seen": 18328768, + "rewards/accuracies": 0.90625, + "rewards/chosen": -0.16377931833267212, + "rewards/margins": 1.1108189821243286, + "rewards/rejected": -1.2745983600616455, + "step": 51 + }, + { + "epoch": 0.19918601867368926, + "grad_norm": 1.07081937789917, + "learning_rate": 9.324902473952529e-06, + "logits/chosen": -0.8768286108970642, + "logits/rejected": -0.8687412142753601, + "logps/chosen": -944.7354125976562, + "logps/rejected": -943.99609375, + "loss": 0.4577, + "num_input_tokens_seen": 18682304, + "rewards/accuracies": 0.828125, + "rewards/chosen": -0.40257060527801514, + "rewards/margins": 0.8100301027297974, + "rewards/rejected": -1.2126007080078125, + "step": 52 + }, + { + "epoch": 0.20301651903279866, + "grad_norm": 0.9810666441917419, + "learning_rate": 9.293160779819658e-06, + "logits/chosen": -0.8752217292785645, + "logits/rejected": -0.8711000680923462, + "logps/chosen": -948.7554321289062, + "logps/rejected": -963.16796875, + "loss": 0.3821, + "num_input_tokens_seen": 19037760, + "rewards/accuracies": 0.8984375, + "rewards/chosen": -0.23420241475105286, + "rewards/margins": 1.0984153747558594, + "rewards/rejected": -1.3326178789138794, + "step": 53 + }, + { + "epoch": 0.20684701939190805, + "grad_norm": 0.9477998614311218, + "learning_rate": 9.260746536911792e-06, + "logits/chosen": -0.8622844219207764, + "logits/rejected": -0.8555351495742798, + "logps/chosen": -960.6400756835938, + "logps/rejected": -948.1239624023438, + "loss": 0.3763, + "num_input_tokens_seen": 19394688, + "rewards/accuracies": 0.8828125, + "rewards/chosen": -0.295390784740448, + "rewards/margins": 1.1338173151016235, + "rewards/rejected": -1.4292080402374268, + "step": 54 + }, + { + "epoch": 0.21067751975101748, + "grad_norm": 0.9970362186431885, + "learning_rate": 9.227664823109884e-06, + "logits/chosen": -0.882621169090271, + "logits/rejected": -0.8761383295059204, + "logps/chosen": -980.5223388671875, + "logps/rejected": -993.8486328125, + "loss": 0.3643, + "num_input_tokens_seen": 19760704, + "rewards/accuracies": 0.859375, + "rewards/chosen": -0.2676926255226135, + "rewards/margins": 1.283174991607666, + "rewards/rejected": -1.5508675575256348, + "step": 55 + }, + { + "epoch": 0.21450802011012687, + "grad_norm": 0.922550618648529, + "learning_rate": 9.193920820858113e-06, + "logits/chosen": -0.8822978138923645, + "logits/rejected": -0.8816237449645996, + "logps/chosen": -956.872802734375, + "logps/rejected": -934.1729736328125, + "loss": 0.363, + "num_input_tokens_seen": 20121856, + "rewards/accuracies": 0.890625, + "rewards/chosen": -0.22871704399585724, + "rewards/margins": 1.22221040725708, + "rewards/rejected": -1.450927495956421, + "step": 56 + }, + { + "epoch": 0.2183385204692363, + "grad_norm": 0.873478889465332, + "learning_rate": 9.159519816352021e-06, + "logits/chosen": -0.8788328170776367, + "logits/rejected": -0.8787699937820435, + "logps/chosen": -966.6612548828125, + "logps/rejected": -973.8304443359375, + "loss": 0.3473, + "num_input_tokens_seen": 20482304, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.36199629306793213, + "rewards/margins": 1.2343295812606812, + "rewards/rejected": -1.5963258743286133, + "step": 57 + }, + { + "epoch": 0.2221690208283457, + "grad_norm": 0.9038686156272888, + "learning_rate": 9.124467198710401e-06, + "logits/chosen": -0.8885728120803833, + "logits/rejected": -0.8782378435134888, + "logps/chosen": -936.5159301757812, + "logps/rejected": -942.8802490234375, + "loss": 0.3425, + "num_input_tokens_seen": 20822656, + "rewards/accuracies": 0.8984375, + "rewards/chosen": -0.32135745882987976, + "rewards/margins": 1.3397715091705322, + "rewards/rejected": -1.6611288785934448, + "step": 58 + }, + { + "epoch": 0.22599952118745512, + "grad_norm": 0.8958147168159485, + "learning_rate": 9.08876845913106e-06, + "logits/chosen": -0.8872295618057251, + "logits/rejected": -0.8896946907043457, + "logps/chosen": -939.883544921875, + "logps/rejected": -929.0052490234375, + "loss": 0.3257, + "num_input_tokens_seen": 21187392, + "rewards/accuracies": 0.9296875, + "rewards/chosen": -0.10529069602489471, + "rewards/margins": 1.4546587467193604, + "rewards/rejected": -1.5599493980407715, + "step": 59 + }, + { + "epoch": 0.2298300215465645, + "grad_norm": 0.8841367959976196, + "learning_rate": 9.052429190030589e-06, + "logits/chosen": -0.8831209540367126, + "logits/rejected": -0.8708245754241943, + "logps/chosen": -969.869140625, + "logps/rejected": -1000.88232421875, + "loss": 0.3257, + "num_input_tokens_seen": 21545600, + "rewards/accuracies": 0.9140625, + "rewards/chosen": -0.31539836525917053, + "rewards/margins": 1.379377841949463, + "rewards/rejected": -1.6947762966156006, + "step": 60 + }, + { + "epoch": 0.23366052190567393, + "grad_norm": 0.9442328214645386, + "learning_rate": 9.015455084168279e-06, + "logits/chosen": -0.8948913812637329, + "logits/rejected": -0.8908047080039978, + "logps/chosen": -901.967041015625, + "logps/rejected": -917.2904052734375, + "loss": 0.3375, + "num_input_tokens_seen": 21887936, + "rewards/accuracies": 0.890625, + "rewards/chosen": -0.22983768582344055, + "rewards/margins": 1.5137555599212646, + "rewards/rejected": -1.7435933351516724, + "step": 61 + }, + { + "epoch": 0.23749102226478333, + "grad_norm": 0.909550130367279, + "learning_rate": 8.977851933754317e-06, + "logits/chosen": -0.8684841394424438, + "logits/rejected": -0.8621213436126709, + "logps/chosen": -964.7745361328125, + "logps/rejected": -954.098388671875, + "loss": 0.3398, + "num_input_tokens_seen": 22233024, + "rewards/accuracies": 0.9140625, + "rewards/chosen": -0.22858697175979614, + "rewards/margins": 1.3412563800811768, + "rewards/rejected": -1.5698432922363281, + "step": 62 + }, + { + "epoch": 0.24132152262389275, + "grad_norm": 0.9149515628814697, + "learning_rate": 8.939625629542401e-06, + "logits/chosen": -0.883716881275177, + "logits/rejected": -0.8854869604110718, + "logps/chosen": -932.4530029296875, + "logps/rejected": -939.7131958007812, + "loss": 0.3274, + "num_input_tokens_seen": 22585472, + "rewards/accuracies": 0.9140625, + "rewards/chosen": -0.22678017616271973, + "rewards/margins": 1.4715971946716309, + "rewards/rejected": -1.6983773708343506, + "step": 63 + }, + { + "epoch": 0.24515202298300215, + "grad_norm": 0.8684330582618713, + "learning_rate": 8.900782159906927e-06, + "logits/chosen": -0.9030859470367432, + "logits/rejected": -0.8920726776123047, + "logps/chosen": -938.24072265625, + "logps/rejected": -963.484130859375, + "loss": 0.301, + "num_input_tokens_seen": 22946880, + "rewards/accuracies": 0.921875, + "rewards/chosen": -0.20122338831424713, + "rewards/margins": 1.6123220920562744, + "rewards/rejected": -1.8135454654693604, + "step": 64 + }, + { + "epoch": 0.24898252334211157, + "grad_norm": 0.8821057081222534, + "learning_rate": 8.861327609904859e-06, + "logits/chosen": -0.8871606588363647, + "logits/rejected": -0.881830096244812, + "logps/chosen": -966.441650390625, + "logps/rejected": -990.2425537109375, + "loss": 0.3119, + "num_input_tokens_seen": 23304320, + "rewards/accuracies": 0.8984375, + "rewards/chosen": -0.20573949813842773, + "rewards/margins": 1.7018569707870483, + "rewards/rejected": -1.907596468925476, + "step": 65 + }, + { + "epoch": 0.252813023701221, + "grad_norm": 0.993715226650238, + "learning_rate": 8.821268160322482e-06, + "logits/chosen": -0.8987492322921753, + "logits/rejected": -0.8888131976127625, + "logps/chosen": -956.7464599609375, + "logps/rejected": -969.938232421875, + "loss": 0.329, + "num_input_tokens_seen": 23665536, + "rewards/accuracies": 0.8515625, + "rewards/chosen": -0.2799607515335083, + "rewards/margins": 1.5986993312835693, + "rewards/rejected": -1.8786600828170776, + "step": 66 + }, + { + "epoch": 0.25664352406033036, + "grad_norm": 0.789756715297699, + "learning_rate": 8.780610086707149e-06, + "logits/chosen": -0.8871505260467529, + "logits/rejected": -0.8831641674041748, + "logps/chosen": -981.848876953125, + "logps/rejected": -1000.328369140625, + "loss": 0.2801, + "num_input_tokens_seen": 24021376, + "rewards/accuracies": 0.953125, + "rewards/chosen": -0.4163787066936493, + "rewards/margins": 1.6759014129638672, + "rewards/rejected": -2.092280149459839, + "step": 67 + }, + { + "epoch": 0.2604740244194398, + "grad_norm": 0.8739683628082275, + "learning_rate": 8.739359758384162e-06, + "logits/chosen": -0.9010568857192993, + "logits/rejected": -0.8976689577102661, + "logps/chosen": -961.4142456054688, + "logps/rejected": -970.7120361328125, + "loss": 0.2903, + "num_input_tokens_seen": 24385216, + "rewards/accuracies": 0.9140625, + "rewards/chosen": -0.23521028459072113, + "rewards/margins": 1.7630295753479004, + "rewards/rejected": -1.9982399940490723, + "step": 68 + }, + { + "epoch": 0.2643045247785492, + "grad_norm": 0.9129281640052795, + "learning_rate": 8.697523637458997e-06, + "logits/chosen": -0.89457768201828, + "logits/rejected": -0.8855119943618774, + "logps/chosen": -952.9556884765625, + "logps/rejected": -966.17822265625, + "loss": 0.2866, + "num_input_tokens_seen": 24750080, + "rewards/accuracies": 0.9140625, + "rewards/chosen": -0.20718005299568176, + "rewards/margins": 1.6356449127197266, + "rewards/rejected": -1.842824935913086, + "step": 69 + }, + { + "epoch": 0.26813502513765863, + "grad_norm": 0.8331277966499329, + "learning_rate": 8.655108277804975e-06, + "logits/chosen": -0.9106764197349548, + "logits/rejected": -0.9041292667388916, + "logps/chosen": -967.6072998046875, + "logps/rejected": -999.0532836914062, + "loss": 0.2925, + "num_input_tokens_seen": 25116480, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.43630316853523254, + "rewards/margins": 1.729076862335205, + "rewards/rejected": -2.1653800010681152, + "step": 70 + }, + { + "epoch": 0.271965525496768, + "grad_norm": 0.7939078211784363, + "learning_rate": 8.612120324036548e-06, + "logits/chosen": -0.9071799516677856, + "logits/rejected": -0.9028173089027405, + "logps/chosen": -941.1134033203125, + "logps/rejected": -947.9176635742188, + "loss": 0.2486, + "num_input_tokens_seen": 25465536, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.18985743820667267, + "rewards/margins": 2.005892515182495, + "rewards/rejected": -2.1957499980926514, + "step": 71 + }, + { + "epoch": 0.2757960258558774, + "grad_norm": 0.8236827254295349, + "learning_rate": 8.568566510468392e-06, + "logits/chosen": -0.912564754486084, + "logits/rejected": -0.9037194848060608, + "logps/chosen": -972.3777465820312, + "logps/rejected": -1004.67626953125, + "loss": 0.261, + "num_input_tokens_seen": 25833344, + "rewards/accuracies": 0.9140625, + "rewards/chosen": -0.4295309782028198, + "rewards/margins": 1.9907405376434326, + "rewards/rejected": -2.420271396636963, + "step": 72 + }, + { + "epoch": 0.27962652621498685, + "grad_norm": 0.9908759593963623, + "learning_rate": 8.524453660060434e-06, + "logits/chosen": -0.8977659940719604, + "logits/rejected": -0.8889322876930237, + "logps/chosen": -921.490234375, + "logps/rejected": -936.3394775390625, + "loss": 0.3015, + "num_input_tokens_seen": 26183872, + "rewards/accuracies": 0.890625, + "rewards/chosen": -0.4154922366142273, + "rewards/margins": 1.6831226348876953, + "rewards/rejected": -2.0986149311065674, + "step": 73 + }, + { + "epoch": 0.2834570265740962, + "grad_norm": 0.9763004779815674, + "learning_rate": 8.479788683348996e-06, + "logits/chosen": -0.8884252309799194, + "logits/rejected": -0.8845940828323364, + "logps/chosen": -947.107177734375, + "logps/rejected": -929.6920166015625, + "loss": 0.3031, + "num_input_tokens_seen": 26539904, + "rewards/accuracies": 0.8828125, + "rewards/chosen": -0.46277469396591187, + "rewards/margins": 1.6962052583694458, + "rewards/rejected": -2.158979892730713, + "step": 74 + }, + { + "epoch": 0.28728752693320564, + "grad_norm": 0.8439339995384216, + "learning_rate": 8.434578577364218e-06, + "logits/chosen": -0.8974050283432007, + "logits/rejected": -0.8876934051513672, + "logps/chosen": -959.245849609375, + "logps/rejected": -975.6104736328125, + "loss": 0.2726, + "num_input_tokens_seen": 26902464, + "rewards/accuracies": 0.8984375, + "rewards/chosen": -0.39336973428726196, + "rewards/margins": 1.9701586961746216, + "rewards/rejected": -2.3635284900665283, + "step": 75 + }, + { + "epoch": 0.29111802729231506, + "grad_norm": 0.7955582737922668, + "learning_rate": 8.388830424533935e-06, + "logits/chosen": -0.9060702323913574, + "logits/rejected": -0.8961368799209595, + "logps/chosen": -953.9136962890625, + "logps/rejected": -988.19677734375, + "loss": 0.2354, + "num_input_tokens_seen": 27269504, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -0.3611718416213989, + "rewards/margins": 2.007566452026367, + "rewards/rejected": -2.3687381744384766, + "step": 76 + }, + { + "epoch": 0.2949485276514245, + "grad_norm": 1.0123270750045776, + "learning_rate": 8.342551391574165e-06, + "logits/chosen": -0.9041194915771484, + "logits/rejected": -0.9068773984909058, + "logps/chosen": -974.324951171875, + "logps/rejected": -958.915771484375, + "loss": 0.2989, + "num_input_tokens_seen": 27629248, + "rewards/accuracies": 0.8984375, + "rewards/chosen": -0.48377448320388794, + "rewards/margins": 1.8969142436981201, + "rewards/rejected": -2.3806886672973633, + "step": 77 + }, + { + "epoch": 0.29877902801053385, + "grad_norm": 0.8166556358337402, + "learning_rate": 8.295748728366414e-06, + "logits/chosen": -0.8858418464660645, + "logits/rejected": -0.8801019191741943, + "logps/chosen": -973.2070922851562, + "logps/rejected": -1012.683837890625, + "loss": 0.2474, + "num_input_tokens_seen": 27988096, + "rewards/accuracies": 0.9453125, + "rewards/chosen": -0.47751736640930176, + "rewards/margins": 2.0694668292999268, + "rewards/rejected": -2.5469841957092285, + "step": 78 + }, + { + "epoch": 0.3026095283696433, + "grad_norm": 0.840967059135437, + "learning_rate": 8.248429766821925e-06, + "logits/chosen": -0.890509843826294, + "logits/rejected": -0.8831250667572021, + "logps/chosen": -992.5006103515625, + "logps/rejected": -1011.10107421875, + "loss": 0.2654, + "num_input_tokens_seen": 28346432, + "rewards/accuracies": 0.9140625, + "rewards/chosen": -0.5605780482292175, + "rewards/margins": 2.078644037246704, + "rewards/rejected": -2.6392221450805664, + "step": 79 + }, + { + "epoch": 0.3064400287287527, + "grad_norm": 0.9755147695541382, + "learning_rate": 8.200601919733106e-06, + "logits/chosen": -0.912348747253418, + "logits/rejected": -0.9080063700675964, + "logps/chosen": -990.100341796875, + "logps/rejected": -1010.1781005859375, + "loss": 0.2653, + "num_input_tokens_seen": 28715200, + "rewards/accuracies": 0.8828125, + "rewards/chosen": -0.5899574756622314, + "rewards/margins": 2.0038092136383057, + "rewards/rejected": -2.593766689300537, + "step": 80 + }, + { + "epoch": 0.3102705290878621, + "grad_norm": 0.8211416006088257, + "learning_rate": 8.15227267961226e-06, + "logits/chosen": -0.8992519378662109, + "logits/rejected": -0.8915354013442993, + "logps/chosen": -976.0731811523438, + "logps/rejected": -986.6966552734375, + "loss": 0.2446, + "num_input_tokens_seen": 29068800, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3442724645137787, + "rewards/margins": 2.3275351524353027, + "rewards/rejected": -2.671807289123535, + "step": 81 + }, + { + "epoch": 0.3141010294469715, + "grad_norm": 0.9516357779502869, + "learning_rate": 8.10344961751785e-06, + "logits/chosen": -0.8960527777671814, + "logits/rejected": -0.8952938318252563, + "logps/chosen": -947.9008178710938, + "logps/rejected": -949.0771484375, + "loss": 0.2791, + "num_input_tokens_seen": 29416000, + "rewards/accuracies": 0.921875, + "rewards/chosen": -0.465572327375412, + "rewards/margins": 2.01568603515625, + "rewards/rejected": -2.4812583923339844, + "step": 82 + }, + { + "epoch": 0.3179315298060809, + "grad_norm": 0.8539770841598511, + "learning_rate": 8.054140381868435e-06, + "logits/chosen": -0.9193344712257385, + "logits/rejected": -0.9072273969650269, + "logps/chosen": -960.5968017578125, + "logps/rejected": -984.9967651367188, + "loss": 0.2383, + "num_input_tokens_seen": 29768704, + "rewards/accuracies": 0.90625, + "rewards/chosen": -0.43722477555274963, + "rewards/margins": 2.2228188514709473, + "rewards/rejected": -2.660043478012085, + "step": 83 + }, + { + "epoch": 0.32176203016519034, + "grad_norm": 0.9070563316345215, + "learning_rate": 8.004352697244516e-06, + "logits/chosen": -0.9024090766906738, + "logits/rejected": -0.8883140683174133, + "logps/chosen": -997.8526000976562, + "logps/rejected": -1006.8358154296875, + "loss": 0.2578, + "num_input_tokens_seen": 30143360, + "rewards/accuracies": 0.921875, + "rewards/chosen": -0.5442839860916138, + "rewards/margins": 2.057681083679199, + "rewards/rejected": -2.6019649505615234, + "step": 84 + }, + { + "epoch": 0.32559253052429976, + "grad_norm": 0.7460620999336243, + "learning_rate": 7.954094363178421e-06, + "logits/chosen": -0.8904761672019958, + "logits/rejected": -0.8793785572052002, + "logps/chosen": -949.7542724609375, + "logps/rejected": -983.7249755859375, + "loss": 0.2276, + "num_input_tokens_seen": 30496832, + "rewards/accuracies": 0.9453125, + "rewards/chosen": -0.3411102592945099, + "rewards/margins": 2.239603042602539, + "rewards/rejected": -2.5807135105133057, + "step": 85 + }, + { + "epoch": 0.32942303088340913, + "grad_norm": 0.8186484575271606, + "learning_rate": 7.903373252932474e-06, + "logits/chosen": -0.9134713411331177, + "logits/rejected": -0.9035958647727966, + "logps/chosen": -943.67236328125, + "logps/rejected": -964.198486328125, + "loss": 0.2369, + "num_input_tokens_seen": 30854400, + "rewards/accuracies": 0.921875, + "rewards/chosen": -0.5375100374221802, + "rewards/margins": 2.2460575103759766, + "rewards/rejected": -2.783567428588867, + "step": 86 + }, + { + "epoch": 0.33325353124251855, + "grad_norm": 0.8342300057411194, + "learning_rate": 7.852197312265592e-06, + "logits/chosen": -0.9076871871948242, + "logits/rejected": -0.9010123014450073, + "logps/chosen": -935.4744873046875, + "logps/rejected": -934.6981811523438, + "loss": 0.261, + "num_input_tokens_seen": 31212288, + "rewards/accuracies": 0.9140625, + "rewards/chosen": -0.28344666957855225, + "rewards/margins": 2.122594118118286, + "rewards/rejected": -2.406040668487549, + "step": 87 + }, + { + "epoch": 0.337084031601628, + "grad_norm": 0.8632445931434631, + "learning_rate": 7.800574558188548e-06, + "logits/chosen": -0.9118667840957642, + "logits/rejected": -0.9074774980545044, + "logps/chosen": -969.2461547851562, + "logps/rejected": -975.0919189453125, + "loss": 0.2214, + "num_input_tokens_seen": 31576000, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5344555377960205, + "rewards/margins": 2.2522950172424316, + "rewards/rejected": -2.7867507934570312, + "step": 88 + }, + { + "epoch": 0.34091453196073734, + "grad_norm": 0.9685871601104736, + "learning_rate": 7.748513077708044e-06, + "logits/chosen": -0.8978217840194702, + "logits/rejected": -0.8894325494766235, + "logps/chosen": -940.5440063476562, + "logps/rejected": -953.5572509765625, + "loss": 0.2836, + "num_input_tokens_seen": 31930560, + "rewards/accuracies": 0.9140625, + "rewards/chosen": -0.5140777826309204, + "rewards/margins": 1.949610948562622, + "rewards/rejected": -2.463688611984253, + "step": 89 + }, + { + "epoch": 0.34474503231984677, + "grad_norm": 0.8063095808029175, + "learning_rate": 7.69602102655985e-06, + "logits/chosen": -0.9282764792442322, + "logits/rejected": -0.921352207660675, + "logps/chosen": -960.401123046875, + "logps/rejected": -984.61572265625, + "loss": 0.2046, + "num_input_tokens_seen": 32295488, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.327387273311615, + "rewards/margins": 2.6362967491149902, + "rewards/rejected": -2.96368408203125, + "step": 90 + }, + { + "epoch": 0.3485755326789562, + "grad_norm": 0.6952674388885498, + "learning_rate": 7.643106627931148e-06, + "logits/chosen": -0.9313417673110962, + "logits/rejected": -0.9226356744766235, + "logps/chosen": -1011.1554565429688, + "logps/rejected": -1027.318115234375, + "loss": 0.1799, + "num_input_tokens_seen": 32673984, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -0.3193371891975403, + "rewards/margins": 2.551300525665283, + "rewards/rejected": -2.870637893676758, + "step": 91 + }, + { + "epoch": 0.3524060330380656, + "grad_norm": 0.8423375487327576, + "learning_rate": 7.5897781711723215e-06, + "logits/chosen": -0.9159414768218994, + "logits/rejected": -0.9044461250305176, + "logps/chosen": -947.4771728515625, + "logps/rejected": -958.0704345703125, + "loss": 0.2464, + "num_input_tokens_seen": 33035968, + "rewards/accuracies": 0.921875, + "rewards/chosen": -0.43906497955322266, + "rewards/margins": 2.325441360473633, + "rewards/rejected": -2.7645063400268555, + "step": 92 + }, + { + "epoch": 0.356236533397175, + "grad_norm": 0.9066694378852844, + "learning_rate": 7.536044010498396e-06, + "logits/chosen": -0.9149327278137207, + "logits/rejected": -0.9138184785842896, + "logps/chosen": -921.2164306640625, + "logps/rejected": -933.0731201171875, + "loss": 0.2468, + "num_input_tokens_seen": 33392192, + "rewards/accuracies": 0.90625, + "rewards/chosen": -0.3270125687122345, + "rewards/margins": 2.203007698059082, + "rewards/rejected": -2.530020236968994, + "step": 93 + }, + { + "epoch": 0.3600670337562844, + "grad_norm": 0.7547585368156433, + "learning_rate": 7.48191256368028e-06, + "logits/chosen": -0.9391610622406006, + "logits/rejected": -0.9235018491744995, + "logps/chosen": -976.5980834960938, + "logps/rejected": -1010.8218994140625, + "loss": 0.1873, + "num_input_tokens_seen": 33758976, + "rewards/accuracies": 0.953125, + "rewards/chosen": -0.17495784163475037, + "rewards/margins": 2.878791332244873, + "rewards/rejected": -3.0537490844726562, + "step": 94 + }, + { + "epoch": 0.36389753411539383, + "grad_norm": 0.8308711051940918, + "learning_rate": 7.427392310726088e-06, + "logits/chosen": -0.9251983165740967, + "logits/rejected": -0.9083241820335388, + "logps/chosen": -964.7904052734375, + "logps/rejected": -975.2269287109375, + "loss": 0.2136, + "num_input_tokens_seen": 34123840, + "rewards/accuracies": 0.921875, + "rewards/chosen": -0.25840020179748535, + "rewards/margins": 2.6237287521362305, + "rewards/rejected": -2.882129192352295, + "step": 95 + }, + { + "epoch": 0.36772803447450325, + "grad_norm": 0.67497318983078, + "learning_rate": 7.372491792552694e-06, + "logits/chosen": -0.931308388710022, + "logits/rejected": -0.9206041097640991, + "logps/chosen": -968.970947265625, + "logps/rejected": -982.2496948242188, + "loss": 0.1643, + "num_input_tokens_seen": 34479552, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -0.1697201132774353, + "rewards/margins": 2.6050310134887695, + "rewards/rejected": -2.7747509479522705, + "step": 96 + }, + { + "epoch": 0.3715585348336126, + "grad_norm": 0.7515125274658203, + "learning_rate": 7.31721960964774e-06, + "logits/chosen": -0.9041364789009094, + "logits/rejected": -0.9017446637153625, + "logps/chosen": -938.43798828125, + "logps/rejected": -966.27490234375, + "loss": 0.2047, + "num_input_tokens_seen": 34835840, + "rewards/accuracies": 0.9453125, + "rewards/chosen": -0.01113000325858593, + "rewards/margins": 2.7240090370178223, + "rewards/rejected": -2.7351388931274414, + "step": 97 + }, + { + "epoch": 0.37538903519272204, + "grad_norm": 0.954608142375946, + "learning_rate": 7.261584420722328e-06, + "logits/chosen": -0.9252983331680298, + "logits/rejected": -0.9254056215286255, + "logps/chosen": -967.1123046875, + "logps/rejected": -992.2210083007812, + "loss": 0.2532, + "num_input_tokens_seen": 35210176, + "rewards/accuracies": 0.8984375, + "rewards/chosen": -0.17205007374286652, + "rewards/margins": 2.385958671569824, + "rewards/rejected": -2.5580086708068848, + "step": 98 + }, + { + "epoch": 0.37921953555183147, + "grad_norm": 0.7498520612716675, + "learning_rate": 7.20559494135458e-06, + "logits/chosen": -0.914477527141571, + "logits/rejected": -0.9178078174591064, + "logps/chosen": -956.7589721679688, + "logps/rejected": -974.8713989257812, + "loss": 0.199, + "num_input_tokens_seen": 35568576, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.25146859884262085, + "rewards/margins": 2.7367429733276367, + "rewards/rejected": -2.9882116317749023, + "step": 99 + }, + { + "epoch": 0.3830500359109409, + "grad_norm": 0.8344777822494507, + "learning_rate": 7.149259942624287e-06, + "logits/chosen": -0.9274442195892334, + "logits/rejected": -0.9153659343719482, + "logps/chosen": -955.1148681640625, + "logps/rejected": -978.0455932617188, + "loss": 0.2089, + "num_input_tokens_seen": 35929024, + "rewards/accuracies": 0.921875, + "rewards/chosen": -0.20801052451133728, + "rewards/margins": 2.4391160011291504, + "rewards/rejected": -2.6471261978149414, + "step": 100 + }, + { + "epoch": 0.38688053627005026, + "grad_norm": 0.8431086540222168, + "learning_rate": 7.092588249738871e-06, + "logits/chosen": -0.9213912487030029, + "logits/rejected": -0.9219788908958435, + "logps/chosen": -952.00927734375, + "logps/rejected": -941.57763671875, + "loss": 0.2212, + "num_input_tokens_seen": 36292032, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.08584348857402802, + "rewards/margins": 2.4320878982543945, + "rewards/rejected": -2.5179312229156494, + "step": 101 + }, + { + "epoch": 0.3907110366291597, + "grad_norm": 0.7968887686729431, + "learning_rate": 7.03558874065087e-06, + "logits/chosen": -0.9020681977272034, + "logits/rejected": -0.8992164731025696, + "logps/chosen": -919.3433837890625, + "logps/rejected": -936.555908203125, + "loss": 0.2015, + "num_input_tokens_seen": 36647104, + "rewards/accuracies": 0.8984375, + "rewards/chosen": -0.07779061794281006, + "rewards/margins": 2.4831807613372803, + "rewards/rejected": -2.56097149848938, + "step": 102 + }, + { + "epoch": 0.3945415369882691, + "grad_norm": 0.7636644840240479, + "learning_rate": 6.978270344667143e-06, + "logits/chosen": -0.9130829572677612, + "logits/rejected": -0.904511570930481, + "logps/chosen": -946.6600341796875, + "logps/rejected": -955.886962890625, + "loss": 0.198, + "num_input_tokens_seen": 36996736, + "rewards/accuracies": 0.9453125, + "rewards/chosen": -0.026550982147455215, + "rewards/margins": 2.4634013175964355, + "rewards/rejected": -2.489952325820923, + "step": 103 + }, + { + "epoch": 0.3983720373473785, + "grad_norm": 1.0013426542282104, + "learning_rate": 6.920642041050055e-06, + "logits/chosen": -0.9012176990509033, + "logits/rejected": -0.9022072553634644, + "logps/chosen": -970.3377685546875, + "logps/rejected": -963.3656616210938, + "loss": 0.2626, + "num_input_tokens_seen": 37351296, + "rewards/accuracies": 0.890625, + "rewards/chosen": -0.2126973420381546, + "rewards/margins": 2.224216938018799, + "rewards/rejected": -2.4369139671325684, + "step": 104 + }, + { + "epoch": 0.4022025377064879, + "grad_norm": 0.6189612746238708, + "learning_rate": 6.862712857610812e-06, + "logits/chosen": -0.9104306697845459, + "logits/rejected": -0.9062457084655762, + "logps/chosen": -979.9945068359375, + "logps/rejected": -979.806884765625, + "loss": 0.1446, + "num_input_tokens_seen": 37710464, + "rewards/accuracies": 0.9609375, + "rewards/chosen": 0.14229023456573486, + "rewards/margins": 2.9538631439208984, + "rewards/rejected": -2.811572790145874, + "step": 105 + }, + { + "epoch": 0.4060330380655973, + "grad_norm": 0.7884417772293091, + "learning_rate": 6.804491869295207e-06, + "logits/chosen": -0.9367163181304932, + "logits/rejected": -0.9306931495666504, + "logps/chosen": -975.1888427734375, + "logps/rejected": -973.6009521484375, + "loss": 0.2115, + "num_input_tokens_seen": 38075136, + "rewards/accuracies": 0.9453125, + "rewards/chosen": 0.028138499706983566, + "rewards/margins": 2.4732718467712402, + "rewards/rejected": -2.4451332092285156, + "step": 106 + }, + { + "epoch": 0.40986353842470674, + "grad_norm": 0.7455801367759705, + "learning_rate": 6.745988196761976e-06, + "logits/chosen": -0.8959982991218567, + "logits/rejected": -0.8889661431312561, + "logps/chosen": -983.70458984375, + "logps/rejected": -999.4580688476562, + "loss": 0.1883, + "num_input_tokens_seen": 38438464, + "rewards/accuracies": 0.9453125, + "rewards/chosen": 0.2271251380443573, + "rewards/margins": 2.7836532592773438, + "rewards/rejected": -2.556527853012085, + "step": 107 + }, + { + "epoch": 0.4136940387838161, + "grad_norm": 0.6705349683761597, + "learning_rate": 6.687211004953992e-06, + "logits/chosen": -0.9123795032501221, + "logits/rejected": -0.8993309736251831, + "logps/chosen": -931.0626220703125, + "logps/rejected": -943.3898315429688, + "loss": 0.1569, + "num_input_tokens_seen": 38786112, + "rewards/accuracies": 0.953125, + "rewards/chosen": 0.08423255383968353, + "rewards/margins": 2.947176456451416, + "rewards/rejected": -2.8629438877105713, + "step": 108 + }, + { + "epoch": 0.41752453914292553, + "grad_norm": 0.7670567631721497, + "learning_rate": 6.628169501662527e-06, + "logits/chosen": -0.9253091216087341, + "logits/rejected": -0.9139726161956787, + "logps/chosen": -931.079833984375, + "logps/rejected": -947.3938598632812, + "loss": 0.1728, + "num_input_tokens_seen": 39137920, + "rewards/accuracies": 0.953125, + "rewards/chosen": 0.0590210035443306, + "rewards/margins": 2.6670312881469727, + "rewards/rejected": -2.6080102920532227, + "step": 109 + }, + { + "epoch": 0.42135503950203496, + "grad_norm": 0.9639368057250977, + "learning_rate": 6.568872936084789e-06, + "logits/chosen": -0.9225589036941528, + "logits/rejected": -0.9140769243240356, + "logps/chosen": -943.505126953125, + "logps/rejected": -975.529296875, + "loss": 0.2576, + "num_input_tokens_seen": 39496896, + "rewards/accuracies": 0.890625, + "rewards/chosen": -0.23697441816329956, + "rewards/margins": 2.4227981567382812, + "rewards/rejected": -2.6597723960876465, + "step": 110 + }, + { + "epoch": 0.4251855398611444, + "grad_norm": 0.8447521924972534, + "learning_rate": 6.509330597374993e-06, + "logits/chosen": -0.9280617833137512, + "logits/rejected": -0.9181127548217773, + "logps/chosen": -984.5870361328125, + "logps/rejected": -1005.9556274414062, + "loss": 0.2094, + "num_input_tokens_seen": 39862784, + "rewards/accuracies": 0.9140625, + "rewards/chosen": -0.1390344500541687, + "rewards/margins": 2.583678722381592, + "rewards/rejected": -2.7227132320404053, + "step": 111 + }, + { + "epoch": 0.42901604022025375, + "grad_norm": 0.8629226088523865, + "learning_rate": 6.44955181318915e-06, + "logits/chosen": -0.9241629242897034, + "logits/rejected": -0.9225752353668213, + "logps/chosen": -970.8624267578125, + "logps/rejected": -994.7688598632812, + "loss": 0.1943, + "num_input_tokens_seen": 40229504, + "rewards/accuracies": 0.9453125, + "rewards/chosen": -0.060454513877630234, + "rewards/margins": 2.8952488899230957, + "rewards/rejected": -2.9557032585144043, + "step": 112 + }, + { + "epoch": 0.43284654057936317, + "grad_norm": 0.8970760703086853, + "learning_rate": 6.389545948223841e-06, + "logits/chosen": -0.9069831967353821, + "logits/rejected": -0.8961917161941528, + "logps/chosen": -929.178466796875, + "logps/rejected": -957.38232421875, + "loss": 0.2281, + "num_input_tokens_seen": 40578304, + "rewards/accuracies": 0.9140625, + "rewards/chosen": -0.10809502750635147, + "rewards/margins": 2.5905418395996094, + "rewards/rejected": -2.698636770248413, + "step": 113 + }, + { + "epoch": 0.4366770409384726, + "grad_norm": 0.7248643040657043, + "learning_rate": 6.329322402749181e-06, + "logits/chosen": -0.9125691652297974, + "logits/rejected": -0.9055846929550171, + "logps/chosen": -947.4840698242188, + "logps/rejected": -994.1029663085938, + "loss": 0.1579, + "num_input_tokens_seen": 40926400, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -0.054590873420238495, + "rewards/margins": 3.0778942108154297, + "rewards/rejected": -3.1324849128723145, + "step": 114 + }, + { + "epoch": 0.440507541297582, + "grad_norm": 0.8313496708869934, + "learning_rate": 6.2688906111362115e-06, + "logits/chosen": -0.9156756401062012, + "logits/rejected": -0.9181293845176697, + "logps/chosen": -962.3348388671875, + "logps/rejected": -958.6428833007812, + "loss": 0.1995, + "num_input_tokens_seen": 41283008, + "rewards/accuracies": 0.9296875, + "rewards/chosen": -0.13335499167442322, + "rewards/margins": 2.7144775390625, + "rewards/rejected": -2.847832441329956, + "step": 115 + }, + { + "epoch": 0.4443380416566914, + "grad_norm": 0.8500897884368896, + "learning_rate": 6.208260040378946e-06, + "logits/chosen": -0.9062702655792236, + "logits/rejected": -0.894444465637207, + "logps/chosen": -897.670166015625, + "logps/rejected": -909.5458984375, + "loss": 0.226, + "num_input_tokens_seen": 41619520, + "rewards/accuracies": 0.90625, + "rewards/chosen": 0.0693136677145958, + "rewards/margins": 2.6547067165374756, + "rewards/rejected": -2.585392951965332, + "step": 116 + }, + { + "epoch": 0.4481685420158008, + "grad_norm": 0.7028183937072754, + "learning_rate": 6.147440188611324e-06, + "logits/chosen": -0.9204760193824768, + "logits/rejected": -0.9124895334243774, + "logps/chosen": -991.82080078125, + "logps/rejected": -1025.6387939453125, + "loss": 0.1644, + "num_input_tokens_seen": 41986368, + "rewards/accuracies": 0.9453125, + "rewards/chosen": -0.06996024399995804, + "rewards/margins": 2.8688559532165527, + "rewards/rejected": -2.9388160705566406, + "step": 117 + }, + { + "epoch": 0.45199904237491023, + "grad_norm": 0.706648588180542, + "learning_rate": 6.0864405836192575e-06, + "logits/chosen": -0.9304739832878113, + "logits/rejected": -0.9166991114616394, + "logps/chosen": -915.133544921875, + "logps/rejected": -950.4501953125, + "loss": 0.1757, + "num_input_tokens_seen": 42347008, + "rewards/accuracies": 0.921875, + "rewards/chosen": 0.03329768031835556, + "rewards/margins": 3.0157203674316406, + "rewards/rejected": -2.9824228286743164, + "step": 118 + }, + { + "epoch": 0.45582954273401965, + "grad_norm": 0.7446486949920654, + "learning_rate": 6.025270781348055e-06, + "logits/chosen": -0.9153317213058472, + "logits/rejected": -0.9103206992149353, + "logps/chosen": -965.8341064453125, + "logps/rejected": -968.9500732421875, + "loss": 0.1706, + "num_input_tokens_seen": 42698240, + "rewards/accuracies": 0.9453125, + "rewards/chosen": 0.03542988374829292, + "rewards/margins": 2.953965187072754, + "rewards/rejected": -2.9185352325439453, + "step": 119 + }, + { + "epoch": 0.459660043093129, + "grad_norm": 0.8363955616950989, + "learning_rate": 5.963940364405425e-06, + "logits/chosen": -0.9062429666519165, + "logits/rejected": -0.9142025113105774, + "logps/chosen": -958.3355712890625, + "logps/rejected": -946.6017456054688, + "loss": 0.2021, + "num_input_tokens_seen": 43050688, + "rewards/accuracies": 0.90625, + "rewards/chosen": 0.010046720504760742, + "rewards/margins": 2.874662399291992, + "rewards/rejected": -2.8646156787872314, + "step": 120 + }, + { + "epoch": 0.46349054345223845, + "grad_norm": 0.7020468711853027, + "learning_rate": 5.902458940560304e-06, + "logits/chosen": -0.9269927144050598, + "logits/rejected": -0.9181845188140869, + "logps/chosen": -986.21875, + "logps/rejected": -993.67041015625, + "loss": 0.1728, + "num_input_tokens_seen": 43415744, + "rewards/accuracies": 0.953125, + "rewards/chosen": 0.013758718967437744, + "rewards/margins": 2.8712427616119385, + "rewards/rejected": -2.8574843406677246, + "step": 121 + }, + { + "epoch": 0.46732104381134787, + "grad_norm": 0.7419387102127075, + "learning_rate": 5.8408361412377475e-06, + "logits/chosen": -0.9247527718544006, + "logits/rejected": -0.9219599962234497, + "logps/chosen": -936.7920532226562, + "logps/rejected": -948.786376953125, + "loss": 0.1615, + "num_input_tokens_seen": 43769984, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.24296002089977264, + "rewards/margins": 2.9300107955932617, + "rewards/rejected": -2.6870508193969727, + "step": 122 + }, + { + "epoch": 0.4711515441704573, + "grad_norm": 0.6466975212097168, + "learning_rate": 5.779081620010104e-06, + "logits/chosen": -0.9124172925949097, + "logits/rejected": -0.9061915874481201, + "logps/chosen": -971.8199462890625, + "logps/rejected": -985.4337768554688, + "loss": 0.1474, + "num_input_tokens_seen": 44125376, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.05713702738285065, + "rewards/margins": 2.898320198059082, + "rewards/rejected": -2.8411829471588135, + "step": 123 + }, + { + "epoch": 0.47498204452956666, + "grad_norm": 0.8490724563598633, + "learning_rate": 5.717205051084731e-06, + "logits/chosen": -0.9195170998573303, + "logits/rejected": -0.9108103513717651, + "logps/chosen": -990.1031494140625, + "logps/rejected": -1022.4898681640625, + "loss": 0.2012, + "num_input_tokens_seen": 44496896, + "rewards/accuracies": 0.921875, + "rewards/chosen": 0.06758461147546768, + "rewards/margins": 2.6426219940185547, + "rewards/rejected": -2.5750372409820557, + "step": 124 + }, + { + "epoch": 0.4788125448886761, + "grad_norm": 0.7902799248695374, + "learning_rate": 5.655216127788472e-06, + "logits/chosen": -0.9141355752944946, + "logits/rejected": -0.906214714050293, + "logps/chosen": -994.0625, + "logps/rejected": -1009.5243530273438, + "loss": 0.1836, + "num_input_tokens_seen": 44858496, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.1333620846271515, + "rewards/margins": 2.6629581451416016, + "rewards/rejected": -2.5295960903167725, + "step": 125 + }, + { + "epoch": 0.4826430452477855, + "grad_norm": 0.7545987963676453, + "learning_rate": 5.593124561049141e-06, + "logits/chosen": -0.9265604019165039, + "logits/rejected": -0.9306275248527527, + "logps/chosen": -950.220947265625, + "logps/rejected": -954.4283447265625, + "loss": 0.1678, + "num_input_tokens_seen": 45215680, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.09714318811893463, + "rewards/margins": 3.0466086864471436, + "rewards/rejected": -2.949465751647949, + "step": 126 + }, + { + "epoch": 0.4864735456068949, + "grad_norm": 0.7740658521652222, + "learning_rate": 5.530940077874248e-06, + "logits/chosen": -0.9124447703361511, + "logits/rejected": -0.9034743905067444, + "logps/chosen": -938.5323486328125, + "logps/rejected": -952.2512817382812, + "loss": 0.1832, + "num_input_tokens_seen": 45575872, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0883568748831749, + "rewards/margins": 2.4720749855041504, + "rewards/rejected": -2.560431957244873, + "step": 127 + }, + { + "epoch": 0.4903040459660043, + "grad_norm": 0.7650671005249023, + "learning_rate": 5.468672419827208e-06, + "logits/chosen": -0.9438241124153137, + "logits/rejected": -0.933975875377655, + "logps/chosen": -962.1099853515625, + "logps/rejected": -1007.1533203125, + "loss": 0.1594, + "num_input_tokens_seen": 45951488, + "rewards/accuracies": 0.9453125, + "rewards/chosen": 0.20753604173660278, + "rewards/margins": 3.042029857635498, + "rewards/rejected": -2.83449387550354, + "step": 128 + }, + { + "epoch": 0.4941345463251137, + "grad_norm": 0.8886730670928955, + "learning_rate": 5.406331341501264e-06, + "logits/chosen": -0.9003716707229614, + "logits/rejected": -0.8953530192375183, + "logps/chosen": -955.8033447265625, + "logps/rejected": -982.9979248046875, + "loss": 0.2112, + "num_input_tokens_seen": 46313472, + "rewards/accuracies": 0.90625, + "rewards/chosen": 0.08642053604125977, + "rewards/margins": 2.638611316680908, + "rewards/rejected": -2.5521907806396484, + "step": 129 + }, + { + "epoch": 0.49796504668422314, + "grad_norm": 0.7639846205711365, + "learning_rate": 5.34392660899138e-06, + "logits/chosen": -0.9218605756759644, + "logits/rejected": -0.9198346734046936, + "logps/chosen": -937.4310913085938, + "logps/rejected": -954.9653930664062, + "loss": 0.1785, + "num_input_tokens_seen": 46683136, + "rewards/accuracies": 0.9453125, + "rewards/chosen": 0.08254611492156982, + "rewards/margins": 2.7471113204956055, + "rewards/rejected": -2.664564847946167, + "step": 130 + }, + { + "epoch": 0.5017955470433325, + "grad_norm": 0.8248947262763977, + "learning_rate": 5.281467998364314e-06, + "logits/chosen": -0.9116605520248413, + "logits/rejected": -0.9014899134635925, + "logps/chosen": -946.0270385742188, + "logps/rejected": -985.1790161132812, + "loss": 0.1789, + "num_input_tokens_seen": 47033152, + "rewards/accuracies": 0.9140625, + "rewards/chosen": 0.23138511180877686, + "rewards/margins": 3.0144951343536377, + "rewards/rejected": -2.7831101417541504, + "step": 131 + }, + { + "epoch": 0.505626047402442, + "grad_norm": 0.6977185010910034, + "learning_rate": 5.218965294127155e-06, + "logits/chosen": -0.9298666715621948, + "logits/rejected": -0.9158017039299011, + "logps/chosen": -976.8021240234375, + "logps/rejected": -1009.7666015625, + "loss": 0.1489, + "num_input_tokens_seen": 47407552, + "rewards/accuracies": 0.9609375, + "rewards/chosen": 0.0529421865940094, + "rewards/margins": 3.1810245513916016, + "rewards/rejected": -3.128082513809204, + "step": 132 + }, + { + "epoch": 0.5094565477615514, + "grad_norm": 0.7374473214149475, + "learning_rate": 5.156428287694508e-06, + "logits/chosen": -0.9254990220069885, + "logits/rejected": -0.9135416150093079, + "logps/chosen": -996.0518798828125, + "logps/rejected": -1023.51416015625, + "loss": 0.1521, + "num_input_tokens_seen": 47777088, + "rewards/accuracies": 0.953125, + "rewards/chosen": -0.0355679988861084, + "rewards/margins": 3.0022096633911133, + "rewards/rejected": -3.037777900695801, + "step": 133 + }, + { + "epoch": 0.5132870481206607, + "grad_norm": 0.8357559442520142, + "learning_rate": 5.093866775854618e-06, + "logits/chosen": -0.9214972257614136, + "logits/rejected": -0.916042685508728, + "logps/chosen": -976.1259155273438, + "logps/rejected": -985.6715087890625, + "loss": 0.1878, + "num_input_tokens_seen": 48146176, + "rewards/accuracies": 0.921875, + "rewards/chosen": 0.21140217781066895, + "rewards/margins": 2.903090000152588, + "rewards/rejected": -2.691687822341919, + "step": 134 + }, + { + "epoch": 0.5171175484797702, + "grad_norm": 0.697043240070343, + "learning_rate": 5.03129055923465e-06, + "logits/chosen": -0.938490629196167, + "logits/rejected": -0.9244428873062134, + "logps/chosen": -949.1923217773438, + "logps/rejected": -982.4913330078125, + "loss": 0.1465, + "num_input_tokens_seen": 48511488, + "rewards/accuracies": 0.953125, + "rewards/chosen": 0.17881152033805847, + "rewards/margins": 3.344749927520752, + "rewards/rejected": -3.165938377380371, + "step": 135 + }, + { + "epoch": 0.5209480488388796, + "grad_norm": 0.8031744956970215, + "learning_rate": 4.968709440765352e-06, + "logits/chosen": -0.9183306694030762, + "logits/rejected": -0.9239646196365356, + "logps/chosen": -943.1031494140625, + "logps/rejected": -952.557373046875, + "loss": 0.1633, + "num_input_tokens_seen": 48862336, + "rewards/accuracies": 0.9296875, + "rewards/chosen": 0.12101169675588608, + "rewards/margins": 2.9173638820648193, + "rewards/rejected": -2.7963523864746094, + "step": 136 + }, + { + "epoch": 0.5247785491979889, + "grad_norm": 0.8991208076477051, + "learning_rate": 4.906133224145384e-06, + "logits/chosen": -0.9242952466011047, + "logits/rejected": -0.9066295623779297, + "logps/chosen": -958.8857421875, + "logps/rejected": -1014.4093017578125, + "loss": 0.182, + "num_input_tokens_seen": 49226048, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.20996643602848053, + "rewards/margins": 2.8446974754333496, + "rewards/rejected": -2.6347310543060303, + "step": 137 + }, + { + "epoch": 0.5286090495570984, + "grad_norm": 1.0389693975448608, + "learning_rate": 4.843571712305493e-06, + "logits/chosen": -0.9104933738708496, + "logits/rejected": -0.9128438234329224, + "logps/chosen": -947.2581176757812, + "logps/rejected": -968.7093505859375, + "loss": 0.2354, + "num_input_tokens_seen": 49581888, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.028731685131788254, + "rewards/margins": 2.8915884494781494, + "rewards/rejected": -2.9203200340270996, + "step": 138 + }, + { + "epoch": 0.5324395499162078, + "grad_norm": 0.6432647109031677, + "learning_rate": 4.781034705872846e-06, + "logits/chosen": -0.9122514128684998, + "logits/rejected": -0.9045748114585876, + "logps/chosen": -953.7161865234375, + "logps/rejected": -988.1721801757812, + "loss": 0.1248, + "num_input_tokens_seen": 49936512, + "rewards/accuracies": 0.9765625, + "rewards/chosen": 0.061478182673454285, + "rewards/margins": 3.1349568367004395, + "rewards/rejected": -3.0734782218933105, + "step": 139 + }, + { + "epoch": 0.5362700502753173, + "grad_norm": 0.7527614235877991, + "learning_rate": 4.7185320016356865e-06, + "logits/chosen": -0.9259985685348511, + "logits/rejected": -0.9194520711898804, + "logps/chosen": -962.0812377929688, + "logps/rejected": -992.304443359375, + "loss": 0.1743, + "num_input_tokens_seen": 50309824, + "rewards/accuracies": 0.9453125, + "rewards/chosen": -0.038489580154418945, + "rewards/margins": 3.002593517303467, + "rewards/rejected": -3.041083335876465, + "step": 140 + }, + { + "epoch": 0.5401005506344266, + "grad_norm": 0.6872031092643738, + "learning_rate": 4.656073391008622e-06, + "logits/chosen": -0.9056570529937744, + "logits/rejected": -0.8970775604248047, + "logps/chosen": -913.651123046875, + "logps/rejected": -951.6573486328125, + "loss": 0.16, + "num_input_tokens_seen": 50659712, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -0.023862987756729126, + "rewards/margins": 2.8871231079101562, + "rewards/rejected": -2.9109859466552734, + "step": 141 + }, + { + "epoch": 0.543931050993536, + "grad_norm": 0.7276185750961304, + "learning_rate": 4.593668658498737e-06, + "logits/chosen": -0.9364982843399048, + "logits/rejected": -0.9288170337677002, + "logps/chosen": -961.48193359375, + "logps/rejected": -980.2227172851562, + "loss": 0.1646, + "num_input_tokens_seen": 51020992, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.14927178621292114, + "rewards/margins": 2.987109899520874, + "rewards/rejected": -2.8378381729125977, + "step": 142 + }, + { + "epoch": 0.5477615513526455, + "grad_norm": 0.6749744415283203, + "learning_rate": 4.531327580172794e-06, + "logits/chosen": -0.9263750314712524, + "logits/rejected": -0.9184825420379639, + "logps/chosen": -985.8505249023438, + "logps/rejected": -997.365478515625, + "loss": 0.134, + "num_input_tokens_seen": 51378624, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.08198380470275879, + "rewards/margins": 3.251645088195801, + "rewards/rejected": -3.3336286544799805, + "step": 143 + }, + { + "epoch": 0.5515920517117548, + "grad_norm": 0.8101935982704163, + "learning_rate": 4.469059922125753e-06, + "logits/chosen": -0.9402338266372681, + "logits/rejected": -0.9275364875793457, + "logps/chosen": -1014.8330688476562, + "logps/rejected": -1064.818603515625, + "loss": 0.1787, + "num_input_tokens_seen": 51760384, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2844201922416687, + "rewards/margins": 2.994309186935425, + "rewards/rejected": -3.2787294387817383, + "step": 144 + }, + { + "epoch": 0.5554225520708642, + "grad_norm": 0.7569603323936462, + "learning_rate": 4.4068754389508616e-06, + "logits/chosen": -0.916680097579956, + "logits/rejected": -0.9122562408447266, + "logps/chosen": -967.0972900390625, + "logps/rejected": -988.894775390625, + "loss": 0.1561, + "num_input_tokens_seen": 52114304, + "rewards/accuracies": 0.9453125, + "rewards/chosen": 0.028125673532485962, + "rewards/margins": 3.3518166542053223, + "rewards/rejected": -3.323690891265869, + "step": 145 + }, + { + "epoch": 0.5592530524299737, + "grad_norm": 0.7716813087463379, + "learning_rate": 4.34478387221153e-06, + "logits/chosen": -0.9058219790458679, + "logits/rejected": -0.9038896560668945, + "logps/chosen": -958.7652587890625, + "logps/rejected": -981.8695068359375, + "loss": 0.1754, + "num_input_tokens_seen": 52486784, + "rewards/accuracies": 0.9296875, + "rewards/chosen": -0.23985768854618073, + "rewards/margins": 2.9047439098358154, + "rewards/rejected": -3.144601345062256, + "step": 146 + }, + { + "epoch": 0.5630835527890831, + "grad_norm": 0.6906071901321411, + "learning_rate": 4.282794948915271e-06, + "logits/chosen": -0.9132944345474243, + "logits/rejected": -0.906856894493103, + "logps/chosen": -925.8829345703125, + "logps/rejected": -970.9793701171875, + "loss": 0.1371, + "num_input_tokens_seen": 52831680, + "rewards/accuracies": 0.9609375, + "rewards/chosen": 0.0626637414097786, + "rewards/margins": 3.341050148010254, + "rewards/rejected": -3.2783865928649902, + "step": 147 + }, + { + "epoch": 0.5669140531481924, + "grad_norm": 0.6533832550048828, + "learning_rate": 4.220918379989898e-06, + "logits/chosen": -0.9118704795837402, + "logits/rejected": -0.9107242822647095, + "logps/chosen": -944.8489990234375, + "logps/rejected": -943.7695922851562, + "loss": 0.1264, + "num_input_tokens_seen": 53175680, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -0.09482257068157196, + "rewards/margins": 3.1861157417297363, + "rewards/rejected": -3.280938148498535, + "step": 148 + }, + { + "epoch": 0.5707445535073019, + "grad_norm": 0.7558099031448364, + "learning_rate": 4.159163858762255e-06, + "logits/chosen": -0.903454601764679, + "logits/rejected": -0.9002053737640381, + "logps/chosen": -957.793701171875, + "logps/rejected": -963.5908203125, + "loss": 0.1533, + "num_input_tokens_seen": 53537024, + "rewards/accuracies": 0.9453125, + "rewards/chosen": -0.1851228028535843, + "rewards/margins": 3.139002799987793, + "rewards/rejected": -3.3241257667541504, + "step": 149 + }, + { + "epoch": 0.5745750538664113, + "grad_norm": 0.6508485674858093, + "learning_rate": 4.097541059439698e-06, + "logits/chosen": -0.9359325766563416, + "logits/rejected": -0.9345180988311768, + "logps/chosen": -947.7728881835938, + "logps/rejected": -962.4177856445312, + "loss": 0.1247, + "num_input_tokens_seen": 53894528, + "rewards/accuracies": 0.984375, + "rewards/chosen": -0.23104152083396912, + "rewards/margins": 3.349843978881836, + "rewards/rejected": -3.580885410308838, + "step": 150 + }, + { + "epoch": 0.5784055542255208, + "grad_norm": 0.7486430406570435, + "learning_rate": 4.036059635594578e-06, + "logits/chosen": -0.9248436689376831, + "logits/rejected": -0.9138121604919434, + "logps/chosen": -971.0343017578125, + "logps/rejected": -982.6134643554688, + "loss": 0.1545, + "num_input_tokens_seen": 54262848, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -0.06414251029491425, + "rewards/margins": 2.8752317428588867, + "rewards/rejected": -2.9393744468688965, + "step": 151 + }, + { + "epoch": 0.5822360545846301, + "grad_norm": 0.6359651684761047, + "learning_rate": 3.974729218651946e-06, + "logits/chosen": -0.9428115487098694, + "logits/rejected": -0.940241277217865, + "logps/chosen": -961.1180419921875, + "logps/rejected": -996.2943725585938, + "loss": 0.1238, + "num_input_tokens_seen": 54635776, + "rewards/accuracies": 0.984375, + "rewards/chosen": -0.3338150978088379, + "rewards/margins": 3.267073154449463, + "rewards/rejected": -3.6008880138397217, + "step": 152 + }, + { + "epoch": 0.5860665549437395, + "grad_norm": 0.7911621928215027, + "learning_rate": 3.913559416380743e-06, + "logits/chosen": -0.9230860471725464, + "logits/rejected": -0.9275256395339966, + "logps/chosen": -942.3780517578125, + "logps/rejected": -964.422119140625, + "loss": 0.1598, + "num_input_tokens_seen": 54999744, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -0.24053587019443512, + "rewards/margins": 3.191891670227051, + "rewards/rejected": -3.4324276447296143, + "step": 153 + }, + { + "epoch": 0.589897055302849, + "grad_norm": 0.5902103185653687, + "learning_rate": 3.852559811388676e-06, + "logits/chosen": -0.9106675386428833, + "logits/rejected": -0.9067913889884949, + "logps/chosen": -951.162353515625, + "logps/rejected": -964.935791015625, + "loss": 0.1063, + "num_input_tokens_seen": 55337280, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -0.1262761652469635, + "rewards/margins": 3.579685688018799, + "rewards/rejected": -3.7059621810913086, + "step": 154 + }, + { + "epoch": 0.5937275556619583, + "grad_norm": 0.9833846688270569, + "learning_rate": 3.791739959621054e-06, + "logits/chosen": -0.9223001599311829, + "logits/rejected": -0.9142757654190063, + "logps/chosen": -959.0340576171875, + "logps/rejected": -990.4814453125, + "loss": 0.2155, + "num_input_tokens_seen": 55702336, + "rewards/accuracies": 0.9296875, + "rewards/chosen": -0.5217866897583008, + "rewards/margins": 2.574551820755005, + "rewards/rejected": -3.0963382720947266, + "step": 155 + }, + { + "epoch": 0.5975580560210677, + "grad_norm": 0.7217708826065063, + "learning_rate": 3.7311093888637906e-06, + "logits/chosen": -0.9206514954566956, + "logits/rejected": -0.9145509004592896, + "logps/chosen": -968.0018310546875, + "logps/rejected": -975.0421142578125, + "loss": 0.1484, + "num_input_tokens_seen": 56070336, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -0.11130332946777344, + "rewards/margins": 3.2543931007385254, + "rewards/rejected": -3.3656961917877197, + "step": 156 + }, + { + "epoch": 0.6013885563801772, + "grad_norm": 0.7246530652046204, + "learning_rate": 3.670677597250819e-06, + "logits/chosen": -0.9260054230690002, + "logits/rejected": -0.9178897738456726, + "logps/chosen": -941.1248779296875, + "logps/rejected": -968.9746704101562, + "loss": 0.1514, + "num_input_tokens_seen": 56428800, + "rewards/accuracies": 0.953125, + "rewards/chosen": -0.2762157917022705, + "rewards/margins": 3.010899066925049, + "rewards/rejected": -3.2871148586273193, + "step": 157 + }, + { + "epoch": 0.6052190567392866, + "grad_norm": 0.6936725974082947, + "learning_rate": 3.6104540517761594e-06, + "logits/chosen": -0.9163696765899658, + "logits/rejected": -0.914963960647583, + "logps/chosen": -964.3089599609375, + "logps/rejected": -959.0465698242188, + "loss": 0.1324, + "num_input_tokens_seen": 56783040, + "rewards/accuracies": 0.953125, + "rewards/chosen": -0.045130133628845215, + "rewards/margins": 3.460543632507324, + "rewards/rejected": -3.50567364692688, + "step": 158 + }, + { + "epoch": 0.609049557098396, + "grad_norm": 0.5236934423446655, + "learning_rate": 3.55044818681085e-06, + "logits/chosen": -0.9246783256530762, + "logits/rejected": -0.9130945205688477, + "logps/chosen": -977.2896728515625, + "logps/rejected": -1020.466064453125, + "loss": 0.0982, + "num_input_tokens_seen": 57145920, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -0.07516241073608398, + "rewards/margins": 3.4908297061920166, + "rewards/rejected": -3.5659918785095215, + "step": 159 + }, + { + "epoch": 0.6128800574575054, + "grad_norm": 0.689087986946106, + "learning_rate": 3.4906694026250075e-06, + "logits/chosen": -0.9287959337234497, + "logits/rejected": -0.9262123107910156, + "logps/chosen": -957.884521484375, + "logps/rejected": -977.1112060546875, + "loss": 0.1404, + "num_input_tokens_seen": 57502528, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -0.38688862323760986, + "rewards/margins": 3.173988103866577, + "rewards/rejected": -3.5608768463134766, + "step": 160 + }, + { + "epoch": 0.6167105578166148, + "grad_norm": 0.726698100566864, + "learning_rate": 3.431127063915213e-06, + "logits/chosen": -0.9286818504333496, + "logits/rejected": -0.9241329431533813, + "logps/chosen": -960.54833984375, + "logps/rejected": -974.0302734375, + "loss": 0.143, + "num_input_tokens_seen": 57864896, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -0.151766836643219, + "rewards/margins": 3.2966322898864746, + "rewards/rejected": -3.448399066925049, + "step": 161 + }, + { + "epoch": 0.6205410581757242, + "grad_norm": 0.6334934830665588, + "learning_rate": 3.371830498337475e-06, + "logits/chosen": -0.911125898361206, + "logits/rejected": -0.8991184234619141, + "logps/chosen": -925.171630859375, + "logps/rejected": -978.1533203125, + "loss": 0.1236, + "num_input_tokens_seen": 58218496, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -0.19827139377593994, + "rewards/margins": 3.3811941146850586, + "rewards/rejected": -3.579465389251709, + "step": 162 + }, + { + "epoch": 0.6243715585348336, + "grad_norm": 0.6609664559364319, + "learning_rate": 3.3127889950460094e-06, + "logits/chosen": -0.9121673703193665, + "logits/rejected": -0.9033511877059937, + "logps/chosen": -918.6185913085938, + "logps/rejected": -975.3275146484375, + "loss": 0.1342, + "num_input_tokens_seen": 58561152, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -0.2318304032087326, + "rewards/margins": 3.1678988933563232, + "rewards/rejected": -3.3997297286987305, + "step": 163 + }, + { + "epoch": 0.628202058893943, + "grad_norm": 0.9944313168525696, + "learning_rate": 3.254011803238026e-06, + "logits/chosen": -0.9261605739593506, + "logits/rejected": -0.9191899299621582, + "logps/chosen": -988.6527709960938, + "logps/rejected": -1030.06494140625, + "loss": 0.1939, + "num_input_tokens_seen": 58925504, + "rewards/accuracies": 0.921875, + "rewards/chosen": -0.6074918508529663, + "rewards/margins": 3.271310806274414, + "rewards/rejected": -3.87880277633667, + "step": 164 + }, + { + "epoch": 0.6320325592530525, + "grad_norm": 0.7976089715957642, + "learning_rate": 3.195508130704795e-06, + "logits/chosen": -0.9168756008148193, + "logits/rejected": -0.9133727550506592, + "logps/chosen": -940.9190063476562, + "logps/rejected": -952.8231811523438, + "loss": 0.1649, + "num_input_tokens_seen": 59269120, + "rewards/accuracies": 0.953125, + "rewards/chosen": -0.39120060205459595, + "rewards/margins": 3.0984408855438232, + "rewards/rejected": -3.4896414279937744, + "step": 165 + }, + { + "epoch": 0.6358630596121618, + "grad_norm": 0.7843651175498962, + "learning_rate": 3.1372871423891894e-06, + "logits/chosen": -0.9184448719024658, + "logits/rejected": -0.9136737585067749, + "logps/chosen": -948.08056640625, + "logps/rejected": -970.911376953125, + "loss": 0.1534, + "num_input_tokens_seen": 59620928, + "rewards/accuracies": 0.9296875, + "rewards/chosen": -0.2569704055786133, + "rewards/margins": 3.2640576362609863, + "rewards/rejected": -3.5210278034210205, + "step": 166 + }, + { + "epoch": 0.6396935599712712, + "grad_norm": 0.6255090832710266, + "learning_rate": 3.079357958949946e-06, + "logits/chosen": -0.933240532875061, + "logits/rejected": -0.9224045872688293, + "logps/chosen": -943.7069091796875, + "logps/rejected": -979.8607177734375, + "loss": 0.1166, + "num_input_tokens_seen": 59974656, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -0.02661116048693657, + "rewards/margins": 3.640368700027466, + "rewards/rejected": -3.666980028152466, + "step": 167 + }, + { + "epoch": 0.6435240603303807, + "grad_norm": 0.783669650554657, + "learning_rate": 3.021729655332858e-06, + "logits/chosen": -0.9272843599319458, + "logits/rejected": -0.9127662181854248, + "logps/chosen": -963.0645751953125, + "logps/rejected": -990.942138671875, + "loss": 0.1577, + "num_input_tokens_seen": 60329216, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -0.14982371032238007, + "rewards/margins": 3.0728282928466797, + "rewards/rejected": -3.222651958465576, + "step": 168 + }, + { + "epoch": 0.64735456068949, + "grad_norm": 0.5235418081283569, + "learning_rate": 2.9644112593491315e-06, + "logits/chosen": -0.9054789543151855, + "logits/rejected": -0.9013749361038208, + "logps/chosen": -932.501708984375, + "logps/rejected": -954.3444213867188, + "loss": 0.1012, + "num_input_tokens_seen": 60686464, + "rewards/accuracies": 0.984375, + "rewards/chosen": -0.05191610008478165, + "rewards/margins": 3.5466318130493164, + "rewards/rejected": -3.5985476970672607, + "step": 169 + }, + { + "epoch": 0.6511850610485995, + "grad_norm": 0.7909523844718933, + "learning_rate": 2.90741175026113e-06, + "logits/chosen": -0.9309460520744324, + "logits/rejected": -0.9214715361595154, + "logps/chosen": -954.997314453125, + "logps/rejected": -989.3560791015625, + "loss": 0.1475, + "num_input_tokens_seen": 61046144, + "rewards/accuracies": 0.9453125, + "rewards/chosen": -0.11950027942657471, + "rewards/margins": 3.6973204612731934, + "rewards/rejected": -3.8168208599090576, + "step": 170 + }, + { + "epoch": 0.6550155614077089, + "grad_norm": 0.773652195930481, + "learning_rate": 2.850740057375716e-06, + "logits/chosen": -0.933504581451416, + "logits/rejected": -0.9210010170936584, + "logps/chosen": -926.88916015625, + "logps/rejected": -958.1067504882812, + "loss": 0.148, + "num_input_tokens_seen": 61395776, + "rewards/accuracies": 0.953125, + "rewards/chosen": -0.4070283770561218, + "rewards/margins": 3.2778236865997314, + "rewards/rejected": -3.684852123260498, + "step": 171 + }, + { + "epoch": 0.6588460617668183, + "grad_norm": 0.797848641872406, + "learning_rate": 2.7944050586454215e-06, + "logits/chosen": -0.9204280376434326, + "logits/rejected": -0.9115635752677917, + "logps/chosen": -903.8150634765625, + "logps/rejected": -940.1217041015625, + "loss": 0.1509, + "num_input_tokens_seen": 61739072, + "rewards/accuracies": 0.953125, + "rewards/chosen": -0.38842105865478516, + "rewards/margins": 3.475029468536377, + "rewards/rejected": -3.863450527191162, + "step": 172 + }, + { + "epoch": 0.6626765621259277, + "grad_norm": 0.7227478623390198, + "learning_rate": 2.7384155792776724e-06, + "logits/chosen": -0.9213179349899292, + "logits/rejected": -0.9115352630615234, + "logps/chosen": -959.1319580078125, + "logps/rejected": -990.2802734375, + "loss": 0.1492, + "num_input_tokens_seen": 62091840, + "rewards/accuracies": 0.9453125, + "rewards/chosen": -0.45713087916374207, + "rewards/margins": 3.632582664489746, + "rewards/rejected": -4.0897135734558105, + "step": 173 + }, + { + "epoch": 0.6665070624850371, + "grad_norm": 0.7665003538131714, + "learning_rate": 2.682780390352262e-06, + "logits/chosen": -0.929406464099884, + "logits/rejected": -0.9264893531799316, + "logps/chosen": -937.1808471679688, + "logps/rejected": -971.6337890625, + "loss": 0.1385, + "num_input_tokens_seen": 62452416, + "rewards/accuracies": 0.953125, + "rewards/chosen": -0.38978707790374756, + "rewards/margins": 3.6679680347442627, + "rewards/rejected": -4.057755470275879, + "step": 174 + }, + { + "epoch": 0.6703375628441465, + "grad_norm": 0.7943733334541321, + "learning_rate": 2.627508207447308e-06, + "logits/chosen": -0.9343462586402893, + "logits/rejected": -0.9233741164207458, + "logps/chosen": -979.9729614257812, + "logps/rejected": -1021.626220703125, + "loss": 0.1579, + "num_input_tokens_seen": 62826432, + "rewards/accuracies": 0.9296875, + "rewards/chosen": -0.41942232847213745, + "rewards/margins": 3.447638988494873, + "rewards/rejected": -3.867061138153076, + "step": 175 + }, + { + "epoch": 0.674168063203256, + "grad_norm": 0.8248593211174011, + "learning_rate": 2.5726076892739127e-06, + "logits/chosen": -0.9268508553504944, + "logits/rejected": -0.9227650165557861, + "logps/chosen": -1003.8179321289062, + "logps/rejected": -1027.9615478515625, + "loss": 0.1482, + "num_input_tokens_seen": 63188736, + "rewards/accuracies": 0.953125, + "rewards/chosen": -0.36489757895469666, + "rewards/margins": 3.3734445571899414, + "rewards/rejected": -3.73834228515625, + "step": 176 + }, + { + "epoch": 0.6779985635623653, + "grad_norm": 0.7207663655281067, + "learning_rate": 2.5180874363197217e-06, + "logits/chosen": -0.9383317828178406, + "logits/rejected": -0.9231407642364502, + "logps/chosen": -990.7264404296875, + "logps/rejected": -1012.448486328125, + "loss": 0.131, + "num_input_tokens_seen": 63559040, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -0.3337286114692688, + "rewards/margins": 3.171330451965332, + "rewards/rejected": -3.505059242248535, + "step": 177 + }, + { + "epoch": 0.6818290639214747, + "grad_norm": 0.6918401122093201, + "learning_rate": 2.463955989501607e-06, + "logits/chosen": -0.9346806406974792, + "logits/rejected": -0.9255118370056152, + "logps/chosen": -968.970947265625, + "logps/rejected": -995.2408447265625, + "loss": 0.1255, + "num_input_tokens_seen": 63918080, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -0.3664875030517578, + "rewards/margins": 3.5894227027893066, + "rewards/rejected": -3.9559102058410645, + "step": 178 + }, + { + "epoch": 0.6856595642805842, + "grad_norm": 0.6757586598396301, + "learning_rate": 2.41022182882768e-06, + "logits/chosen": -0.917214035987854, + "logits/rejected": -0.9152848124504089, + "logps/chosen": -940.9396362304688, + "logps/rejected": -953.3655395507812, + "loss": 0.1311, + "num_input_tokens_seen": 64264384, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -0.32321658730506897, + "rewards/margins": 3.2651116847991943, + "rewards/rejected": -3.5883283615112305, + "step": 179 + }, + { + "epoch": 0.6894900646396935, + "grad_norm": 0.7588072419166565, + "learning_rate": 2.356893372068855e-06, + "logits/chosen": -0.9298040866851807, + "logits/rejected": -0.9151283502578735, + "logps/chosen": -941.1553955078125, + "logps/rejected": -963.6946411132812, + "loss": 0.148, + "num_input_tokens_seen": 64627520, + "rewards/accuracies": 0.953125, + "rewards/chosen": -0.32789045572280884, + "rewards/margins": 3.368997097015381, + "rewards/rejected": -3.696887254714966, + "step": 180 + }, + { + "epoch": 0.693320564998803, + "grad_norm": 0.7013005614280701, + "learning_rate": 2.3039789734401524e-06, + "logits/chosen": -0.9495141506195068, + "logits/rejected": -0.9404861927032471, + "logps/chosen": -907.8338623046875, + "logps/rejected": -958.0008544921875, + "loss": 0.1238, + "num_input_tokens_seen": 64985472, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -0.2692577838897705, + "rewards/margins": 3.5591087341308594, + "rewards/rejected": -3.828366756439209, + "step": 181 + }, + { + "epoch": 0.6971510653579124, + "grad_norm": 0.7725825905799866, + "learning_rate": 2.251486922291957e-06, + "logits/chosen": -0.9297600984573364, + "logits/rejected": -0.9223314523696899, + "logps/chosen": -897.1873779296875, + "logps/rejected": -942.2521362304688, + "loss": 0.162, + "num_input_tokens_seen": 65333056, + "rewards/accuracies": 0.9453125, + "rewards/chosen": -0.5130435228347778, + "rewards/margins": 3.2171437740325928, + "rewards/rejected": -3.73018741607666, + "step": 182 + }, + { + "epoch": 0.7009815657170217, + "grad_norm": 0.8550043702125549, + "learning_rate": 2.1994254418114524e-06, + "logits/chosen": -0.9317770600318909, + "logits/rejected": -0.9222955703735352, + "logps/chosen": -988.07373046875, + "logps/rejected": -1032.86279296875, + "loss": 0.1528, + "num_input_tokens_seen": 65701376, + "rewards/accuracies": 0.953125, + "rewards/chosen": -0.44387656450271606, + "rewards/margins": 3.2253012657165527, + "rewards/rejected": -3.669177770614624, + "step": 183 + }, + { + "epoch": 0.7048120660761312, + "grad_norm": 0.6962841153144836, + "learning_rate": 2.147802687734409e-06, + "logits/chosen": -0.9174663424491882, + "logits/rejected": -0.9097387790679932, + "logps/chosen": -961.351318359375, + "logps/rejected": -999.2906494140625, + "loss": 0.1427, + "num_input_tokens_seen": 66057152, + "rewards/accuracies": 0.9453125, + "rewards/chosen": -0.4570930600166321, + "rewards/margins": 3.385087490081787, + "rewards/rejected": -3.8421807289123535, + "step": 184 + }, + { + "epoch": 0.7086425664352406, + "grad_norm": 0.7893382906913757, + "learning_rate": 2.0966267470675273e-06, + "logits/chosen": -0.9248800277709961, + "logits/rejected": -0.9228296279907227, + "logps/chosen": -958.9442138671875, + "logps/rejected": -972.08984375, + "loss": 0.1538, + "num_input_tokens_seen": 66413248, + "rewards/accuracies": 0.953125, + "rewards/chosen": -0.31346142292022705, + "rewards/margins": 3.536005973815918, + "rewards/rejected": -3.8494672775268555, + "step": 185 + }, + { + "epoch": 0.71247306679435, + "grad_norm": 0.8005087971687317, + "learning_rate": 2.0459056368215786e-06, + "logits/chosen": -0.9207695722579956, + "logits/rejected": -0.9085906744003296, + "logps/chosen": -939.89208984375, + "logps/rejected": -962.2757568359375, + "loss": 0.1646, + "num_input_tokens_seen": 66777600, + "rewards/accuracies": 0.9296875, + "rewards/chosen": -0.39592188596725464, + "rewards/margins": 3.1147937774658203, + "rewards/rejected": -3.5107157230377197, + "step": 186 + }, + { + "epoch": 0.7163035671534594, + "grad_norm": 0.9660282135009766, + "learning_rate": 1.9956473027554846e-06, + "logits/chosen": -0.9134042263031006, + "logits/rejected": -0.9137810468673706, + "logps/chosen": -931.7164916992188, + "logps/rejected": -959.245361328125, + "loss": 0.1682, + "num_input_tokens_seen": 67126784, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3818260133266449, + "rewards/margins": 3.1711347103118896, + "rewards/rejected": -3.5529606342315674, + "step": 187 + }, + { + "epoch": 0.7201340675125688, + "grad_norm": 0.6785956621170044, + "learning_rate": 1.9458596181315643e-06, + "logits/chosen": -0.920072078704834, + "logits/rejected": -0.918395459651947, + "logps/chosen": -968.3331909179688, + "logps/rejected": -975.0594482421875, + "loss": 0.1301, + "num_input_tokens_seen": 67486848, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.5244669914245605, + "rewards/margins": 3.3171777725219727, + "rewards/rejected": -3.8416450023651123, + "step": 188 + }, + { + "epoch": 0.7239645678716783, + "grad_norm": 0.6946900486946106, + "learning_rate": 1.8965503824821496e-06, + "logits/chosen": -0.9121235609054565, + "logits/rejected": -0.9028680324554443, + "logps/chosen": -942.3410034179688, + "logps/rejected": -981.06640625, + "loss": 0.1263, + "num_input_tokens_seen": 67830400, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.6077229976654053, + "rewards/margins": 3.4614524841308594, + "rewards/rejected": -4.0691752433776855, + "step": 189 + }, + { + "epoch": 0.7277950682307877, + "grad_norm": 0.8973605036735535, + "learning_rate": 1.84772732038774e-06, + "logits/chosen": -0.9083918333053589, + "logits/rejected": -0.9054266810417175, + "logps/chosen": -953.493896484375, + "logps/rejected": -976.3198852539062, + "loss": 0.1905, + "num_input_tokens_seen": 68184448, + "rewards/accuracies": 0.953125, + "rewards/chosen": -0.4564862847328186, + "rewards/margins": 3.076885223388672, + "rewards/rejected": -3.5333714485168457, + "step": 190 + }, + { + "epoch": 0.731625568589897, + "grad_norm": 0.664169430732727, + "learning_rate": 1.7993980802668947e-06, + "logits/chosen": -0.9126517176628113, + "logits/rejected": -0.899666428565979, + "logps/chosen": -944.12109375, + "logps/rejected": -996.7346801757812, + "loss": 0.1212, + "num_input_tokens_seen": 68536640, + "rewards/accuracies": 0.953125, + "rewards/chosen": -0.12518563866615295, + "rewards/margins": 3.5026543140411377, + "rewards/rejected": -3.627840042114258, + "step": 191 + }, + { + "epoch": 0.7354560689490065, + "grad_norm": 0.6976946592330933, + "learning_rate": 1.7515702331780753e-06, + "logits/chosen": -0.925005316734314, + "logits/rejected": -0.9164972305297852, + "logps/chosen": -908.323974609375, + "logps/rejected": -934.6966552734375, + "loss": 0.1466, + "num_input_tokens_seen": 68881216, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.22267469763755798, + "rewards/margins": 3.2678799629211426, + "rewards/rejected": -3.4905548095703125, + "step": 192 + }, + { + "epoch": 0.7392865693081159, + "grad_norm": 0.7548503875732422, + "learning_rate": 1.7042512716335873e-06, + "logits/chosen": -0.9357597231864929, + "logits/rejected": -0.9307007789611816, + "logps/chosen": -986.9573974609375, + "logps/rejected": -1031.5806884765625, + "loss": 0.1463, + "num_input_tokens_seen": 69258368, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -0.6343834400177002, + "rewards/margins": 3.1081337928771973, + "rewards/rejected": -3.7425169944763184, + "step": 193 + }, + { + "epoch": 0.7431170696672252, + "grad_norm": 0.5930299758911133, + "learning_rate": 1.6574486084258369e-06, + "logits/chosen": -0.9408407807350159, + "logits/rejected": -0.9381968975067139, + "logps/chosen": -959.538818359375, + "logps/rejected": -957.91455078125, + "loss": 0.1093, + "num_input_tokens_seen": 69617152, + "rewards/accuracies": 0.984375, + "rewards/chosen": -0.29738473892211914, + "rewards/margins": 3.521193027496338, + "rewards/rejected": -3.818577766418457, + "step": 194 + }, + { + "epoch": 0.7469475700263347, + "grad_norm": 0.7145241498947144, + "learning_rate": 1.6111695754660667e-06, + "logits/chosen": -0.9211927056312561, + "logits/rejected": -0.9088835716247559, + "logps/chosen": -979.0453491210938, + "logps/rejected": -1004.5391235351562, + "loss": 0.1382, + "num_input_tokens_seen": 69979072, + "rewards/accuracies": 0.953125, + "rewards/chosen": -0.5738773345947266, + "rewards/margins": 3.473905086517334, + "rewards/rejected": -4.047781944274902, + "step": 195 + }, + { + "epoch": 0.7507780703854441, + "grad_norm": 0.5450356006622314, + "learning_rate": 1.5654214226357822e-06, + "logits/chosen": -0.9210422039031982, + "logits/rejected": -0.9054737091064453, + "logps/chosen": -971.6129150390625, + "logps/rejected": -1038.47509765625, + "loss": 0.0859, + "num_input_tokens_seen": 70344768, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6035772562026978, + "rewards/margins": 3.4652934074401855, + "rewards/rejected": -4.068870544433594, + "step": 196 + }, + { + "epoch": 0.7546085707445535, + "grad_norm": 0.8711457252502441, + "learning_rate": 1.5202113166510058e-06, + "logits/chosen": -0.9456660747528076, + "logits/rejected": -0.9417568445205688, + "logps/chosen": -958.7596435546875, + "logps/rejected": -988.8860473632812, + "loss": 0.1651, + "num_input_tokens_seen": 70721984, + "rewards/accuracies": 0.953125, + "rewards/chosen": -0.4450293779373169, + "rewards/margins": 3.5368199348449707, + "rewards/rejected": -3.981849193572998, + "step": 197 + }, + { + "epoch": 0.7584390711036629, + "grad_norm": 0.6212217807769775, + "learning_rate": 1.475546339939568e-06, + "logits/chosen": -0.9077344536781311, + "logits/rejected": -0.9016069173812866, + "logps/chosen": -971.069580078125, + "logps/rejected": -1017.0029296875, + "loss": 0.1192, + "num_input_tokens_seen": 71082368, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.444461464881897, + "rewards/margins": 3.318068742752075, + "rewards/rejected": -3.7625300884246826, + "step": 198 + }, + { + "epoch": 0.7622695714627723, + "grad_norm": 0.7105376720428467, + "learning_rate": 1.4314334895316095e-06, + "logits/chosen": -0.9295698404312134, + "logits/rejected": -0.9288057684898376, + "logps/chosen": -941.1664428710938, + "logps/rejected": -970.9924926757812, + "loss": 0.1352, + "num_input_tokens_seen": 71436928, + "rewards/accuracies": 0.9453125, + "rewards/chosen": -0.2635565996170044, + "rewards/margins": 3.5694832801818848, + "rewards/rejected": -3.8330392837524414, + "step": 199 + }, + { + "epoch": 0.7661000718218818, + "grad_norm": 0.5974374413490295, + "learning_rate": 1.3878796759634544e-06, + "logits/chosen": -0.9344805479049683, + "logits/rejected": -0.9272158145904541, + "logps/chosen": -961.6475830078125, + "logps/rejected": -1006.07373046875, + "loss": 0.1048, + "num_input_tokens_seen": 71801024, + "rewards/accuracies": 0.984375, + "rewards/chosen": -0.2792045474052429, + "rewards/margins": 3.611067771911621, + "rewards/rejected": -3.8902721405029297, + "step": 200 + }, + { + "epoch": 0.7699305721809911, + "grad_norm": 0.8337687849998474, + "learning_rate": 1.3448917221950264e-06, + "logits/chosen": -0.9320505261421204, + "logits/rejected": -0.9255180358886719, + "logps/chosen": -975.5443725585938, + "logps/rejected": -986.9401245117188, + "loss": 0.1484, + "num_input_tokens_seen": 72172800, + "rewards/accuracies": 0.9453125, + "rewards/chosen": -0.3965162932872772, + "rewards/margins": 3.266500949859619, + "rewards/rejected": -3.6630172729492188, + "step": 201 + }, + { + "epoch": 0.7737610725401005, + "grad_norm": 0.6306331753730774, + "learning_rate": 1.3024763625410025e-06, + "logits/chosen": -0.9231235980987549, + "logits/rejected": -0.9261617660522461, + "logps/chosen": -957.9940185546875, + "logps/rejected": -972.0745239257812, + "loss": 0.1119, + "num_input_tokens_seen": 72536000, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -0.06987044215202332, + "rewards/margins": 3.476567506790161, + "rewards/rejected": -3.546437978744507, + "step": 202 + }, + { + "epoch": 0.77759157289921, + "grad_norm": 0.7426192164421082, + "learning_rate": 1.2606402416158391e-06, + "logits/chosen": -0.9373898506164551, + "logits/rejected": -0.9224025011062622, + "logps/chosen": -960.5654296875, + "logps/rejected": -995.372802734375, + "loss": 0.1285, + "num_input_tokens_seen": 72900672, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.4293292760848999, + "rewards/margins": 3.4104127883911133, + "rewards/rejected": -3.8397421836853027, + "step": 203 + }, + { + "epoch": 0.7814220732583194, + "grad_norm": 0.7285085916519165, + "learning_rate": 1.2193899132928539e-06, + "logits/chosen": -0.9066146612167358, + "logits/rejected": -0.904312252998352, + "logps/chosen": -925.3270263671875, + "logps/rejected": -928.7410888671875, + "loss": 0.1443, + "num_input_tokens_seen": 73242240, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.38697314262390137, + "rewards/margins": 3.3365767002105713, + "rewards/rejected": -3.7235498428344727, + "step": 204 + }, + { + "epoch": 0.7852525736174287, + "grad_norm": 0.8147975206375122, + "learning_rate": 1.1787318396775188e-06, + "logits/chosen": -0.9232807159423828, + "logits/rejected": -0.9159767627716064, + "logps/chosen": -970.0517578125, + "logps/rejected": -1003.58154296875, + "loss": 0.1508, + "num_input_tokens_seen": 73604288, + "rewards/accuracies": 0.9453125, + "rewards/chosen": -0.5156584978103638, + "rewards/margins": 3.401538848876953, + "rewards/rejected": -3.9171972274780273, + "step": 205 + }, + { + "epoch": 0.7890830739765382, + "grad_norm": 0.8339166045188904, + "learning_rate": 1.138672390095143e-06, + "logits/chosen": -0.9299338459968567, + "logits/rejected": -0.9230663180351257, + "logps/chosen": -947.5897827148438, + "logps/rejected": -947.0642700195312, + "loss": 0.1296, + "num_input_tokens_seen": 73953984, + "rewards/accuracies": 0.953125, + "rewards/chosen": -0.48498770594596863, + "rewards/margins": 3.4591293334960938, + "rewards/rejected": -3.9441165924072266, + "step": 206 + }, + { + "epoch": 0.7929135743356476, + "grad_norm": 0.858613133430481, + "learning_rate": 1.0992178400930753e-06, + "logits/chosen": -0.9461163282394409, + "logits/rejected": -0.9391993284225464, + "logps/chosen": -956.7074584960938, + "logps/rejected": -997.1918334960938, + "loss": 0.1864, + "num_input_tokens_seen": 74322752, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3927125334739685, + "rewards/margins": 3.4954702854156494, + "rewards/rejected": -3.8881826400756836, + "step": 207 + }, + { + "epoch": 0.796744074694757, + "grad_norm": 0.6588866114616394, + "learning_rate": 1.0603743704575992e-06, + "logits/chosen": -0.930090069770813, + "logits/rejected": -0.9162197113037109, + "logps/chosen": -1001.8871459960938, + "logps/rejected": -1020.6495361328125, + "loss": 0.1169, + "num_input_tokens_seen": 74692672, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -0.3931676745414734, + "rewards/margins": 3.558851957321167, + "rewards/rejected": -3.952019691467285, + "step": 208 + }, + { + "epoch": 0.8005745750538664, + "grad_norm": 0.7126496434211731, + "learning_rate": 1.0221480662456845e-06, + "logits/chosen": -0.9272634983062744, + "logits/rejected": -0.9180313944816589, + "logps/chosen": -969.216064453125, + "logps/rejected": -1001.8155517578125, + "loss": 0.1366, + "num_input_tokens_seen": 75043648, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -0.5493388175964355, + "rewards/margins": 3.4465904235839844, + "rewards/rejected": -3.99592924118042, + "step": 209 + }, + { + "epoch": 0.8044050754129758, + "grad_norm": 0.7452099323272705, + "learning_rate": 9.845449158317216e-07, + "logits/chosen": -0.9176936149597168, + "logits/rejected": -0.9103116393089294, + "logps/chosen": -956.2598876953125, + "logps/rejected": -977.7944946289062, + "loss": 0.1543, + "num_input_tokens_seen": 75408832, + "rewards/accuracies": 0.953125, + "rewards/chosen": -0.30952370166778564, + "rewards/margins": 3.247213363647461, + "rewards/rejected": -3.556736946105957, + "step": 210 + }, + { + "epoch": 0.8082355757720853, + "grad_norm": 0.6459721922874451, + "learning_rate": 9.475708099694125e-07, + "logits/chosen": -0.936363697052002, + "logits/rejected": -0.9266760349273682, + "logps/chosen": -985.375, + "logps/rejected": -1011.9945678710938, + "loss": 0.0999, + "num_input_tokens_seen": 75767744, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -0.39271217584609985, + "rewards/margins": 3.9395999908447266, + "rewards/rejected": -4.332312107086182, + "step": 211 + }, + { + "epoch": 0.8120660761311946, + "grad_norm": 0.8414193391799927, + "learning_rate": 9.112315408689415e-07, + "logits/chosen": -0.9309604167938232, + "logits/rejected": -0.9217410683631897, + "logps/chosen": -969.8416748046875, + "logps/rejected": -994.174072265625, + "loss": 0.1413, + "num_input_tokens_seen": 76143744, + "rewards/accuracies": 0.953125, + "rewards/chosen": -0.356390118598938, + "rewards/margins": 3.5124168395996094, + "rewards/rejected": -3.868807077407837, + "step": 212 + }, + { + "epoch": 0.815896576490304, + "grad_norm": 0.8411555290222168, + "learning_rate": 8.755328012896002e-07, + "logits/chosen": -0.9191011190414429, + "logits/rejected": -0.9185481071472168, + "logps/chosen": -935.0903930664062, + "logps/rejected": -959.4551391601562, + "loss": 0.1615, + "num_input_tokens_seen": 76490112, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.48258739709854126, + "rewards/margins": 3.2218642234802246, + "rewards/rejected": -3.704451560974121, + "step": 213 + }, + { + "epoch": 0.8197270768494135, + "grad_norm": 0.7057923078536987, + "learning_rate": 8.404801836479809e-07, + "logits/chosen": -0.9404339790344238, + "logits/rejected": -0.9307448863983154, + "logps/chosen": -995.869384765625, + "logps/rejected": -1031.501220703125, + "loss": 0.1372, + "num_input_tokens_seen": 76866048, + "rewards/accuracies": 0.953125, + "rewards/chosen": -0.6210708618164062, + "rewards/margins": 3.6604011058807373, + "rewards/rejected": -4.281472206115723, + "step": 214 + }, + { + "epoch": 0.8235575772085229, + "grad_norm": 0.8342182636260986, + "learning_rate": 8.060791791418887e-07, + "logits/chosen": -0.9281613230705261, + "logits/rejected": -0.9277277588844299, + "logps/chosen": -973.7565307617188, + "logps/rejected": -990.3409423828125, + "loss": 0.166, + "num_input_tokens_seen": 77235840, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6552642583847046, + "rewards/margins": 3.5112967491149902, + "rewards/rejected": -4.166561126708984, + "step": 215 + }, + { + "epoch": 0.8273880775676322, + "grad_norm": 0.7837120890617371, + "learning_rate": 7.723351768901172e-07, + "logits/chosen": -0.914681613445282, + "logits/rejected": -0.9032266736030579, + "logps/chosen": -945.2279052734375, + "logps/rejected": -964.207275390625, + "loss": 0.146, + "num_input_tokens_seen": 77590592, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -0.29910939931869507, + "rewards/margins": 3.3871712684631348, + "rewards/rejected": -3.6862807273864746, + "step": 216 + }, + { + "epoch": 0.8312185779267417, + "grad_norm": 0.7295951843261719, + "learning_rate": 7.392534630882092e-07, + "logits/chosen": -0.9165376424789429, + "logits/rejected": -0.9079729318618774, + "logps/chosen": -925.588623046875, + "logps/rejected": -954.1300048828125, + "loss": 0.14, + "num_input_tokens_seen": 77947072, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.44157418608665466, + "rewards/margins": 3.286750316619873, + "rewards/rejected": -3.7283244132995605, + "step": 217 + }, + { + "epoch": 0.8350490782858511, + "grad_norm": 0.6768993139266968, + "learning_rate": 7.06839220180342e-07, + "logits/chosen": -0.9179525375366211, + "logits/rejected": -0.9105085730552673, + "logps/chosen": -977.3087768554688, + "logps/rejected": -1018.0786743164062, + "loss": 0.1216, + "num_input_tokens_seen": 78299264, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -0.293866366147995, + "rewards/margins": 3.6625845432281494, + "rewards/rejected": -3.9564507007598877, + "step": 218 + }, + { + "epoch": 0.8388795786449605, + "grad_norm": 0.784661054611206, + "learning_rate": 6.750975260474718e-07, + "logits/chosen": -0.9186124205589294, + "logits/rejected": -0.9146217703819275, + "logps/chosen": -947.469482421875, + "logps/rejected": -976.1781005859375, + "loss": 0.1551, + "num_input_tokens_seen": 78658112, + "rewards/accuracies": 0.9453125, + "rewards/chosen": -0.39997243881225586, + "rewards/margins": 3.3980631828308105, + "rewards/rejected": -3.7980358600616455, + "step": 219 + }, + { + "epoch": 0.8427100790040699, + "grad_norm": 0.638978123664856, + "learning_rate": 6.440333532118503e-07, + "logits/chosen": -0.9157735705375671, + "logits/rejected": -0.9229204654693604, + "logps/chosen": -950.6304931640625, + "logps/rejected": -967.498046875, + "loss": 0.1226, + "num_input_tokens_seen": 79021824, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -0.30217647552490234, + "rewards/margins": 3.5326499938964844, + "rewards/rejected": -3.8348262310028076, + "step": 220 + }, + { + "epoch": 0.8465405793631793, + "grad_norm": 0.6480262279510498, + "learning_rate": 6.136515680580479e-07, + "logits/chosen": -0.9298775792121887, + "logits/rejected": -0.9236046075820923, + "logps/chosen": -975.0006103515625, + "logps/rejected": -991.0952758789062, + "loss": 0.1023, + "num_input_tokens_seen": 79377856, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.5108473300933838, + "rewards/margins": 3.7519538402557373, + "rewards/rejected": -4.262801170349121, + "step": 221 + }, + { + "epoch": 0.8503710797222888, + "grad_norm": 0.7456729412078857, + "learning_rate": 5.839569300706127e-07, + "logits/chosen": -0.9218869805335999, + "logits/rejected": -0.919642984867096, + "logps/chosen": -975.8746337890625, + "logps/rejected": -1002.1050415039062, + "loss": 0.1229, + "num_input_tokens_seen": 79735616, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -0.30094680190086365, + "rewards/margins": 3.752605438232422, + "rewards/rejected": -4.053552627563477, + "step": 222 + }, + { + "epoch": 0.8542015800813981, + "grad_norm": 0.7922987341880798, + "learning_rate": 5.549540910884649e-07, + "logits/chosen": -0.9393529891967773, + "logits/rejected": -0.9286071062088013, + "logps/chosen": -982.0252685546875, + "logps/rejected": -1008.0518798828125, + "loss": 0.1372, + "num_input_tokens_seen": 80104704, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -0.3273351490497589, + "rewards/margins": 3.5788180828094482, + "rewards/rejected": -3.9061532020568848, + "step": 223 + }, + { + "epoch": 0.8580320804405075, + "grad_norm": 0.5692488551139832, + "learning_rate": 5.266475945761562e-07, + "logits/chosen": -0.922243595123291, + "logits/rejected": -0.9213239550590515, + "logps/chosen": -958.7139282226562, + "logps/rejected": -973.3154296875, + "loss": 0.0964, + "num_input_tokens_seen": 80450752, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -0.13344106078147888, + "rewards/margins": 4.023585796356201, + "rewards/rejected": -4.157027244567871, + "step": 224 + }, + { + "epoch": 0.861862580799617, + "grad_norm": 0.7449589967727661, + "learning_rate": 4.990418749121179e-07, + "logits/chosen": -0.9438382387161255, + "logits/rejected": -0.9378608465194702, + "logps/chosen": -1000.4117431640625, + "logps/rejected": -1010.1104125976562, + "loss": 0.1466, + "num_input_tokens_seen": 80818432, + "rewards/accuracies": 0.953125, + "rewards/chosen": -0.33709198236465454, + "rewards/margins": 3.2897238731384277, + "rewards/rejected": -3.6268157958984375, + "step": 225 + }, + { + "epoch": 0.8656930811587263, + "grad_norm": 0.6118392944335938, + "learning_rate": 4.721412566939804e-07, + "logits/chosen": -0.9306925535202026, + "logits/rejected": -0.9116442203521729, + "logps/chosen": -923.215576171875, + "logps/rejected": -981.7252197265625, + "loss": 0.1024, + "num_input_tokens_seen": 81175296, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -0.4298011362552643, + "rewards/margins": 3.7008438110351562, + "rewards/rejected": -4.130644798278809, + "step": 226 + }, + { + "epoch": 0.8695235815178358, + "grad_norm": 0.645531415939331, + "learning_rate": 4.4594995406110785e-07, + "logits/chosen": -0.9165564775466919, + "logits/rejected": -0.907341480255127, + "logps/chosen": -923.3377685546875, + "logps/rejected": -941.5916748046875, + "loss": 0.1219, + "num_input_tokens_seen": 81528576, + "rewards/accuracies": 0.9453125, + "rewards/chosen": -0.3308335542678833, + "rewards/margins": 3.751887559890747, + "rewards/rejected": -4.082720756530762, + "step": 227 + }, + { + "epoch": 0.8733540818769452, + "grad_norm": 0.7683370113372803, + "learning_rate": 4.2047207003442003e-07, + "logits/chosen": -0.9419224262237549, + "logits/rejected": -0.9364947080612183, + "logps/chosen": -963.0107421875, + "logps/rejected": -1002.9989013671875, + "loss": 0.1465, + "num_input_tokens_seen": 81893888, + "rewards/accuracies": 0.9453125, + "rewards/chosen": -0.302141010761261, + "rewards/margins": 3.5640974044799805, + "rewards/rejected": -3.8662383556365967, + "step": 228 + }, + { + "epoch": 0.8771845822360546, + "grad_norm": 0.7591111660003662, + "learning_rate": 3.957115958736374e-07, + "logits/chosen": -0.9183859825134277, + "logits/rejected": -0.9088101983070374, + "logps/chosen": -898.6732177734375, + "logps/rejected": -930.2683715820312, + "loss": 0.1393, + "num_input_tokens_seen": 82220416, + "rewards/accuracies": 0.9453125, + "rewards/chosen": -0.27679839730262756, + "rewards/margins": 3.477238655090332, + "rewards/rejected": -3.7540369033813477, + "step": 229 + }, + { + "epoch": 0.881015082595164, + "grad_norm": 0.6726076006889343, + "learning_rate": 3.7167241045202474e-07, + "logits/chosen": -0.9320576786994934, + "logits/rejected": -0.9238025546073914, + "logps/chosen": -984.03955078125, + "logps/rejected": -1007.7301025390625, + "loss": 0.1189, + "num_input_tokens_seen": 82582784, + "rewards/accuracies": 0.953125, + "rewards/chosen": -0.3495268225669861, + "rewards/margins": 3.904726982116699, + "rewards/rejected": -4.254254341125488, + "step": 230 + }, + { + "epoch": 0.8848455829542734, + "grad_norm": 0.647498369216919, + "learning_rate": 3.483582796487395e-07, + "logits/chosen": -0.9176431894302368, + "logits/rejected": -0.9147059917449951, + "logps/chosen": -968.1280517578125, + "logps/rejected": -966.5975952148438, + "loss": 0.1095, + "num_input_tokens_seen": 82945856, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.3321223258972168, + "rewards/margins": 3.5463643074035645, + "rewards/rejected": -3.8784866333007812, + "step": 231 + }, + { + "epoch": 0.8886760833133828, + "grad_norm": 0.6471075415611267, + "learning_rate": 3.257728557588902e-07, + "logits/chosen": -0.938944399356842, + "logits/rejected": -0.9266493320465088, + "logps/chosen": -969.0576171875, + "logps/rejected": -989.8577880859375, + "loss": 0.1139, + "num_input_tokens_seen": 83325760, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.43996739387512207, + "rewards/margins": 3.5021231174468994, + "rewards/rejected": -3.9420905113220215, + "step": 232 + }, + { + "epoch": 0.8925065836724922, + "grad_norm": 0.5224552154541016, + "learning_rate": 3.039196769213787e-07, + "logits/chosen": -0.9303513765335083, + "logits/rejected": -0.9171960949897766, + "logps/chosen": -942.271728515625, + "logps/rejected": -985.5810546875, + "loss": 0.0973, + "num_input_tokens_seen": 83684032, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3316795825958252, + "rewards/margins": 3.5600767135620117, + "rewards/rejected": -3.891756296157837, + "step": 233 + }, + { + "epoch": 0.8963370840316016, + "grad_norm": 0.7001847624778748, + "learning_rate": 2.828021665646341e-07, + "logits/chosen": -0.9517209529876709, + "logits/rejected": -0.9410845041275024, + "logps/chosen": -1001.434814453125, + "logps/rejected": -1056.3505859375, + "loss": 0.1432, + "num_input_tokens_seen": 84058624, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -0.4396364390850067, + "rewards/margins": 3.692404270172119, + "rewards/rejected": -4.132040500640869, + "step": 234 + }, + { + "epoch": 0.900167584390711, + "grad_norm": 0.842387855052948, + "learning_rate": 2.6242363287030617e-07, + "logits/chosen": -0.9320073127746582, + "logits/rejected": -0.9291372299194336, + "logps/chosen": -937.1383056640625, + "logps/rejected": -966.5003051757812, + "loss": 0.1677, + "num_input_tokens_seen": 84418816, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -0.538489818572998, + "rewards/margins": 3.398437023162842, + "rewards/rejected": -3.936927080154419, + "step": 235 + }, + { + "epoch": 0.9039980847498205, + "grad_norm": 1.006986141204834, + "learning_rate": 2.4278726825502696e-07, + "logits/chosen": -0.9529685974121094, + "logits/rejected": -0.9434100985527039, + "logps/chosen": -960.7099609375, + "logps/rejected": -977.6849365234375, + "loss": 0.1739, + "num_input_tokens_seen": 84791232, + "rewards/accuracies": 0.9453125, + "rewards/chosen": -0.5791181325912476, + "rewards/margins": 3.5598771572113037, + "rewards/rejected": -4.138995170593262, + "step": 236 + }, + { + "epoch": 0.9078285851089298, + "grad_norm": 0.7588669657707214, + "learning_rate": 2.2389614887029564e-07, + "logits/chosen": -0.9361369609832764, + "logits/rejected": -0.9245843291282654, + "logps/chosen": -950.8031616210938, + "logps/rejected": -990.46923828125, + "loss": 0.1351, + "num_input_tokens_seen": 85148544, + "rewards/accuracies": 0.953125, + "rewards/chosen": -0.31227266788482666, + "rewards/margins": 3.419215202331543, + "rewards/rejected": -3.73148775100708, + "step": 237 + }, + { + "epoch": 0.9116590854680393, + "grad_norm": 0.8685732483863831, + "learning_rate": 2.0575323412058036e-07, + "logits/chosen": -0.925530731678009, + "logits/rejected": -0.9173595905303955, + "logps/chosen": -980.1353759765625, + "logps/rejected": -1027.097900390625, + "loss": 0.166, + "num_input_tokens_seen": 85514624, + "rewards/accuracies": 0.9296875, + "rewards/chosen": -0.2911660373210907, + "rewards/margins": 3.2417550086975098, + "rewards/rejected": -3.532921075820923, + "step": 238 + }, + { + "epoch": 0.9154895858271487, + "grad_norm": 0.7738699316978455, + "learning_rate": 1.8836136619971468e-07, + "logits/chosen": -0.9151325225830078, + "logits/rejected": -0.9104874134063721, + "logps/chosen": -981.421142578125, + "logps/rejected": -1018.5482177734375, + "loss": 0.1315, + "num_input_tokens_seen": 85866560, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -0.41056591272354126, + "rewards/margins": 3.523707389831543, + "rewards/rejected": -3.9342732429504395, + "step": 239 + }, + { + "epoch": 0.919320086186258, + "grad_norm": 0.6902733445167542, + "learning_rate": 1.7172326964564777e-07, + "logits/chosen": -0.9358813762664795, + "logits/rejected": -0.9225896596908569, + "logps/chosen": -932.2445068359375, + "logps/rejected": -980.0001220703125, + "loss": 0.1289, + "num_input_tokens_seen": 86227584, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.30342864990234375, + "rewards/margins": 3.5703718662261963, + "rewards/rejected": -3.87380051612854, + "step": 240 + }, + { + "epoch": 0.9231505865453675, + "grad_norm": 0.9467810392379761, + "learning_rate": 1.5584155091362907e-07, + "logits/chosen": -0.9454147219657898, + "logits/rejected": -0.9401627779006958, + "logps/chosen": -991.7401733398438, + "logps/rejected": -1001.8673095703125, + "loss": 0.1732, + "num_input_tokens_seen": 86599360, + "rewards/accuracies": 0.9296875, + "rewards/chosen": -0.35543906688690186, + "rewards/margins": 3.2525267601013184, + "rewards/rejected": -3.6079659461975098, + "step": 241 + }, + { + "epoch": 0.9269810869044769, + "grad_norm": 0.6303733587265015, + "learning_rate": 1.4071869796789427e-07, + "logits/chosen": -0.9176468253135681, + "logits/rejected": -0.9086652398109436, + "logps/chosen": -941.583740234375, + "logps/rejected": -978.9615478515625, + "loss": 0.1152, + "num_input_tokens_seen": 86962112, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -0.21579024195671082, + "rewards/margins": 3.5956597328186035, + "rewards/rejected": -3.811450242996216, + "step": 242 + }, + { + "epoch": 0.9308115872635863, + "grad_norm": 0.7258052229881287, + "learning_rate": 1.263570798919106e-07, + "logits/chosen": -0.9400737285614014, + "logits/rejected": -0.9357911348342896, + "logps/chosen": -982.98388671875, + "logps/rejected": -1013.065673828125, + "loss": 0.1161, + "num_input_tokens_seen": 87320064, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -0.15125450491905212, + "rewards/margins": 3.9507298469543457, + "rewards/rejected": -4.101984024047852, + "step": 243 + }, + { + "epoch": 0.9346420876226957, + "grad_norm": 0.7220345139503479, + "learning_rate": 1.1275894651724517e-07, + "logits/chosen": -0.9216594696044922, + "logits/rejected": -0.9175605773925781, + "logps/chosen": -933.1033935546875, + "logps/rejected": -956.485107421875, + "loss": 0.1322, + "num_input_tokens_seen": 87670976, + "rewards/accuracies": 0.953125, + "rewards/chosen": -0.4048665761947632, + "rewards/margins": 3.5103683471679688, + "rewards/rejected": -3.9152348041534424, + "step": 244 + }, + { + "epoch": 0.9384725879818051, + "grad_norm": 0.7088446617126465, + "learning_rate": 9.992642807111486e-08, + "logits/chosen": -0.9138802289962769, + "logits/rejected": -0.9027386903762817, + "logps/chosen": -886.5745239257812, + "logps/rejected": -926.411865234375, + "loss": 0.1165, + "num_input_tokens_seen": 88002176, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -0.3084491193294525, + "rewards/margins": 3.5038204193115234, + "rewards/rejected": -3.812269687652588, + "step": 245 + }, + { + "epoch": 0.9423030883409146, + "grad_norm": 0.46804699301719666, + "learning_rate": 8.78615348426759e-08, + "logits/chosen": -0.9159804582595825, + "logits/rejected": -0.9086206555366516, + "logps/chosen": -922.8563232421875, + "logps/rejected": -958.032470703125, + "loss": 0.0834, + "num_input_tokens_seen": 88350976, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.148274227976799, + "rewards/margins": 3.6523170471191406, + "rewards/rejected": -3.800590991973877, + "step": 246 + }, + { + "epoch": 0.946133588700024, + "grad_norm": 0.617975115776062, + "learning_rate": 7.656615686809976e-08, + "logits/chosen": -0.938027024269104, + "logits/rejected": -0.9268268346786499, + "logps/chosen": -947.643798828125, + "logps/rejected": -984.83251953125, + "loss": 0.1083, + "num_input_tokens_seen": 88715456, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -0.21512371301651, + "rewards/margins": 3.836833953857422, + "rewards/rejected": -4.051958084106445, + "step": 247 + }, + { + "epoch": 0.9499640890591333, + "grad_norm": 0.5795360803604126, + "learning_rate": 6.604206363448662e-08, + "logits/chosen": -0.928307294845581, + "logits/rejected": -0.9218379259109497, + "logps/chosen": -976.09765625, + "logps/rejected": -1003.6217041015625, + "loss": 0.1025, + "num_input_tokens_seen": 89086656, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -0.20297859609127045, + "rewards/margins": 3.886709213256836, + "rewards/rejected": -4.089688301086426, + "step": 248 + }, + { + "epoch": 0.9537945894182428, + "grad_norm": 0.6491379141807556, + "learning_rate": 5.6290903802665444e-08, + "logits/chosen": -0.9402973651885986, + "logits/rejected": -0.9266122579574585, + "logps/chosen": -1005.4326171875, + "logps/rejected": -1036.166259765625, + "loss": 0.1172, + "num_input_tokens_seen": 89469120, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.2067689150571823, + "rewards/margins": 3.535766124725342, + "rewards/rejected": -3.74253511428833, + "step": 249 + }, + { + "epoch": 0.9576250897773522, + "grad_norm": 0.879736602306366, + "learning_rate": 4.7314204948923356e-08, + "logits/chosen": -0.9178593158721924, + "logits/rejected": -0.9069898128509521, + "logps/chosen": -947.748046875, + "logps/rejected": -978.2379150390625, + "loss": 0.178, + "num_input_tokens_seen": 89832832, + "rewards/accuracies": 0.9453125, + "rewards/chosen": -0.2660120129585266, + "rewards/margins": 3.206664562225342, + "rewards/rejected": -3.4726767539978027, + "step": 250 + }, + { + "epoch": 0.9614555901364615, + "grad_norm": 0.8699985146522522, + "learning_rate": 3.911337332569876e-08, + "logits/chosen": -0.9140677452087402, + "logits/rejected": -0.9121595025062561, + "logps/chosen": -942.36962890625, + "logps/rejected": -961.3214721679688, + "loss": 0.1803, + "num_input_tokens_seen": 90184960, + "rewards/accuracies": 0.921875, + "rewards/chosen": -0.2849889397621155, + "rewards/margins": 3.405879020690918, + "rewards/rejected": -3.6908679008483887, + "step": 251 + }, + { + "epoch": 0.965286090495571, + "grad_norm": 0.7312610149383545, + "learning_rate": 3.168969364128527e-08, + "logits/chosen": -0.9094095230102539, + "logits/rejected": -0.90659099817276, + "logps/chosen": -923.608642578125, + "logps/rejected": -948.98828125, + "loss": 0.122, + "num_input_tokens_seen": 90540608, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -0.33560073375701904, + "rewards/margins": 3.53710675239563, + "rewards/rejected": -3.8727073669433594, + "step": 252 + }, + { + "epoch": 0.9691165908546804, + "grad_norm": 0.7127865552902222, + "learning_rate": 2.5044328858576105e-08, + "logits/chosen": -0.9293651580810547, + "logits/rejected": -0.9224931001663208, + "logps/chosen": -964.11083984375, + "logps/rejected": -999.3698120117188, + "loss": 0.136, + "num_input_tokens_seen": 90903552, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -0.1544954925775528, + "rewards/margins": 3.5648012161254883, + "rewards/rejected": -3.719296932220459, + "step": 253 + }, + { + "epoch": 0.9729470912137897, + "grad_norm": 0.652675986289978, + "learning_rate": 1.917832001287645e-08, + "logits/chosen": -0.9135050177574158, + "logits/rejected": -0.8944007158279419, + "logps/chosen": -949.9973754882812, + "logps/rejected": -979.066162109375, + "loss": 0.1234, + "num_input_tokens_seen": 91277824, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -0.3631123900413513, + "rewards/margins": 3.4665122032165527, + "rewards/rejected": -3.8296244144439697, + "step": 254 + }, + { + "epoch": 0.9767775915728992, + "grad_norm": 0.6781688332557678, + "learning_rate": 1.4092586048820578e-08, + "logits/chosen": -0.9378706216812134, + "logits/rejected": -0.928849458694458, + "logps/chosen": -954.780517578125, + "logps/rejected": -974.5152587890625, + "loss": 0.128, + "num_input_tokens_seen": 91637568, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.2064896821975708, + "rewards/margins": 3.4366228580474854, + "rewards/rejected": -3.6431126594543457, + "step": 255 + }, + { + "epoch": 0.9806080919320086, + "grad_norm": 0.5987056493759155, + "learning_rate": 9.787923676414235e-09, + "logits/chosen": -0.9221091270446777, + "logits/rejected": -0.9193128943443298, + "logps/chosen": -919.7487182617188, + "logps/rejected": -933.0615844726562, + "loss": 0.1089, + "num_input_tokens_seen": 91986624, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -0.3668377101421356, + "rewards/margins": 3.6270933151245117, + "rewards/rejected": -3.9939308166503906, + "step": 256 + }, + { + "epoch": 0.9844385922911181, + "grad_norm": 0.7575734853744507, + "learning_rate": 6.265007246223365e-09, + "logits/chosen": -0.9319050908088684, + "logits/rejected": -0.9213892221450806, + "logps/chosen": -973.8796997070312, + "logps/rejected": -994.4847412109375, + "loss": 0.1555, + "num_input_tokens_seen": 92359232, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -0.2888575792312622, + "rewards/margins": 3.2450671195983887, + "rewards/rejected": -3.5339248180389404, + "step": 257 + }, + { + "epoch": 0.9882690926502274, + "grad_norm": 0.7093362212181091, + "learning_rate": 3.524388643736387e-09, + "logits/chosen": -0.9140563607215881, + "logits/rejected": -0.9141513109207153, + "logps/chosen": -967.251220703125, + "logps/rejected": -965.6304931640625, + "loss": 0.1283, + "num_input_tokens_seen": 92725888, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -0.19560253620147705, + "rewards/margins": 3.3071861267089844, + "rewards/rejected": -3.502788782119751, + "step": 258 + }, + { + "epoch": 0.9920995930093368, + "grad_norm": 0.9159227609634399, + "learning_rate": 1.566497202904471e-09, + "logits/chosen": -0.9180241823196411, + "logits/rejected": -0.9152236580848694, + "logps/chosen": -989.4107666015625, + "logps/rejected": -1018.9290161132812, + "loss": 0.1709, + "num_input_tokens_seen": 93092224, + "rewards/accuracies": 0.921875, + "rewards/chosen": -0.052332304418087006, + "rewards/margins": 3.409949779510498, + "rewards/rejected": -3.4622819423675537, + "step": 259 + }, + { + "epoch": 0.9959300933684463, + "grad_norm": 0.6988145709037781, + "learning_rate": 3.916396388869981e-10, + "logits/chosen": -0.9381681680679321, + "logits/rejected": -0.9322620630264282, + "logps/chosen": -956.5706787109375, + "logps/rejected": -1003.7496948242188, + "loss": 0.1259, + "num_input_tokens_seen": 93450688, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -0.4282854199409485, + "rewards/margins": 3.464014768600464, + "rewards/rejected": -3.8923001289367676, + "step": 260 + }, + { + "epoch": 0.9997605937275557, + "grad_norm": 0.7452526688575745, + "learning_rate": 0.0, + "logits/chosen": -0.9185935258865356, + "logits/rejected": -0.9104200601577759, + "logps/chosen": -959.4465942382812, + "logps/rejected": -990.2052001953125, + "loss": 0.1391, + "num_input_tokens_seen": 93801152, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -0.6293328404426575, + "rewards/margins": 3.542891025543213, + "rewards/rejected": -4.172224044799805, + "step": 261 + }, + { + "epoch": 0.9997605937275557, + "num_input_tokens_seen": 93801152, + "step": 261, + "total_flos": 4.318246812239528e+18, + "train_loss": 0.26227811141603297, + "train_runtime": 22900.2703, + "train_samples_per_second": 1.459, + "train_steps_per_second": 0.011 + } + ], + "logging_steps": 1.0, + "max_steps": 261, + "num_input_tokens_seen": 93801152, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.318246812239528e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}