diff --git "a/Qwen2.5-7b-Med-REFL-LoraAdapter/trainer_state.json" "b/Qwen2.5-7b-Med-REFL-LoraAdapter/trainer_state.json" new file mode 100644--- /dev/null +++ "b/Qwen2.5-7b-Med-REFL-LoraAdapter/trainer_state.json" @@ -0,0 +1,8396 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9959300933684463, + "eval_steps": 500, + "global_step": 522, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0038305003591094086, + "grad_norm": 0.7754550099413912, + "learning_rate": 2.0000000000000003e-06, + "logits/chosen": -0.004948570393025875, + "logits/rejected": -0.009020027704536915, + "logps/chosen": -1029.60107421875, + "logps/rejected": -1046.81640625, + "loss": 0.6931, + "num_input_tokens_seen": 326624, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.007661000718218817, + "grad_norm": 0.7216497908201672, + "learning_rate": 4.000000000000001e-06, + "logits/chosen": 0.0067935725674033165, + "logits/rejected": -0.0245077982544899, + "logps/chosen": -1023.5374755859375, + "logps/rejected": -1017.0289306640625, + "loss": 0.6931, + "num_input_tokens_seen": 649056, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.011491501077328227, + "grad_norm": 0.7190610035227727, + "learning_rate": 6e-06, + "logits/chosen": 0.02191464975476265, + "logits/rejected": 0.004422793164849281, + "logps/chosen": -1038.782470703125, + "logps/rejected": -1025.2938232421875, + "loss": 0.6978, + "num_input_tokens_seen": 978912, + "rewards/accuracies": 0.484375, + "rewards/chosen": 0.032175540924072266, + "rewards/margins": -0.003839826211333275, + "rewards/rejected": 0.03601536899805069, + "step": 3 + }, + { + "epoch": 0.015322001436437634, + "grad_norm": 0.7316574645788874, + "learning_rate": 8.000000000000001e-06, + "logits/chosen": 0.020559391006827354, + "logits/rejected": -0.02043076977133751, + "logps/chosen": -1022.3394775390625, + "logps/rejected": -1020.7935791015625, + "loss": 0.6985, + "num_input_tokens_seen": 1306720, + "rewards/accuracies": 0.46875, + "rewards/chosen": 0.019536186009645462, + "rewards/margins": -0.0024862526915967464, + "rewards/rejected": 0.022022439166903496, + "step": 4 + }, + { + "epoch": 0.019152501795547044, + "grad_norm": 0.683004061426711, + "learning_rate": 1e-05, + "logits/chosen": -0.003460688516497612, + "logits/rejected": -0.019346771761775017, + "logps/chosen": -1067.627685546875, + "logps/rejected": -1077.1904296875, + "loss": 0.6851, + "num_input_tokens_seen": 1635584, + "rewards/accuracies": 0.5546875, + "rewards/chosen": 0.005624030251055956, + "rewards/margins": 0.022820446640253067, + "rewards/rejected": -0.0171964168548584, + "step": 5 + }, + { + "epoch": 0.022983002154656453, + "grad_norm": 0.7140073483183785, + "learning_rate": 1.2e-05, + "logits/chosen": -0.054150402545928955, + "logits/rejected": -0.04285615682601929, + "logps/chosen": -1011.5696411132812, + "logps/rejected": -1008.5350341796875, + "loss": 0.707, + "num_input_tokens_seen": 1952000, + "rewards/accuracies": 0.4453125, + "rewards/chosen": -0.03760785982012749, + "rewards/margins": -0.021029017865657806, + "rewards/rejected": -0.01657884195446968, + "step": 6 + }, + { + "epoch": 0.02681350251376586, + "grad_norm": 0.7146960147477691, + "learning_rate": 1.4e-05, + "logits/chosen": -0.03695409372448921, + "logits/rejected": -0.04261859506368637, + "logps/chosen": -1019.7833251953125, + "logps/rejected": -1015.2184448242188, + "loss": 0.7024, + "num_input_tokens_seen": 2275136, + "rewards/accuracies": 0.46875, + "rewards/chosen": 0.014542412012815475, + "rewards/margins": -0.012655187398195267, + "rewards/rejected": 0.027197599411010742, + "step": 7 + }, + { + "epoch": 0.03064400287287527, + "grad_norm": 0.7315787912318974, + "learning_rate": 1.6000000000000003e-05, + "logits/chosen": 0.028178848326206207, + "logits/rejected": 0.013220325112342834, + "logps/chosen": -1047.8155517578125, + "logps/rejected": -1042.162353515625, + "loss": 0.6983, + "num_input_tokens_seen": 2606880, + "rewards/accuracies": 0.5234375, + "rewards/chosen": -0.026167582720518112, + "rewards/margins": -0.0054130288772284985, + "rewards/rejected": -0.020754551514983177, + "step": 8 + }, + { + "epoch": 0.03447450323198468, + "grad_norm": 0.7146546587838524, + "learning_rate": 1.8e-05, + "logits/chosen": -0.038511309772729874, + "logits/rejected": -0.07079754769802094, + "logps/chosen": -1093.1131591796875, + "logps/rejected": -1072.4642333984375, + "loss": 0.6874, + "num_input_tokens_seen": 2947296, + "rewards/accuracies": 0.5390625, + "rewards/chosen": 0.0270112045109272, + "rewards/margins": 0.017521381378173828, + "rewards/rejected": 0.009489820338785648, + "step": 9 + }, + { + "epoch": 0.03830500359109409, + "grad_norm": 0.7535276756443735, + "learning_rate": 2e-05, + "logits/chosen": 0.048275869339704514, + "logits/rejected": 0.023540839552879333, + "logps/chosen": -1073.13330078125, + "logps/rejected": -1037.5401611328125, + "loss": 0.6993, + "num_input_tokens_seen": 3286528, + "rewards/accuracies": 0.484375, + "rewards/chosen": 0.025215817615389824, + "rewards/margins": -0.006738638039678335, + "rewards/rejected": 0.031954456120729446, + "step": 10 + }, + { + "epoch": 0.0421355039502035, + "grad_norm": 0.7563401252178212, + "learning_rate": 1.9999811752826012e-05, + "logits/chosen": 0.03654933348298073, + "logits/rejected": -0.0053311921656131744, + "logps/chosen": -1083.346435546875, + "logps/rejected": -1068.189208984375, + "loss": 0.6849, + "num_input_tokens_seen": 3630304, + "rewards/accuracies": 0.5546875, + "rewards/chosen": -0.025775529444217682, + "rewards/margins": 0.022771168500185013, + "rewards/rejected": -0.048546694219112396, + "step": 11 + }, + { + "epoch": 0.045966004309312906, + "grad_norm": 0.7818212967323699, + "learning_rate": 1.999924701839145e-05, + "logits/chosen": -0.013700390234589577, + "logits/rejected": -0.041521020233631134, + "logps/chosen": -1069.7159423828125, + "logps/rejected": -1053.1302490234375, + "loss": 0.6892, + "num_input_tokens_seen": 3958208, + "rewards/accuracies": 0.5703125, + "rewards/chosen": 0.03000187873840332, + "rewards/margins": 0.014318801462650299, + "rewards/rejected": 0.01568308100104332, + "step": 12 + }, + { + "epoch": 0.049796504668422316, + "grad_norm": 0.7930062566004271, + "learning_rate": 1.9998305817958235e-05, + "logits/chosen": -0.05341847613453865, + "logits/rejected": -0.06503578275442123, + "logps/chosen": -1076.9273681640625, + "logps/rejected": -1032.4300537109375, + "loss": 0.6791, + "num_input_tokens_seen": 4295584, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.009268571622669697, + "rewards/margins": 0.03455815464258194, + "rewards/rejected": -0.025289583951234818, + "step": 13 + }, + { + "epoch": 0.05362700502753172, + "grad_norm": 0.8406676770464243, + "learning_rate": 1.9996988186962044e-05, + "logits/chosen": -0.05680212751030922, + "logits/rejected": -0.060767412185668945, + "logps/chosen": -1082.224609375, + "logps/rejected": -1043.6207275390625, + "loss": 0.6689, + "num_input_tokens_seen": 4627968, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.14103689789772034, + "rewards/margins": 0.05704689025878906, + "rewards/rejected": 0.08399000763893127, + "step": 14 + }, + { + "epoch": 0.05745750538664113, + "grad_norm": 0.7837222100675245, + "learning_rate": 1.9995294175010934e-05, + "logits/chosen": -0.07101021707057953, + "logits/rejected": -0.07319741696119308, + "logps/chosen": -1036.0831298828125, + "logps/rejected": -1029.1241455078125, + "loss": 0.6516, + "num_input_tokens_seen": 4941696, + "rewards/accuracies": 0.6796875, + "rewards/chosen": 0.19987744092941284, + "rewards/margins": 0.0951383113861084, + "rewards/rejected": 0.10473913699388504, + "step": 15 + }, + { + "epoch": 0.06128800574575054, + "grad_norm": 0.793308186438828, + "learning_rate": 1.9993223845883496e-05, + "logits/chosen": -0.06108350306749344, + "logits/rejected": -0.08611281216144562, + "logps/chosen": -1002.2987670898438, + "logps/rejected": -1018.6817626953125, + "loss": 0.6677, + "num_input_tokens_seen": 5268000, + "rewards/accuracies": 0.578125, + "rewards/chosen": 0.14876127243041992, + "rewards/margins": 0.062479548156261444, + "rewards/rejected": 0.08628173172473907, + "step": 16 + }, + { + "epoch": 0.06511850610485995, + "grad_norm": 0.7626078990200041, + "learning_rate": 1.9990777277526458e-05, + "logits/chosen": 0.005296383053064346, + "logits/rejected": -0.031334955245256424, + "logps/chosen": -1056.2413330078125, + "logps/rejected": -1028.4158935546875, + "loss": 0.6288, + "num_input_tokens_seen": 5595904, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3595597445964813, + "rewards/margins": 0.14869971573352814, + "rewards/rejected": 0.210860013961792, + "step": 17 + }, + { + "epoch": 0.06894900646396936, + "grad_norm": 0.7842860816456734, + "learning_rate": 1.9987954562051724e-05, + "logits/chosen": 0.08855888247489929, + "logits/rejected": 0.06388720124959946, + "logps/chosen": -1085.090576171875, + "logps/rejected": -1097.7294921875, + "loss": 0.6231, + "num_input_tokens_seen": 5941312, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.4658185541629791, + "rewards/margins": 0.16621243953704834, + "rewards/rejected": 0.2996061146259308, + "step": 18 + }, + { + "epoch": 0.07277950682307877, + "grad_norm": 0.7631925070604849, + "learning_rate": 1.9984755805732948e-05, + "logits/chosen": 0.046462349593639374, + "logits/rejected": 0.037998467683792114, + "logps/chosen": -1038.4248046875, + "logps/rejected": -1042.4051513671875, + "loss": 0.6199, + "num_input_tokens_seen": 6267424, + "rewards/accuracies": 0.7265625, + "rewards/chosen": 0.4370581805706024, + "rewards/margins": 0.17269258201122284, + "rewards/rejected": 0.2643655836582184, + "step": 19 + }, + { + "epoch": 0.07661000718218818, + "grad_norm": 0.7532869450296331, + "learning_rate": 1.998118112900149e-05, + "logits/chosen": -0.004614699631929398, + "logits/rejected": -0.029905591160058975, + "logps/chosen": -1028.544921875, + "logps/rejected": -1019.021240234375, + "loss": 0.6015, + "num_input_tokens_seen": 6590368, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.5869898200035095, + "rewards/margins": 0.22056323289871216, + "rewards/rejected": 0.36642658710479736, + "step": 20 + }, + { + "epoch": 0.08044050754129758, + "grad_norm": 0.8148890973194257, + "learning_rate": 1.997723066644192e-05, + "logits/chosen": -0.013207060284912586, + "logits/rejected": -0.03132867068052292, + "logps/chosen": -1082.7481689453125, + "logps/rejected": -1086.310546875, + "loss": 0.6003, + "num_input_tokens_seen": 6930112, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.6782715916633606, + "rewards/margins": 0.2352655827999115, + "rewards/rejected": 0.4430060386657715, + "step": 21 + }, + { + "epoch": 0.084271007900407, + "grad_norm": 0.7242905172658506, + "learning_rate": 1.9972904566786903e-05, + "logits/chosen": -0.013522692024707794, + "logits/rejected": -0.03546103462576866, + "logps/chosen": -1063.7215576171875, + "logps/rejected": -1068.9742431640625, + "loss": 0.5536, + "num_input_tokens_seen": 7267360, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.803547203540802, + "rewards/margins": 0.3516206741333008, + "rewards/rejected": 0.4519265294075012, + "step": 22 + }, + { + "epoch": 0.0881015082595164, + "grad_norm": 0.7111199077888346, + "learning_rate": 1.9968202992911657e-05, + "logits/chosen": -0.040007129311561584, + "logits/rejected": -0.055904414504766464, + "logps/chosen": -1046.552978515625, + "logps/rejected": -1016.3299560546875, + "loss": 0.5813, + "num_input_tokens_seen": 7599200, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.8247166872024536, + "rewards/margins": 0.29593774676322937, + "rewards/rejected": 0.5287790298461914, + "step": 23 + }, + { + "epoch": 0.09193200861862581, + "grad_norm": 0.7427934718384868, + "learning_rate": 1.996312612182778e-05, + "logits/chosen": -0.01739281788468361, + "logits/rejected": -0.03726988285779953, + "logps/chosen": -1062.774658203125, + "logps/rejected": -1057.2645263671875, + "loss": 0.5351, + "num_input_tokens_seen": 7927200, + "rewards/accuracies": 0.765625, + "rewards/chosen": 0.9207038879394531, + "rewards/margins": 0.43137192726135254, + "rewards/rejected": 0.4893319010734558, + "step": 24 + }, + { + "epoch": 0.09576250897773522, + "grad_norm": 0.72148779145702, + "learning_rate": 1.99576741446766e-05, + "logits/chosen": 0.011154616251587868, + "logits/rejected": 0.010420276783406734, + "logps/chosen": -1057.216064453125, + "logps/rejected": -1062.7374267578125, + "loss": 0.5131, + "num_input_tokens_seen": 8260800, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.0708329677581787, + "rewards/margins": 0.5048216581344604, + "rewards/rejected": 0.5660112500190735, + "step": 25 + }, + { + "epoch": 0.09959300933684463, + "grad_norm": 0.7429082834391346, + "learning_rate": 1.995184726672197e-05, + "logits/chosen": -0.009670288302004337, + "logits/rejected": -0.00827145203948021, + "logps/chosen": -1038.47998046875, + "logps/rejected": -1051.02783203125, + "loss": 0.5195, + "num_input_tokens_seen": 8593440, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 1.1231303215026855, + "rewards/margins": 0.5032806396484375, + "rewards/rejected": 0.6198496222496033, + "step": 26 + }, + { + "epoch": 0.10342350969595403, + "grad_norm": 0.749033472641741, + "learning_rate": 1.9945645707342555e-05, + "logits/chosen": -0.058443039655685425, + "logits/rejected": -0.06339199841022491, + "logps/chosen": -1012.507568359375, + "logps/rejected": -1008.6725463867188, + "loss": 0.5156, + "num_input_tokens_seen": 8913024, + "rewards/accuracies": 0.703125, + "rewards/chosen": 1.18907630443573, + "rewards/margins": 0.5712540745735168, + "rewards/rejected": 0.6178222298622131, + "step": 27 + }, + { + "epoch": 0.10725401005506344, + "grad_norm": 0.6961328109607642, + "learning_rate": 1.9939069700023564e-05, + "logits/chosen": 0.015964325517416, + "logits/rejected": 0.00434470921754837, + "logps/chosen": -1028.740234375, + "logps/rejected": -1022.506103515625, + "loss": 0.4667, + "num_input_tokens_seen": 9242464, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.2921875715255737, + "rewards/margins": 0.7302837371826172, + "rewards/rejected": 0.561903715133667, + "step": 28 + }, + { + "epoch": 0.11108451041417285, + "grad_norm": 0.6934109275433618, + "learning_rate": 1.9932119492347947e-05, + "logits/chosen": -0.0235883928835392, + "logits/rejected": -0.016572438180446625, + "logps/chosen": -1026.03662109375, + "logps/rejected": -1035.074951171875, + "loss": 0.4858, + "num_input_tokens_seen": 9570976, + "rewards/accuracies": 0.796875, + "rewards/chosen": 1.3176655769348145, + "rewards/margins": 0.6839665174484253, + "rewards/rejected": 0.6336989402770996, + "step": 29 + }, + { + "epoch": 0.11491501077328226, + "grad_norm": 0.729583755943264, + "learning_rate": 1.9924795345987103e-05, + "logits/chosen": -0.03625601902604103, + "logits/rejected": -0.0739939734339714, + "logps/chosen": -1058.5277099609375, + "logps/rejected": -1052.7415771484375, + "loss": 0.4546, + "num_input_tokens_seen": 9907360, + "rewards/accuracies": 0.828125, + "rewards/chosen": 1.3329441547393799, + "rewards/margins": 0.7888250350952148, + "rewards/rejected": 0.544119119644165, + "step": 30 + }, + { + "epoch": 0.11874551113239167, + "grad_norm": 0.7706357702768899, + "learning_rate": 1.9917097536690997e-05, + "logits/chosen": -0.09417334944009781, + "logits/rejected": -0.12287972122430801, + "logps/chosen": -1041.641845703125, + "logps/rejected": -1049.744384765625, + "loss": 0.426, + "num_input_tokens_seen": 10237760, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 1.2745342254638672, + "rewards/margins": 0.9819475412368774, + "rewards/rejected": 0.29258662462234497, + "step": 31 + }, + { + "epoch": 0.12257601149150107, + "grad_norm": 0.6563944269748782, + "learning_rate": 1.99090263542778e-05, + "logits/chosen": -0.027718650177121162, + "logits/rejected": -0.020578352734446526, + "logps/chosen": -1028.19775390625, + "logps/rejected": -1030.349853515625, + "loss": 0.3774, + "num_input_tokens_seen": 10553152, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.402183175086975, + "rewards/margins": 1.1027390956878662, + "rewards/rejected": 0.2994440197944641, + "step": 32 + }, + { + "epoch": 0.1264065118506105, + "grad_norm": 0.7784182933390023, + "learning_rate": 1.9900582102622973e-05, + "logits/chosen": -0.03451825678348541, + "logits/rejected": -0.07660885155200958, + "logps/chosen": -1078.9853515625, + "logps/rejected": -1079.59326171875, + "loss": 0.4336, + "num_input_tokens_seen": 10886464, + "rewards/accuracies": 0.796875, + "rewards/chosen": 1.2540764808654785, + "rewards/margins": 1.0660908222198486, + "rewards/rejected": 0.18798565864562988, + "step": 33 + }, + { + "epoch": 0.1302370122097199, + "grad_norm": 0.7784174751062098, + "learning_rate": 1.989176509964781e-05, + "logits/chosen": -0.04997391998767853, + "logits/rejected": -0.03442884609103203, + "logps/chosen": -1058.5472412109375, + "logps/rejected": -1076.449462890625, + "loss": 0.4272, + "num_input_tokens_seen": 11222976, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 1.3005071878433228, + "rewards/margins": 1.030328392982483, + "rewards/rejected": 0.27017879486083984, + "step": 34 + }, + { + "epoch": 0.13406751256882932, + "grad_norm": 0.6869306523064598, + "learning_rate": 1.9882575677307497e-05, + "logits/chosen": -0.09178491681814194, + "logits/rejected": -0.09663952887058258, + "logps/chosen": -1042.510009765625, + "logps/rejected": -1074.96630859375, + "loss": 0.3327, + "num_input_tokens_seen": 11558912, + "rewards/accuracies": 0.8828125, + "rewards/chosen": 1.082582712173462, + "rewards/margins": 1.4677971601486206, + "rewards/rejected": -0.3852144777774811, + "step": 35 + }, + { + "epoch": 0.1378980129279387, + "grad_norm": 0.726826169519756, + "learning_rate": 1.9873014181578588e-05, + "logits/chosen": -0.006761874072253704, + "logits/rejected": 0.014297496527433395, + "logps/chosen": -989.15283203125, + "logps/rejected": -1001.4314575195312, + "loss": 0.3958, + "num_input_tokens_seen": 11875648, + "rewards/accuracies": 0.8515625, + "rewards/chosen": 1.0257503986358643, + "rewards/margins": 1.191283941268921, + "rewards/rejected": -0.16553358733654022, + "step": 36 + }, + { + "epoch": 0.1417285132870481, + "grad_norm": 0.7329104104421856, + "learning_rate": 1.986308097244599e-05, + "logits/chosen": -0.045461565256118774, + "logits/rejected": -0.06471557170152664, + "logps/chosen": -988.549560546875, + "logps/rejected": -990.2896118164062, + "loss": 0.37, + "num_input_tokens_seen": 12195392, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.0338926315307617, + "rewards/margins": 1.3707859516143799, + "rewards/rejected": -0.3368934392929077, + "step": 37 + }, + { + "epoch": 0.14555901364615753, + "grad_norm": 0.7383910342145567, + "learning_rate": 1.9852776423889414e-05, + "logits/chosen": -0.028068481013178825, + "logits/rejected": -0.046034954488277435, + "logps/chosen": -1064.37060546875, + "logps/rejected": -1065.1097412109375, + "loss": 0.3656, + "num_input_tokens_seen": 12534752, + "rewards/accuracies": 0.8515625, + "rewards/chosen": 1.0157785415649414, + "rewards/margins": 1.47843599319458, + "rewards/rejected": -0.46265745162963867, + "step": 38 + }, + { + "epoch": 0.14938951400526693, + "grad_norm": 0.76722914253465, + "learning_rate": 1.9842100923869292e-05, + "logits/chosen": -0.04737653583288193, + "logits/rejected": -0.04486582428216934, + "logps/chosen": -1018.3984375, + "logps/rejected": -1013.9937744140625, + "loss": 0.3673, + "num_input_tokens_seen": 12857440, + "rewards/accuracies": 0.8515625, + "rewards/chosen": 0.8627580404281616, + "rewards/margins": 1.3541693687438965, + "rewards/rejected": -0.4914112687110901, + "step": 39 + }, + { + "epoch": 0.15322001436437635, + "grad_norm": 0.7448845028725913, + "learning_rate": 1.9831054874312167e-05, + "logits/chosen": -0.07007941603660583, + "logits/rejected": -0.10030247271060944, + "logps/chosen": -1028.76611328125, + "logps/rejected": -1022.4772338867188, + "loss": 0.3274, + "num_input_tokens_seen": 13178048, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.9479148387908936, + "rewards/margins": 1.6322944164276123, + "rewards/rejected": -0.6843795776367188, + "step": 40 + }, + { + "epoch": 0.15705051472348575, + "grad_norm": 0.7040798823364977, + "learning_rate": 1.9819638691095554e-05, + "logits/chosen": -0.06454780697822571, + "logits/rejected": -0.07849342375993729, + "logps/chosen": -1023.6952514648438, + "logps/rejected": -1029.20361328125, + "loss": 0.3362, + "num_input_tokens_seen": 13512032, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.8823634386062622, + "rewards/margins": 1.5936744213104248, + "rewards/rejected": -0.7113110423088074, + "step": 41 + }, + { + "epoch": 0.16088101508259517, + "grad_norm": 0.6631077195706544, + "learning_rate": 1.9807852804032306e-05, + "logits/chosen": -0.10623634606599808, + "logits/rejected": -0.12342099845409393, + "logps/chosen": -1061.46728515625, + "logps/rejected": -1062.283203125, + "loss": 0.3002, + "num_input_tokens_seen": 13848640, + "rewards/accuracies": 0.8671875, + "rewards/chosen": 1.0242018699645996, + "rewards/margins": 1.788495421409607, + "rewards/rejected": -0.7642935514450073, + "step": 42 + }, + { + "epoch": 0.16471151544170456, + "grad_norm": 0.6817732885725012, + "learning_rate": 1.9795697656854406e-05, + "logits/chosen": -0.09262734651565552, + "logits/rejected": -0.11081977933645248, + "logps/chosen": -1056.545654296875, + "logps/rejected": -1055.6888427734375, + "loss": 0.2887, + "num_input_tokens_seen": 14180832, + "rewards/accuracies": 0.9140625, + "rewards/chosen": 0.9857735633850098, + "rewards/margins": 1.7419161796569824, + "rewards/rejected": -0.7561426162719727, + "step": 43 + }, + { + "epoch": 0.168542015800814, + "grad_norm": 0.643526147331565, + "learning_rate": 1.9783173707196278e-05, + "logits/chosen": -0.1645267754793167, + "logits/rejected": -0.14613790810108185, + "logps/chosen": -1037.59326171875, + "logps/rejected": -1071.25634765625, + "loss": 0.2722, + "num_input_tokens_seen": 14502976, + "rewards/accuracies": 0.8828125, + "rewards/chosen": 1.018608570098877, + "rewards/margins": 2.0563204288482666, + "rewards/rejected": -1.0377120971679688, + "step": 44 + }, + { + "epoch": 0.17237251615992338, + "grad_norm": 0.6379199879794387, + "learning_rate": 1.9770281426577543e-05, + "logits/chosen": -0.10344626754522324, + "logits/rejected": -0.09739074856042862, + "logps/chosen": -1029.3065185546875, + "logps/rejected": -1017.9700317382812, + "loss": 0.2772, + "num_input_tokens_seen": 14827616, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.3612115383148193, + "rewards/margins": 1.9505512714385986, + "rewards/rejected": -0.5893397331237793, + "step": 45 + }, + { + "epoch": 0.1762030165190328, + "grad_norm": 0.6780809227332171, + "learning_rate": 1.9757021300385288e-05, + "logits/chosen": -0.07911691069602966, + "logits/rejected": -0.09820247441530228, + "logps/chosen": -1036.677978515625, + "logps/rejected": -1047.679931640625, + "loss": 0.2868, + "num_input_tokens_seen": 15165888, + "rewards/accuracies": 0.8984375, + "rewards/chosen": 1.307704210281372, + "rewards/margins": 2.0017316341400146, + "rewards/rejected": -0.6940274834632874, + "step": 46 + }, + { + "epoch": 0.1800335168781422, + "grad_norm": 0.6465379525176257, + "learning_rate": 1.9743393827855758e-05, + "logits/chosen": -0.0645233690738678, + "logits/rejected": -0.09609126299619675, + "logps/chosen": -1034.8553466796875, + "logps/rejected": -1059.4375, + "loss": 0.2618, + "num_input_tokens_seen": 15501984, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.2903960943222046, + "rewards/margins": 2.132112741470337, + "rewards/rejected": -0.8417166471481323, + "step": 47 + }, + { + "epoch": 0.18386401723725163, + "grad_norm": 0.6609494877732606, + "learning_rate": 1.9729399522055603e-05, + "logits/chosen": -0.03229144960641861, + "logits/rejected": -0.06522522866725922, + "logps/chosen": -1029.3084716796875, + "logps/rejected": -1040.4041748046875, + "loss": 0.2642, + "num_input_tokens_seen": 15822752, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.328877329826355, + "rewards/margins": 1.9950249195098877, + "rewards/rejected": -0.6661476492881775, + "step": 48 + }, + { + "epoch": 0.18769451759636102, + "grad_norm": 0.6904700773334985, + "learning_rate": 1.9715038909862518e-05, + "logits/chosen": -0.12151038646697998, + "logits/rejected": -0.15654897689819336, + "logps/chosen": -1035.0198974609375, + "logps/rejected": -1042.153076171875, + "loss": 0.2754, + "num_input_tokens_seen": 16148160, + "rewards/accuracies": 0.8828125, + "rewards/chosen": 1.183942437171936, + "rewards/margins": 2.05972957611084, + "rewards/rejected": -0.875787079334259, + "step": 49 + }, + { + "epoch": 0.19152501795547044, + "grad_norm": 0.5836682451344709, + "learning_rate": 1.9700312531945444e-05, + "logits/chosen": -0.013994764536619186, + "logits/rejected": -0.03232590854167938, + "logps/chosen": -1036.3302001953125, + "logps/rejected": -1036.571533203125, + "loss": 0.2395, + "num_input_tokens_seen": 16479744, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.200459599494934, + "rewards/margins": 1.9757564067840576, + "rewards/rejected": -0.7752969264984131, + "step": 50 + }, + { + "epoch": 0.19535551831457984, + "grad_norm": 0.5711655448255101, + "learning_rate": 1.9685220942744174e-05, + "logits/chosen": -0.07415458559989929, + "logits/rejected": -0.08991286903619766, + "logps/chosen": -1038.37060546875, + "logps/rejected": -1048.2490234375, + "loss": 0.2197, + "num_input_tokens_seen": 16811456, + "rewards/accuracies": 0.9140625, + "rewards/chosen": 1.4541586637496948, + "rewards/margins": 2.542146682739258, + "rewards/rejected": -1.0879878997802734, + "step": 51 + }, + { + "epoch": 0.19918601867368926, + "grad_norm": 0.7318682254264652, + "learning_rate": 1.9669764710448523e-05, + "logits/chosen": -0.12173047661781311, + "logits/rejected": -0.13167661428451538, + "logps/chosen": -1026.5341796875, + "logps/rejected": -1038.888427734375, + "loss": 0.2882, + "num_input_tokens_seen": 17140448, + "rewards/accuracies": 0.8671875, + "rewards/chosen": 1.0422205924987793, + "rewards/margins": 2.047743558883667, + "rewards/rejected": -1.0055230855941772, + "step": 52 + }, + { + "epoch": 0.20301651903279866, + "grad_norm": 0.6275554264552532, + "learning_rate": 1.9653944416976897e-05, + "logits/chosen": -0.14821757376194, + "logits/rejected": -0.171110600233078, + "logps/chosen": -1028.662841796875, + "logps/rejected": -1055.0906982421875, + "loss": 0.2202, + "num_input_tokens_seen": 17465248, + "rewards/accuracies": 0.9453125, + "rewards/chosen": 1.1303459405899048, + "rewards/margins": 2.3993606567382812, + "rewards/rejected": -1.269014835357666, + "step": 53 + }, + { + "epoch": 0.20684701939190805, + "grad_norm": 0.5961275037118179, + "learning_rate": 1.96377606579544e-05, + "logits/chosen": -0.03661435842514038, + "logits/rejected": -0.07696016877889633, + "logps/chosen": -1040.2498779296875, + "logps/rejected": -1033.3004150390625, + "loss": 0.2347, + "num_input_tokens_seen": 17796480, + "rewards/accuracies": 0.921875, + "rewards/chosen": 1.1795969009399414, + "rewards/margins": 2.383175849914551, + "rewards/rejected": -1.2035791873931885, + "step": 54 + }, + { + "epoch": 0.21067751975101748, + "grad_norm": 0.7448901432988814, + "learning_rate": 1.9621214042690416e-05, + "logits/chosen": -0.18664640188217163, + "logits/rejected": -0.18897084891796112, + "logps/chosen": -1063.03759765625, + "logps/rejected": -1079.42822265625, + "loss": 0.2424, + "num_input_tokens_seen": 18136896, + "rewards/accuracies": 0.8984375, + "rewards/chosen": 1.1454110145568848, + "rewards/margins": 2.5277884006500244, + "rewards/rejected": -1.3823773860931396, + "step": 55 + }, + { + "epoch": 0.21450802011012687, + "grad_norm": 0.5885594470162759, + "learning_rate": 1.960430519415566e-05, + "logits/chosen": -0.07422441244125366, + "logits/rejected": -0.09609758108854294, + "logps/chosen": -1042.267822265625, + "logps/rejected": -1027.90576171875, + "loss": 0.1977, + "num_input_tokens_seen": 18462848, + "rewards/accuracies": 0.9296875, + "rewards/chosen": 1.111598014831543, + "rewards/margins": 2.655104637145996, + "rewards/rejected": -1.5435068607330322, + "step": 56 + }, + { + "epoch": 0.2183385204692363, + "grad_norm": 0.5766324037602554, + "learning_rate": 1.9587034748958716e-05, + "logits/chosen": -0.1421271115541458, + "logits/rejected": -0.15093661844730377, + "logps/chosen": -1047.219970703125, + "logps/rejected": -1057.823486328125, + "loss": 0.1834, + "num_input_tokens_seen": 18798400, + "rewards/accuracies": 0.9296875, + "rewards/chosen": 1.115065574645996, + "rewards/margins": 2.7879080772399902, + "rewards/rejected": -1.6728425025939941, + "step": 57 + }, + { + "epoch": 0.2221690208283457, + "grad_norm": 0.5495977083966185, + "learning_rate": 1.956940335732209e-05, + "logits/chosen": -0.11535763740539551, + "logits/rejected": -0.0958159789443016, + "logps/chosen": -1014.5343017578125, + "logps/rejected": -1035.2476806640625, + "loss": 0.195, + "num_input_tokens_seen": 19112352, + "rewards/accuracies": 0.921875, + "rewards/chosen": 1.1337867975234985, + "rewards/margins": 2.727102518081665, + "rewards/rejected": -1.593315601348877, + "step": 58 + }, + { + "epoch": 0.22599952118745512, + "grad_norm": 0.5352706515726708, + "learning_rate": 1.955141168305771e-05, + "logits/chosen": -0.15455064177513123, + "logits/rejected": -0.15888439118862152, + "logps/chosen": -1029.631103515625, + "logps/rejected": -1020.0987548828125, + "loss": 0.1753, + "num_input_tokens_seen": 19446560, + "rewards/accuracies": 0.9609375, + "rewards/chosen": 1.3473488092422485, + "rewards/margins": 2.8243696689605713, + "rewards/rejected": -1.4770206212997437, + "step": 59 + }, + { + "epoch": 0.2298300215465645, + "grad_norm": 0.658911130186954, + "learning_rate": 1.9533060403541937e-05, + "logits/chosen": -0.1459999680519104, + "logits/rejected": -0.14830471575260162, + "logps/chosen": -1053.2979736328125, + "logps/rejected": -1092.8507080078125, + "loss": 0.2073, + "num_input_tokens_seen": 19776960, + "rewards/accuracies": 0.921875, + "rewards/chosen": 1.134781837463379, + "rewards/margins": 2.814605236053467, + "rewards/rejected": -1.6798232793807983, + "step": 60 + }, + { + "epoch": 0.23366052190567393, + "grad_norm": 0.6199967402545495, + "learning_rate": 1.9514350209690085e-05, + "logits/chosen": -0.15357057750225067, + "logits/rejected": -0.18749693036079407, + "logps/chosen": -991.7289428710938, + "logps/rejected": -1004.6387939453125, + "loss": 0.2021, + "num_input_tokens_seen": 20088064, + "rewards/accuracies": 0.953125, + "rewards/chosen": 0.784792423248291, + "rewards/margins": 2.672673225402832, + "rewards/rejected": -1.8878809213638306, + "step": 61 + }, + { + "epoch": 0.23749102226478333, + "grad_norm": 0.5686629475048504, + "learning_rate": 1.949528180593037e-05, + "logits/chosen": -0.10532631725072861, + "logits/rejected": -0.13495033979415894, + "logps/chosen": -1051.88818359375, + "logps/rejected": -1053.18994140625, + "loss": 0.1944, + "num_input_tokens_seen": 20411904, + "rewards/accuracies": 0.9609375, + "rewards/chosen": 1.0606743097305298, + "rewards/margins": 2.510143280029297, + "rewards/rejected": -1.4494688510894775, + "step": 62 + }, + { + "epoch": 0.24132152262389275, + "grad_norm": 0.6518749808308908, + "learning_rate": 1.947585591017741e-05, + "logits/chosen": -0.13857662677764893, + "logits/rejected": -0.16213856637477875, + "logps/chosen": -1025.368896484375, + "logps/rejected": -1033.074951171875, + "loss": 0.2268, + "num_input_tokens_seen": 20738656, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6809751987457275, + "rewards/margins": 2.7084925174713135, + "rewards/rejected": -2.027517318725586, + "step": 63 + }, + { + "epoch": 0.24515202298300215, + "grad_norm": 0.7053575397085449, + "learning_rate": 1.9456073253805214e-05, + "logits/chosen": -0.1368640959262848, + "logits/rejected": -0.15874262154102325, + "logps/chosen": -1024.86865234375, + "logps/rejected": -1057.951416015625, + "loss": 0.2464, + "num_input_tokens_seen": 21064192, + "rewards/accuracies": 0.890625, + "rewards/chosen": 0.8212336897850037, + "rewards/margins": 2.9199941158294678, + "rewards/rejected": -2.0987603664398193, + "step": 64 + }, + { + "epoch": 0.24898252334211157, + "grad_norm": 0.555302461574011, + "learning_rate": 1.9435934581619606e-05, + "logits/chosen": -0.13385649025440216, + "logits/rejected": -0.1642713099718094, + "logps/chosen": -1053.2781982421875, + "logps/rejected": -1082.4051513671875, + "loss": 0.1719, + "num_input_tokens_seen": 21393024, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.8186042904853821, + "rewards/margins": 3.0335397720336914, + "rewards/rejected": -2.214935302734375, + "step": 65 + }, + { + "epoch": 0.252813023701221, + "grad_norm": 0.6146722427848964, + "learning_rate": 1.941544065183021e-05, + "logits/chosen": -0.1487371027469635, + "logits/rejected": -0.175001323223114, + "logps/chosen": -1045.6282958984375, + "logps/rejected": -1064.157470703125, + "loss": 0.1916, + "num_input_tokens_seen": 21726112, + "rewards/accuracies": 0.9140625, + "rewards/chosen": 0.7089834809303284, + "rewards/margins": 2.941462993621826, + "rewards/rejected": -2.2324795722961426, + "step": 66 + }, + { + "epoch": 0.25664352406033036, + "grad_norm": 0.7013307088427623, + "learning_rate": 1.93945922360219e-05, + "logits/chosen": -0.1703948676586151, + "logits/rejected": -0.17046520113945007, + "logps/chosen": -1067.7177734375, + "logps/rejected": -1093.931640625, + "loss": 0.219, + "num_input_tokens_seen": 22054816, + "rewards/accuracies": 0.9140625, + "rewards/chosen": 0.4798608422279358, + "rewards/margins": 2.8461766242980957, + "rewards/rejected": -2.3663158416748047, + "step": 67 + }, + { + "epoch": 0.2604740244194398, + "grad_norm": 0.5824384270480167, + "learning_rate": 1.937339011912575e-05, + "logits/chosen": -0.1234840601682663, + "logits/rejected": -0.14853805303573608, + "logps/chosen": -1048.8248291015625, + "logps/rejected": -1072.3228759765625, + "loss": 0.1784, + "num_input_tokens_seen": 22385920, + "rewards/accuracies": 0.9453125, + "rewards/chosen": 0.5817351937294006, + "rewards/margins": 2.9389114379882812, + "rewards/rejected": -2.3571763038635254, + "step": 68 + }, + { + "epoch": 0.2643045247785492, + "grad_norm": 0.5773717658236601, + "learning_rate": 1.9351835099389476e-05, + "logits/chosen": -0.1679905652999878, + "logits/rejected": -0.1781005859375, + "logps/chosen": -1041.194091796875, + "logps/rejected": -1063.80908203125, + "loss": 0.1549, + "num_input_tokens_seen": 22719104, + "rewards/accuracies": 0.9609375, + "rewards/chosen": 0.5108047723770142, + "rewards/margins": 3.0011167526245117, + "rewards/rejected": -2.490311861038208, + "step": 69 + }, + { + "epoch": 0.26813502513765863, + "grad_norm": 0.5292610087207676, + "learning_rate": 1.932992798834739e-05, + "logits/chosen": -0.18429197371006012, + "logits/rejected": -0.2002743035554886, + "logps/chosen": -1062.5814208984375, + "logps/rejected": -1101.291259765625, + "loss": 0.142, + "num_input_tokens_seen": 23048352, + "rewards/accuracies": 0.9609375, + "rewards/chosen": 0.2421068549156189, + "rewards/margins": 3.224984645843506, + "rewards/rejected": -2.982877731323242, + "step": 70 + }, + { + "epoch": 0.271965525496768, + "grad_norm": 0.5558880910896092, + "learning_rate": 1.9307669610789837e-05, + "logits/chosen": -0.19764868915081024, + "logits/rejected": -0.21474558115005493, + "logps/chosen": -1036.3931884765625, + "logps/rejected": -1051.1083984375, + "loss": 0.1519, + "num_input_tokens_seen": 23370880, + "rewards/accuracies": 0.953125, + "rewards/chosen": 0.2054198682308197, + "rewards/margins": 3.248018503189087, + "rewards/rejected": -3.0425987243652344, + "step": 71 + }, + { + "epoch": 0.2757960258558774, + "grad_norm": 0.6082855008522914, + "learning_rate": 1.928506080473216e-05, + "logits/chosen": -0.1662517786026001, + "logits/rejected": -0.1979164332151413, + "logps/chosen": -1065.109130859375, + "logps/rejected": -1103.873291015625, + "loss": 0.176, + "num_input_tokens_seen": 23703872, + "rewards/accuracies": 0.953125, + "rewards/chosen": -0.09440723806619644, + "rewards/margins": 3.1955373287200928, + "rewards/rejected": -3.2899444103240967, + "step": 72 + }, + { + "epoch": 0.27962652621498685, + "grad_norm": 0.6478702548788221, + "learning_rate": 1.9262102421383115e-05, + "logits/chosen": -0.09081355482339859, + "logits/rejected": -0.09914170950651169, + "logps/chosen": -1017.9966430664062, + "logps/rejected": -1044.4127197265625, + "loss": 0.1715, + "num_input_tokens_seen": 24023104, + "rewards/accuracies": 0.9453125, + "rewards/chosen": -0.18625618517398834, + "rewards/margins": 3.0096402168273926, + "rewards/rejected": -3.1958963871002197, + "step": 73 + }, + { + "epoch": 0.2834570265740962, + "grad_norm": 0.6159684985112368, + "learning_rate": 1.9238795325112867e-05, + "logits/chosen": -0.13127541542053223, + "logits/rejected": -0.14948369562625885, + "logps/chosen": -1044.07373046875, + "logps/rejected": -1031.362060546875, + "loss": 0.1737, + "num_input_tokens_seen": 24348992, + "rewards/accuracies": 0.9453125, + "rewards/chosen": -0.15133219957351685, + "rewards/margins": 3.065927028656006, + "rewards/rejected": -3.217259407043457, + "step": 74 + }, + { + "epoch": 0.28728752693320564, + "grad_norm": 0.5395018961917724, + "learning_rate": 1.9215140393420422e-05, + "logits/chosen": -0.12972044944763184, + "logits/rejected": -0.10936278104782104, + "logps/chosen": -1051.79541015625, + "logps/rejected": -1077.9757080078125, + "loss": 0.1499, + "num_input_tokens_seen": 24682176, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -0.032676391303539276, + "rewards/margins": 3.04026198387146, + "rewards/rejected": -3.0729379653930664, + "step": 75 + }, + { + "epoch": 0.29111802729231506, + "grad_norm": 0.7175214977408462, + "learning_rate": 1.919113851690058e-05, + "logits/chosen": -0.09176371991634369, + "logits/rejected": -0.11456742882728577, + "logps/chosen": -1051.74462890625, + "logps/rejected": -1091.8253173828125, + "loss": 0.173, + "num_input_tokens_seen": 25017216, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.08995723724365234, + "rewards/margins": 3.341566562652588, + "rewards/rejected": -3.4315240383148193, + "step": 76 + }, + { + "epoch": 0.2949485276514245, + "grad_norm": 0.5826202494592349, + "learning_rate": 1.9166790599210426e-05, + "logits/chosen": -0.19666969776153564, + "logits/rejected": -0.22636541724205017, + "logps/chosen": -1070.956298828125, + "logps/rejected": -1065.6644287109375, + "loss": 0.1708, + "num_input_tokens_seen": 25350144, + "rewards/accuracies": 0.953125, + "rewards/chosen": -0.40832141041755676, + "rewards/margins": 3.1240291595458984, + "rewards/rejected": -3.532350540161133, + "step": 77 + }, + { + "epoch": 0.29877902801053385, + "grad_norm": 0.5601118908642735, + "learning_rate": 1.914209755703531e-05, + "logits/chosen": -0.10145962238311768, + "logits/rejected": -0.12346384674310684, + "logps/chosen": -1068.3275146484375, + "logps/rejected": -1120.119873046875, + "loss": 0.1405, + "num_input_tokens_seen": 25680960, + "rewards/accuracies": 0.953125, + "rewards/chosen": -0.4318390190601349, + "rewards/margins": 3.3549301624298096, + "rewards/rejected": -3.786768913269043, + "step": 78 + }, + { + "epoch": 0.3026095283696433, + "grad_norm": 0.7174301104880886, + "learning_rate": 1.91170603200543e-05, + "logits/chosen": -0.10651267319917679, + "logits/rejected": -0.1076231598854065, + "logps/chosen": -1092.2734375, + "logps/rejected": -1116.5054931640625, + "loss": 0.1933, + "num_input_tokens_seen": 26018816, + "rewards/accuracies": 0.9296875, + "rewards/chosen": -0.7465907335281372, + "rewards/margins": 3.316758632659912, + "rewards/rejected": -4.06334924697876, + "step": 79 + }, + { + "epoch": 0.3064400287287527, + "grad_norm": 0.6522039554329132, + "learning_rate": 1.9091679830905225e-05, + "logits/chosen": -0.18133768439292908, + "logits/rejected": -0.1916915625333786, + "logps/chosen": -1084.028076171875, + "logps/rejected": -1113.2078857421875, + "loss": 0.1648, + "num_input_tokens_seen": 26362592, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6805394887924194, + "rewards/margins": 3.322674512863159, + "rewards/rejected": -4.003213882446289, + "step": 80 + }, + { + "epoch": 0.3102705290878621, + "grad_norm": 0.5791395986077041, + "learning_rate": 1.9065957045149156e-05, + "logits/chosen": -0.08691702783107758, + "logits/rejected": -0.09795527160167694, + "logps/chosen": -1075.6014404296875, + "logps/rejected": -1094.822265625, + "loss": 0.1574, + "num_input_tokens_seen": 26694880, + "rewards/accuracies": 0.9453125, + "rewards/chosen": -0.6514371633529663, + "rewards/margins": 3.610201835632324, + "rewards/rejected": -4.26163911819458, + "step": 81 + }, + { + "epoch": 0.3141010294469715, + "grad_norm": 0.5835848375966363, + "learning_rate": 1.9039892931234434e-05, + "logits/chosen": -0.1662209928035736, + "logits/rejected": -0.18504418432712555, + "logps/chosen": -1053.040771484375, + "logps/rejected": -1058.100341796875, + "loss": 0.1452, + "num_input_tokens_seen": 27020832, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8232435584068298, + "rewards/margins": 3.3671939373016357, + "rewards/rejected": -4.190437316894531, + "step": 82 + }, + { + "epoch": 0.3179315298060809, + "grad_norm": 0.6953542425753936, + "learning_rate": 1.9013488470460223e-05, + "logits/chosen": -0.19153544306755066, + "logits/rejected": -0.1963300108909607, + "logps/chosen": -1067.6649169921875, + "logps/rejected": -1099.5660400390625, + "loss": 0.1498, + "num_input_tokens_seen": 27338752, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9370135068893433, + "rewards/margins": 3.602818489074707, + "rewards/rejected": -4.53983211517334, + "step": 83 + }, + { + "epoch": 0.32176203016519034, + "grad_norm": 0.660622602441037, + "learning_rate": 1.898674465693954e-05, + "logits/chosen": -0.16682273149490356, + "logits/rejected": -0.17320144176483154, + "logps/chosen": -1098.4761962890625, + "logps/rejected": -1116.791748046875, + "loss": 0.1532, + "num_input_tokens_seen": 27681088, + "rewards/accuracies": 0.9296875, + "rewards/chosen": -1.164334774017334, + "rewards/margins": 3.388920307159424, + "rewards/rejected": -4.553255081176758, + "step": 84 + }, + { + "epoch": 0.32559253052429976, + "grad_norm": 0.591288537248825, + "learning_rate": 1.895966249756185e-05, + "logits/chosen": -0.14719031751155853, + "logits/rejected": -0.17177651822566986, + "logps/chosen": -1053.7938232421875, + "logps/rejected": -1096.4962158203125, + "loss": 0.134, + "num_input_tokens_seen": 28011136, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3432151079177856, + "rewards/margins": 3.7118208408355713, + "rewards/rejected": -5.0550360679626465, + "step": 85 + }, + { + "epoch": 0.32942303088340913, + "grad_norm": 0.6803074372051849, + "learning_rate": 1.8932243011955154e-05, + "logits/chosen": -0.2051846981048584, + "logits/rejected": -0.21695606410503387, + "logps/chosen": -1052.840087890625, + "logps/rejected": -1079.072998046875, + "loss": 0.1787, + "num_input_tokens_seen": 28343296, + "rewards/accuracies": 0.921875, + "rewards/chosen": -1.5618228912353516, + "rewards/margins": 3.518864154815674, + "rewards/rejected": -5.080687046051025, + "step": 86 + }, + { + "epoch": 0.33325353124251855, + "grad_norm": 0.5933171593594313, + "learning_rate": 1.8904487232447582e-05, + "logits/chosen": -0.16801735758781433, + "logits/rejected": -0.1852046549320221, + "logps/chosen": -1039.32275390625, + "logps/rejected": -1049.6553955078125, + "loss": 0.1563, + "num_input_tokens_seen": 28667424, + "rewards/accuracies": 0.9453125, + "rewards/chosen": -0.9179442524909973, + "rewards/margins": 3.417177438735962, + "rewards/rejected": -4.3351216316223145, + "step": 87 + }, + { + "epoch": 0.337084031601628, + "grad_norm": 0.6055431279538412, + "learning_rate": 1.8876396204028543e-05, + "logits/chosen": -0.22229944169521332, + "logits/rejected": -0.24006246030330658, + "logps/chosen": -1069.5985107421875, + "logps/rejected": -1085.583740234375, + "loss": 0.1347, + "num_input_tokens_seen": 28999776, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3764853477478027, + "rewards/margins": 3.6491096019744873, + "rewards/rejected": -5.025594711303711, + "step": 88 + }, + { + "epoch": 0.34091453196073734, + "grad_norm": 0.5990150613263929, + "learning_rate": 1.884797098430938e-05, + "logits/chosen": -0.1843714416027069, + "logits/rejected": -0.17826035618782043, + "logps/chosen": -1046.4185791015625, + "logps/rejected": -1067.7587890625, + "loss": 0.1542, + "num_input_tokens_seen": 29331456, + "rewards/accuracies": 0.96875, + "rewards/chosen": -1.2393687963485718, + "rewards/margins": 3.2584688663482666, + "rewards/rejected": -4.497837543487549, + "step": 89 + }, + { + "epoch": 0.34474503231984677, + "grad_norm": 0.3831370747895364, + "learning_rate": 1.881921264348355e-05, + "logits/chosen": -0.25252625346183777, + "logits/rejected": -0.27720457315444946, + "logps/chosen": -1059.4232177734375, + "logps/rejected": -1091.85009765625, + "loss": 0.0792, + "num_input_tokens_seen": 29664768, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -0.974642813205719, + "rewards/margins": 4.044173240661621, + "rewards/rejected": -5.0188164710998535, + "step": 90 + }, + { + "epoch": 0.3485755326789562, + "grad_norm": 0.5309082471492976, + "learning_rate": 1.8790122264286336e-05, + "logits/chosen": -0.19553938508033752, + "logits/rejected": -0.20926207304000854, + "logps/chosen": -1108.445068359375, + "logps/rejected": -1135.32275390625, + "loss": 0.1184, + "num_input_tokens_seen": 30011712, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.8668914437294006, + "rewards/margins": 3.816915512084961, + "rewards/rejected": -4.683806896209717, + "step": 91 + }, + { + "epoch": 0.3524060330380656, + "grad_norm": 0.6862058008983414, + "learning_rate": 1.8760700941954066e-05, + "logits/chosen": -0.16240651905536652, + "logits/rejected": -0.18200114369392395, + "logps/chosen": -1047.280517578125, + "logps/rejected": -1065.8599853515625, + "loss": 0.1729, + "num_input_tokens_seen": 30343968, + "rewards/accuracies": 0.9296875, + "rewards/chosen": -0.7949151992797852, + "rewards/margins": 3.6079280376434326, + "rewards/rejected": -4.402842998504639, + "step": 92 + }, + { + "epoch": 0.356236533397175, + "grad_norm": 0.6414040935374731, + "learning_rate": 1.87309497841829e-05, + "logits/chosen": -0.16592206060886383, + "logits/rejected": -0.21626293659210205, + "logps/chosen": -1021.5260009765625, + "logps/rejected": -1043.9886474609375, + "loss": 0.1441, + "num_input_tokens_seen": 30670528, + "rewards/accuracies": 0.9453125, + "rewards/chosen": -0.5563231706619263, + "rewards/margins": 3.582958936691284, + "rewards/rejected": -4.1392822265625, + "step": 93 + }, + { + "epoch": 0.3600670337562844, + "grad_norm": 0.49814948938214665, + "learning_rate": 1.8700869911087115e-05, + "logits/chosen": -0.1781419813632965, + "logits/rejected": -0.20105642080307007, + "logps/chosen": -1071.57958984375, + "logps/rejected": -1115.304443359375, + "loss": 0.1032, + "num_input_tokens_seen": 31003104, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -0.093349888920784, + "rewards/margins": 4.100081920623779, + "rewards/rejected": -4.193431854248047, + "step": 94 + }, + { + "epoch": 0.36389753411539383, + "grad_norm": 0.5133872622092326, + "learning_rate": 1.8670462455156928e-05, + "logits/chosen": -0.1734057366847992, + "logits/rejected": -0.15577569603919983, + "logps/chosen": -1058.9464111328125, + "logps/rejected": -1078.302490234375, + "loss": 0.1198, + "num_input_tokens_seen": 31336576, + "rewards/accuracies": 0.9609375, + "rewards/chosen": 0.15735287964344025, + "rewards/margins": 3.64870023727417, + "rewards/rejected": -3.491347312927246, + "step": 95 + }, + { + "epoch": 0.36772803447450325, + "grad_norm": 0.45417380002727153, + "learning_rate": 1.863972856121587e-05, + "logits/chosen": -0.2463587075471878, + "logits/rejected": -0.2643094062805176, + "logps/chosen": -1064.293212890625, + "logps/rejected": -1086.7137451171875, + "loss": 0.0899, + "num_input_tokens_seen": 31663232, + "rewards/accuracies": 0.9765625, + "rewards/chosen": 0.2432812750339508, + "rewards/margins": 3.9300501346588135, + "rewards/rejected": -3.6867690086364746, + "step": 96 + }, + { + "epoch": 0.3715585348336126, + "grad_norm": 0.41344607312458825, + "learning_rate": 1.8608669386377672e-05, + "logits/chosen": -0.15563519299030304, + "logits/rejected": -0.16767705976963043, + "logps/chosen": -1027.3201904296875, + "logps/rejected": -1065.580078125, + "loss": 0.0887, + "num_input_tokens_seen": 31992480, + "rewards/accuracies": 0.984375, + "rewards/chosen": 0.6941100358963013, + "rewards/margins": 3.9777886867523193, + "rewards/rejected": -3.2836787700653076, + "step": 97 + }, + { + "epoch": 0.37538903519272204, + "grad_norm": 0.7874604729574614, + "learning_rate": 1.8577286100002723e-05, + "logits/chosen": -0.1941939890384674, + "logits/rejected": -0.22690552473068237, + "logps/chosen": -1054.074951171875, + "logps/rejected": -1088.1954345703125, + "loss": 0.1658, + "num_input_tokens_seen": 32336256, + "rewards/accuracies": 0.9296875, + "rewards/chosen": 0.8607207536697388, + "rewards/margins": 3.823794364929199, + "rewards/rejected": -2.96307373046875, + "step": 98 + }, + { + "epoch": 0.37921953555183147, + "grad_norm": 0.4796484358828774, + "learning_rate": 1.8545579883654007e-05, + "logits/chosen": -0.12867914140224457, + "logits/rejected": -0.13516613841056824, + "logps/chosen": -1041.158447265625, + "logps/rejected": -1069.065673828125, + "loss": 0.1089, + "num_input_tokens_seen": 32667232, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.842719316482544, + "rewards/margins": 4.017481803894043, + "rewards/rejected": -3.174762725830078, + "step": 99 + }, + { + "epoch": 0.3830500359109409, + "grad_norm": 0.47206553844790056, + "learning_rate": 1.8513551931052654e-05, + "logits/chosen": -0.16613329946994781, + "logits/rejected": -0.1799527108669281, + "logps/chosen": -1039.8082275390625, + "logps/rejected": -1067.810546875, + "loss": 0.0963, + "num_input_tokens_seen": 32998496, + "rewards/accuracies": 0.9765625, + "rewards/chosen": 1.0254769325256348, + "rewards/margins": 3.787473440170288, + "rewards/rejected": -2.7619965076446533, + "step": 100 + }, + { + "epoch": 0.38688053627005026, + "grad_norm": 0.6132457712849773, + "learning_rate": 1.8481203448032975e-05, + "logits/chosen": -0.21160748600959778, + "logits/rejected": -0.23554955422878265, + "logps/chosen": -1043.0679931640625, + "logps/rejected": -1038.188720703125, + "loss": 0.1315, + "num_input_tokens_seen": 33325216, + "rewards/accuracies": 0.9609375, + "rewards/chosen": 0.9241933226585388, + "rewards/margins": 3.721569299697876, + "rewards/rejected": -2.7973761558532715, + "step": 101 + }, + { + "epoch": 0.3907110366291597, + "grad_norm": 0.5950178634980474, + "learning_rate": 1.8448535652497073e-05, + "logits/chosen": -0.1953836977481842, + "logits/rejected": -0.2117384970188141, + "logps/chosen": -1007.9212036132812, + "logps/rejected": -1033.7440185546875, + "loss": 0.1281, + "num_input_tokens_seen": 33648256, + "rewards/accuracies": 0.9765625, + "rewards/chosen": 0.6901592016220093, + "rewards/margins": 3.686323881149292, + "rewards/rejected": -2.996164560317993, + "step": 102 + }, + { + "epoch": 0.3945415369882691, + "grad_norm": 0.5669090669402612, + "learning_rate": 1.8415549774368987e-05, + "logits/chosen": -0.205836683511734, + "logits/rejected": -0.20203089714050293, + "logps/chosen": -1034.2452392578125, + "logps/rejected": -1048.6591796875, + "loss": 0.1247, + "num_input_tokens_seen": 33972000, + "rewards/accuracies": 0.953125, + "rewards/chosen": 1.008556604385376, + "rewards/margins": 3.892176628112793, + "rewards/rejected": -2.883620023727417, + "step": 103 + }, + { + "epoch": 0.3983720373473785, + "grad_norm": 0.7591909209793485, + "learning_rate": 1.838224705554838e-05, + "logits/chosen": -0.1789209544658661, + "logits/rejected": -0.20510420203208923, + "logps/chosen": -1059.2811279296875, + "logps/rejected": -1060.1768798828125, + "loss": 0.1733, + "num_input_tokens_seen": 34296864, + "rewards/accuracies": 0.90625, + "rewards/chosen": 0.8286986947059631, + "rewards/margins": 3.5364646911621094, + "rewards/rejected": -2.707766056060791, + "step": 104 + }, + { + "epoch": 0.4022025377064879, + "grad_norm": 0.43681558258007436, + "learning_rate": 1.83486287498638e-05, + "logits/chosen": -0.20612114667892456, + "logits/rejected": -0.2278648465871811, + "logps/chosen": -1066.069580078125, + "logps/rejected": -1078.008056640625, + "loss": 0.0805, + "num_input_tokens_seen": 34627136, + "rewards/accuracies": 0.9765625, + "rewards/chosen": 1.2604122161865234, + "rewards/margins": 4.303579330444336, + "rewards/rejected": -3.0431671142578125, + "step": 105 + }, + { + "epoch": 0.4060330380655973, + "grad_norm": 0.5322804609529611, + "learning_rate": 1.8314696123025456e-05, + "logits/chosen": -0.274746298789978, + "logits/rejected": -0.2759355306625366, + "logps/chosen": -1058.934814453125, + "logps/rejected": -1069.558837890625, + "loss": 0.1143, + "num_input_tokens_seen": 34964704, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.0433799028396606, + "rewards/margins": 3.9979944229125977, + "rewards/rejected": -2.9546146392822266, + "step": 106 + }, + { + "epoch": 0.40986353842470674, + "grad_norm": 0.5816665307511901, + "learning_rate": 1.828045045257756e-05, + "logits/chosen": -0.15692484378814697, + "logits/rejected": -0.17666402459144592, + "logps/chosen": -1056.9013671875, + "logps/rejected": -1090.870849609375, + "loss": 0.1297, + "num_input_tokens_seen": 35302112, + "rewards/accuracies": 0.9453125, + "rewards/chosen": 1.1503454446792603, + "rewards/margins": 4.228044033050537, + "rewards/rejected": -3.0776991844177246, + "step": 107 + }, + { + "epoch": 0.4136940387838161, + "grad_norm": 0.4151830792760495, + "learning_rate": 1.8245893027850255e-05, + "logits/chosen": -0.24039533734321594, + "logits/rejected": -0.25090712308883667, + "logps/chosen": -1017.1063232421875, + "logps/rejected": -1034.947265625, + "loss": 0.0726, + "num_input_tokens_seen": 35628288, + "rewards/accuracies": 0.984375, + "rewards/chosen": 0.8024920225143433, + "rewards/margins": 4.36201286315918, + "rewards/rejected": -3.559520721435547, + "step": 108 + }, + { + "epoch": 0.41752453914292553, + "grad_norm": 0.4231583376073568, + "learning_rate": 1.821102514991105e-05, + "logits/chosen": -0.28017720580101013, + "logits/rejected": -0.2859395444393158, + "logps/chosen": -1025.148193359375, + "logps/rejected": -1055.1865234375, + "loss": 0.0803, + "num_input_tokens_seen": 35950016, + "rewards/accuracies": 0.984375, + "rewards/chosen": 0.43601202964782715, + "rewards/margins": 3.9519665241241455, + "rewards/rejected": -3.5159547328948975, + "step": 109 + }, + { + "epoch": 0.42135503950203496, + "grad_norm": 0.7010831165342412, + "learning_rate": 1.817584813151584e-05, + "logits/chosen": -0.24620476365089417, + "logits/rejected": -0.24660280346870422, + "logps/chosen": -1039.8870849609375, + "logps/rejected": -1081.481201171875, + "loss": 0.156, + "num_input_tokens_seen": 36275488, + "rewards/accuracies": 0.9453125, + "rewards/chosen": -0.25645607709884644, + "rewards/margins": 3.689507484436035, + "rewards/rejected": -3.9459638595581055, + "step": 110 + }, + { + "epoch": 0.4251855398611444, + "grad_norm": 0.6825948940480404, + "learning_rate": 1.8140363297059488e-05, + "logits/chosen": -0.2373446375131607, + "logits/rejected": -0.26177430152893066, + "logps/chosen": -1076.395751953125, + "logps/rejected": -1109.4825439453125, + "loss": 0.1377, + "num_input_tokens_seen": 36606944, + "rewards/accuracies": 0.9609375, + "rewards/chosen": 0.024053648114204407, + "rewards/margins": 4.013818740844727, + "rewards/rejected": -3.989765167236328, + "step": 111 + }, + { + "epoch": 0.42901604022025375, + "grad_norm": 0.6307830763795625, + "learning_rate": 1.810457198252595e-05, + "logits/chosen": -0.2484377771615982, + "logits/rejected": -0.25214269757270813, + "logps/chosen": -1069.794921875, + "logps/rejected": -1095.82568359375, + "loss": 0.1222, + "num_input_tokens_seen": 36940928, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.11815807968378067, + "rewards/margins": 4.096502304077148, + "rewards/rejected": -4.21466064453125, + "step": 112 + }, + { + "epoch": 0.43284654057936317, + "grad_norm": 0.6193421696534998, + "learning_rate": 1.8068475535437996e-05, + "logits/chosen": -0.19404606521129608, + "logits/rejected": -0.2124338001012802, + "logps/chosen": -1030.669189453125, + "logps/rejected": -1064.060546875, + "loss": 0.1344, + "num_input_tokens_seen": 37261536, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -0.3699100911617279, + "rewards/margins": 3.9253668785095215, + "rewards/rejected": -4.2952775955200195, + "step": 113 + }, + { + "epoch": 0.4366770409384726, + "grad_norm": 0.5013718929930592, + "learning_rate": 1.803207531480645e-05, + "logits/chosen": -0.23311278223991394, + "logits/rejected": -0.2401929795742035, + "logps/chosen": -1043.6195068359375, + "logps/rejected": -1102.009033203125, + "loss": 0.0928, + "num_input_tokens_seen": 37588096, + "rewards/accuracies": 0.984375, + "rewards/chosen": -0.5171188116073608, + "rewards/margins": 4.32392692565918, + "rewards/rejected": -4.84104585647583, + "step": 114 + }, + { + "epoch": 0.440507541297582, + "grad_norm": 0.7221034594279531, + "learning_rate": 1.799537269107905e-05, + "logits/chosen": -0.30688992142677307, + "logits/rejected": -0.3013033866882324, + "logps/chosen": -1067.143310546875, + "logps/rejected": -1073.790283203125, + "loss": 0.1464, + "num_input_tokens_seen": 37916032, + "rewards/accuracies": 0.9140625, + "rewards/chosen": -0.8833155035972595, + "rewards/margins": 4.002040863037109, + "rewards/rejected": -4.885356903076172, + "step": 115 + }, + { + "epoch": 0.4443380416566914, + "grad_norm": 0.6490221569831277, + "learning_rate": 1.7958369046088837e-05, + "logits/chosen": -0.25279510021209717, + "logits/rejected": -0.2626411020755768, + "logps/chosen": -1009.5618896484375, + "logps/rejected": -1027.370849609375, + "loss": 0.1229, + "num_input_tokens_seen": 38222208, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -0.7985373735427856, + "rewards/margins": 4.077844619750977, + "rewards/rejected": -4.876381874084473, + "step": 116 + }, + { + "epoch": 0.4481685420158008, + "grad_norm": 0.6773444465680789, + "learning_rate": 1.7921065773002127e-05, + "logits/chosen": -0.3016372323036194, + "logits/rejected": -0.30134475231170654, + "logps/chosen": -1098.3170166015625, + "logps/rejected": -1144.29150390625, + "loss": 0.1136, + "num_input_tokens_seen": 38559712, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -0.8902484178543091, + "rewards/margins": 4.510333061218262, + "rewards/rejected": -5.400581359863281, + "step": 117 + }, + { + "epoch": 0.45199904237491023, + "grad_norm": 0.5779202546774028, + "learning_rate": 1.7883464276266064e-05, + "logits/chosen": -0.20695927739143372, + "logits/rejected": -0.2022428810596466, + "logps/chosen": -1022.294677734375, + "logps/rejected": -1069.1326904296875, + "loss": 0.1193, + "num_input_tokens_seen": 38883200, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -0.7527124881744385, + "rewards/margins": 4.3678975105285645, + "rewards/rejected": -5.120610237121582, + "step": 118 + }, + { + "epoch": 0.45582954273401965, + "grad_norm": 0.4586189383473843, + "learning_rate": 1.7845565971555754e-05, + "logits/chosen": -0.27116644382476807, + "logits/rejected": -0.25867244601249695, + "logps/chosen": -1067.7314453125, + "logps/rejected": -1080.5064697265625, + "loss": 0.0865, + "num_input_tokens_seen": 39209280, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -0.6261466145515442, + "rewards/margins": 4.41098690032959, + "rewards/rejected": -5.037134170532227, + "step": 119 + }, + { + "epoch": 0.459660043093129, + "grad_norm": 0.6816805213532102, + "learning_rate": 1.7807372285720945e-05, + "logits/chosen": -0.28929582238197327, + "logits/rejected": -0.30856993794441223, + "logps/chosen": -1062.742919921875, + "logps/rejected": -1059.875732421875, + "loss": 0.1415, + "num_input_tokens_seen": 39533312, + "rewards/accuracies": 0.953125, + "rewards/chosen": -0.8029653429985046, + "rewards/margins": 4.179574012756348, + "rewards/rejected": -4.982539176940918, + "step": 120 + }, + { + "epoch": 0.46349054345223845, + "grad_norm": 0.6811925471450805, + "learning_rate": 1.7768884656732327e-05, + "logits/chosen": -0.2861953377723694, + "logits/rejected": -0.26550349593162537, + "logps/chosen": -1090.0140380859375, + "logps/rejected": -1102.16845703125, + "loss": 0.124, + "num_input_tokens_seen": 39867488, + "rewards/accuracies": 0.9453125, + "rewards/chosen": -0.8988319635391235, + "rewards/margins": 4.200819969177246, + "rewards/rejected": -5.099652290344238, + "step": 121 + }, + { + "epoch": 0.46732104381134787, + "grad_norm": 0.48322056762086973, + "learning_rate": 1.773010453362737e-05, + "logits/chosen": -0.31013819575309753, + "logits/rejected": -0.356136679649353, + "logps/chosen": -1042.788330078125, + "logps/rejected": -1064.80810546875, + "loss": 0.0743, + "num_input_tokens_seen": 40189824, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -0.7631875276565552, + "rewards/margins": 4.4137468338012695, + "rewards/rejected": -5.176934242248535, + "step": 122 + }, + { + "epoch": 0.4711515441704573, + "grad_norm": 0.5294860115373862, + "learning_rate": 1.7691033376455798e-05, + "logits/chosen": -0.23792900145053864, + "logits/rejected": -0.2405223399400711, + "logps/chosen": -1079.235107421875, + "logps/rejected": -1097.3013916015625, + "loss": 0.0918, + "num_input_tokens_seen": 40518368, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -0.7908273935317993, + "rewards/margins": 4.2740159034729, + "rewards/rejected": -5.06484317779541, + "step": 123 + }, + { + "epoch": 0.47498204452956666, + "grad_norm": 0.47568299668736097, + "learning_rate": 1.7651672656224592e-05, + "logits/chosen": -0.30369070172309875, + "logits/rejected": -0.316807359457016, + "logps/chosen": -1096.5023193359375, + "logps/rejected": -1137.207763671875, + "loss": 0.085, + "num_input_tokens_seen": 40867360, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -0.8699458837509155, + "rewards/margins": 4.258440017700195, + "rewards/rejected": -5.128385543823242, + "step": 124 + }, + { + "epoch": 0.4788125448886761, + "grad_norm": 0.5784472217131377, + "learning_rate": 1.7612023854842618e-05, + "logits/chosen": -0.2649439573287964, + "logits/rejected": -0.2871297001838684, + "logps/chosen": -1103.049072265625, + "logps/rejected": -1126.614013671875, + "loss": 0.114, + "num_input_tokens_seen": 41200768, + "rewards/accuracies": 0.953125, + "rewards/chosen": -0.8097789287567139, + "rewards/margins": 4.0321364402771, + "rewards/rejected": -4.841914653778076, + "step": 125 + }, + { + "epoch": 0.4826430452477855, + "grad_norm": 0.5439851329447019, + "learning_rate": 1.7572088465064847e-05, + "logits/chosen": -0.27585747838020325, + "logits/rejected": -0.30980539321899414, + "logps/chosen": -1059.407958984375, + "logps/rejected": -1069.379150390625, + "loss": 0.1044, + "num_input_tokens_seen": 41521728, + "rewards/accuracies": 0.953125, + "rewards/chosen": -0.8517192602157593, + "rewards/margins": 4.530754089355469, + "rewards/rejected": -5.382473468780518, + "step": 126 + }, + { + "epoch": 0.4864735456068949, + "grad_norm": 0.2912266005762759, + "learning_rate": 1.7531867990436127e-05, + "logits/chosen": -0.2476787567138672, + "logits/rejected": -0.24558056890964508, + "logps/chosen": -1046.4764404296875, + "logps/rejected": -1078.9632568359375, + "loss": 0.0513, + "num_input_tokens_seen": 41848032, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.397990345954895, + "rewards/margins": 4.357748508453369, + "rewards/rejected": -5.755739212036133, + "step": 127 + }, + { + "epoch": 0.4903040459660043, + "grad_norm": 0.5041812178141297, + "learning_rate": 1.7491363945234595e-05, + "logits/chosen": -0.2616129517555237, + "logits/rejected": -0.2628300189971924, + "logps/chosen": -1075.170654296875, + "logps/rejected": -1122.702392578125, + "loss": 0.0888, + "num_input_tokens_seen": 42184928, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.9113963842391968, + "rewards/margins": 4.461873531341553, + "rewards/rejected": -5.373270034790039, + "step": 128 + }, + { + "epoch": 0.4941345463251137, + "grad_norm": 0.5654795055834491, + "learning_rate": 1.7450577854414662e-05, + "logits/chosen": -0.24484415352344513, + "logits/rejected": -0.2536078989505768, + "logps/chosen": -1061.5811767578125, + "logps/rejected": -1103.420166015625, + "loss": 0.0969, + "num_input_tokens_seen": 42516544, + "rewards/accuracies": 0.96875, + "rewards/chosen": -1.2993745803833008, + "rewards/margins": 4.303369998931885, + "rewards/rejected": -5.602744102478027, + "step": 129 + }, + { + "epoch": 0.49796504668422314, + "grad_norm": 0.5727614146954769, + "learning_rate": 1.7409511253549592e-05, + "logits/chosen": -0.2602999210357666, + "logits/rejected": -0.26499199867248535, + "logps/chosen": -1046.62841796875, + "logps/rejected": -1074.9959716796875, + "loss": 0.0965, + "num_input_tokens_seen": 42851168, + "rewards/accuracies": 0.984375, + "rewards/chosen": -1.4124513864517212, + "rewards/margins": 4.2027974128723145, + "rewards/rejected": -5.615248680114746, + "step": 130 + }, + { + "epoch": 0.5017955470433325, + "grad_norm": 0.5836484198142939, + "learning_rate": 1.73681656887737e-05, + "logits/chosen": -0.27618327736854553, + "logits/rejected": -0.30133211612701416, + "logps/chosen": -1057.632080078125, + "logps/rejected": -1110.866943359375, + "loss": 0.1093, + "num_input_tokens_seen": 43175584, + "rewards/accuracies": 0.96875, + "rewards/chosen": -1.1855597496032715, + "rewards/margins": 4.684765338897705, + "rewards/rejected": -5.870325088500977, + "step": 131 + }, + { + "epoch": 0.505626047402442, + "grad_norm": 0.5568829400346685, + "learning_rate": 1.7326542716724127e-05, + "logits/chosen": -0.31125199794769287, + "logits/rejected": -0.29701024293899536, + "logps/chosen": -1087.96826171875, + "logps/rejected": -1126.8802490234375, + "loss": 0.0904, + "num_input_tokens_seen": 43515872, + "rewards/accuracies": 0.96875, + "rewards/chosen": -1.2924695014953613, + "rewards/margins": 4.890289783477783, + "rewards/rejected": -6.1827592849731445, + "step": 132 + }, + { + "epoch": 0.5094565477615514, + "grad_norm": 0.5495922551872561, + "learning_rate": 1.7284643904482254e-05, + "logits/chosen": -0.3197418749332428, + "logits/rejected": -0.3448374569416046, + "logps/chosen": -1100.758056640625, + "logps/rejected": -1132.927001953125, + "loss": 0.0952, + "num_input_tokens_seen": 43854976, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -1.1738486289978027, + "rewards/margins": 4.4937872886657715, + "rewards/rejected": -5.667635440826416, + "step": 133 + }, + { + "epoch": 0.5132870481206607, + "grad_norm": 0.6992637378791453, + "learning_rate": 1.7242470829514674e-05, + "logits/chosen": -0.2809790074825287, + "logits/rejected": -0.27749815583229065, + "logps/chosen": -1085.05419921875, + "logps/rejected": -1102.66455078125, + "loss": 0.1383, + "num_input_tokens_seen": 44195136, + "rewards/accuracies": 0.9296875, + "rewards/chosen": -1.2908967733383179, + "rewards/margins": 4.259258270263672, + "rewards/rejected": -5.550155162811279, + "step": 134 + }, + { + "epoch": 0.5171175484797702, + "grad_norm": 0.6121837965889434, + "learning_rate": 1.720002507961382e-05, + "logits/chosen": -0.341269314289093, + "logits/rejected": -0.3448982834815979, + "logps/chosen": -1063.4564208984375, + "logps/rejected": -1105.0433349609375, + "loss": 0.1074, + "num_input_tokens_seen": 44525856, + "rewards/accuracies": 0.953125, + "rewards/chosen": -1.4353156089782715, + "rewards/margins": 4.755117893218994, + "rewards/rejected": -6.190433502197266, + "step": 135 + }, + { + "epoch": 0.5209480488388796, + "grad_norm": 0.49792481263106764, + "learning_rate": 1.7157308252838187e-05, + "logits/chosen": -0.3150177001953125, + "logits/rejected": -0.34403371810913086, + "logps/chosen": -1058.091552734375, + "logps/rejected": -1080.352783203125, + "loss": 0.0834, + "num_input_tokens_seen": 44842336, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -1.5159709453582764, + "rewards/margins": 4.622813701629639, + "rewards/rejected": -6.138784408569336, + "step": 136 + }, + { + "epoch": 0.5247785491979889, + "grad_norm": 0.5604175485375209, + "learning_rate": 1.7114321957452166e-05, + "logits/chosen": -0.28440794348716736, + "logits/rejected": -0.2973944842815399, + "logps/chosen": -1070.2861328125, + "logps/rejected": -1134.93505859375, + "loss": 0.0874, + "num_input_tokens_seen": 45183360, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -1.3956727981567383, + "rewards/margins": 4.318110466003418, + "rewards/rejected": -5.713783264160156, + "step": 137 + }, + { + "epoch": 0.5286090495570984, + "grad_norm": 0.5120431796548408, + "learning_rate": 1.7071067811865477e-05, + "logits/chosen": -0.24847936630249023, + "logits/rejected": -0.2593730688095093, + "logps/chosen": -1058.0743408203125, + "logps/rejected": -1086.9111328125, + "loss": 0.0913, + "num_input_tokens_seen": 45511040, + "rewards/accuracies": 0.984375, + "rewards/chosen": -1.5798696279525757, + "rewards/margins": 4.31203556060791, + "rewards/rejected": -5.891904830932617, + "step": 138 + }, + { + "epoch": 0.5324395499162078, + "grad_norm": 0.5048764848641242, + "learning_rate": 1.7027547444572254e-05, + "logits/chosen": -0.31230995059013367, + "logits/rejected": -0.335568904876709, + "logps/chosen": -1067.47998046875, + "logps/rejected": -1106.81591796875, + "loss": 0.0683, + "num_input_tokens_seen": 45842656, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -1.5118072032928467, + "rewards/margins": 4.775772571563721, + "rewards/rejected": -6.2875800132751465, + "step": 139 + }, + { + "epoch": 0.5362700502753173, + "grad_norm": 0.658938815407698, + "learning_rate": 1.6983762494089732e-05, + "logits/chosen": -0.2993840277194977, + "logits/rejected": -0.30466732382774353, + "logps/chosen": -1074.99755859375, + "logps/rejected": -1116.8233642578125, + "loss": 0.1107, + "num_input_tokens_seen": 46178432, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -1.4941047430038452, + "rewards/margins": 4.600259780883789, + "rewards/rejected": -6.094364643096924, + "step": 140 + }, + { + "epoch": 0.5401005506344266, + "grad_norm": 0.5412095001639124, + "learning_rate": 1.693971460889654e-05, + "logits/chosen": -0.27547773718833923, + "logits/rejected": -0.2832818627357483, + "logps/chosen": -1033.436767578125, + "logps/rejected": -1081.29443359375, + "loss": 0.099, + "num_input_tokens_seen": 46500768, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -1.7360153198242188, + "rewards/margins": 4.463346481323242, + "rewards/rejected": -6.199361801147461, + "step": 141 + }, + { + "epoch": 0.543931050993536, + "grad_norm": 0.4660796026996289, + "learning_rate": 1.689540544737067e-05, + "logits/chosen": -0.36272650957107544, + "logits/rejected": -0.3602423369884491, + "logps/chosen": -1078.7869873046875, + "logps/rejected": -1107.66796875, + "loss": 0.0724, + "num_input_tokens_seen": 46827200, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -1.4805693626403809, + "rewards/margins": 4.58699369430542, + "rewards/rejected": -6.067563533782959, + "step": 142 + }, + { + "epoch": 0.5477615513526455, + "grad_norm": 0.4868023251388612, + "learning_rate": 1.6850836677727005e-05, + "logits/chosen": -0.36519676446914673, + "logits/rejected": -0.38448548316955566, + "logps/chosen": -1096.3104248046875, + "logps/rejected": -1114.1181640625, + "loss": 0.0651, + "num_input_tokens_seen": 47156576, + "rewards/accuracies": 0.984375, + "rewards/chosen": -1.6444036960601807, + "rewards/margins": 4.827052593231201, + "rewards/rejected": -6.471456050872803, + "step": 143 + }, + { + "epoch": 0.5515920517117548, + "grad_norm": 0.597104418993657, + "learning_rate": 1.6806009977954533e-05, + "logits/chosen": -0.26541781425476074, + "logits/rejected": -0.2711328864097595, + "logps/chosen": -1124.53271484375, + "logps/rejected": -1186.300537109375, + "loss": 0.0992, + "num_input_tokens_seen": 47501920, + "rewards/accuracies": 0.953125, + "rewards/chosen": -1.5454996824264526, + "rewards/margins": 4.688060283660889, + "rewards/rejected": -6.233560085296631, + "step": 144 + }, + { + "epoch": 0.5554225520708642, + "grad_norm": 0.5820627736230906, + "learning_rate": 1.676092703575316e-05, + "logits/chosen": -0.30492663383483887, + "logits/rejected": -0.3028402030467987, + "logps/chosen": -1080.04150390625, + "logps/rejected": -1114.582275390625, + "loss": 0.0882, + "num_input_tokens_seen": 47832384, + "rewards/accuracies": 0.953125, + "rewards/chosen": -1.5779472589492798, + "rewards/margins": 5.022209167480469, + "rewards/rejected": -6.600155830383301, + "step": 145 + }, + { + "epoch": 0.5592530524299737, + "grad_norm": 0.5089187386974958, + "learning_rate": 1.6715589548470187e-05, + "logits/chosen": -0.3214140236377716, + "logits/rejected": -0.31802237033843994, + "logps/chosen": -1073.1607666015625, + "logps/rejected": -1104.7745361328125, + "loss": 0.0836, + "num_input_tokens_seen": 48172064, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -1.720104455947876, + "rewards/margins": 4.598552703857422, + "rewards/rejected": -6.318656921386719, + "step": 146 + }, + { + "epoch": 0.5630835527890831, + "grad_norm": 0.6143420546717425, + "learning_rate": 1.6669999223036377e-05, + "logits/chosen": -0.278370201587677, + "logits/rejected": -0.31141722202301025, + "logps/chosen": -1040.6866455078125, + "logps/rejected": -1094.07373046875, + "loss": 0.0801, + "num_input_tokens_seen": 48490304, + "rewards/accuracies": 0.96875, + "rewards/chosen": -1.5565414428710938, + "rewards/margins": 4.861513137817383, + "rewards/rejected": -6.418054580688477, + "step": 147 + }, + { + "epoch": 0.5669140531481924, + "grad_norm": 0.38248241599823835, + "learning_rate": 1.662415777590172e-05, + "logits/chosen": -0.27843543887138367, + "logits/rejected": -0.27160152792930603, + "logps/chosen": -1058.7454833984375, + "logps/rejected": -1066.873779296875, + "loss": 0.0507, + "num_input_tokens_seen": 48808128, + "rewards/accuracies": 0.984375, + "rewards/chosen": -1.494460940361023, + "rewards/margins": 4.860630989074707, + "rewards/rejected": -6.3550920486450195, + "step": 148 + }, + { + "epoch": 0.5707445535073019, + "grad_norm": 0.6293538721002351, + "learning_rate": 1.657806693297079e-05, + "logits/chosen": -0.2736295461654663, + "logits/rejected": -0.29326513409614563, + "logps/chosen": -1070.2845458984375, + "logps/rejected": -1082.324462890625, + "loss": 0.1075, + "num_input_tokens_seen": 49144000, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -1.6812742948532104, + "rewards/margins": 4.656603813171387, + "rewards/rejected": -6.337878227233887, + "step": 149 + }, + { + "epoch": 0.5745750538664113, + "grad_norm": 0.48502799481905956, + "learning_rate": 1.6531728429537766e-05, + "logits/chosen": -0.30077147483825684, + "logits/rejected": -0.3063828945159912, + "logps/chosen": -1064.610595703125, + "logps/rejected": -1087.884521484375, + "loss": 0.0617, + "num_input_tokens_seen": 49469216, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -1.9076788425445557, + "rewards/margins": 4.812081813812256, + "rewards/rejected": -6.719760417938232, + "step": 150 + }, + { + "epoch": 0.5784055542255208, + "grad_norm": 0.5585457906443175, + "learning_rate": 1.6485144010221126e-05, + "logits/chosen": -0.3358997702598572, + "logits/rejected": -0.3236524760723114, + "logps/chosen": -1081.9306640625, + "logps/rejected": -1100.621826171875, + "loss": 0.0908, + "num_input_tokens_seen": 49799488, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -1.536299467086792, + "rewards/margins": 4.397487640380859, + "rewards/rejected": -5.933786869049072, + "step": 151 + }, + { + "epoch": 0.5822360545846301, + "grad_norm": 0.49421780204480903, + "learning_rate": 1.6438315428897914e-05, + "logits/chosen": -0.35712140798568726, + "logits/rejected": -0.3945716619491577, + "logps/chosen": -1069.7569580078125, + "logps/rejected": -1120.60888671875, + "loss": 0.0647, + "num_input_tokens_seen": 50129408, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -1.5219446420669556, + "rewards/margins": 4.791114807128906, + "rewards/rejected": -6.3130598068237305, + "step": 152 + }, + { + "epoch": 0.5860665549437395, + "grad_norm": 0.5523862609723885, + "learning_rate": 1.639124444863776e-05, + "logits/chosen": -0.27339184284210205, + "logits/rejected": -0.28280600905418396, + "logps/chosen": -1049.0377197265625, + "logps/rejected": -1081.47900390625, + "loss": 0.0852, + "num_input_tokens_seen": 50455968, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -1.2807562351226807, + "rewards/margins": 4.678353786468506, + "rewards/rejected": -5.959109783172607, + "step": 153 + }, + { + "epoch": 0.589897055302849, + "grad_norm": 0.3011941721115771, + "learning_rate": 1.6343932841636455e-05, + "logits/chosen": -0.31659185886383057, + "logits/rejected": -0.35317906737327576, + "logps/chosen": -1060.587158203125, + "logps/rejected": -1091.8394775390625, + "loss": 0.0384, + "num_input_tokens_seen": 50770496, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -1.1696479320526123, + "rewards/margins": 5.266739368438721, + "rewards/rejected": -6.436387062072754, + "step": 154 + }, + { + "epoch": 0.5937275556619583, + "grad_norm": 0.6500067699056428, + "learning_rate": 1.6296382389149273e-05, + "logits/chosen": -0.31897059082984924, + "logits/rejected": -0.3177844285964966, + "logps/chosen": -1066.132568359375, + "logps/rejected": -1111.0223388671875, + "loss": 0.1084, + "num_input_tokens_seen": 51106624, + "rewards/accuracies": 0.953125, + "rewards/chosen": -1.7926788330078125, + "rewards/margins": 4.2171783447265625, + "rewards/rejected": -6.009857177734375, + "step": 155 + }, + { + "epoch": 0.5975580560210677, + "grad_norm": 0.550693495948907, + "learning_rate": 1.6248594881423866e-05, + "logits/chosen": -0.3027258515357971, + "logits/rejected": -0.32225438952445984, + "logps/chosen": -1074.19921875, + "logps/rejected": -1092.076904296875, + "loss": 0.0808, + "num_input_tokens_seen": 51440256, + "rewards/accuracies": 0.96875, + "rewards/chosen": -1.0770642757415771, + "rewards/margins": 4.914764404296875, + "rewards/rejected": -5.991828441619873, + "step": 156 + }, + { + "epoch": 0.6013885563801772, + "grad_norm": 0.5956248845790991, + "learning_rate": 1.6200572117632892e-05, + "logits/chosen": -0.37387484312057495, + "logits/rejected": -0.3706851303577423, + "logps/chosen": -1051.87451171875, + "logps/rejected": -1089.0977783203125, + "loss": 0.0811, + "num_input_tokens_seen": 51767552, + "rewards/accuracies": 0.96875, + "rewards/chosen": -1.6899757385253906, + "rewards/margins": 4.654976844787598, + "rewards/rejected": -6.344952583312988, + "step": 157 + }, + { + "epoch": 0.6052190567392866, + "grad_norm": 0.40283862755644384, + "learning_rate": 1.615231590580627e-05, + "logits/chosen": -0.2697727382183075, + "logits/rejected": -0.2785693109035492, + "logps/chosen": -1067.231689453125, + "logps/rejected": -1077.2822265625, + "loss": 0.0552, + "num_input_tokens_seen": 52094208, + "rewards/accuracies": 0.984375, + "rewards/chosen": -1.0514568090438843, + "rewards/margins": 5.0262675285339355, + "rewards/rejected": -6.077724456787109, + "step": 158 + }, + { + "epoch": 0.609049557098396, + "grad_norm": 0.3846828968582389, + "learning_rate": 1.6103828062763095e-05, + "logits/chosen": -0.35383158922195435, + "logits/rejected": -0.3529141843318939, + "logps/chosen": -1086.27294921875, + "logps/rejected": -1131.7923583984375, + "loss": 0.0476, + "num_input_tokens_seen": 52431584, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -1.1795865297317505, + "rewards/margins": 5.213119029998779, + "rewards/rejected": -6.392704963684082, + "step": 159 + }, + { + "epoch": 0.6128800574575054, + "grad_norm": 0.4508770583843052, + "learning_rate": 1.605511041404326e-05, + "logits/chosen": -0.2959039807319641, + "logits/rejected": -0.320911169052124, + "logps/chosen": -1067.097412109375, + "logps/rejected": -1097.7996826171875, + "loss": 0.0607, + "num_input_tokens_seen": 52759680, + "rewards/accuracies": 0.984375, + "rewards/chosen": -1.608352541923523, + "rewards/margins": 4.9150896072387695, + "rewards/rejected": -6.523441791534424, + "step": 160 + }, + { + "epoch": 0.6167105578166148, + "grad_norm": 0.6310122045287392, + "learning_rate": 1.6006164793838693e-05, + "logits/chosen": -0.28234612941741943, + "logits/rejected": -0.2928387522697449, + "logps/chosen": -1074.974609375, + "logps/rejected": -1091.3623046875, + "loss": 0.0878, + "num_input_tokens_seen": 53090880, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -1.403192162513733, + "rewards/margins": 4.708822250366211, + "rewards/rejected": -6.112013816833496, + "step": 161 + }, + { + "epoch": 0.6205410581757242, + "grad_norm": 0.44152732131192657, + "learning_rate": 1.5956993044924334e-05, + "logits/chosen": -0.3540336489677429, + "logits/rejected": -0.34405604004859924, + "logps/chosen": -1038.75341796875, + "logps/rejected": -1104.8155517578125, + "loss": 0.064, + "num_input_tokens_seen": 53418208, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -1.5000206232070923, + "rewards/margins": 5.213437557220459, + "rewards/rejected": -6.713458061218262, + "step": 162 + }, + { + "epoch": 0.6243715585348336, + "grad_norm": 0.6491833699327055, + "learning_rate": 1.5907597018588746e-05, + "logits/chosen": -0.31376007199287415, + "logits/rejected": -0.3313858211040497, + "logps/chosen": -1032.480224609375, + "logps/rejected": -1095.984130859375, + "loss": 0.0851, + "num_input_tokens_seen": 53739520, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -1.4761462211608887, + "rewards/margins": 4.7697434425354, + "rewards/rejected": -6.245889663696289, + "step": 163 + }, + { + "epoch": 0.628202058893943, + "grad_norm": 0.6595326829758555, + "learning_rate": 1.585797857456439e-05, + "logits/chosen": -0.3295058608055115, + "logits/rejected": -0.3371520936489105, + "logps/chosen": -1093.2264404296875, + "logps/rejected": -1155.4815673828125, + "loss": 0.0936, + "num_input_tokens_seen": 54071936, + "rewards/accuracies": 0.953125, + "rewards/chosen": -1.6281415224075317, + "rewards/margins": 5.081623077392578, + "rewards/rejected": -6.70976448059082, + "step": 164 + }, + { + "epoch": 0.6320325592530525, + "grad_norm": 0.6908540698448176, + "learning_rate": 1.5808139580957647e-05, + "logits/chosen": -0.34094804525375366, + "logits/rejected": -0.3643835186958313, + "logps/chosen": -1052.879638671875, + "logps/rejected": -1081.25390625, + "loss": 0.0891, + "num_input_tokens_seen": 54390368, + "rewards/accuracies": 0.96875, + "rewards/chosen": -1.7019580602645874, + "rewards/margins": 5.046705722808838, + "rewards/rejected": -6.748663425445557, + "step": 165 + }, + { + "epoch": 0.6358630596121618, + "grad_norm": 0.7631658376915625, + "learning_rate": 1.5758081914178457e-05, + "logits/chosen": -0.3057851195335388, + "logits/rejected": -0.3119189441204071, + "logps/chosen": -1059.583984375, + "logps/rejected": -1089.928466796875, + "loss": 0.1092, + "num_input_tokens_seen": 54716928, + "rewards/accuracies": 0.9453125, + "rewards/chosen": -1.3701412677764893, + "rewards/margins": 4.963718414306641, + "rewards/rejected": -6.333859443664551, + "step": 166 + }, + { + "epoch": 0.6396935599712712, + "grad_norm": 0.4345985531486534, + "learning_rate": 1.5707807458869675e-05, + "logits/chosen": -0.42637568712234497, + "logits/rejected": -0.45623651146888733, + "logps/chosen": -1053.9580078125, + "logps/rejected": -1106.2781982421875, + "loss": 0.0476, + "num_input_tokens_seen": 55043456, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -1.4241918325424194, + "rewards/margins": 5.542591094970703, + "rewards/rejected": -6.96678352355957, + "step": 167 + }, + { + "epoch": 0.6435240603303807, + "grad_norm": 0.4859848129347064, + "learning_rate": 1.5657318107836133e-05, + "logits/chosen": -0.29520362615585327, + "logits/rejected": -0.29335343837738037, + "logps/chosen": -1073.212158203125, + "logps/rejected": -1113.77197265625, + "loss": 0.074, + "num_input_tokens_seen": 55370368, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -1.4467662572860718, + "rewards/margins": 4.669836044311523, + "rewards/rejected": -6.116602420806885, + "step": 168 + }, + { + "epoch": 0.64735456068949, + "grad_norm": 0.3648463797543333, + "learning_rate": 1.560661576197336e-05, + "logits/chosen": -0.31153351068496704, + "logits/rejected": -0.28893059492111206, + "logps/chosen": -1043.7664794921875, + "logps/rejected": -1079.0833740234375, + "loss": 0.0451, + "num_input_tokens_seen": 55703360, + "rewards/accuracies": 0.984375, + "rewards/chosen": -1.5342607498168945, + "rewards/margins": 5.652307510375977, + "rewards/rejected": -7.186568737030029, + "step": 169 + }, + { + "epoch": 0.6511850610485995, + "grad_norm": 0.5094110724351522, + "learning_rate": 1.5555702330196024e-05, + "logits/chosen": -0.3514013886451721, + "logits/rejected": -0.361299067735672, + "logps/chosen": -1067.784912109375, + "logps/rejected": -1112.9228515625, + "loss": 0.0652, + "num_input_tokens_seen": 56034752, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -1.540859341621399, + "rewards/margins": 5.56430721282959, + "rewards/rejected": -7.105166435241699, + "step": 170 + }, + { + "epoch": 0.6550155614077089, + "grad_norm": 0.34032515882527165, + "learning_rate": 1.550457972936605e-05, + "logits/chosen": -0.34852537512779236, + "logits/rejected": -0.3557685911655426, + "logps/chosen": -1040.5858154296875, + "logps/rejected": -1089.8858642578125, + "loss": 0.0392, + "num_input_tokens_seen": 56355040, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9609565734863281, + "rewards/margins": 5.2837653160095215, + "rewards/rejected": -7.24472188949585, + "step": 171 + }, + { + "epoch": 0.6588460617668183, + "grad_norm": 0.8293783594865153, + "learning_rate": 1.5453249884220466e-05, + "logits/chosen": -0.3500719368457794, + "logits/rejected": -0.34363821148872375, + "logps/chosen": -1016.6444091796875, + "logps/rejected": -1063.8790283203125, + "loss": 0.1087, + "num_input_tokens_seen": 56676128, + "rewards/accuracies": 0.953125, + "rewards/chosen": -2.0376718044281006, + "rewards/margins": 5.306238174438477, + "rewards/rejected": -7.343910217285156, + "step": 172 + }, + { + "epoch": 0.6626765621259277, + "grad_norm": 0.5162779605833504, + "learning_rate": 1.540171472729893e-05, + "logits/chosen": -0.35712379217147827, + "logits/rejected": -0.359439492225647, + "logps/chosen": -1068.63525390625, + "logps/rejected": -1113.405517578125, + "loss": 0.0688, + "num_input_tokens_seen": 57004672, + "rewards/accuracies": 0.984375, + "rewards/chosen": -1.926776647567749, + "rewards/margins": 5.698601722717285, + "rewards/rejected": -7.6253790855407715, + "step": 173 + }, + { + "epoch": 0.6665070624850371, + "grad_norm": 0.37918759492527626, + "learning_rate": 1.5349976198870974e-05, + "logits/chosen": -0.39276063442230225, + "logits/rejected": -0.39930692315101624, + "logps/chosen": -1051.030517578125, + "logps/rejected": -1102.3223876953125, + "loss": 0.0475, + "num_input_tokens_seen": 57329792, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -2.241332769393921, + "rewards/margins": 5.592853546142578, + "rewards/rejected": -7.83418607711792, + "step": 174 + }, + { + "epoch": 0.6703375628441465, + "grad_norm": 0.4380137993949912, + "learning_rate": 1.529803624686295e-05, + "logits/chosen": -0.3385137915611267, + "logits/rejected": -0.3374669551849365, + "logps/chosen": -1090.6273193359375, + "logps/rejected": -1146.446533203125, + "loss": 0.0648, + "num_input_tokens_seen": 57664768, + "rewards/accuracies": 0.96875, + "rewards/chosen": -1.8906933069229126, + "rewards/margins": 5.4380950927734375, + "rewards/rejected": -7.328788757324219, + "step": 175 + }, + { + "epoch": 0.674168063203256, + "grad_norm": 0.585570791332763, + "learning_rate": 1.5245896826784689e-05, + "logits/chosen": -0.3907468318939209, + "logits/rejected": -0.41205736994743347, + "logps/chosen": -1121.990966796875, + "logps/rejected": -1161.072509765625, + "loss": 0.0709, + "num_input_tokens_seen": 58001760, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -2.253509998321533, + "rewards/margins": 5.232100009918213, + "rewards/rejected": -7.485610008239746, + "step": 176 + }, + { + "epoch": 0.6779985635623653, + "grad_norm": 0.6564462627665776, + "learning_rate": 1.5193559901655897e-05, + "logits/chosen": -0.3795199394226074, + "logits/rejected": -0.40783464908599854, + "logps/chosen": -1107.425537109375, + "logps/rejected": -1133.7227783203125, + "loss": 0.1017, + "num_input_tokens_seen": 58342336, + "rewards/accuracies": 0.953125, + "rewards/chosen": -2.219597816467285, + "rewards/margins": 4.832563400268555, + "rewards/rejected": -7.05216121673584, + "step": 177 + }, + { + "epoch": 0.6818290639214747, + "grad_norm": 0.499855231696093, + "learning_rate": 1.5141027441932217e-05, + "logits/chosen": -0.3829120695590973, + "logits/rejected": -0.3773517906665802, + "logps/chosen": -1081.32373046875, + "logps/rejected": -1126.15234375, + "loss": 0.0619, + "num_input_tokens_seen": 58670432, + "rewards/accuracies": 0.984375, + "rewards/chosen": -2.104212760925293, + "rewards/margins": 5.646474361419678, + "rewards/rejected": -7.750687599182129, + "step": 178 + }, + { + "epoch": 0.6856595642805842, + "grad_norm": 0.42994783825003535, + "learning_rate": 1.5088301425431072e-05, + "logits/chosen": -0.3850676417350769, + "logits/rejected": -0.39243972301483154, + "logps/chosen": -1061.424560546875, + "logps/rejected": -1093.44873046875, + "loss": 0.0519, + "num_input_tokens_seen": 58993472, + "rewards/accuracies": 0.984375, + "rewards/chosen": -2.4199581146240234, + "rewards/margins": 5.382256507873535, + "rewards/rejected": -7.802214622497559, + "step": 179 + }, + { + "epoch": 0.6894900646396935, + "grad_norm": 0.5754872426097162, + "learning_rate": 1.5035383837257178e-05, + "logits/chosen": -0.3468138575553894, + "logits/rejected": -0.30852875113487244, + "logps/chosen": -1063.0513916015625, + "logps/rejected": -1097.786865234375, + "loss": 0.0704, + "num_input_tokens_seen": 59321248, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -2.560081958770752, + "rewards/margins": 5.123150825500488, + "rewards/rejected": -7.683233261108398, + "step": 180 + }, + { + "epoch": 0.693320564998803, + "grad_norm": 0.5167463586251899, + "learning_rate": 1.498227666972782e-05, + "logits/chosen": -0.3862994909286499, + "logits/rejected": -0.40714791417121887, + "logps/chosen": -1029.218994140625, + "logps/rejected": -1085.8651123046875, + "loss": 0.0565, + "num_input_tokens_seen": 59640512, + "rewards/accuracies": 0.984375, + "rewards/chosen": -2.4606149196624756, + "rewards/margins": 5.344514846801758, + "rewards/rejected": -7.8051300048828125, + "step": 181 + }, + { + "epoch": 0.6971510653579124, + "grad_norm": 0.6036551491056049, + "learning_rate": 1.4928981922297842e-05, + "logits/chosen": -0.35210585594177246, + "logits/rejected": -0.3664399981498718, + "logps/chosen": -1019.4240112304688, + "logps/rejected": -1076.930908203125, + "loss": 0.0817, + "num_input_tokens_seen": 59963648, + "rewards/accuracies": 0.96875, + "rewards/chosen": -2.6374425888061523, + "rewards/margins": 5.176463603973389, + "rewards/rejected": -7.813906669616699, + "step": 182 + }, + { + "epoch": 0.7009815657170217, + "grad_norm": 0.6142428673288808, + "learning_rate": 1.4875501601484362e-05, + "logits/chosen": -0.3965800106525421, + "logits/rejected": -0.40107452869415283, + "logps/chosen": -1106.7393798828125, + "logps/rejected": -1169.469970703125, + "loss": 0.0783, + "num_input_tokens_seen": 60304064, + "rewards/accuracies": 0.96875, + "rewards/chosen": -2.603675365447998, + "rewards/margins": 5.148473262786865, + "rewards/rejected": -7.752148151397705, + "step": 183 + }, + { + "epoch": 0.7048120660761312, + "grad_norm": 0.5056582658705452, + "learning_rate": 1.482183772079123e-05, + "logits/chosen": -0.4066172242164612, + "logits/rejected": -0.4109392762184143, + "logps/chosen": -1081.2373046875, + "logps/rejected": -1136.6824951171875, + "loss": 0.0733, + "num_input_tokens_seen": 60630784, + "rewards/accuracies": 0.96875, + "rewards/chosen": -2.7854676246643066, + "rewards/margins": 5.170050621032715, + "rewards/rejected": -7.955517768859863, + "step": 184 + }, + { + "epoch": 0.7086425664352406, + "grad_norm": 0.5706621164820406, + "learning_rate": 1.4767992300633224e-05, + "logits/chosen": -0.3524749279022217, + "logits/rejected": -0.40085846185684204, + "logps/chosen": -1073.117919921875, + "logps/rejected": -1103.932373046875, + "loss": 0.0669, + "num_input_tokens_seen": 60955136, + "rewards/accuracies": 0.96875, + "rewards/chosen": -2.584977626800537, + "rewards/margins": 5.593213081359863, + "rewards/rejected": -8.178190231323242, + "step": 185 + }, + { + "epoch": 0.71247306679435, + "grad_norm": 0.6504328321711108, + "learning_rate": 1.4713967368259981e-05, + "logits/chosen": -0.3981163501739502, + "logits/rejected": -0.38565918803215027, + "logps/chosen": -1066.8680419921875, + "logps/rejected": -1102.06884765625, + "loss": 0.091, + "num_input_tokens_seen": 61287008, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -2.840730667114258, + "rewards/margins": 5.109292984008789, + "rewards/rejected": -7.950023651123047, + "step": 186 + }, + { + "epoch": 0.7163035671534594, + "grad_norm": 0.7217426418637043, + "learning_rate": 1.4659764957679663e-05, + "logits/chosen": -0.3458193242549896, + "logits/rejected": -0.3710017800331116, + "logps/chosen": -1051.720458984375, + "logps/rejected": -1093.107421875, + "loss": 0.0871, + "num_input_tokens_seen": 61610560, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -2.892113447189331, + "rewards/margins": 5.103034973144531, + "rewards/rejected": -7.995147705078125, + "step": 187 + }, + { + "epoch": 0.7201340675125688, + "grad_norm": 0.6161119693588761, + "learning_rate": 1.4605387109582401e-05, + "logits/chosen": -0.3587178587913513, + "logits/rejected": -0.40242552757263184, + "logps/chosen": -1095.86279296875, + "logps/rejected": -1108.609375, + "loss": 0.0876, + "num_input_tokens_seen": 61944384, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -3.295729637145996, + "rewards/margins": 4.967322826385498, + "rewards/rejected": -8.263051986694336, + "step": 188 + }, + { + "epoch": 0.7239645678716783, + "grad_norm": 0.452514989649038, + "learning_rate": 1.455083587126344e-05, + "logits/chosen": -0.3199668526649475, + "logits/rejected": -0.34406644105911255, + "logps/chosen": -1066.1934814453125, + "logps/rejected": -1115.670166015625, + "loss": 0.0526, + "num_input_tokens_seen": 62267712, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -3.221564769744873, + "rewards/margins": 5.354893207550049, + "rewards/rejected": -8.576457977294922, + "step": 189 + }, + { + "epoch": 0.7277950682307877, + "grad_norm": 0.5963884832289921, + "learning_rate": 1.4496113296546068e-05, + "logits/chosen": -0.35279780626296997, + "logits/rejected": -0.3623964190483093, + "logps/chosen": -1084.421142578125, + "logps/rejected": -1116.402587890625, + "loss": 0.1032, + "num_input_tokens_seen": 62595584, + "rewards/accuracies": 0.984375, + "rewards/chosen": -3.449268102645874, + "rewards/margins": 4.945537567138672, + "rewards/rejected": -8.394805908203125, + "step": 190 + }, + { + "epoch": 0.731625568589897, + "grad_norm": 0.26389518159181236, + "learning_rate": 1.4441221445704294e-05, + "logits/chosen": -0.2977907657623291, + "logits/rejected": -0.2962134778499603, + "logps/chosen": -1066.1024169921875, + "logps/rejected": -1140.6871337890625, + "loss": 0.0334, + "num_input_tokens_seen": 62921344, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8238296508789062, + "rewards/margins": 5.532756805419922, + "rewards/rejected": -8.356586456298828, + "step": 191 + }, + { + "epoch": 0.7354560689490065, + "grad_norm": 0.563054774067362, + "learning_rate": 1.4386162385385279e-05, + "logits/chosen": -0.4010685682296753, + "logits/rejected": -0.39404407143592834, + "logps/chosen": -1032.6571044921875, + "logps/rejected": -1080.37939453125, + "loss": 0.0724, + "num_input_tokens_seen": 63231200, + "rewards/accuracies": 0.984375, + "rewards/chosen": -3.2786214351654053, + "rewards/margins": 5.2652506828308105, + "rewards/rejected": -8.543872833251953, + "step": 192 + }, + { + "epoch": 0.7392865693081159, + "grad_norm": 0.4053083170936949, + "learning_rate": 1.433093818853152e-05, + "logits/chosen": -0.305337131023407, + "logits/rejected": -0.3222377896308899, + "logps/chosen": -1118.9312744140625, + "logps/rejected": -1171.0069580078125, + "loss": 0.0563, + "num_input_tokens_seen": 63570592, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -3.717423915863037, + "rewards/margins": 4.910149574279785, + "rewards/rejected": -8.627573013305664, + "step": 193 + }, + { + "epoch": 0.7431170696672252, + "grad_norm": 0.46090021771041406, + "learning_rate": 1.4275550934302822e-05, + "logits/chosen": -0.4093608558177948, + "logits/rejected": -0.42888912558555603, + "logps/chosen": -1089.6619873046875, + "logps/rejected": -1092.9468994140625, + "loss": 0.0592, + "num_input_tokens_seen": 63894464, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -3.2741451263427734, + "rewards/margins": 5.198902130126953, + "rewards/rejected": -8.473047256469727, + "step": 194 + }, + { + "epoch": 0.7469475700263347, + "grad_norm": 0.4948991363694334, + "learning_rate": 1.4220002707998e-05, + "logits/chosen": -0.30450543761253357, + "logits/rejected": -0.3249562978744507, + "logps/chosen": -1108.9569091796875, + "logps/rejected": -1141.0350341796875, + "loss": 0.0676, + "num_input_tokens_seen": 64227200, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -3.459994316101074, + "rewards/margins": 5.136832237243652, + "rewards/rejected": -8.596826553344727, + "step": 195 + }, + { + "epoch": 0.7507780703854441, + "grad_norm": 0.1796375117598255, + "learning_rate": 1.4164295600976375e-05, + "logits/chosen": -0.3392075300216675, + "logits/rejected": -0.35465580224990845, + "logps/chosen": -1096.398681640625, + "logps/rejected": -1181.217529296875, + "loss": 0.0188, + "num_input_tokens_seen": 64565504, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3717217445373535, + "rewards/margins": 5.682257652282715, + "rewards/rejected": -9.053979873657227, + "step": 196 + }, + { + "epoch": 0.7546085707445535, + "grad_norm": 0.832494859294556, + "learning_rate": 1.410843171057904e-05, + "logits/chosen": -0.4082401990890503, + "logits/rejected": -0.4145820438861847, + "logps/chosen": -1093.6285400390625, + "logps/rejected": -1126.459228515625, + "loss": 0.121, + "num_input_tokens_seen": 64911392, + "rewards/accuracies": 0.953125, + "rewards/chosen": -3.4425406455993652, + "rewards/margins": 5.461415767669678, + "rewards/rejected": -8.903956413269043, + "step": 197 + }, + { + "epoch": 0.7584390711036629, + "grad_norm": 0.48960331127394835, + "learning_rate": 1.4052413140049898e-05, + "logits/chosen": -0.359018474817276, + "logits/rejected": -0.36954444646835327, + "logps/chosen": -1098.950439453125, + "logps/rejected": -1157.846923828125, + "loss": 0.0567, + "num_input_tokens_seen": 65249856, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -3.391528606414795, + "rewards/margins": 5.364487648010254, + "rewards/rejected": -8.756016731262207, + "step": 198 + }, + { + "epoch": 0.7622695714627723, + "grad_norm": 0.41888025386199246, + "learning_rate": 1.399624199845647e-05, + "logits/chosen": -0.36450421810150146, + "logits/rejected": -0.35776621103286743, + "logps/chosen": -1066.9852294921875, + "logps/rejected": -1110.235595703125, + "loss": 0.054, + "num_input_tokens_seen": 65572384, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.035019874572754, + "rewards/margins": 5.2263898849487305, + "rewards/rejected": -8.261409759521484, + "step": 199 + }, + { + "epoch": 0.7661000718218818, + "grad_norm": 0.34396194639320754, + "learning_rate": 1.3939920400610483e-05, + "logits/chosen": -0.35331085324287415, + "logits/rejected": -0.369003027677536, + "logps/chosen": -1082.5882568359375, + "logps/rejected": -1140.47900390625, + "loss": 0.0364, + "num_input_tokens_seen": 65904512, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6791069507598877, + "rewards/margins": 5.489410400390625, + "rewards/rejected": -8.16851806640625, + "step": 200 + }, + { + "epoch": 0.7699305721809911, + "grad_norm": 0.5756888417034157, + "learning_rate": 1.3883450466988264e-05, + "logits/chosen": -0.37132593989372253, + "logits/rejected": -0.3989941477775574, + "logps/chosen": -1099.147705078125, + "logps/rejected": -1120.3642578125, + "loss": 0.0697, + "num_input_tokens_seen": 66239552, + "rewards/accuracies": 0.984375, + "rewards/chosen": -2.9705073833465576, + "rewards/margins": 5.108386039733887, + "rewards/rejected": -8.078892707824707, + "step": 201 + }, + { + "epoch": 0.7737610725401005, + "grad_norm": 0.3888874299042038, + "learning_rate": 1.3826834323650899e-05, + "logits/chosen": -0.4046036899089813, + "logits/rejected": -0.4258958399295807, + "logps/chosen": -1078.98046875, + "logps/rejected": -1108.484619140625, + "loss": 0.0349, + "num_input_tokens_seen": 66571296, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -2.6392245292663574, + "rewards/margins": 5.178862571716309, + "rewards/rejected": -7.818087577819824, + "step": 202 + }, + { + "epoch": 0.77759157289921, + "grad_norm": 0.5499625356704353, + "learning_rate": 1.3770074102164184e-05, + "logits/chosen": -0.4315694570541382, + "logits/rejected": -0.46639716625213623, + "logps/chosen": -1080.9366455078125, + "logps/rejected": -1120.23486328125, + "loss": 0.0654, + "num_input_tokens_seen": 66909408, + "rewards/accuracies": 0.984375, + "rewards/chosen": -2.724909782409668, + "rewards/margins": 5.231980323791504, + "rewards/rejected": -7.956890106201172, + "step": 203 + }, + { + "epoch": 0.7814220732583194, + "grad_norm": 0.32743320487857014, + "learning_rate": 1.3713171939518378e-05, + "logits/chosen": -0.36352843046188354, + "logits/rejected": -0.36441731452941895, + "logps/chosen": -1051.1240234375, + "logps/rejected": -1068.173828125, + "loss": 0.0455, + "num_input_tokens_seen": 67225952, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -2.908391237258911, + "rewards/margins": 5.259853363037109, + "rewards/rejected": -8.168244361877441, + "step": 204 + }, + { + "epoch": 0.7852525736174287, + "grad_norm": 0.42480802495516595, + "learning_rate": 1.365612997804774e-05, + "logits/chosen": -0.3659301996231079, + "logits/rejected": -0.3896404206752777, + "logps/chosen": -1093.025390625, + "logps/rejected": -1139.8154296875, + "loss": 0.051, + "num_input_tokens_seen": 67570528, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -3.0012338161468506, + "rewards/margins": 5.440090179443359, + "rewards/rejected": -8.441324234008789, + "step": 205 + }, + { + "epoch": 0.7890830739765382, + "grad_norm": 0.6343770054148905, + "learning_rate": 1.3598950365349884e-05, + "logits/chosen": -0.45734673738479614, + "logits/rejected": -0.4893167018890381, + "logps/chosen": -1073.5565185546875, + "logps/rejected": -1092.49658203125, + "loss": 0.0726, + "num_input_tokens_seen": 67896000, + "rewards/accuracies": 0.96875, + "rewards/chosen": -3.3900272846221924, + "rewards/margins": 5.502257347106934, + "rewards/rejected": -8.892284393310547, + "step": 206 + }, + { + "epoch": 0.7929135743356476, + "grad_norm": 0.6753410307079385, + "learning_rate": 1.3541635254204906e-05, + "logits/chosen": -0.3509202301502228, + "logits/rejected": -0.36146238446235657, + "logps/chosen": -1070.9962158203125, + "logps/rejected": -1128.6781005859375, + "loss": 0.0955, + "num_input_tokens_seen": 68227488, + "rewards/accuracies": 0.953125, + "rewards/chosen": -2.77247953414917, + "rewards/margins": 5.470081329345703, + "rewards/rejected": -8.242561340332031, + "step": 207 + }, + { + "epoch": 0.796744074694757, + "grad_norm": 0.5050822651586315, + "learning_rate": 1.3484186802494346e-05, + "logits/chosen": -0.43409159779548645, + "logits/rejected": -0.4423712491989136, + "logps/chosen": -1121.501220703125, + "logps/rejected": -1153.7506103515625, + "loss": 0.0521, + "num_input_tokens_seen": 68567584, + "rewards/accuracies": 0.96875, + "rewards/chosen": -2.795149326324463, + "rewards/margins": 5.651962757110596, + "rewards/rejected": -8.447112083435059, + "step": 208 + }, + { + "epoch": 0.8005745750538664, + "grad_norm": 0.627706833523695, + "learning_rate": 1.3426607173119945e-05, + "logits/chosen": -0.42920294404029846, + "logits/rejected": -0.43402910232543945, + "logps/chosen": -1098.674072265625, + "logps/rejected": -1134.579833984375, + "loss": 0.0754, + "num_input_tokens_seen": 68895840, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -3.266181468963623, + "rewards/margins": 5.105745315551758, + "rewards/rejected": -8.371927261352539, + "step": 209 + }, + { + "epoch": 0.8044050754129758, + "grad_norm": 0.5939158910682291, + "learning_rate": 1.3368898533922202e-05, + "logits/chosen": -0.33013007044792175, + "logits/rejected": -0.3365688920021057, + "logps/chosen": -1077.737060546875, + "logps/rejected": -1117.476318359375, + "loss": 0.0735, + "num_input_tokens_seen": 69229696, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -3.002263307571411, + "rewards/margins": 5.478519439697266, + "rewards/rejected": -8.480783462524414, + "step": 210 + }, + { + "epoch": 0.8082355757720853, + "grad_norm": 0.42472340016603405, + "learning_rate": 1.3311063057598765e-05, + "logits/chosen": -0.44070786237716675, + "logits/rejected": -0.433513343334198, + "logps/chosen": -1109.822265625, + "logps/rejected": -1143.8209228515625, + "loss": 0.0477, + "num_input_tokens_seen": 69564640, + "rewards/accuracies": 0.984375, + "rewards/chosen": -3.2324776649475098, + "rewards/margins": 5.777805328369141, + "rewards/rejected": -9.010282516479492, + "step": 211 + }, + { + "epoch": 0.8120660761311946, + "grad_norm": 0.6848085847164826, + "learning_rate": 1.3253102921622632e-05, + "logits/chosen": -0.38287749886512756, + "logits/rejected": -0.40390366315841675, + "logps/chosen": -1101.7408447265625, + "logps/rejected": -1139.66064453125, + "loss": 0.0906, + "num_input_tokens_seen": 69908864, + "rewards/accuracies": 0.96875, + "rewards/chosen": -3.3320484161376953, + "rewards/margins": 5.413163185119629, + "rewards/rejected": -8.74521255493164, + "step": 212 + }, + { + "epoch": 0.815896576490304, + "grad_norm": 0.5107487502059916, + "learning_rate": 1.3195020308160157e-05, + "logits/chosen": -0.421735942363739, + "logits/rejected": -0.4362775683403015, + "logps/chosen": -1068.7615966796875, + "logps/rejected": -1106.69921875, + "loss": 0.0585, + "num_input_tokens_seen": 70225312, + "rewards/accuracies": 0.984375, + "rewards/chosen": -3.6092147827148438, + "rewards/margins": 5.352176666259766, + "rewards/rejected": -8.96139144897461, + "step": 213 + }, + { + "epoch": 0.8197270768494135, + "grad_norm": 0.5165736158292992, + "learning_rate": 1.3136817403988918e-05, + "logits/chosen": -0.3840211033821106, + "logits/rejected": -0.3841598927974701, + "logps/chosen": -1119.8875732421875, + "logps/rejected": -1167.1761474609375, + "loss": 0.0615, + "num_input_tokens_seen": 70567520, + "rewards/accuracies": 0.984375, + "rewards/chosen": -3.1678307056427, + "rewards/margins": 5.611310005187988, + "rewards/rejected": -8.77914047241211, + "step": 214 + }, + { + "epoch": 0.8235575772085229, + "grad_norm": 0.35043211758055187, + "learning_rate": 1.307849640041535e-05, + "logits/chosen": -0.36969754099845886, + "logits/rejected": -0.3974273204803467, + "logps/chosen": -1099.508056640625, + "logps/rejected": -1124.4560546875, + "loss": 0.0424, + "num_input_tokens_seen": 70901728, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -3.320462703704834, + "rewards/margins": 5.666969299316406, + "rewards/rejected": -8.987432479858398, + "step": 215 + }, + { + "epoch": 0.8273880775676322, + "grad_norm": 0.4571919928464332, + "learning_rate": 1.3020059493192283e-05, + "logits/chosen": -0.41503724455833435, + "logits/rejected": -0.40935584902763367, + "logps/chosen": -1073.09326171875, + "logps/rejected": -1098.550537109375, + "loss": 0.0473, + "num_input_tokens_seen": 71229504, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -3.0290937423706055, + "rewards/margins": 5.423949718475342, + "rewards/rejected": -8.453042984008789, + "step": 216 + }, + { + "epoch": 0.8312185779267417, + "grad_norm": 0.5394567914854616, + "learning_rate": 1.296150888243624e-05, + "logits/chosen": -0.36737823486328125, + "logits/rejected": -0.37759262323379517, + "logps/chosen": -1049.029541015625, + "logps/rejected": -1096.068115234375, + "loss": 0.0753, + "num_input_tokens_seen": 71554720, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -3.3441693782806396, + "rewards/margins": 5.3865966796875, + "rewards/rejected": -8.730766296386719, + "step": 217 + }, + { + "epoch": 0.8350490782858511, + "grad_norm": 0.6428866516438753, + "learning_rate": 1.2902846772544625e-05, + "logits/chosen": -0.3706621825695038, + "logits/rejected": -0.39000797271728516, + "logps/chosen": -1103.883056640625, + "logps/rejected": -1154.616943359375, + "loss": 0.0771, + "num_input_tokens_seen": 71885408, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -2.843276023864746, + "rewards/margins": 5.670588493347168, + "rewards/rejected": -8.513864517211914, + "step": 218 + }, + { + "epoch": 0.8388795786449605, + "grad_norm": 0.5826327143182622, + "learning_rate": 1.2844075372112718e-05, + "logits/chosen": -0.3488718867301941, + "logits/rejected": -0.3805573582649231, + "logps/chosen": -1066.004150390625, + "logps/rejected": -1113.3624267578125, + "loss": 0.0798, + "num_input_tokens_seen": 72221248, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -2.7812976837158203, + "rewards/margins": 5.480805397033691, + "rewards/rejected": -8.262102127075195, + "step": 219 + }, + { + "epoch": 0.8427100790040699, + "grad_norm": 0.3966779560368587, + "learning_rate": 1.2785196893850532e-05, + "logits/chosen": -0.35187608003616333, + "logits/rejected": -0.3700043857097626, + "logps/chosen": -1070.1513671875, + "logps/rejected": -1103.05419921875, + "loss": 0.0485, + "num_input_tokens_seen": 72548736, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -2.55346417427063, + "rewards/margins": 5.652145862579346, + "rewards/rejected": -8.205610275268555, + "step": 220 + }, + { + "epoch": 0.8465405793631793, + "grad_norm": 0.5354933978155185, + "learning_rate": 1.2726213554499491e-05, + "logits/chosen": -0.4636977016925812, + "logits/rejected": -0.4594433009624481, + "logps/chosen": -1089.219482421875, + "logps/rejected": -1117.517333984375, + "loss": 0.0641, + "num_input_tokens_seen": 72872864, + "rewards/accuracies": 0.96875, + "rewards/chosen": -2.7893753051757812, + "rewards/margins": 5.619229793548584, + "rewards/rejected": -8.408605575561523, + "step": 221 + }, + { + "epoch": 0.8503710797222888, + "grad_norm": 0.5941370577166448, + "learning_rate": 1.2667127574748985e-05, + "logits/chosen": -0.39592477679252625, + "logits/rejected": -0.41650301218032837, + "logps/chosen": -1092.7911376953125, + "logps/rejected": -1136.67236328125, + "loss": 0.0598, + "num_input_tokens_seen": 73196896, + "rewards/accuracies": 0.96875, + "rewards/chosen": -2.299659490585327, + "rewards/margins": 5.945974349975586, + "rewards/rejected": -8.245634078979492, + "step": 222 + }, + { + "epoch": 0.8542015800813981, + "grad_norm": 0.3982821428102479, + "learning_rate": 1.2607941179152756e-05, + "logits/chosen": -0.4786827266216278, + "logits/rejected": -0.480704665184021, + "logps/chosen": -1096.277099609375, + "logps/rejected": -1136.999267578125, + "loss": 0.0427, + "num_input_tokens_seen": 73533536, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -2.3316447734832764, + "rewards/margins": 5.617600917816162, + "rewards/rejected": -7.949245452880859, + "step": 223 + }, + { + "epoch": 0.8580320804405075, + "grad_norm": 0.33703274992377913, + "learning_rate": 1.2548656596045147e-05, + "logits/chosen": -0.4593358039855957, + "logits/rejected": -0.4720173478126526, + "logps/chosen": -1071.749267578125, + "logps/rejected": -1098.951904296875, + "loss": 0.0342, + "num_input_tokens_seen": 73857184, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -1.84685480594635, + "rewards/margins": 6.092759132385254, + "rewards/rejected": -7.9396138191223145, + "step": 224 + }, + { + "epoch": 0.861862580799617, + "grad_norm": 0.3711425277338864, + "learning_rate": 1.2489276057457205e-05, + "logits/chosen": -0.4213397204875946, + "logits/rejected": -0.41080206632614136, + "logps/chosen": -1118.953857421875, + "logps/rejected": -1141.2943115234375, + "loss": 0.043, + "num_input_tokens_seen": 74191872, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -2.171072244644165, + "rewards/margins": 5.28747034072876, + "rewards/rejected": -7.458542346954346, + "step": 225 + }, + { + "epoch": 0.8656930811587263, + "grad_norm": 0.5265600870746873, + "learning_rate": 1.242980179903264e-05, + "logits/chosen": -0.38690048456192017, + "logits/rejected": -0.40241944789886475, + "logps/chosen": -1043.50341796875, + "logps/rejected": -1118.68359375, + "loss": 0.0519, + "num_input_tokens_seen": 74516224, + "rewards/accuracies": 0.984375, + "rewards/chosen": -2.320708751678467, + "rewards/margins": 5.84165096282959, + "rewards/rejected": -8.162359237670898, + "step": 226 + }, + { + "epoch": 0.8695235815178358, + "grad_norm": 0.5513802923025586, + "learning_rate": 1.2370236059943674e-05, + "logits/chosen": -0.3797294497489929, + "logits/rejected": -0.3786570429801941, + "logps/chosen": -1036.927734375, + "logps/rejected": -1075.22607421875, + "loss": 0.058, + "num_input_tokens_seen": 74841280, + "rewards/accuracies": 0.984375, + "rewards/chosen": -2.146742105484009, + "rewards/margins": 5.955057144165039, + "rewards/rejected": -8.101799964904785, + "step": 227 + }, + { + "epoch": 0.8733540818769452, + "grad_norm": 0.652769944783012, + "learning_rate": 1.2310581082806713e-05, + "logits/chosen": -0.47347062826156616, + "logits/rejected": -0.48015111684799194, + "logps/chosen": -1083.6005859375, + "logps/rejected": -1138.885009765625, + "loss": 0.0781, + "num_input_tokens_seen": 75173696, + "rewards/accuracies": 0.96875, + "rewards/chosen": -2.2463467121124268, + "rewards/margins": 5.615927219390869, + "rewards/rejected": -7.862273693084717, + "step": 228 + }, + { + "epoch": 0.8771845822360546, + "grad_norm": 0.3723262041804569, + "learning_rate": 1.2250839113597928e-05, + "logits/chosen": -0.4572444558143616, + "logits/rejected": -0.44295501708984375, + "logps/chosen": -1014.3811645507812, + "logps/rejected": -1062.399169921875, + "loss": 0.0447, + "num_input_tokens_seen": 75480896, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -2.0660948753356934, + "rewards/margins": 5.59710693359375, + "rewards/rejected": -7.663201332092285, + "step": 229 + }, + { + "epoch": 0.881015082595164, + "grad_norm": 0.3916120089474223, + "learning_rate": 1.2191012401568698e-05, + "logits/chosen": -0.4184446334838867, + "logits/rejected": -0.4254826605319977, + "logps/chosen": -1097.74169921875, + "logps/rejected": -1137.325439453125, + "loss": 0.0417, + "num_input_tokens_seen": 75812608, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -1.843390941619873, + "rewards/margins": 5.98928165435791, + "rewards/rejected": -7.832672119140625, + "step": 230 + }, + { + "epoch": 0.8848455829542734, + "grad_norm": 0.5289171888276779, + "learning_rate": 1.2131103199160913e-05, + "logits/chosen": -0.41859838366508484, + "logits/rejected": -0.412430077791214, + "logps/chosen": -1084.5152587890625, + "logps/rejected": -1096.826416015625, + "loss": 0.0535, + "num_input_tokens_seen": 76142400, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -2.179537773132324, + "rewards/margins": 5.520472526550293, + "rewards/rejected": -7.700010299682617, + "step": 231 + }, + { + "epoch": 0.8886760833133828, + "grad_norm": 0.4637638815574396, + "learning_rate": 1.2071113761922187e-05, + "logits/chosen": -0.4305140972137451, + "logits/rejected": -0.45257967710494995, + "logps/chosen": -1084.440185546875, + "logps/rejected": -1121.800537109375, + "loss": 0.0463, + "num_input_tokens_seen": 76489376, + "rewards/accuracies": 0.984375, + "rewards/chosen": -2.1045100688934326, + "rewards/margins": 5.712845802307129, + "rewards/rejected": -7.817356109619141, + "step": 232 + }, + { + "epoch": 0.8925065836724922, + "grad_norm": 0.3380595004943603, + "learning_rate": 1.2011046348420921e-05, + "logits/chosen": -0.4042135775089264, + "logits/rejected": -0.40141215920448303, + "logps/chosen": -1056.2340087890625, + "logps/rejected": -1110.959716796875, + "loss": 0.0358, + "num_input_tokens_seen": 76818304, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -2.0610506534576416, + "rewards/margins": 5.407949447631836, + "rewards/rejected": -7.469000339508057, + "step": 233 + }, + { + "epoch": 0.8963370840316016, + "grad_norm": 0.6134155225524351, + "learning_rate": 1.1950903220161286e-05, + "logits/chosen": -0.45010632276535034, + "logits/rejected": -0.45799732208251953, + "logps/chosen": -1112.919921875, + "logps/rejected": -1178.3665771484375, + "loss": 0.0726, + "num_input_tokens_seen": 77158272, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -1.8620401620864868, + "rewards/margins": 5.740628719329834, + "rewards/rejected": -7.6026692390441895, + "step": 234 + }, + { + "epoch": 0.900167584390711, + "grad_norm": 0.6628282434411974, + "learning_rate": 1.1890686641498064e-05, + "logits/chosen": -0.4123459756374359, + "logits/rejected": -0.43458613753318787, + "logps/chosen": -1055.947509765625, + "logps/rejected": -1101.5477294921875, + "loss": 0.079, + "num_input_tokens_seen": 77483360, + "rewards/accuracies": 0.96875, + "rewards/chosen": -2.427839756011963, + "rewards/margins": 5.394449234008789, + "rewards/rejected": -7.82228946685791, + "step": 235 + }, + { + "epoch": 0.9039980847498205, + "grad_norm": 0.4574423216270021, + "learning_rate": 1.1830398879551412e-05, + "logits/chosen": -0.4544453024864197, + "logits/rejected": -0.4579254984855652, + "logps/chosen": -1077.8934326171875, + "logps/rejected": -1105.59375, + "loss": 0.0488, + "num_input_tokens_seen": 77818944, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -2.117683172225952, + "rewards/margins": 5.758082389831543, + "rewards/rejected": -7.875765800476074, + "step": 236 + }, + { + "epoch": 0.9078285851089298, + "grad_norm": 0.6854203861764628, + "learning_rate": 1.177004220412149e-05, + "logits/chosen": -0.4499816596508026, + "logits/rejected": -0.4586358666419983, + "logps/chosen": -1068.04052734375, + "logps/rejected": -1118.0302734375, + "loss": 0.0913, + "num_input_tokens_seen": 78145344, + "rewards/accuracies": 0.96875, + "rewards/chosen": -2.3535232543945312, + "rewards/margins": 5.176707744598389, + "rewards/rejected": -7.530230522155762, + "step": 237 + }, + { + "epoch": 0.9116590854680393, + "grad_norm": 0.47085958324735183, + "learning_rate": 1.1709618887603013e-05, + "logits/chosen": -0.407639741897583, + "logits/rejected": -0.4041568636894226, + "logps/chosen": -1091.763427734375, + "logps/rejected": -1152.5400390625, + "loss": 0.0529, + "num_input_tokens_seen": 78488160, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -2.1383814811706543, + "rewards/margins": 5.399660587310791, + "rewards/rejected": -7.538042068481445, + "step": 238 + }, + { + "epoch": 0.9154895858271487, + "grad_norm": 0.5355975766395914, + "learning_rate": 1.1649131204899702e-05, + "logits/chosen": -0.4186476469039917, + "logits/rejected": -0.4375460147857666, + "logps/chosen": -1099.92822265625, + "logps/rejected": -1156.2044677734375, + "loss": 0.0568, + "num_input_tokens_seen": 78819968, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -2.599379539489746, + "rewards/margins": 5.559235095977783, + "rewards/rejected": -8.158615112304688, + "step": 239 + }, + { + "epoch": 0.919320086186258, + "grad_norm": 0.5543549505429072, + "learning_rate": 1.1588581433338614e-05, + "logits/chosen": -0.4291573762893677, + "logits/rejected": -0.4474974274635315, + "logps/chosen": -1047.956787109375, + "logps/rejected": -1112.687744140625, + "loss": 0.0574, + "num_input_tokens_seen": 79143136, + "rewards/accuracies": 0.984375, + "rewards/chosen": -2.6260874271392822, + "rewards/margins": 5.547441482543945, + "rewards/rejected": -8.173528671264648, + "step": 240 + }, + { + "epoch": 0.9231505865453675, + "grad_norm": 0.7089623528362986, + "learning_rate": 1.1527971852584434e-05, + "logits/chosen": -0.4518323540687561, + "logits/rejected": -0.4442501664161682, + "logps/chosen": -1112.748779296875, + "logps/rejected": -1131.294677734375, + "loss": 0.0927, + "num_input_tokens_seen": 79482752, + "rewards/accuracies": 0.96875, + "rewards/chosen": -2.4785447120666504, + "rewards/margins": 5.350704193115234, + "rewards/rejected": -7.829249382019043, + "step": 241 + }, + { + "epoch": 0.9269810869044769, + "grad_norm": 0.47005488914909815, + "learning_rate": 1.1467304744553618e-05, + "logits/chosen": -0.3662518858909607, + "logits/rejected": -0.4030444622039795, + "logps/chosen": -1061.30712890625, + "logps/rejected": -1115.3720703125, + "loss": 0.047, + "num_input_tokens_seen": 79813888, + "rewards/accuracies": 0.984375, + "rewards/chosen": -2.354909658432007, + "rewards/margins": 5.655545234680176, + "rewards/rejected": -8.010455131530762, + "step": 242 + }, + { + "epoch": 0.9308115872635863, + "grad_norm": 0.508231331328986, + "learning_rate": 1.1406582393328493e-05, + "logits/chosen": -0.482490599155426, + "logits/rejected": -0.49022340774536133, + "logps/chosen": -1103.95751953125, + "logps/rejected": -1150.1783447265625, + "loss": 0.0475, + "num_input_tokens_seen": 80145696, + "rewards/accuracies": 0.984375, + "rewards/chosen": -2.2599711418151855, + "rewards/margins": 6.204766273498535, + "rewards/rejected": -8.464736938476562, + "step": 243 + }, + { + "epoch": 0.9346420876226957, + "grad_norm": 0.3814970113159265, + "learning_rate": 1.1345807085071263e-05, + "logits/chosen": -0.3977491855621338, + "logits/rejected": -0.4104079604148865, + "logps/chosen": -1054.0306396484375, + "logps/rejected": -1096.1517333984375, + "loss": 0.0439, + "num_input_tokens_seen": 80471072, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -2.727005958557129, + "rewards/margins": 5.943853855133057, + "rewards/rejected": -8.670860290527344, + "step": 244 + }, + { + "epoch": 0.9384725879818051, + "grad_norm": 0.31580598646062713, + "learning_rate": 1.1284981107937933e-05, + "logits/chosen": -0.4471209645271301, + "logits/rejected": -0.4528927206993103, + "logps/chosen": -1014.870849609375, + "logps/rejected": -1071.708251953125, + "loss": 0.0314, + "num_input_tokens_seen": 80780224, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -3.084810256958008, + "rewards/margins": 5.770332336425781, + "rewards/rejected": -8.855142593383789, + "step": 245 + }, + { + "epoch": 0.9423030883409146, + "grad_norm": 0.2761772405627037, + "learning_rate": 1.1224106751992164e-05, + "logits/chosen": -0.3892216086387634, + "logits/rejected": -0.3928696811199188, + "logps/chosen": -1045.1275634765625, + "logps/rejected": -1095.2027587890625, + "loss": 0.0276, + "num_input_tokens_seen": 81102880, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.408397674560547, + "rewards/margins": 5.988190174102783, + "rewards/rejected": -8.396587371826172, + "step": 246 + }, + { + "epoch": 0.946133588700024, + "grad_norm": 0.2291554882462916, + "learning_rate": 1.116318630911905e-05, + "logits/chosen": -0.44851553440093994, + "logits/rejected": -0.4727195203304291, + "logps/chosen": -1067.846923828125, + "logps/rejected": -1121.5997314453125, + "loss": 0.0225, + "num_input_tokens_seen": 81435296, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.589552402496338, + "rewards/margins": 5.99311637878418, + "rewards/rejected": -8.582669258117676, + "step": 247 + }, + { + "epoch": 0.9499640890591333, + "grad_norm": 0.6068887187814143, + "learning_rate": 1.1102222072938832e-05, + "logits/chosen": -0.4842962920665741, + "logits/rejected": -0.5168699026107788, + "logps/chosen": -1098.196533203125, + "logps/rejected": -1135.597900390625, + "loss": 0.0724, + "num_input_tokens_seen": 81773952, + "rewards/accuracies": 0.96875, + "rewards/chosen": -2.3644886016845703, + "rewards/margins": 6.153889179229736, + "rewards/rejected": -8.518377304077148, + "step": 248 + }, + { + "epoch": 0.9537945894182428, + "grad_norm": 0.4865703125863599, + "learning_rate": 1.1041216338720548e-05, + "logits/chosen": -0.41839316487312317, + "logits/rejected": -0.4279792308807373, + "logps/chosen": -1119.8089599609375, + "logps/rejected": -1167.3818359375, + "loss": 0.0524, + "num_input_tokens_seen": 82128992, + "rewards/accuracies": 0.984375, + "rewards/chosen": -2.2524805068969727, + "rewards/margins": 5.727254867553711, + "rewards/rejected": -7.979735374450684, + "step": 249 + }, + { + "epoch": 0.9576250897773522, + "grad_norm": 0.6557650773239859, + "learning_rate": 1.098017140329561e-05, + "logits/chosen": -0.34586331248283386, + "logits/rejected": -0.3445798456668854, + "logps/chosen": -1065.8570556640625, + "logps/rejected": -1113.523681640625, + "loss": 0.1022, + "num_input_tokens_seen": 82458624, + "rewards/accuracies": 0.96875, + "rewards/chosen": -2.5229134559631348, + "rewards/margins": 5.115533351898193, + "rewards/rejected": -7.638446807861328, + "step": 250 + }, + { + "epoch": 0.9614555901364615, + "grad_norm": 0.6187233083889037, + "learning_rate": 1.0919089564971328e-05, + "logits/chosen": -0.3793196678161621, + "logits/rejected": -0.3830195665359497, + "logps/chosen": -1061.614990234375, + "logps/rejected": -1103.16943359375, + "loss": 0.0704, + "num_input_tokens_seen": 82790912, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -2.7397522926330566, + "rewards/margins": 5.539724349975586, + "rewards/rejected": -8.279476165771484, + "step": 251 + }, + { + "epoch": 0.965286090495571, + "grad_norm": 0.31427787674914964, + "learning_rate": 1.0857973123444401e-05, + "logits/chosen": -0.4219928979873657, + "logits/rejected": -0.4239566922187805, + "logps/chosen": -1041.800537109375, + "logps/rejected": -1084.5595703125, + "loss": 0.029, + "num_input_tokens_seen": 83119392, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -2.5621440410614014, + "rewards/margins": 5.746786117553711, + "rewards/rejected": -8.308929443359375, + "step": 252 + }, + { + "epoch": 0.9691165908546804, + "grad_norm": 0.5126245440637385, + "learning_rate": 1.0796824379714301e-05, + "logits/chosen": -0.4406627416610718, + "logits/rejected": -0.4481799006462097, + "logps/chosen": -1085.403076171875, + "logps/rejected": -1130.52294921875, + "loss": 0.066, + "num_input_tokens_seen": 83449888, + "rewards/accuracies": 0.984375, + "rewards/chosen": -2.1633081436157227, + "rewards/margins": 5.643986701965332, + "rewards/rejected": -7.8072943687438965, + "step": 253 + }, + { + "epoch": 0.9729470912137897, + "grad_norm": 0.6006796612190157, + "learning_rate": 1.0735645635996676e-05, + "logits/chosen": -0.3469582200050354, + "logits/rejected": -0.35880526900291443, + "logps/chosen": -1076.3043212890625, + "logps/rejected": -1112.580810546875, + "loss": 0.0716, + "num_input_tokens_seen": 83792608, + "rewards/accuracies": 0.984375, + "rewards/chosen": -3.0452232360839844, + "rewards/margins": 5.419180870056152, + "rewards/rejected": -8.464404106140137, + "step": 254 + }, + { + "epoch": 0.9767775915728992, + "grad_norm": 0.5737368919422813, + "learning_rate": 1.067443919563664e-05, + "logits/chosen": -0.4346870183944702, + "logits/rejected": -0.4354139268398285, + "logps/chosen": -1074.835693359375, + "logps/rejected": -1114.8531494140625, + "loss": 0.0613, + "num_input_tokens_seen": 84118304, + "rewards/accuracies": 0.984375, + "rewards/chosen": -2.328031539916992, + "rewards/margins": 5.7469048500061035, + "rewards/rejected": -8.074935913085938, + "step": 255 + }, + { + "epoch": 0.9806080919320086, + "grad_norm": 0.2988352126144153, + "learning_rate": 1.0613207363022086e-05, + "logits/chosen": -0.42018818855285645, + "logits/rejected": -0.4369983375072479, + "logps/chosen": -1037.9376220703125, + "logps/rejected": -1070.4998779296875, + "loss": 0.0284, + "num_input_tokens_seen": 84443392, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -2.698380470275879, + "rewards/margins": 5.725028991699219, + "rewards/rejected": -8.423409461975098, + "step": 256 + }, + { + "epoch": 0.9844385922911181, + "grad_norm": 0.5114648724706398, + "learning_rate": 1.0551952443496902e-05, + "logits/chosen": -0.398377925157547, + "logits/rejected": -0.38936707377433777, + "logps/chosen": -1090.4180908203125, + "logps/rejected": -1127.52392578125, + "loss": 0.0571, + "num_input_tokens_seen": 84790656, + "rewards/accuracies": 0.984375, + "rewards/chosen": -2.041167974472046, + "rewards/margins": 5.559525012969971, + "rewards/rejected": -7.600693225860596, + "step": 257 + }, + { + "epoch": 0.9882690926502274, + "grad_norm": 0.5608043815070605, + "learning_rate": 1.0490676743274181e-05, + "logits/chosen": -0.39295387268066406, + "logits/rejected": -0.4099191427230835, + "logps/chosen": -1082.97119140625, + "logps/rejected": -1107.43359375, + "loss": 0.0607, + "num_input_tokens_seen": 85136640, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -2.5679168701171875, + "rewards/margins": 5.422738075256348, + "rewards/rejected": -7.990654945373535, + "step": 258 + }, + { + "epoch": 0.9920995930093368, + "grad_norm": 0.46522428007055977, + "learning_rate": 1.042938256934941e-05, + "logits/chosen": -0.390384703874588, + "logits/rejected": -0.4041425883769989, + "logps/chosen": -1103.4140625, + "logps/rejected": -1152.4345703125, + "loss": 0.0497, + "num_input_tokens_seen": 85474048, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -1.8531684875488281, + "rewards/margins": 5.739438533782959, + "rewards/rejected": -7.592607021331787, + "step": 259 + }, + { + "epoch": 0.9959300933684463, + "grad_norm": 0.5796653829303974, + "learning_rate": 1.036807222941359e-05, + "logits/chosen": -0.49759235978126526, + "logits/rejected": -0.5076637268066406, + "logps/chosen": -1079.9263916015625, + "logps/rejected": -1139.41552734375, + "loss": 0.0568, + "num_input_tokens_seen": 85797408, + "rewards/accuracies": 0.984375, + "rewards/chosen": -2.807788133621216, + "rewards/margins": 5.715126037597656, + "rewards/rejected": -8.52291488647461, + "step": 260 + }, + { + "epoch": 0.9997605937275557, + "grad_norm": 0.32648603121066433, + "learning_rate": 1.0306748031766367e-05, + "logits/chosen": -0.47008639574050903, + "logits/rejected": -0.4414535164833069, + "logps/chosen": -1073.38916015625, + "logps/rejected": -1130.3580322265625, + "loss": 0.0328, + "num_input_tokens_seen": 86127744, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7792227268218994, + "rewards/margins": 5.945899963378906, + "rewards/rejected": -8.725122451782227, + "step": 261 + }, + { + "epoch": 1.0, + "grad_norm": 0.32648603121066433, + "learning_rate": 1.0245412285229124e-05, + "logits/chosen": -0.40284958481788635, + "logits/rejected": -0.4234589636325836, + "logps/chosen": -1139.0733642578125, + "logps/rejected": -1121.179443359375, + "loss": 0.0049, + "num_input_tokens_seen": 86150848, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4898324012756348, + "rewards/margins": 4.945976257324219, + "rewards/rejected": -6.435808181762695, + "step": 262 + }, + { + "epoch": 1.0038305003591095, + "grad_norm": 0.30482628627859, + "learning_rate": 1.018406729905805e-05, + "logits/chosen": -0.43190330266952515, + "logits/rejected": -0.42813292145729065, + "logps/chosen": -1064.09033203125, + "logps/rejected": -1104.1912841796875, + "loss": 0.0297, + "num_input_tokens_seen": 86470976, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.862783908843994, + "rewards/margins": 5.934267044067383, + "rewards/rejected": -8.797050476074219, + "step": 263 + }, + { + "epoch": 1.0076610007182187, + "grad_norm": 0.4200115677072683, + "learning_rate": 1.01227153828572e-05, + "logits/chosen": -0.4512402415275574, + "logits/rejected": -0.45330795645713806, + "logps/chosen": -1100.692626953125, + "logps/rejected": -1128.3052978515625, + "loss": 0.0394, + "num_input_tokens_seen": 86801472, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -2.67680025100708, + "rewards/margins": 5.705223083496094, + "rewards/rejected": -8.382022857666016, + "step": 264 + }, + { + "epoch": 1.0114915010773282, + "grad_norm": 0.34610675404910174, + "learning_rate": 1.0061358846491548e-05, + "logits/chosen": -0.48727381229400635, + "logits/rejected": -0.4835362434387207, + "logps/chosen": -1104.974853515625, + "logps/rejected": -1132.6875, + "loss": 0.0317, + "num_input_tokens_seen": 87139392, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.684995412826538, + "rewards/margins": 5.819555282592773, + "rewards/rejected": -8.50455093383789, + "step": 265 + }, + { + "epoch": 1.0153220014364377, + "grad_norm": 0.23688042995769534, + "learning_rate": 1e-05, + "logits/chosen": -0.4744352698326111, + "logits/rejected": -0.47239381074905396, + "logps/chosen": -1077.82275390625, + "logps/rejected": -1122.03466796875, + "loss": 0.0234, + "num_input_tokens_seen": 87465760, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4221696853637695, + "rewards/margins": 6.047277450561523, + "rewards/rejected": -8.469447135925293, + "step": 266 + }, + { + "epoch": 1.019152501795547, + "grad_norm": 0.30883727759556884, + "learning_rate": 9.938641153508457e-06, + "logits/chosen": -0.49353209137916565, + "logits/rejected": -0.499381422996521, + "logps/chosen": -1096.467041015625, + "logps/rejected": -1138.04345703125, + "loss": 0.0273, + "num_input_tokens_seen": 87809856, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -2.752589702606201, + "rewards/margins": 5.69918155670166, + "rewards/rejected": -8.45177173614502, + "step": 267 + }, + { + "epoch": 1.0229830021546564, + "grad_norm": 0.21687386177899245, + "learning_rate": 9.877284617142802e-06, + "logits/chosen": -0.4954046308994293, + "logits/rejected": -0.494023859500885, + "logps/chosen": -1039.7576904296875, + "logps/rejected": -1103.8875732421875, + "loss": 0.0203, + "num_input_tokens_seen": 88126560, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5245070457458496, + "rewards/margins": 6.456825256347656, + "rewards/rejected": -8.981331825256348, + "step": 268 + }, + { + "epoch": 1.026813502513766, + "grad_norm": 0.46251807045835513, + "learning_rate": 9.815932700941954e-06, + "logits/chosen": -0.5402802228927612, + "logits/rejected": -0.5423499941825867, + "logps/chosen": -1123.579345703125, + "logps/rejected": -1167.3709716796875, + "loss": 0.0378, + "num_input_tokens_seen": 88466752, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -2.3790953159332275, + "rewards/margins": 6.071811199188232, + "rewards/rejected": -8.450906753540039, + "step": 269 + }, + { + "epoch": 1.0306440028728752, + "grad_norm": 0.30202210669280954, + "learning_rate": 9.75458771477088e-06, + "logits/chosen": -0.4868274927139282, + "logits/rejected": -0.49801307916641235, + "logps/chosen": -1067.61376953125, + "logps/rejected": -1103.095458984375, + "loss": 0.0339, + "num_input_tokens_seen": 88794112, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -2.207566976547241, + "rewards/margins": 6.231461524963379, + "rewards/rejected": -8.439027786254883, + "step": 270 + }, + { + "epoch": 1.0344745032319846, + "grad_norm": 0.2993291012444868, + "learning_rate": 9.693251968233637e-06, + "logits/chosen": -0.5135484933853149, + "logits/rejected": -0.5233759880065918, + "logps/chosen": -1047.32275390625, + "logps/rejected": -1100.70849609375, + "loss": 0.0321, + "num_input_tokens_seen": 89116480, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.845935821533203, + "rewards/margins": 5.898771286010742, + "rewards/rejected": -8.744706153869629, + "step": 271 + }, + { + "epoch": 1.0383050035910941, + "grad_norm": 0.30470677356671677, + "learning_rate": 9.631927770586412e-06, + "logits/chosen": -0.48902449011802673, + "logits/rejected": -0.4956105947494507, + "logps/chosen": -1083.82421875, + "logps/rejected": -1137.950927734375, + "loss": 0.0315, + "num_input_tokens_seen": 89460896, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5832748413085938, + "rewards/margins": 6.079258918762207, + "rewards/rejected": -8.662534713745117, + "step": 272 + }, + { + "epoch": 1.0421355039502036, + "grad_norm": 0.32805692975700873, + "learning_rate": 9.570617430650593e-06, + "logits/chosen": -0.46343863010406494, + "logits/rejected": -0.4568053185939789, + "logps/chosen": -1006.8623046875, + "logps/rejected": -1081.487548828125, + "loss": 0.0381, + "num_input_tokens_seen": 89776992, + "rewards/accuracies": 0.984375, + "rewards/chosen": -2.5563578605651855, + "rewards/margins": 5.7294206619262695, + "rewards/rejected": -8.285778045654297, + "step": 273 + }, + { + "epoch": 1.0459660043093129, + "grad_norm": 0.47219171839778007, + "learning_rate": 9.50932325672582e-06, + "logits/chosen": -0.4515499472618103, + "logits/rejected": -0.4488064646720886, + "logps/chosen": -1051.923583984375, + "logps/rejected": -1099.134033203125, + "loss": 0.0352, + "num_input_tokens_seen": 90095776, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -2.922595262527466, + "rewards/margins": 5.832512378692627, + "rewards/rejected": -8.755107879638672, + "step": 274 + }, + { + "epoch": 1.0497965046684223, + "grad_norm": 0.34108513096998644, + "learning_rate": 9.448047556503101e-06, + "logits/chosen": -0.4935795068740845, + "logits/rejected": -0.49251919984817505, + "logps/chosen": -1092.8480224609375, + "logps/rejected": -1127.662353515625, + "loss": 0.0304, + "num_input_tokens_seen": 90429888, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8941431045532227, + "rewards/margins": 5.699064254760742, + "rewards/rejected": -8.593206405639648, + "step": 275 + }, + { + "epoch": 1.0536270050275318, + "grad_norm": 0.3303854789443076, + "learning_rate": 9.386792636977915e-06, + "logits/chosen": -0.5254795551300049, + "logits/rejected": -0.5333357453346252, + "logps/chosen": -1103.0426025390625, + "logps/rejected": -1144.573974609375, + "loss": 0.0317, + "num_input_tokens_seen": 90763872, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.642683982849121, + "rewards/margins": 6.212048530578613, + "rewards/rejected": -8.85473346710205, + "step": 276 + }, + { + "epoch": 1.057457505386641, + "grad_norm": 0.5038817305212445, + "learning_rate": 9.325560804363361e-06, + "logits/chosen": -0.49569618701934814, + "logits/rejected": -0.5068331956863403, + "logps/chosen": -1095.6949462890625, + "logps/rejected": -1144.6748046875, + "loss": 0.0544, + "num_input_tokens_seen": 91101920, + "rewards/accuracies": 0.984375, + "rewards/chosen": -2.660771369934082, + "rewards/margins": 5.845937252044678, + "rewards/rejected": -8.506708145141602, + "step": 277 + }, + { + "epoch": 1.0612880057457506, + "grad_norm": 0.1827819839757777, + "learning_rate": 9.264354364003327e-06, + "logits/chosen": -0.46657612919807434, + "logits/rejected": -0.48462343215942383, + "logps/chosen": -1112.1156005859375, + "logps/rejected": -1167.8316650390625, + "loss": 0.0196, + "num_input_tokens_seen": 91441408, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6155645847320557, + "rewards/margins": 6.344812870025635, + "rewards/rejected": -8.960376739501953, + "step": 278 + }, + { + "epoch": 1.06511850610486, + "grad_norm": 0.25890001128031587, + "learning_rate": 9.2031756202857e-06, + "logits/chosen": -0.5310154557228088, + "logits/rejected": -0.5486434698104858, + "logps/chosen": -1021.6463012695312, + "logps/rejected": -1098.6300048828125, + "loss": 0.0268, + "num_input_tokens_seen": 91758592, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0416007041931152, + "rewards/margins": 6.106353282928467, + "rewards/rejected": -9.147953987121582, + "step": 279 + }, + { + "epoch": 1.0689490064639693, + "grad_norm": 0.22009889266699845, + "learning_rate": 9.142026876555602e-06, + "logits/chosen": -0.5527172088623047, + "logits/rejected": -0.5542583465576172, + "logps/chosen": -1088.73291015625, + "logps/rejected": -1155.190185546875, + "loss": 0.0197, + "num_input_tokens_seen": 92090016, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.068014144897461, + "rewards/margins": 6.385533332824707, + "rewards/rejected": -9.453546524047852, + "step": 280 + }, + { + "epoch": 1.0727795068230788, + "grad_norm": 0.4032330788438385, + "learning_rate": 9.080910435028675e-06, + "logits/chosen": -0.5402523279190063, + "logits/rejected": -0.5673002004623413, + "logps/chosen": -1041.0772705078125, + "logps/rejected": -1065.860595703125, + "loss": 0.0304, + "num_input_tokens_seen": 92410560, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -3.202848196029663, + "rewards/margins": 6.2481279373168945, + "rewards/rejected": -9.450976371765137, + "step": 281 + }, + { + "epoch": 1.0766100071821882, + "grad_norm": 0.26907717413889465, + "learning_rate": 9.019828596704394e-06, + "logits/chosen": -0.48633456230163574, + "logits/rejected": -0.5003535747528076, + "logps/chosen": -1092.32958984375, + "logps/rejected": -1137.3994140625, + "loss": 0.0152, + "num_input_tokens_seen": 92740160, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8479530811309814, + "rewards/margins": 6.724106311798096, + "rewards/rejected": -9.572059631347656, + "step": 282 + }, + { + "epoch": 1.0804405075412975, + "grad_norm": 0.4049868167452109, + "learning_rate": 8.958783661279454e-06, + "logits/chosen": -0.542472243309021, + "logits/rejected": -0.5566778779029846, + "logps/chosen": -1092.750244140625, + "logps/rejected": -1117.0533447265625, + "loss": 0.0405, + "num_input_tokens_seen": 93068864, + "rewards/accuracies": 0.984375, + "rewards/chosen": -3.1580586433410645, + "rewards/margins": 6.151012420654297, + "rewards/rejected": -9.309070587158203, + "step": 283 + }, + { + "epoch": 1.084271007900407, + "grad_norm": 0.5263230188915736, + "learning_rate": 8.89777792706117e-06, + "logits/chosen": -0.5099426507949829, + "logits/rejected": -0.5155041813850403, + "logps/chosen": -1093.289794921875, + "logps/rejected": -1127.580322265625, + "loss": 0.0414, + "num_input_tokens_seen": 93401184, + "rewards/accuracies": 0.984375, + "rewards/chosen": -3.690894365310669, + "rewards/margins": 6.206489562988281, + "rewards/rejected": -9.897383689880371, + "step": 284 + }, + { + "epoch": 1.0881015082595165, + "grad_norm": 0.40611026286005514, + "learning_rate": 8.836813690880953e-06, + "logits/chosen": -0.47375327348709106, + "logits/rejected": -0.5043613910675049, + "logps/chosen": -1082.195068359375, + "logps/rejected": -1132.988037109375, + "loss": 0.0394, + "num_input_tokens_seen": 93738176, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -2.9468984603881836, + "rewards/margins": 6.305409908294678, + "rewards/rejected": -9.252307891845703, + "step": 285 + }, + { + "epoch": 1.0919320086186257, + "grad_norm": 0.2659988032230293, + "learning_rate": 8.77589324800784e-06, + "logits/chosen": -0.5486725568771362, + "logits/rejected": -0.5619292259216309, + "logps/chosen": -1087.52197265625, + "logps/rejected": -1133.5904541015625, + "loss": 0.0222, + "num_input_tokens_seen": 94064352, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8315885066986084, + "rewards/margins": 6.359816551208496, + "rewards/rejected": -9.191404342651367, + "step": 286 + }, + { + "epoch": 1.0957625089777352, + "grad_norm": 0.3830548733425151, + "learning_rate": 8.715018892062069e-06, + "logits/chosen": -0.51210618019104, + "logits/rejected": -0.5182426571846008, + "logps/chosen": -1094.94482421875, + "logps/rejected": -1130.339111328125, + "loss": 0.0334, + "num_input_tokens_seen": 94393056, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -2.703124761581421, + "rewards/margins": 6.7252302169799805, + "rewards/rejected": -9.42835521697998, + "step": 287 + }, + { + "epoch": 1.0995930093368447, + "grad_norm": 0.4127779248684346, + "learning_rate": 8.654192914928739e-06, + "logits/chosen": -0.5065290927886963, + "logits/rejected": -0.5290075540542603, + "logps/chosen": -1101.2088623046875, + "logps/rejected": -1169.057861328125, + "loss": 0.0344, + "num_input_tokens_seen": 94724640, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -2.868635654449463, + "rewards/margins": 6.2539849281311035, + "rewards/rejected": -9.122620582580566, + "step": 288 + }, + { + "epoch": 1.103423509695954, + "grad_norm": 0.26638638835159956, + "learning_rate": 8.593417606671509e-06, + "logits/chosen": -0.5741506218910217, + "logits/rejected": -0.5868088603019714, + "logps/chosen": -1084.6932373046875, + "logps/rejected": -1124.1976318359375, + "loss": 0.0213, + "num_input_tokens_seen": 95043040, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.116628646850586, + "rewards/margins": 6.559916019439697, + "rewards/rejected": -9.676544189453125, + "step": 289 + }, + { + "epoch": 1.1072540100550634, + "grad_norm": 0.2160475372401454, + "learning_rate": 8.532695255446384e-06, + "logits/chosen": -0.6005316376686096, + "logits/rejected": -0.6313759684562683, + "logps/chosen": -1063.679443359375, + "logps/rejected": -1120.9688720703125, + "loss": 0.0277, + "num_input_tokens_seen": 95368288, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.630537509918213, + "rewards/margins": 6.0724921226501465, + "rewards/rejected": -9.70302963256836, + "step": 290 + }, + { + "epoch": 1.1110845104141729, + "grad_norm": 0.4951634466064939, + "learning_rate": 8.472028147415567e-06, + "logits/chosen": -0.5808524489402771, + "logits/rejected": -0.6008937358856201, + "logps/chosen": -1044.30810546875, + "logps/rejected": -1081.6571044921875, + "loss": 0.0358, + "num_input_tokens_seen": 95672320, + "rewards/accuracies": 0.984375, + "rewards/chosen": -3.4017791748046875, + "rewards/margins": 6.365994930267334, + "rewards/rejected": -9.76777458190918, + "step": 291 + }, + { + "epoch": 1.1149150107732821, + "grad_norm": 0.22808442400112958, + "learning_rate": 8.411418566661387e-06, + "logits/chosen": -0.5307670831680298, + "logits/rejected": -0.5491093397140503, + "logps/chosen": -1060.5166015625, + "logps/rejected": -1110.872314453125, + "loss": 0.0207, + "num_input_tokens_seen": 95980992, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.215269088745117, + "rewards/margins": 6.315883636474609, + "rewards/rejected": -9.531152725219727, + "step": 292 + }, + { + "epoch": 1.1187455111323916, + "grad_norm": 0.21988413755280473, + "learning_rate": 8.350868795100301e-06, + "logits/chosen": -0.5522828102111816, + "logits/rejected": -0.5677733421325684, + "logps/chosen": -1112.68115234375, + "logps/rejected": -1156.2060546875, + "loss": 0.0142, + "num_input_tokens_seen": 96324896, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.267910957336426, + "rewards/margins": 6.608005046844482, + "rewards/rejected": -9.87591552734375, + "step": 293 + }, + { + "epoch": 1.122576011491501, + "grad_norm": 0.3005181438155044, + "learning_rate": 8.290381112396989e-06, + "logits/chosen": -0.5702823996543884, + "logits/rejected": -0.5716421604156494, + "logps/chosen": -1077.447509765625, + "logps/rejected": -1122.2396240234375, + "loss": 0.0237, + "num_input_tokens_seen": 96640032, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3606491088867188, + "rewards/margins": 6.356201648712158, + "rewards/rejected": -9.716850280761719, + "step": 294 + }, + { + "epoch": 1.1264065118506106, + "grad_norm": 0.28211958265890946, + "learning_rate": 8.229957795878513e-06, + "logits/chosen": -0.5151051878929138, + "logits/rejected": -0.5257341265678406, + "logps/chosen": -1105.862060546875, + "logps/rejected": -1152.6639404296875, + "loss": 0.0252, + "num_input_tokens_seen": 96968160, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2288451194763184, + "rewards/margins": 6.629881858825684, + "rewards/rejected": -9.858728408813477, + "step": 295 + }, + { + "epoch": 1.1302370122097198, + "grad_norm": 0.24814309034590554, + "learning_rate": 8.169601120448592e-06, + "logits/chosen": -0.5219290852546692, + "logits/rejected": -0.5157222747802734, + "logps/chosen": -1036.36669921875, + "logps/rejected": -1114.5919189453125, + "loss": 0.0207, + "num_input_tokens_seen": 97288096, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5417604446411133, + "rewards/margins": 6.264781951904297, + "rewards/rejected": -9.80654239654541, + "step": 296 + }, + { + "epoch": 1.1340675125688293, + "grad_norm": 0.6212769498300771, + "learning_rate": 8.109313358501939e-06, + "logits/chosen": -0.5568684339523315, + "logits/rejected": -0.5613746643066406, + "logps/chosen": -1099.389404296875, + "logps/rejected": -1148.8350830078125, + "loss": 0.0684, + "num_input_tokens_seen": 97618816, + "rewards/accuracies": 0.96875, + "rewards/chosen": -3.5746378898620605, + "rewards/margins": 6.125028610229492, + "rewards/rejected": -9.699666023254395, + "step": 297 + }, + { + "epoch": 1.1378980129279388, + "grad_norm": 0.4021221935174639, + "learning_rate": 8.04909677983872e-06, + "logits/chosen": -0.5583133101463318, + "logits/rejected": -0.5708407759666443, + "logps/chosen": -1084.4132080078125, + "logps/rejected": -1103.875, + "loss": 0.0196, + "num_input_tokens_seen": 97946400, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -2.78764009475708, + "rewards/margins": 6.816537857055664, + "rewards/rejected": -9.604177474975586, + "step": 298 + }, + { + "epoch": 1.141728513287048, + "grad_norm": 0.6205911315662619, + "learning_rate": 7.988953651579082e-06, + "logits/chosen": -0.5306577086448669, + "logits/rejected": -0.5513067245483398, + "logps/chosen": -1046.0338134765625, + "logps/rejected": -1110.045166015625, + "loss": 0.0593, + "num_input_tokens_seen": 98273472, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -3.6495399475097656, + "rewards/margins": 6.1302995681762695, + "rewards/rejected": -9.779839515686035, + "step": 299 + }, + { + "epoch": 1.1455590136461575, + "grad_norm": 0.4053585378953993, + "learning_rate": 7.928886238077817e-06, + "logits/chosen": -0.5522028207778931, + "logits/rejected": -0.5578690767288208, + "logps/chosen": -1060.31298828125, + "logps/rejected": -1117.7474365234375, + "loss": 0.0354, + "num_input_tokens_seen": 98599360, + "rewards/accuracies": 0.984375, + "rewards/chosen": -3.168921709060669, + "rewards/margins": 6.632106781005859, + "rewards/rejected": -9.80102825164795, + "step": 300 + }, + { + "epoch": 1.149389514005267, + "grad_norm": 0.31710853965134694, + "learning_rate": 7.868896800839088e-06, + "logits/chosen": -0.49912428855895996, + "logits/rejected": -0.5160341262817383, + "logps/chosen": -1098.803466796875, + "logps/rejected": -1137.767822265625, + "loss": 0.0193, + "num_input_tokens_seen": 98930080, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -3.558093547821045, + "rewards/margins": 6.648016452789307, + "rewards/rejected": -10.206110000610352, + "step": 301 + }, + { + "epoch": 1.1532200143643763, + "grad_norm": 0.16661424669416017, + "learning_rate": 7.808987598431303e-06, + "logits/chosen": -0.615931510925293, + "logits/rejected": -0.6293091773986816, + "logps/chosen": -1084.83740234375, + "logps/rejected": -1140.68603515625, + "loss": 0.0142, + "num_input_tokens_seen": 99256576, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8766427040100098, + "rewards/margins": 6.60366678237915, + "rewards/rejected": -10.48030948638916, + "step": 302 + }, + { + "epoch": 1.1570505147234857, + "grad_norm": 0.29131855185352934, + "learning_rate": 7.749160886402074e-06, + "logits/chosen": -0.5557946562767029, + "logits/rejected": -0.5785644054412842, + "logps/chosen": -1098.9532470703125, + "logps/rejected": -1150.231689453125, + "loss": 0.0282, + "num_input_tokens_seen": 99595808, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.559966802597046, + "rewards/margins": 6.437026023864746, + "rewards/rejected": -9.996993064880371, + "step": 303 + }, + { + "epoch": 1.1608810150825952, + "grad_norm": 0.2536376253671043, + "learning_rate": 7.68941891719329e-06, + "logits/chosen": -0.5173993110656738, + "logits/rejected": -0.5453042387962341, + "logps/chosen": -1081.798583984375, + "logps/rejected": -1104.855224609375, + "loss": 0.0221, + "num_input_tokens_seen": 99917760, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5507495403289795, + "rewards/margins": 6.346550941467285, + "rewards/rejected": -9.897299766540527, + "step": 304 + }, + { + "epoch": 1.1647115154417045, + "grad_norm": 0.5675946849084924, + "learning_rate": 7.629763940056328e-06, + "logits/chosen": -0.5439468026161194, + "logits/rejected": -0.570183277130127, + "logps/chosen": -1086.3389892578125, + "logps/rejected": -1149.052490234375, + "loss": 0.0426, + "num_input_tokens_seen": 100242176, + "rewards/accuracies": 0.984375, + "rewards/chosen": -3.8348593711853027, + "rewards/margins": 6.398078441619873, + "rewards/rejected": -10.23293685913086, + "step": 305 + }, + { + "epoch": 1.168542015800814, + "grad_norm": 0.19660080953332104, + "learning_rate": 7.570198200967363e-06, + "logits/chosen": -0.49614018201828003, + "logits/rejected": -0.5242593884468079, + "logps/chosen": -1069.2412109375, + "logps/rejected": -1132.6884765625, + "loss": 0.0208, + "num_input_tokens_seen": 100560000, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3553149700164795, + "rewards/margins": 6.5504536628723145, + "rewards/rejected": -9.905769348144531, + "step": 306 + }, + { + "epoch": 1.1723725161599234, + "grad_norm": 0.3793200413670357, + "learning_rate": 7.510723942542799e-06, + "logits/chosen": -0.522719144821167, + "logits/rejected": -0.5086554288864136, + "logps/chosen": -1119.348876953125, + "logps/rejected": -1183.35498046875, + "loss": 0.0362, + "num_input_tokens_seen": 100902592, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -3.677518606185913, + "rewards/margins": 5.818599700927734, + "rewards/rejected": -9.496118545532227, + "step": 307 + }, + { + "epoch": 1.176203016519033, + "grad_norm": 0.22942103249743084, + "learning_rate": 7.451343403954856e-06, + "logits/chosen": -0.5533791780471802, + "logits/rejected": -0.5773422122001648, + "logps/chosen": -1120.6015625, + "logps/rejected": -1146.826416015625, + "loss": 0.0168, + "num_input_tokens_seen": 101239840, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5555758476257324, + "rewards/margins": 6.347783088684082, + "rewards/rejected": -9.903359413146973, + "step": 308 + }, + { + "epoch": 1.1800335168781422, + "grad_norm": 0.3625229477451333, + "learning_rate": 7.392058820847246e-06, + "logits/chosen": -0.5555598735809326, + "logits/rejected": -0.5726979970932007, + "logps/chosen": -1095.2576904296875, + "logps/rejected": -1131.7054443359375, + "loss": 0.0324, + "num_input_tokens_seen": 101563744, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8269643783569336, + "rewards/margins": 6.424999713897705, + "rewards/rejected": -10.251964569091797, + "step": 309 + }, + { + "epoch": 1.1838640172372517, + "grad_norm": 0.3128989934358838, + "learning_rate": 7.332872425251017e-06, + "logits/chosen": -0.5529404282569885, + "logits/rejected": -0.5656006932258606, + "logps/chosen": -1085.046630859375, + "logps/rejected": -1147.440673828125, + "loss": 0.0216, + "num_input_tokens_seen": 101898880, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -4.018230438232422, + "rewards/margins": 6.624449729919434, + "rewards/rejected": -10.642681121826172, + "step": 310 + }, + { + "epoch": 1.187694517596361, + "grad_norm": 0.27325576759858283, + "learning_rate": 7.273786445500512e-06, + "logits/chosen": -0.5299850702285767, + "logits/rejected": -0.551963210105896, + "logps/chosen": -1090.983154296875, + "logps/rejected": -1134.7171630859375, + "loss": 0.0226, + "num_input_tokens_seen": 102231360, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.383548259735107, + "rewards/margins": 6.203757286071777, + "rewards/rejected": -10.587305068969727, + "step": 311 + }, + { + "epoch": 1.1915250179554704, + "grad_norm": 0.38258066929722134, + "learning_rate": 7.214803106149471e-06, + "logits/chosen": -0.5501708984375, + "logits/rejected": -0.5534524917602539, + "logps/chosen": -1128.267578125, + "logps/rejected": -1187.3466796875, + "loss": 0.0399, + "num_input_tokens_seen": 102574176, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -3.6865224838256836, + "rewards/margins": 6.469578266143799, + "rewards/rejected": -10.15610122680664, + "step": 312 + }, + { + "epoch": 1.1953555183145799, + "grad_norm": 0.188733919158956, + "learning_rate": 7.155924627887283e-06, + "logits/chosen": -0.6095337867736816, + "logits/rejected": -0.6194601655006409, + "logps/chosen": -1107.542236328125, + "logps/rejected": -1182.0018310546875, + "loss": 0.0139, + "num_input_tokens_seen": 102903424, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.175518035888672, + "rewards/margins": 6.902989864349365, + "rewards/rejected": -11.078507423400879, + "step": 313 + }, + { + "epoch": 1.1991860186736893, + "grad_norm": 0.20514833830137172, + "learning_rate": 7.097153227455379e-06, + "logits/chosen": -0.6539708375930786, + "logits/rejected": -0.6667028069496155, + "logps/chosen": -1130.8035888671875, + "logps/rejected": -1183.167724609375, + "loss": 0.016, + "num_input_tokens_seen": 103240992, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.308477401733398, + "rewards/margins": 6.659805774688721, + "rewards/rejected": -10.968282699584961, + "step": 314 + }, + { + "epoch": 1.2030165190327986, + "grad_norm": 0.3856491170276071, + "learning_rate": 7.038491117563762e-06, + "logits/chosen": -0.5728814601898193, + "logits/rejected": -0.5498224496841431, + "logps/chosen": -1097.45458984375, + "logps/rejected": -1142.01123046875, + "loss": 0.0328, + "num_input_tokens_seen": 103578240, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -3.8182337284088135, + "rewards/margins": 6.599686622619629, + "rewards/rejected": -10.417920112609863, + "step": 315 + }, + { + "epoch": 1.206847019391908, + "grad_norm": 0.4454558194024016, + "learning_rate": 6.97994050680772e-06, + "logits/chosen": -0.6171661615371704, + "logits/rejected": -0.6259925365447998, + "logps/chosen": -1108.2666015625, + "logps/rejected": -1152.365234375, + "loss": 0.0371, + "num_input_tokens_seen": 103907424, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -4.086334228515625, + "rewards/margins": 6.760931968688965, + "rewards/rejected": -10.847267150878906, + "step": 316 + }, + { + "epoch": 1.2106775197510176, + "grad_norm": 0.4793766631666451, + "learning_rate": 6.9215035995846515e-06, + "logits/chosen": -0.5547460913658142, + "logits/rejected": -0.5681288838386536, + "logps/chosen": -1050.9276123046875, + "logps/rejected": -1093.7308349609375, + "loss": 0.0376, + "num_input_tokens_seen": 104225696, + "rewards/accuracies": 0.984375, + "rewards/chosen": -4.542605400085449, + "rewards/margins": 6.1130852699279785, + "rewards/rejected": -10.65569019317627, + "step": 317 + }, + { + "epoch": 1.2145080201101268, + "grad_norm": 0.1840006039017823, + "learning_rate": 6.8631825960110866e-06, + "logits/chosen": -0.5947555899620056, + "logits/rejected": -0.5856679677963257, + "logps/chosen": -1107.364013671875, + "logps/rejected": -1159.6749267578125, + "loss": 0.0131, + "num_input_tokens_seen": 104567264, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.067417621612549, + "rewards/margins": 7.059563636779785, + "rewards/rejected": -11.126980781555176, + "step": 318 + }, + { + "epoch": 1.2183385204692363, + "grad_norm": 0.16649420354654187, + "learning_rate": 6.8049796918398435e-06, + "logits/chosen": -0.5761028528213501, + "logits/rejected": -0.5897977948188782, + "logps/chosen": -1068.379638671875, + "logps/rejected": -1113.53955078125, + "loss": 0.0135, + "num_input_tokens_seen": 104878656, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.008105278015137, + "rewards/margins": 6.588144302368164, + "rewards/rejected": -10.596250534057617, + "step": 319 + }, + { + "epoch": 1.2221690208283458, + "grad_norm": 0.3555100697187636, + "learning_rate": 6.746897078377372e-06, + "logits/chosen": -0.5819359421730042, + "logits/rejected": -0.5746718645095825, + "logps/chosen": -1071.798828125, + "logps/rejected": -1144.3529052734375, + "loss": 0.0273, + "num_input_tokens_seen": 105210048, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -4.650390625, + "rewards/margins": 6.524094581604004, + "rewards/rejected": -11.174484252929688, + "step": 320 + }, + { + "epoch": 1.225999521187455, + "grad_norm": 0.4712155887331787, + "learning_rate": 6.688936942401237e-06, + "logits/chosen": -0.500744104385376, + "logits/rejected": -0.5284615755081177, + "logps/chosen": -1140.802001953125, + "logps/rejected": -1179.686279296875, + "loss": 0.0329, + "num_input_tokens_seen": 105549728, + "rewards/accuracies": 0.984375, + "rewards/chosen": -3.9859378337860107, + "rewards/margins": 6.569973945617676, + "rewards/rejected": -10.555912017822266, + "step": 321 + }, + { + "epoch": 1.2298300215465645, + "grad_norm": 0.2707670264866995, + "learning_rate": 6.631101466077801e-06, + "logits/chosen": -0.5504841208457947, + "logits/rejected": -0.5597809553146362, + "logps/chosen": -1105.7371826171875, + "logps/rejected": -1171.4761962890625, + "loss": 0.0229, + "num_input_tokens_seen": 105897600, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.280068874359131, + "rewards/margins": 6.372776031494141, + "rewards/rejected": -10.652844429016113, + "step": 322 + }, + { + "epoch": 1.233660521905674, + "grad_norm": 0.34914130578163766, + "learning_rate": 6.573392826880058e-06, + "logits/chosen": -0.4954412579536438, + "logits/rejected": -0.5100732445716858, + "logps/chosen": -1101.5142822265625, + "logps/rejected": -1135.6531982421875, + "loss": 0.0304, + "num_input_tokens_seen": 106232672, + "rewards/accuracies": 0.984375, + "rewards/chosen": -4.087323188781738, + "rewards/margins": 6.584163665771484, + "rewards/rejected": -10.671486854553223, + "step": 323 + }, + { + "epoch": 1.2374910222647832, + "grad_norm": 0.42381173708935516, + "learning_rate": 6.515813197505656e-06, + "logits/chosen": -0.5550379753112793, + "logits/rejected": -0.5652694702148438, + "logps/chosen": -1048.59033203125, + "logps/rejected": -1113.5057373046875, + "loss": 0.0265, + "num_input_tokens_seen": 106552192, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -4.278326034545898, + "rewards/margins": 6.429415702819824, + "rewards/rejected": -10.707741737365723, + "step": 324 + }, + { + "epoch": 1.2413215226238927, + "grad_norm": 0.3765680367303938, + "learning_rate": 6.458364745795096e-06, + "logits/chosen": -0.5535226464271545, + "logits/rejected": -0.5408152937889099, + "logps/chosen": -1048.3013916015625, + "logps/rejected": -1103.2373046875, + "loss": 0.0331, + "num_input_tokens_seen": 106865248, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.309267520904541, + "rewards/margins": 6.844829559326172, + "rewards/rejected": -11.154096603393555, + "step": 325 + }, + { + "epoch": 1.2451520229830022, + "grad_norm": 0.21741191573752516, + "learning_rate": 6.401049634650119e-06, + "logits/chosen": -0.5871390104293823, + "logits/rejected": -0.6051796078681946, + "logps/chosen": -1052.487548828125, + "logps/rejected": -1137.62060546875, + "loss": 0.02, + "num_input_tokens_seen": 107189984, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.969822883605957, + "rewards/margins": 6.448199272155762, + "rewards/rejected": -11.418022155761719, + "step": 326 + }, + { + "epoch": 1.2489825233421117, + "grad_norm": 0.4974063127062127, + "learning_rate": 6.343870021952262e-06, + "logits/chosen": -0.6245028376579285, + "logits/rejected": -0.6534483432769775, + "logps/chosen": -1125.843505859375, + "logps/rejected": -1182.444091796875, + "loss": 0.0352, + "num_input_tokens_seen": 107530208, + "rewards/accuracies": 0.984375, + "rewards/chosen": -4.24246883392334, + "rewards/margins": 6.96823263168335, + "rewards/rejected": -11.210700988769531, + "step": 327 + }, + { + "epoch": 1.252813023701221, + "grad_norm": 0.326555389406604, + "learning_rate": 6.286828060481626e-06, + "logits/chosen": -0.5852325558662415, + "logits/rejected": -0.5888749361038208, + "logps/chosen": -1131.5435791015625, + "logps/rejected": -1178.925048828125, + "loss": 0.0252, + "num_input_tokens_seen": 107866464, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -4.159159183502197, + "rewards/margins": 6.757708549499512, + "rewards/rejected": -10.916868209838867, + "step": 328 + }, + { + "epoch": 1.2566435240603304, + "grad_norm": 0.19290278303358596, + "learning_rate": 6.229925897835818e-06, + "logits/chosen": -0.6336714029312134, + "logits/rejected": -0.62428218126297, + "logps/chosen": -1103.2205810546875, + "logps/rejected": -1132.98974609375, + "loss": 0.015, + "num_input_tokens_seen": 108193216, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.525191307067871, + "rewards/margins": 6.738622665405273, + "rewards/rejected": -11.263813018798828, + "step": 329 + }, + { + "epoch": 1.2604740244194397, + "grad_norm": 0.3147949811548437, + "learning_rate": 6.173165676349103e-06, + "logits/chosen": -0.5511728525161743, + "logits/rejected": -0.5503169894218445, + "logps/chosen": -1107.289794921875, + "logps/rejected": -1151.819580078125, + "loss": 0.0222, + "num_input_tokens_seen": 108526592, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.371089935302734, + "rewards/margins": 6.700687408447266, + "rewards/rejected": -11.07177734375, + "step": 330 + }, + { + "epoch": 1.2643045247785492, + "grad_norm": 0.5494013983335133, + "learning_rate": 6.116549533011739e-06, + "logits/chosen": -0.6210289597511292, + "logits/rejected": -0.6124162077903748, + "logps/chosen": -1084.75537109375, + "logps/rejected": -1120.780029296875, + "loss": 0.0544, + "num_input_tokens_seen": 108854496, + "rewards/accuracies": 0.96875, + "rewards/chosen": -4.516499996185303, + "rewards/margins": 6.2593994140625, + "rewards/rejected": -10.775899887084961, + "step": 331 + }, + { + "epoch": 1.2681350251376586, + "grad_norm": 0.39959738828974073, + "learning_rate": 6.060079599389521e-06, + "logits/chosen": -0.5532679557800293, + "logits/rejected": -0.5373847484588623, + "logps/chosen": -1111.1160888671875, + "logps/rejected": -1135.7886962890625, + "loss": 0.0311, + "num_input_tokens_seen": 109201216, + "rewards/accuracies": 0.984375, + "rewards/chosen": -4.428027153015137, + "rewards/margins": 6.276480674743652, + "rewards/rejected": -10.704507827758789, + "step": 332 + }, + { + "epoch": 1.2719655254967681, + "grad_norm": 0.5978950817141055, + "learning_rate": 6.003758001543534e-06, + "logits/chosen": -0.6529582738876343, + "logits/rejected": -0.6663904786109924, + "logps/chosen": -1119.2666015625, + "logps/rejected": -1186.82470703125, + "loss": 0.0427, + "num_input_tokens_seen": 109544960, + "rewards/accuracies": 0.984375, + "rewards/chosen": -4.644221782684326, + "rewards/margins": 6.811516284942627, + "rewards/rejected": -11.455738067626953, + "step": 333 + }, + { + "epoch": 1.2757960258558774, + "grad_norm": 0.4839261985775478, + "learning_rate": 5.947586859950103e-06, + "logits/chosen": -0.5446620583534241, + "logits/rejected": -0.5383833646774292, + "logps/chosen": -1087.72021484375, + "logps/rejected": -1155.60888671875, + "loss": 0.0396, + "num_input_tokens_seen": 109868192, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -4.938002586364746, + "rewards/margins": 6.395125389099121, + "rewards/rejected": -11.333127975463867, + "step": 334 + }, + { + "epoch": 1.2796265262149868, + "grad_norm": 0.37283061770582415, + "learning_rate": 5.891568289420963e-06, + "logits/chosen": -0.5743653774261475, + "logits/rejected": -0.574337363243103, + "logps/chosen": -1085.6456298828125, + "logps/rejected": -1126.66015625, + "loss": 0.0283, + "num_input_tokens_seen": 110200832, + "rewards/accuracies": 0.984375, + "rewards/chosen": -4.409365653991699, + "rewards/margins": 6.233172416687012, + "rewards/rejected": -10.642538070678711, + "step": 335 + }, + { + "epoch": 1.283457026574096, + "grad_norm": 0.38381290506761573, + "learning_rate": 5.835704399023631e-06, + "logits/chosen": -0.6060564517974854, + "logits/rejected": -0.6228507161140442, + "logps/chosen": -1106.0958251953125, + "logps/rejected": -1154.65576171875, + "loss": 0.0329, + "num_input_tokens_seen": 110526496, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -4.663599014282227, + "rewards/margins": 6.474142074584961, + "rewards/rejected": -11.137741088867188, + "step": 336 + }, + { + "epoch": 1.2872875269332056, + "grad_norm": 0.21346691879616977, + "learning_rate": 5.7799972920020046e-06, + "logits/chosen": -0.5915869474411011, + "logits/rejected": -0.6200781464576721, + "logps/chosen": -1106.6888427734375, + "logps/rejected": -1154.5391845703125, + "loss": 0.0188, + "num_input_tokens_seen": 110851200, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.624930381774902, + "rewards/margins": 6.897384166717529, + "rewards/rejected": -11.52231502532959, + "step": 337 + }, + { + "epoch": 1.291118027292315, + "grad_norm": 0.36174462925624906, + "learning_rate": 5.724449065697182e-06, + "logits/chosen": -0.5750770568847656, + "logits/rejected": -0.5697283744812012, + "logps/chosen": -1096.6201171875, + "logps/rejected": -1141.188720703125, + "loss": 0.0224, + "num_input_tokens_seen": 111182784, + "rewards/accuracies": 0.984375, + "rewards/chosen": -3.9200096130371094, + "rewards/margins": 6.814273834228516, + "rewards/rejected": -10.734284400939941, + "step": 338 + }, + { + "epoch": 1.2949485276514245, + "grad_norm": 0.15931922686163774, + "learning_rate": 5.669061811468481e-06, + "logits/chosen": -0.5392000675201416, + "logits/rejected": -0.5657220482826233, + "logps/chosen": -1094.008056640625, + "logps/rejected": -1144.6591796875, + "loss": 0.0143, + "num_input_tokens_seen": 111512768, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.4557390213012695, + "rewards/margins": 6.5464911460876465, + "rewards/rejected": -11.002230644226074, + "step": 339 + }, + { + "epoch": 1.2987790280105338, + "grad_norm": 0.3868021317859634, + "learning_rate": 5.613837614614726e-06, + "logits/chosen": -0.5473899245262146, + "logits/rejected": -0.5722646713256836, + "logps/chosen": -1059.2979736328125, + "logps/rejected": -1131.09423828125, + "loss": 0.0265, + "num_input_tokens_seen": 111839008, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -4.385403156280518, + "rewards/margins": 6.547536849975586, + "rewards/rejected": -10.932939529418945, + "step": 340 + }, + { + "epoch": 1.3026095283696433, + "grad_norm": 0.44111114206002583, + "learning_rate": 5.558778554295709e-06, + "logits/chosen": -0.5305870771408081, + "logits/rejected": -0.5206206440925598, + "logps/chosen": -1129.84521484375, + "logps/rejected": -1179.5059814453125, + "loss": 0.0324, + "num_input_tokens_seen": 112173056, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -4.060966968536377, + "rewards/margins": 6.388485908508301, + "rewards/rejected": -10.449453353881836, + "step": 341 + }, + { + "epoch": 1.3064400287287528, + "grad_norm": 0.5930639013857211, + "learning_rate": 5.503886703453933e-06, + "logits/chosen": -0.5897170305252075, + "logits/rejected": -0.5736913084983826, + "logps/chosen": -1100.8056640625, + "logps/rejected": -1165.0, + "loss": 0.049, + "num_input_tokens_seen": 112509504, + "rewards/accuracies": 0.984375, + "rewards/chosen": -4.497857093811035, + "rewards/margins": 6.745928764343262, + "rewards/rejected": -11.243785858154297, + "step": 342 + }, + { + "epoch": 1.310270529087862, + "grad_norm": 0.24942719994454182, + "learning_rate": 5.4491641287365635e-06, + "logits/chosen": -0.5884048938751221, + "logits/rejected": -0.6012126803398132, + "logps/chosen": -1143.5953369140625, + "logps/rejected": -1184.6181640625, + "loss": 0.0202, + "num_input_tokens_seen": 112856896, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.079145908355713, + "rewards/margins": 6.797875881195068, + "rewards/rejected": -10.877021789550781, + "step": 343 + }, + { + "epoch": 1.3141010294469715, + "grad_norm": 0.3868482216284717, + "learning_rate": 5.3946128904176e-06, + "logits/chosen": -0.5780808925628662, + "logits/rejected": -0.5911041498184204, + "logps/chosen": -1123.4735107421875, + "logps/rejected": -1178.480224609375, + "loss": 0.0358, + "num_input_tokens_seen": 113191488, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -4.205249309539795, + "rewards/margins": 6.335290431976318, + "rewards/rejected": -10.54054069519043, + "step": 344 + }, + { + "epoch": 1.317931529806081, + "grad_norm": 0.16473193019442023, + "learning_rate": 5.340235042320341e-06, + "logits/chosen": -0.5547666549682617, + "logits/rejected": -0.5500614047050476, + "logps/chosen": -1053.4296875, + "logps/rejected": -1105.7684326171875, + "loss": 0.0168, + "num_input_tokens_seen": 113514048, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.6787109375, + "rewards/margins": 6.394067287445068, + "rewards/rejected": -11.072778701782227, + "step": 345 + }, + { + "epoch": 1.3217620301651904, + "grad_norm": 0.30687936827733076, + "learning_rate": 5.286032631740023e-06, + "logits/chosen": -0.5915262699127197, + "logits/rejected": -0.6005914211273193, + "logps/chosen": -1100.7861328125, + "logps/rejected": -1160.5439453125, + "loss": 0.017, + "num_input_tokens_seen": 113838464, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -4.592052459716797, + "rewards/margins": 7.277642250061035, + "rewards/rejected": -11.869693756103516, + "step": 346 + }, + { + "epoch": 1.3255925305242997, + "grad_norm": 0.3896436248319578, + "learning_rate": 5.2320076993667815e-06, + "logits/chosen": -0.562433123588562, + "logits/rejected": -0.5513118505477905, + "logps/chosen": -1107.2877197265625, + "logps/rejected": -1183.2586669921875, + "loss": 0.0326, + "num_input_tokens_seen": 114176736, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -4.118655681610107, + "rewards/margins": 6.4052276611328125, + "rewards/rejected": -10.523883819580078, + "step": 347 + }, + { + "epoch": 1.3294230308834092, + "grad_norm": 0.5173744103936653, + "learning_rate": 5.178162279208774e-06, + "logits/chosen": -0.5649431943893433, + "logits/rejected": -0.5746188163757324, + "logps/chosen": -1138.1510009765625, + "logps/rejected": -1181.3857421875, + "loss": 0.0427, + "num_input_tokens_seen": 114506016, + "rewards/accuracies": 0.984375, + "rewards/chosen": -4.620360851287842, + "rewards/margins": 6.304009437561035, + "rewards/rejected": -10.924370765686035, + "step": 348 + }, + { + "epoch": 1.3332535312425184, + "grad_norm": 0.35804072584847196, + "learning_rate": 5.1244983985156425e-06, + "logits/chosen": -0.5786940455436707, + "logits/rejected": -0.567001461982727, + "logps/chosen": -1081.60791015625, + "logps/rejected": -1145.8780517578125, + "loss": 0.0328, + "num_input_tokens_seen": 114830528, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -4.430755615234375, + "rewards/margins": 6.210395812988281, + "rewards/rejected": -10.641151428222656, + "step": 349 + }, + { + "epoch": 1.337084031601628, + "grad_norm": 0.37074427874743676, + "learning_rate": 5.071018077702161e-06, + "logits/chosen": -0.581089973449707, + "logits/rejected": -0.6101672649383545, + "logps/chosen": -1125.49169921875, + "logps/rejected": -1162.82080078125, + "loss": 0.0293, + "num_input_tokens_seen": 115154272, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.509761810302734, + "rewards/margins": 6.601551055908203, + "rewards/rejected": -11.111312866210938, + "step": 350 + }, + { + "epoch": 1.3409145319607374, + "grad_norm": 0.18501290986651892, + "learning_rate": 5.017723330272184e-06, + "logits/chosen": -0.5150108933448792, + "logits/rejected": -0.5240092277526855, + "logps/chosen": -1111.302734375, + "logps/rejected": -1170.109375, + "loss": 0.0162, + "num_input_tokens_seen": 115482144, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.068356513977051, + "rewards/margins": 6.618928909301758, + "rewards/rejected": -10.687283515930176, + "step": 351 + }, + { + "epoch": 1.3447450323198469, + "grad_norm": 0.3909418200034741, + "learning_rate": 4.964616162742826e-06, + "logits/chosen": -0.5665695071220398, + "logits/rejected": -0.5610139966011047, + "logps/chosen": -1146.863037109375, + "logps/rejected": -1204.87548828125, + "loss": 0.0235, + "num_input_tokens_seen": 115822272, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -4.42722749710083, + "rewards/margins": 7.129010200500488, + "rewards/rejected": -11.55623722076416, + "step": 352 + }, + { + "epoch": 1.3485755326789561, + "grad_norm": 0.1886565016850838, + "learning_rate": 4.911698574568929e-06, + "logits/chosen": -0.6004152297973633, + "logits/rejected": -0.6080197691917419, + "logps/chosen": -1106.9736328125, + "logps/rejected": -1145.805908203125, + "loss": 0.0171, + "num_input_tokens_seen": 116145504, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.354930400848389, + "rewards/margins": 6.6411895751953125, + "rewards/rejected": -11.996119499206543, + "step": 353 + }, + { + "epoch": 1.3524060330380656, + "grad_norm": 0.38428653453540895, + "learning_rate": 4.858972558067784e-06, + "logits/chosen": -0.6217834949493408, + "logits/rejected": -0.6399535536766052, + "logps/chosen": -1074.2374267578125, + "logps/rejected": -1131.6954345703125, + "loss": 0.0231, + "num_input_tokens_seen": 116474016, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -4.7779765129089355, + "rewards/margins": 6.978175163269043, + "rewards/rejected": -11.756153106689453, + "step": 354 + }, + { + "epoch": 1.3562365333971749, + "grad_norm": 0.34943288135543504, + "learning_rate": 4.8064400983441036e-06, + "logits/chosen": -0.5850189924240112, + "logits/rejected": -0.5880131721496582, + "logps/chosen": -1072.56494140625, + "logps/rejected": -1132.036376953125, + "loss": 0.026, + "num_input_tokens_seen": 116797696, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -4.445429801940918, + "rewards/margins": 6.344244956970215, + "rewards/rejected": -10.789674758911133, + "step": 355 + }, + { + "epoch": 1.3600670337562843, + "grad_norm": 0.4127013748440228, + "learning_rate": 4.754103173215313e-06, + "logits/chosen": -0.5461283922195435, + "logits/rejected": -0.5725980401039124, + "logps/chosen": -1122.2904052734375, + "logps/rejected": -1141.981689453125, + "loss": 0.032, + "num_input_tokens_seen": 117143072, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -4.860044479370117, + "rewards/margins": 6.201094150543213, + "rewards/rejected": -11.061139106750488, + "step": 356 + }, + { + "epoch": 1.3638975341153938, + "grad_norm": 0.47871272718146696, + "learning_rate": 4.7019637531370535e-06, + "logits/chosen": -0.5488605499267578, + "logits/rejected": -0.5651224851608276, + "logps/chosen": -1132.060546875, + "logps/rejected": -1203.186767578125, + "loss": 0.0435, + "num_input_tokens_seen": 117491008, + "rewards/accuracies": 0.984375, + "rewards/chosen": -4.803321838378906, + "rewards/margins": 6.855459213256836, + "rewards/rejected": -11.658782958984375, + "step": 357 + }, + { + "epoch": 1.3677280344745033, + "grad_norm": 0.3628525281162017, + "learning_rate": 4.65002380112903e-06, + "logits/chosen": -0.590827465057373, + "logits/rejected": -0.6208339333534241, + "logps/chosen": -1104.7464599609375, + "logps/rejected": -1139.046630859375, + "loss": 0.0273, + "num_input_tokens_seen": 117813952, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.016109466552734, + "rewards/margins": 6.412625789642334, + "rewards/rejected": -11.428735733032227, + "step": 358 + }, + { + "epoch": 1.3715585348336126, + "grad_norm": 0.24401877116287474, + "learning_rate": 4.598285272701072e-06, + "logits/chosen": -0.6212185621261597, + "logits/rejected": -0.6139695644378662, + "logps/chosen": -1114.401611328125, + "logps/rejected": -1172.537841796875, + "loss": 0.0153, + "num_input_tokens_seen": 118146592, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.432703495025635, + "rewards/margins": 7.041750431060791, + "rewards/rejected": -11.474453926086426, + "step": 359 + }, + { + "epoch": 1.375389035192722, + "grad_norm": 0.5204596578334273, + "learning_rate": 4.546750115779538e-06, + "logits/chosen": -0.5524659752845764, + "logits/rejected": -0.5470616221427917, + "logps/chosen": -1102.4576416015625, + "logps/rejected": -1144.021240234375, + "loss": 0.04, + "num_input_tokens_seen": 118470272, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -5.070345878601074, + "rewards/margins": 6.052586555480957, + "rewards/rejected": -11.122932434082031, + "step": 360 + }, + { + "epoch": 1.3792195355518315, + "grad_norm": 0.45972330804956246, + "learning_rate": 4.495420270633954e-06, + "logits/chosen": -0.5367764830589294, + "logits/rejected": -0.5309403538703918, + "logps/chosen": -1121.531005859375, + "logps/rejected": -1185.4189453125, + "loss": 0.0323, + "num_input_tokens_seen": 118812064, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.972258567810059, + "rewards/margins": 6.482407569885254, + "rewards/rejected": -11.454666137695312, + "step": 361 + }, + { + "epoch": 1.3830500359109408, + "grad_norm": 0.6058684189172656, + "learning_rate": 4.444297669803981e-06, + "logits/chosen": -0.5466238856315613, + "logits/rejected": -0.5569957494735718, + "logps/chosen": -1097.993408203125, + "logps/rejected": -1152.7718505859375, + "loss": 0.0598, + "num_input_tokens_seen": 119137728, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -5.033304691314697, + "rewards/margins": 6.1568708419799805, + "rewards/rejected": -11.190176010131836, + "step": 362 + }, + { + "epoch": 1.3868805362700503, + "grad_norm": 0.4268383524480481, + "learning_rate": 4.393384238026641e-06, + "logits/chosen": -0.6116652488708496, + "logits/rejected": -0.6289969682693481, + "logps/chosen": -1146.99609375, + "logps/rejected": -1191.328369140625, + "loss": 0.0313, + "num_input_tokens_seen": 119481760, + "rewards/accuracies": 0.984375, + "rewards/chosen": -5.32231330871582, + "rewards/margins": 6.607464790344238, + "rewards/rejected": -11.929778099060059, + "step": 363 + }, + { + "epoch": 1.3907110366291597, + "grad_norm": 0.24595911433864964, + "learning_rate": 4.342681892163868e-06, + "logits/chosen": -0.578654944896698, + "logits/rejected": -0.5655863285064697, + "logps/chosen": -1088.029052734375, + "logps/rejected": -1147.778564453125, + "loss": 0.0212, + "num_input_tokens_seen": 119816544, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.033302307128906, + "rewards/margins": 6.770483493804932, + "rewards/rejected": -11.80378532409668, + "step": 364 + }, + { + "epoch": 1.3945415369882692, + "grad_norm": 0.21739895876335888, + "learning_rate": 4.292192541130329e-06, + "logits/chosen": -0.548336386680603, + "logits/rejected": -0.5593569278717041, + "logps/chosen": -1080.714599609375, + "logps/rejected": -1128.7335205078125, + "loss": 0.0147, + "num_input_tokens_seen": 120142976, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.019723892211914, + "rewards/margins": 6.675695419311523, + "rewards/rejected": -11.695419311523438, + "step": 365 + }, + { + "epoch": 1.3983720373473785, + "grad_norm": 0.20760225487207917, + "learning_rate": 4.241918085821547e-06, + "logits/chosen": -0.48104262351989746, + "logits/rejected": -0.4730389714241028, + "logps/chosen": -1136.6636962890625, + "logps/rejected": -1212.406005859375, + "loss": 0.0191, + "num_input_tokens_seen": 120496384, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.647626876831055, + "rewards/margins": 6.411789894104004, + "rewards/rejected": -11.059416770935059, + "step": 366 + }, + { + "epoch": 1.402202537706488, + "grad_norm": 0.3275019863623418, + "learning_rate": 4.191860419042356e-06, + "logits/chosen": -0.613954484462738, + "logits/rejected": -0.6308355927467346, + "logps/chosen": -1079.2032470703125, + "logps/rejected": -1143.3275146484375, + "loss": 0.0294, + "num_input_tokens_seen": 120825632, + "rewards/accuracies": 0.984375, + "rewards/chosen": -5.207267761230469, + "rewards/margins": 6.184564590454102, + "rewards/rejected": -11.39183235168457, + "step": 367 + }, + { + "epoch": 1.4060330380655972, + "grad_norm": 0.3245158266614854, + "learning_rate": 4.142021425435612e-06, + "logits/chosen": -0.590624213218689, + "logits/rejected": -0.5906351804733276, + "logps/chosen": -1137.808349609375, + "logps/rejected": -1194.8358154296875, + "loss": 0.0185, + "num_input_tokens_seen": 121165152, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.1931681632995605, + "rewards/margins": 6.706537246704102, + "rewards/rejected": -11.89970588684082, + "step": 368 + }, + { + "epoch": 1.4098635384247067, + "grad_norm": 0.6209565546304465, + "learning_rate": 4.09240298141126e-06, + "logits/chosen": -0.5298579931259155, + "logits/rejected": -0.5269304513931274, + "logps/chosen": -1076.9927978515625, + "logps/rejected": -1145.547607421875, + "loss": 0.062, + "num_input_tokens_seen": 121494080, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -5.376431465148926, + "rewards/margins": 6.431936264038086, + "rewards/rejected": -11.808367729187012, + "step": 369 + }, + { + "epoch": 1.4136940387838162, + "grad_norm": 0.26802772150040766, + "learning_rate": 4.043006955075667e-06, + "logits/chosen": -0.46784165501594543, + "logits/rejected": -0.4804871678352356, + "logps/chosen": -1086.525390625, + "logps/rejected": -1170.804931640625, + "loss": 0.0261, + "num_input_tokens_seen": 121829184, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -5.1012139320373535, + "rewards/margins": 6.533043384552002, + "rewards/rejected": -11.634257316589355, + "step": 370 + }, + { + "epoch": 1.4175245391429256, + "grad_norm": 0.5510156432728248, + "learning_rate": 3.993835206161313e-06, + "logits/chosen": -0.6157731413841248, + "logits/rejected": -0.5948693752288818, + "logps/chosen": -1141.612548828125, + "logps/rejected": -1191.060546875, + "loss": 0.0431, + "num_input_tokens_seen": 122165280, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -5.091546058654785, + "rewards/margins": 6.532773017883301, + "rewards/rejected": -11.62431812286377, + "step": 371 + }, + { + "epoch": 1.421355039502035, + "grad_norm": 0.6484195555995633, + "learning_rate": 3.944889585956746e-06, + "logits/chosen": -0.5589182376861572, + "logits/rejected": -0.5542987585067749, + "logps/chosen": -1120.92333984375, + "logps/rejected": -1192.95166015625, + "loss": 0.068, + "num_input_tokens_seen": 122499680, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -4.534707069396973, + "rewards/margins": 6.616757869720459, + "rewards/rejected": -11.151464462280273, + "step": 372 + }, + { + "epoch": 1.4251855398611444, + "grad_norm": 0.38016174294258065, + "learning_rate": 3.896171937236904e-06, + "logits/chosen": -0.548100471496582, + "logits/rejected": -0.5266211032867432, + "logps/chosen": -1100.239990234375, + "logps/rejected": -1139.40478515625, + "loss": 0.0305, + "num_input_tokens_seen": 122841312, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -4.725222587585449, + "rewards/margins": 6.122917652130127, + "rewards/rejected": -10.848140716552734, + "step": 373 + }, + { + "epoch": 1.4290160402202536, + "grad_norm": 0.3190714752266467, + "learning_rate": 3.847684094193733e-06, + "logits/chosen": -0.46835410594940186, + "logits/rejected": -0.4880065321922302, + "logps/chosen": -1084.8726806640625, + "logps/rejected": -1147.6541748046875, + "loss": 0.0275, + "num_input_tokens_seen": 123169600, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.551667213439941, + "rewards/margins": 6.469115734100342, + "rewards/rejected": -11.020783424377441, + "step": 374 + }, + { + "epoch": 1.4328465405793631, + "grad_norm": 0.2743677538074191, + "learning_rate": 3.7994278823671084e-06, + "logits/chosen": -0.5531451106071472, + "logits/rejected": -0.5557854771614075, + "logps/chosen": -1074.174560546875, + "logps/rejected": -1129.6207275390625, + "loss": 0.0235, + "num_input_tokens_seen": 123486976, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.940919876098633, + "rewards/margins": 6.677453517913818, + "rewards/rejected": -11.61837387084961, + "step": 375 + }, + { + "epoch": 1.4366770409384726, + "grad_norm": 0.303294252212312, + "learning_rate": 3.751405118576138e-06, + "logits/chosen": -0.5192709565162659, + "logits/rejected": -0.5413182973861694, + "logps/chosen": -1105.858154296875, + "logps/rejected": -1142.076904296875, + "loss": 0.0275, + "num_input_tokens_seen": 123817504, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.097794532775879, + "rewards/margins": 6.259578704833984, + "rewards/rejected": -11.357373237609863, + "step": 376 + }, + { + "epoch": 1.440507541297582, + "grad_norm": 0.18409593459315973, + "learning_rate": 3.7036176108507295e-06, + "logits/chosen": -0.4597388505935669, + "logits/rejected": -0.47885021567344666, + "logps/chosen": -1147.1044921875, + "logps/rejected": -1196.7781982421875, + "loss": 0.0202, + "num_input_tokens_seen": 124161760, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.250223636627197, + "rewards/margins": 6.479463577270508, + "rewards/rejected": -10.729687690734863, + "step": 377 + }, + { + "epoch": 1.4443380416566913, + "grad_norm": 0.39189843385974144, + "learning_rate": 3.6560671583635467e-06, + "logits/chosen": -0.5146514177322388, + "logits/rejected": -0.5417618751525879, + "logps/chosen": -1075.363037109375, + "logps/rejected": -1146.649658203125, + "loss": 0.029, + "num_input_tokens_seen": 124495872, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -4.696915626525879, + "rewards/margins": 6.73520565032959, + "rewards/rejected": -11.432121276855469, + "step": 378 + }, + { + "epoch": 1.4481685420158008, + "grad_norm": 0.3998531026802172, + "learning_rate": 3.608755551362243e-06, + "logits/chosen": -0.5792596340179443, + "logits/rejected": -0.5624982118606567, + "logps/chosen": -1060.078125, + "logps/rejected": -1112.68310546875, + "loss": 0.031, + "num_input_tokens_seen": 124816192, + "rewards/accuracies": 0.984375, + "rewards/chosen": -4.957347393035889, + "rewards/margins": 6.82720947265625, + "rewards/rejected": -11.784557342529297, + "step": 379 + }, + { + "epoch": 1.4519990423749103, + "grad_norm": 0.17911486856466677, + "learning_rate": 3.5616845711020876e-06, + "logits/chosen": -0.5854830741882324, + "logits/rejected": -0.5884844064712524, + "logps/chosen": -1100.80078125, + "logps/rejected": -1166.805908203125, + "loss": 0.0138, + "num_input_tokens_seen": 125151904, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.580303192138672, + "rewards/margins": 7.175595760345459, + "rewards/rejected": -11.755899429321289, + "step": 380 + }, + { + "epoch": 1.4558295427340195, + "grad_norm": 0.22631360906965453, + "learning_rate": 3.514855989778876e-06, + "logits/chosen": -0.5366742610931396, + "logits/rejected": -0.5388984680175781, + "logps/chosen": -1131.1658935546875, + "logps/rejected": -1205.607666015625, + "loss": 0.0158, + "num_input_tokens_seen": 125488480, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.635239124298096, + "rewards/margins": 6.933506965637207, + "rewards/rejected": -11.568745613098145, + "step": 381 + }, + { + "epoch": 1.459660043093129, + "grad_norm": 0.218678206816463, + "learning_rate": 3.468271570462235e-06, + "logits/chosen": -0.6020558476448059, + "logits/rejected": -0.6324237585067749, + "logps/chosen": -1125.540771484375, + "logps/rejected": -1194.388916015625, + "loss": 0.0133, + "num_input_tokens_seen": 125830016, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.89791202545166, + "rewards/margins": 7.153135776519775, + "rewards/rejected": -12.051048278808594, + "step": 382 + }, + { + "epoch": 1.4634905434522385, + "grad_norm": 0.27157692143094103, + "learning_rate": 3.4219330670292137e-06, + "logits/chosen": -0.5100884437561035, + "logits/rejected": -0.5211885571479797, + "logps/chosen": -1073.5001220703125, + "logps/rejected": -1122.3511962890625, + "loss": 0.0229, + "num_input_tokens_seen": 126146784, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.999073028564453, + "rewards/margins": 6.166782379150391, + "rewards/rejected": -11.16585636138916, + "step": 383 + }, + { + "epoch": 1.467321043811348, + "grad_norm": 0.25366254348824563, + "learning_rate": 3.3758422240982814e-06, + "logits/chosen": -0.47184768319129944, + "logits/rejected": -0.47867125272750854, + "logps/chosen": -1085.977783203125, + "logps/rejected": -1125.033447265625, + "loss": 0.0217, + "num_input_tokens_seen": 126478176, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.4813313484191895, + "rewards/margins": 6.321496486663818, + "rewards/rejected": -10.802828788757324, + "step": 384 + }, + { + "epoch": 1.4711515441704572, + "grad_norm": 0.26135394230818676, + "learning_rate": 3.3300007769636268e-06, + "logits/chosen": -0.5334382057189941, + "logits/rejected": -0.5414412617683411, + "logps/chosen": -1069.052001953125, + "logps/rejected": -1125.650146484375, + "loss": 0.0208, + "num_input_tokens_seen": 126804992, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.102548599243164, + "rewards/margins": 6.515319347381592, + "rewards/rejected": -11.617868423461914, + "step": 385 + }, + { + "epoch": 1.4749820445295667, + "grad_norm": 0.3742529933567009, + "learning_rate": 3.284410451529816e-06, + "logits/chosen": -0.570806086063385, + "logits/rejected": -0.5752460956573486, + "logps/chosen": -1097.5703125, + "logps/rejected": -1141.503662109375, + "loss": 0.0294, + "num_input_tokens_seen": 127127552, + "rewards/accuracies": 0.984375, + "rewards/chosen": -4.973342418670654, + "rewards/margins": 6.87398624420166, + "rewards/rejected": -11.847328186035156, + "step": 386 + }, + { + "epoch": 1.478812544888676, + "grad_norm": 0.5393702465685286, + "learning_rate": 3.239072964246842e-06, + "logits/chosen": -0.5785092115402222, + "logits/rejected": -0.5873132348060608, + "logps/chosen": -1123.692138671875, + "logps/rejected": -1174.1474609375, + "loss": 0.0379, + "num_input_tokens_seen": 127454624, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -4.580497741699219, + "rewards/margins": 7.4736175537109375, + "rewards/rejected": -12.054115295410156, + "step": 387 + }, + { + "epoch": 1.4826430452477855, + "grad_norm": 0.1515373367542113, + "learning_rate": 3.19399002204547e-06, + "logits/chosen": -0.5319044589996338, + "logits/rejected": -0.5347077250480652, + "logps/chosen": -1082.864013671875, + "logps/rejected": -1143.107177734375, + "loss": 0.012, + "num_input_tokens_seen": 127777664, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.843909740447998, + "rewards/margins": 6.903135299682617, + "rewards/rejected": -11.74704360961914, + "step": 388 + }, + { + "epoch": 1.486473545606895, + "grad_norm": 0.5325301863346185, + "learning_rate": 3.149163322272998e-06, + "logits/chosen": -0.5967922806739807, + "logits/rejected": -0.625117301940918, + "logps/chosen": -1121.570556640625, + "logps/rejected": -1174.13818359375, + "loss": 0.0492, + "num_input_tokens_seen": 128114400, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -4.88701868057251, + "rewards/margins": 7.042975425720215, + "rewards/rejected": -11.929994583129883, + "step": 389 + }, + { + "epoch": 1.4903040459660044, + "grad_norm": 0.5215677947359276, + "learning_rate": 3.1045945526293307e-06, + "logits/chosen": -0.5355928540229797, + "logits/rejected": -0.5453197360038757, + "logps/chosen": -1127.862548828125, + "logps/rejected": -1174.4453125, + "loss": 0.05, + "num_input_tokens_seen": 128449920, + "rewards/accuracies": 0.984375, + "rewards/chosen": -4.660383224487305, + "rewards/margins": 6.527249336242676, + "rewards/rejected": -11.18763256072998, + "step": 390 + }, + { + "epoch": 1.4941345463251137, + "grad_norm": 0.34422332561157504, + "learning_rate": 3.0602853911034623e-06, + "logits/chosen": -0.5744361877441406, + "logits/rejected": -0.5820431709289551, + "logps/chosen": -1076.0799560546875, + "logps/rejected": -1123.80517578125, + "loss": 0.029, + "num_input_tokens_seen": 128773984, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -4.747982978820801, + "rewards/margins": 6.769466400146484, + "rewards/rejected": -11.517449378967285, + "step": 391 + }, + { + "epoch": 1.4979650466842231, + "grad_norm": 0.2654262528468635, + "learning_rate": 3.016237505910272e-06, + "logits/chosen": -0.5949314832687378, + "logits/rejected": -0.6056872010231018, + "logps/chosen": -1096.0980224609375, + "logps/rejected": -1159.4710693359375, + "loss": 0.0183, + "num_input_tokens_seen": 129096640, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.150325775146484, + "rewards/margins": 6.922772407531738, + "rewards/rejected": -12.073097229003906, + "step": 392 + }, + { + "epoch": 1.5017955470433324, + "grad_norm": 0.40499643869828833, + "learning_rate": 2.9724525554277494e-06, + "logits/chosen": -0.5234485268592834, + "logits/rejected": -0.5431954860687256, + "logps/chosen": -1106.44677734375, + "logps/rejected": -1142.529541015625, + "loss": 0.0362, + "num_input_tokens_seen": 129422848, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -5.012673854827881, + "rewards/margins": 6.529794692993164, + "rewards/rejected": -11.542468070983887, + "step": 393 + }, + { + "epoch": 1.5056260474024419, + "grad_norm": 0.1408062990072125, + "learning_rate": 2.9289321881345257e-06, + "logits/chosen": -0.5465223789215088, + "logits/rejected": -0.5410748720169067, + "logps/chosen": -1110.9638671875, + "logps/rejected": -1131.211181640625, + "loss": 0.0095, + "num_input_tokens_seen": 129759776, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.259383201599121, + "rewards/margins": 7.167115211486816, + "rewards/rejected": -11.426498413085938, + "step": 394 + }, + { + "epoch": 1.5094565477615514, + "grad_norm": 0.6464203064171667, + "learning_rate": 2.8856780425478346e-06, + "logits/chosen": -0.5989202857017517, + "logits/rejected": -0.6086163520812988, + "logps/chosen": -1111.2257080078125, + "logps/rejected": -1164.850830078125, + "loss": 0.0573, + "num_input_tokens_seen": 130088320, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -4.739720344543457, + "rewards/margins": 6.607801914215088, + "rewards/rejected": -11.347522735595703, + "step": 395 + }, + { + "epoch": 1.5132870481206608, + "grad_norm": 0.1595461544117147, + "learning_rate": 2.8426917471618144e-06, + "logits/chosen": -0.5773784518241882, + "logits/rejected": -0.578612208366394, + "logps/chosen": -1066.836669921875, + "logps/rejected": -1132.3533935546875, + "loss": 0.0123, + "num_input_tokens_seen": 130413568, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.440546035766602, + "rewards/margins": 7.347493648529053, + "rewards/rejected": -12.78803825378418, + "step": 396 + }, + { + "epoch": 1.5171175484797703, + "grad_norm": 0.5231413230947265, + "learning_rate": 2.799974920386184e-06, + "logits/chosen": -0.5126875638961792, + "logits/rejected": -0.5231764316558838, + "logps/chosen": -1129.4285888671875, + "logps/rejected": -1165.776611328125, + "loss": 0.0404, + "num_input_tokens_seen": 130750496, + "rewards/accuracies": 0.984375, + "rewards/chosen": -3.934070587158203, + "rewards/margins": 6.931783676147461, + "rewards/rejected": -10.865854263305664, + "step": 397 + }, + { + "epoch": 1.5209480488388796, + "grad_norm": 0.34007075877949855, + "learning_rate": 2.7575291704853325e-06, + "logits/chosen": -0.5731176137924194, + "logits/rejected": -0.5790687203407288, + "logps/chosen": -1122.06689453125, + "logps/rejected": -1163.4764404296875, + "loss": 0.0225, + "num_input_tokens_seen": 131082080, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -4.540215492248535, + "rewards/margins": 7.143531799316406, + "rewards/rejected": -11.683747291564941, + "step": 398 + }, + { + "epoch": 1.5247785491979888, + "grad_norm": 0.25528506999812345, + "learning_rate": 2.7153560955177484e-06, + "logits/chosen": -0.5958976745605469, + "logits/rejected": -0.5888385772705078, + "logps/chosen": -1097.54052734375, + "logps/rejected": -1142.61181640625, + "loss": 0.0198, + "num_input_tokens_seen": 131404768, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.749542236328125, + "rewards/margins": 6.860627174377441, + "rewards/rejected": -11.61016845703125, + "step": 399 + }, + { + "epoch": 1.5286090495570983, + "grad_norm": 0.2288268890289359, + "learning_rate": 2.673457283275873e-06, + "logits/chosen": -0.5699641704559326, + "logits/rejected": -0.5615939497947693, + "logps/chosen": -1106.812255859375, + "logps/rejected": -1151.32275390625, + "loss": 0.0149, + "num_input_tokens_seen": 131727936, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.680929183959961, + "rewards/margins": 7.113523006439209, + "rewards/rejected": -11.794452667236328, + "step": 400 + }, + { + "epoch": 1.5324395499162078, + "grad_norm": 0.41095209817336625, + "learning_rate": 2.6318343112263014e-06, + "logits/chosen": -0.5620801448822021, + "logits/rejected": -0.5593371987342834, + "logps/chosen": -1107.0919189453125, + "logps/rejected": -1155.540283203125, + "loss": 0.0318, + "num_input_tokens_seen": 132056608, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -4.892297744750977, + "rewards/margins": 6.485471725463867, + "rewards/rejected": -11.377769470214844, + "step": 401 + }, + { + "epoch": 1.5362700502753173, + "grad_norm": 0.2297318095404972, + "learning_rate": 2.5904887464504115e-06, + "logits/chosen": -0.5357888340950012, + "logits/rejected": -0.5443666577339172, + "logps/chosen": -1075.8251953125, + "logps/rejected": -1139.0931396484375, + "loss": 0.0165, + "num_input_tokens_seen": 132391296, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.498277187347412, + "rewards/margins": 6.888402938842773, + "rewards/rejected": -11.386680603027344, + "step": 402 + }, + { + "epoch": 1.5401005506344267, + "grad_norm": 0.5162892369789231, + "learning_rate": 2.5494221455853407e-06, + "logits/chosen": -0.5672221779823303, + "logits/rejected": -0.5816081762313843, + "logps/chosen": -1111.1781005859375, + "logps/rejected": -1180.8359375, + "loss": 0.0513, + "num_input_tokens_seen": 132735296, + "rewards/accuracies": 0.984375, + "rewards/chosen": -4.703047275543213, + "rewards/margins": 6.575906753540039, + "rewards/rejected": -11.278953552246094, + "step": 403 + }, + { + "epoch": 1.543931050993536, + "grad_norm": 0.4723693953324068, + "learning_rate": 2.5086360547654088e-06, + "logits/chosen": -0.545316755771637, + "logits/rejected": -0.5594050288200378, + "logps/chosen": -1087.4837646484375, + "logps/rejected": -1137.5257568359375, + "loss": 0.0338, + "num_input_tokens_seen": 133069568, + "rewards/accuracies": 0.984375, + "rewards/chosen": -4.7968244552612305, + "rewards/margins": 6.531520843505859, + "rewards/rejected": -11.32834529876709, + "step": 404 + }, + { + "epoch": 1.5477615513526455, + "grad_norm": 0.2943995914064969, + "learning_rate": 2.4681320095638763e-06, + "logits/chosen": -0.5688447952270508, + "logits/rejected": -0.5940778255462646, + "logps/chosen": -1121.266845703125, + "logps/rejected": -1207.34619140625, + "loss": 0.0227, + "num_input_tokens_seen": 133409952, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.2713775634765625, + "rewards/margins": 6.521623611450195, + "rewards/rejected": -11.793001174926758, + "step": 405 + }, + { + "epoch": 1.5515920517117547, + "grad_norm": 0.5573224426617283, + "learning_rate": 2.4279115349351546e-06, + "logits/chosen": -0.5556387901306152, + "logits/rejected": -0.5656312704086304, + "logps/chosen": -1149.8804931640625, + "logps/rejected": -1203.7596435546875, + "loss": 0.0471, + "num_input_tokens_seen": 133752672, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -4.28227424621582, + "rewards/margins": 6.8423614501953125, + "rewards/rejected": -11.124635696411133, + "step": 406 + }, + { + "epoch": 1.5554225520708642, + "grad_norm": 0.21070364330821775, + "learning_rate": 2.3879761451573834e-06, + "logits/chosen": -0.5693101286888123, + "logits/rejected": -0.5738952159881592, + "logps/chosen": -1105.377685546875, + "logps/rejected": -1163.494384765625, + "loss": 0.0154, + "num_input_tokens_seen": 134090848, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.968280792236328, + "rewards/margins": 6.8309783935546875, + "rewards/rejected": -11.799259185791016, + "step": 407 + }, + { + "epoch": 1.5592530524299737, + "grad_norm": 0.27589950723853224, + "learning_rate": 2.3483273437754106e-06, + "logits/chosen": -0.5726186633110046, + "logits/rejected": -0.5971205830574036, + "logps/chosen": -1088.2452392578125, + "logps/rejected": -1124.00537109375, + "loss": 0.0217, + "num_input_tokens_seen": 134410240, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.741251468658447, + "rewards/margins": 7.028359413146973, + "rewards/rejected": -11.769611358642578, + "step": 408 + }, + { + "epoch": 1.5630835527890832, + "grad_norm": 0.4935329198543234, + "learning_rate": 2.3089666235442055e-06, + "logits/chosen": -0.5436748266220093, + "logits/rejected": -0.5720666646957397, + "logps/chosen": -1154.5625, + "logps/rejected": -1199.4609375, + "loss": 0.0414, + "num_input_tokens_seen": 134747712, + "rewards/accuracies": 0.984375, + "rewards/chosen": -4.399138450622559, + "rewards/margins": 6.799582481384277, + "rewards/rejected": -11.198720932006836, + "step": 409 + }, + { + "epoch": 1.5669140531481924, + "grad_norm": 0.4729996547988146, + "learning_rate": 2.26989546637263e-06, + "logits/chosen": -0.5649195313453674, + "logits/rejected": -0.5647789835929871, + "logps/chosen": -1104.062255859375, + "logps/rejected": -1150.8553466796875, + "loss": 0.0374, + "num_input_tokens_seen": 135069120, + "rewards/accuracies": 0.984375, + "rewards/chosen": -4.846754550933838, + "rewards/margins": 6.685153007507324, + "rewards/rejected": -11.53190803527832, + "step": 410 + }, + { + "epoch": 1.570744553507302, + "grad_norm": 0.5185036903972016, + "learning_rate": 2.231115343267677e-06, + "logits/chosen": -0.5803587436676025, + "logits/rejected": -0.5868566036224365, + "logps/chosen": -1096.200927734375, + "logps/rejected": -1170.959716796875, + "loss": 0.0452, + "num_input_tokens_seen": 135390720, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -4.98620080947876, + "rewards/margins": 7.045351505279541, + "rewards/rejected": -12.0315523147583, + "step": 411 + }, + { + "epoch": 1.5745750538664112, + "grad_norm": 0.4777477123225856, + "learning_rate": 2.1926277142790554e-06, + "logits/chosen": -0.5112602710723877, + "logits/rejected": -0.501746416091919, + "logps/chosen": -1100.30810546875, + "logps/rejected": -1144.878662109375, + "loss": 0.0385, + "num_input_tokens_seen": 135723456, + "rewards/accuracies": 0.984375, + "rewards/chosen": -4.7598042488098145, + "rewards/margins": 6.656907081604004, + "rewards/rejected": -11.416711807250977, + "step": 412 + }, + { + "epoch": 1.5784055542255206, + "grad_norm": 0.16736441206484248, + "learning_rate": 2.15443402844425e-06, + "logits/chosen": -0.6111674308776855, + "logits/rejected": -0.613297700881958, + "logps/chosen": -1091.212646484375, + "logps/rejected": -1140.66064453125, + "loss": 0.0118, + "num_input_tokens_seen": 136044576, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.286986351013184, + "rewards/margins": 7.031262397766113, + "rewards/rejected": -11.318248748779297, + "step": 413 + }, + { + "epoch": 1.5822360545846301, + "grad_norm": 0.34500832991598307, + "learning_rate": 2.116535723733938e-06, + "logits/chosen": -0.5881718993186951, + "logits/rejected": -0.5924442410469055, + "logps/chosen": -1069.9156494140625, + "logps/rejected": -1149.0899658203125, + "loss": 0.0238, + "num_input_tokens_seen": 136378048, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -4.9878106117248535, + "rewards/margins": 6.723343372344971, + "rewards/rejected": -11.71115493774414, + "step": 414 + }, + { + "epoch": 1.5860665549437396, + "grad_norm": 0.29475724264355657, + "learning_rate": 2.0789342269978785e-06, + "logits/chosen": -0.48956170678138733, + "logits/rejected": -0.501071572303772, + "logps/chosen": -1032.8370361328125, + "logps/rejected": -1097.5018310546875, + "loss": 0.0221, + "num_input_tokens_seen": 136698144, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -4.923164367675781, + "rewards/margins": 6.276226043701172, + "rewards/rejected": -11.199390411376953, + "step": 415 + }, + { + "epoch": 1.589897055302849, + "grad_norm": 0.21208533529199725, + "learning_rate": 2.0416309539111656e-06, + "logits/chosen": -0.5635477304458618, + "logits/rejected": -0.5700153112411499, + "logps/chosen": -1124.43798828125, + "logps/rejected": -1188.4461669921875, + "loss": 0.0165, + "num_input_tokens_seen": 137033856, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.591968536376953, + "rewards/margins": 7.096621036529541, + "rewards/rejected": -11.688591003417969, + "step": 416 + }, + { + "epoch": 1.5937275556619583, + "grad_norm": 0.4963017066849073, + "learning_rate": 2.004627308920949e-06, + "logits/chosen": -0.5891237258911133, + "logits/rejected": -0.5888999700546265, + "logps/chosen": -1117.8046875, + "logps/rejected": -1150.2198486328125, + "loss": 0.0306, + "num_input_tokens_seen": 137373344, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -5.079355239868164, + "rewards/margins": 6.654707431793213, + "rewards/rejected": -11.734062194824219, + "step": 417 + }, + { + "epoch": 1.5975580560210676, + "grad_norm": 0.3496768956205511, + "learning_rate": 1.967924685193552e-06, + "logits/chosen": -0.5620278120040894, + "logits/rejected": -0.5710635185241699, + "logps/chosen": -1097.093505859375, + "logps/rejected": -1144.1990966796875, + "loss": 0.0298, + "num_input_tokens_seen": 137703808, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -4.881750106811523, + "rewards/margins": 6.306491851806641, + "rewards/rejected": -11.188241004943848, + "step": 418 + }, + { + "epoch": 1.601388556380177, + "grad_norm": 0.5348010473626033, + "learning_rate": 1.9315244645620066e-06, + "logits/chosen": -0.5030189752578735, + "logits/rejected": -0.5106398463249207, + "logps/chosen": -1091.509521484375, + "logps/rejected": -1153.1097412109375, + "loss": 0.051, + "num_input_tokens_seen": 138037472, + "rewards/accuracies": 0.984375, + "rewards/chosen": -5.06821870803833, + "rewards/margins": 6.238846778869629, + "rewards/rejected": -11.307065963745117, + "step": 419 + }, + { + "epoch": 1.6052190567392866, + "grad_norm": 0.30907427909420404, + "learning_rate": 1.8954280174740536e-06, + "logits/chosen": -0.5922611951828003, + "logits/rejected": -0.5921366810798645, + "logps/chosen": -1081.569580078125, + "logps/rejected": -1183.20361328125, + "loss": 0.0216, + "num_input_tokens_seen": 138371904, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.9829421043396, + "rewards/margins": 7.039853572845459, + "rewards/rejected": -12.022794723510742, + "step": 420 + }, + { + "epoch": 1.609049557098396, + "grad_norm": 0.42281715060043973, + "learning_rate": 1.859636702940516e-06, + "logits/chosen": -0.5663852095603943, + "logits/rejected": -0.5629314184188843, + "logps/chosen": -1080.8397216796875, + "logps/rejected": -1164.36328125, + "loss": 0.033, + "num_input_tokens_seen": 138686880, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -5.151627063751221, + "rewards/margins": 6.807131767272949, + "rewards/rejected": -11.958758354187012, + "step": 421 + }, + { + "epoch": 1.6128800574575055, + "grad_norm": 0.31405588397889617, + "learning_rate": 1.8241518684841642e-06, + "logits/chosen": -0.4854164719581604, + "logits/rejected": -0.5098618268966675, + "logps/chosen": -1089.078125, + "logps/rejected": -1130.2281494140625, + "loss": 0.0167, + "num_input_tokens_seen": 139010656, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.288321495056152, + "rewards/margins": 6.394042015075684, + "rewards/rejected": -11.682364463806152, + "step": 422 + }, + { + "epoch": 1.6167105578166148, + "grad_norm": 0.14836751406960216, + "learning_rate": 1.7889748500889537e-06, + "logits/chosen": -0.558589518070221, + "logits/rejected": -0.5706835985183716, + "logps/chosen": -1139.810302734375, + "logps/rejected": -1158.46240234375, + "loss": 0.0122, + "num_input_tokens_seen": 139338848, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.443422317504883, + "rewards/margins": 6.693099498748779, + "rewards/rejected": -11.13652229309082, + "step": 423 + }, + { + "epoch": 1.6205410581757242, + "grad_norm": 0.23666567844188544, + "learning_rate": 1.7541069721497494e-06, + "logits/chosen": -0.6211961507797241, + "logits/rejected": -0.6183690428733826, + "logps/chosen": -1102.187255859375, + "logps/rejected": -1165.70068359375, + "loss": 0.0177, + "num_input_tokens_seen": 139674496, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.9707794189453125, + "rewards/margins": 6.808759689331055, + "rewards/rejected": -11.779539108276367, + "step": 424 + }, + { + "epoch": 1.6243715585348335, + "grad_norm": 0.20376530719357738, + "learning_rate": 1.7195495474224433e-06, + "logits/chosen": -0.5395438075065613, + "logits/rejected": -0.5555013418197632, + "logps/chosen": -1046.8074951171875, + "logps/rejected": -1111.6490478515625, + "loss": 0.0169, + "num_input_tokens_seen": 139998272, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.743521690368652, + "rewards/margins": 6.67349100112915, + "rewards/rejected": -11.417012214660645, + "step": 425 + }, + { + "epoch": 1.628202058893943, + "grad_norm": 0.41239614206585445, + "learning_rate": 1.6853038769745466e-06, + "logits/chosen": -0.5868422389030457, + "logits/rejected": -0.5923633575439453, + "logps/chosen": -1106.877197265625, + "logps/rejected": -1158.01708984375, + "loss": 0.0388, + "num_input_tokens_seen": 140330400, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -5.1108245849609375, + "rewards/margins": 6.456360816955566, + "rewards/rejected": -11.56718635559082, + "step": 426 + }, + { + "epoch": 1.6320325592530525, + "grad_norm": 0.4698122927488403, + "learning_rate": 1.6513712501362e-06, + "logits/chosen": -0.5429418087005615, + "logits/rejected": -0.5484691858291626, + "logps/chosen": -1114.943115234375, + "logps/rejected": -1146.5546875, + "loss": 0.0308, + "num_input_tokens_seen": 140656416, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -5.050082206726074, + "rewards/margins": 6.422872543334961, + "rewards/rejected": -11.472955703735352, + "step": 427 + }, + { + "epoch": 1.635863059612162, + "grad_norm": 0.47819554120337177, + "learning_rate": 1.6177529444516193e-06, + "logits/chosen": -0.556840717792511, + "logits/rejected": -0.5600153207778931, + "logps/chosen": -1086.291259765625, + "logps/rejected": -1128.2412109375, + "loss": 0.0426, + "num_input_tokens_seen": 140977472, + "rewards/accuracies": 0.984375, + "rewards/chosen": -5.079714298248291, + "rewards/margins": 6.417318344116211, + "rewards/rejected": -11.497032165527344, + "step": 428 + }, + { + "epoch": 1.6396935599712712, + "grad_norm": 0.171324256383831, + "learning_rate": 1.5844502256310169e-06, + "logits/chosen": -0.6063379049301147, + "logits/rejected": -0.620380163192749, + "logps/chosen": -1135.731689453125, + "logps/rejected": -1150.0008544921875, + "loss": 0.0115, + "num_input_tokens_seen": 141300224, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.947619438171387, + "rewards/margins": 7.059733867645264, + "rewards/rejected": -12.007354736328125, + "step": 429 + }, + { + "epoch": 1.6435240603303807, + "grad_norm": 0.40520089913306, + "learning_rate": 1.551464347502929e-06, + "logits/chosen": -0.5612608790397644, + "logits/rejected": -0.5548451542854309, + "logps/chosen": -1067.74853515625, + "logps/rejected": -1116.236572265625, + "loss": 0.0276, + "num_input_tokens_seen": 141631616, + "rewards/accuracies": 0.984375, + "rewards/chosen": -5.120044708251953, + "rewards/margins": 6.634326934814453, + "rewards/rejected": -11.754371643066406, + "step": 430 + }, + { + "epoch": 1.64735456068949, + "grad_norm": 0.3419479162919705, + "learning_rate": 1.518796551967029e-06, + "logits/chosen": -0.5352628231048584, + "logits/rejected": -0.5662381649017334, + "logps/chosen": -1070.580322265625, + "logps/rejected": -1119.461669921875, + "loss": 0.0277, + "num_input_tokens_seen": 141948576, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -4.697789192199707, + "rewards/margins": 6.997978210449219, + "rewards/rejected": -11.695768356323242, + "step": 431 + }, + { + "epoch": 1.6511850610485994, + "grad_norm": 0.5228002169618429, + "learning_rate": 1.486448068947348e-06, + "logits/chosen": -0.5613080263137817, + "logits/rejected": -0.5504803657531738, + "logps/chosen": -1105.53466796875, + "logps/rejected": -1171.393798828125, + "loss": 0.0403, + "num_input_tokens_seen": 142276928, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -4.9683122634887695, + "rewards/margins": 6.584293842315674, + "rewards/rejected": -11.552606582641602, + "step": 432 + }, + { + "epoch": 1.655015561407709, + "grad_norm": 0.4475902406351702, + "learning_rate": 1.454420116345996e-06, + "logits/chosen": -0.6034342646598816, + "logits/rejected": -0.607345461845398, + "logps/chosen": -1109.489501953125, + "logps/rejected": -1168.3642578125, + "loss": 0.0323, + "num_input_tokens_seen": 142609120, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -4.851648807525635, + "rewards/margins": 6.416128158569336, + "rewards/rejected": -11.267777442932129, + "step": 433 + }, + { + "epoch": 1.6588460617668184, + "grad_norm": 0.1450957888471208, + "learning_rate": 1.4227138999972801e-06, + "logits/chosen": -0.5670690536499023, + "logits/rejected": -0.5877550840377808, + "logps/chosen": -1075.284912109375, + "logps/rejected": -1156.928466796875, + "loss": 0.0116, + "num_input_tokens_seen": 142934048, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.162036418914795, + "rewards/margins": 6.789186477661133, + "rewards/rejected": -11.951223373413086, + "step": 434 + }, + { + "epoch": 1.6626765621259278, + "grad_norm": 0.4263540209378264, + "learning_rate": 1.3913306136223292e-06, + "logits/chosen": -0.541340708732605, + "logits/rejected": -0.5401898622512817, + "logps/chosen": -1097.098388671875, + "logps/rejected": -1160.0435791015625, + "loss": 0.0277, + "num_input_tokens_seen": 143271200, + "rewards/accuracies": 0.984375, + "rewards/chosen": -4.8226423263549805, + "rewards/margins": 6.708310127258301, + "rewards/rejected": -11.530952453613281, + "step": 435 + }, + { + "epoch": 1.666507062485037, + "grad_norm": 0.2297703900517814, + "learning_rate": 1.3602714387841332e-06, + "logits/chosen": -0.5570739507675171, + "logits/rejected": -0.5700039863586426, + "logps/chosen": -1105.8616943359375, + "logps/rejected": -1186.8131103515625, + "loss": 0.0194, + "num_input_tokens_seen": 143614752, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.141989707946777, + "rewards/margins": 6.4582133293151855, + "rewards/rejected": -11.600203514099121, + "step": 436 + }, + { + "epoch": 1.6703375628441464, + "grad_norm": 0.29164881842452245, + "learning_rate": 1.3295375448430726e-06, + "logits/chosen": -0.5834655165672302, + "logits/rejected": -0.5806407928466797, + "logps/chosen": -1140.192138671875, + "logps/rejected": -1210.254638671875, + "loss": 0.0195, + "num_input_tokens_seen": 143946976, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.04089879989624, + "rewards/margins": 6.979633331298828, + "rewards/rejected": -12.020532608032227, + "step": 437 + }, + { + "epoch": 1.6741680632032558, + "grad_norm": 0.2917815103292045, + "learning_rate": 1.2991300889128867e-06, + "logits/chosen": -0.5652339458465576, + "logits/rejected": -0.5765770673751831, + "logps/chosen": -1062.804443359375, + "logps/rejected": -1141.0390625, + "loss": 0.0204, + "num_input_tokens_seen": 144278144, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.920722484588623, + "rewards/margins": 6.914840221405029, + "rewards/rejected": -11.835561752319336, + "step": 438 + }, + { + "epoch": 1.6779985635623653, + "grad_norm": 0.39907853763876344, + "learning_rate": 1.2690502158170992e-06, + "logits/chosen": -0.5544623732566833, + "logits/rejected": -0.5672987699508667, + "logps/chosen": -1113.116455078125, + "logps/rejected": -1172.4766845703125, + "loss": 0.0351, + "num_input_tokens_seen": 144604384, + "rewards/accuracies": 0.984375, + "rewards/chosen": -4.644880771636963, + "rewards/margins": 6.867105484008789, + "rewards/rejected": -11.51198673248291, + "step": 439 + }, + { + "epoch": 1.6818290639214748, + "grad_norm": 0.3143940954387101, + "learning_rate": 1.2392990580459351e-06, + "logits/chosen": -0.5268656015396118, + "logits/rejected": -0.526757538318634, + "logps/chosen": -1032.41552734375, + "logps/rejected": -1121.500732421875, + "loss": 0.0152, + "num_input_tokens_seen": 144917728, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.03676700592041, + "rewards/margins": 6.832172393798828, + "rewards/rejected": -11.868940353393555, + "step": 440 + }, + { + "epoch": 1.6856595642805843, + "grad_norm": 0.27596129915697015, + "learning_rate": 1.209877735713665e-06, + "logits/chosen": -0.6310405135154724, + "logits/rejected": -0.6449214220046997, + "logps/chosen": -1089.6024169921875, + "logps/rejected": -1156.683349609375, + "loss": 0.0194, + "num_input_tokens_seen": 145238176, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.199817657470703, + "rewards/margins": 7.061276912689209, + "rewards/rejected": -12.26109504699707, + "step": 441 + }, + { + "epoch": 1.6894900646396935, + "grad_norm": 0.3881727189658637, + "learning_rate": 1.1807873565164507e-06, + "logits/chosen": -0.5954270362854004, + "logits/rejected": -0.6095632314682007, + "logps/chosen": -1103.2989501953125, + "logps/rejected": -1159.8409423828125, + "loss": 0.0194, + "num_input_tokens_seen": 145564160, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -4.6872758865356445, + "rewards/margins": 6.9014997482299805, + "rewards/rejected": -11.588775634765625, + "step": 442 + }, + { + "epoch": 1.693320564998803, + "grad_norm": 0.4489866984100321, + "learning_rate": 1.1520290156906222e-06, + "logits/chosen": -0.6500638723373413, + "logits/rejected": -0.6626325845718384, + "logps/chosen": -1111.8648681640625, + "logps/rejected": -1176.58642578125, + "loss": 0.0283, + "num_input_tokens_seen": 145891296, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -4.687326431274414, + "rewards/margins": 7.18675422668457, + "rewards/rejected": -11.874079704284668, + "step": 443 + }, + { + "epoch": 1.6971510653579123, + "grad_norm": 0.48706347525594146, + "learning_rate": 1.123603795971462e-06, + "logits/chosen": -0.5890306830406189, + "logits/rejected": -0.6106859445571899, + "logps/chosen": -1108.9781494140625, + "logps/rejected": -1160.221923828125, + "loss": 0.0309, + "num_input_tokens_seen": 146218944, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -4.804515838623047, + "rewards/margins": 6.966469764709473, + "rewards/rejected": -11.770986557006836, + "step": 444 + }, + { + "epoch": 1.7009815657170217, + "grad_norm": 0.42774342380278474, + "learning_rate": 1.0955127675524213e-06, + "logits/chosen": -0.5819805860519409, + "logits/rejected": -0.5939491987228394, + "logps/chosen": -1146.3912353515625, + "logps/rejected": -1198.3311767578125, + "loss": 0.0357, + "num_input_tokens_seen": 146564320, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -4.862114429473877, + "rewards/margins": 6.866021156311035, + "rewards/rejected": -11.728135108947754, + "step": 445 + }, + { + "epoch": 1.7048120660761312, + "grad_norm": 0.24244566550307134, + "learning_rate": 1.0677569880448479e-06, + "logits/chosen": -0.5822247266769409, + "logits/rejected": -0.5774258971214294, + "logps/chosen": -1138.51025390625, + "logps/rejected": -1197.170654296875, + "loss": 0.0204, + "num_input_tokens_seen": 146901120, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.078122615814209, + "rewards/margins": 6.692413330078125, + "rewards/rejected": -11.770536422729492, + "step": 446 + }, + { + "epoch": 1.7086425664352407, + "grad_norm": 0.1750887064126772, + "learning_rate": 1.040337502438149e-06, + "logits/chosen": -0.5720530152320862, + "logits/rejected": -0.5788049697875977, + "logps/chosen": -1109.321044921875, + "logps/rejected": -1181.944091796875, + "loss": 0.0188, + "num_input_tokens_seen": 147228736, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.1328229904174805, + "rewards/margins": 6.916455268859863, + "rewards/rejected": -12.049278259277344, + "step": 447 + }, + { + "epoch": 1.71247306679435, + "grad_norm": 0.35034852259785776, + "learning_rate": 1.0132553430604608e-06, + "logits/chosen": -0.6263474225997925, + "logits/rejected": -0.624517023563385, + "logps/chosen": -1067.3369140625, + "logps/rejected": -1133.1463623046875, + "loss": 0.0211, + "num_input_tokens_seen": 147550528, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.75190544128418, + "rewards/margins": 6.651278972625732, + "rewards/rejected": -12.403183937072754, + "step": 448 + }, + { + "epoch": 1.7163035671534594, + "grad_norm": 0.6205877398995029, + "learning_rate": 9.865115295397808e-07, + "logits/chosen": -0.5426154732704163, + "logits/rejected": -0.5526891946792603, + "logps/chosen": -1110.41064453125, + "logps/rejected": -1164.3221435546875, + "loss": 0.0488, + "num_input_tokens_seen": 147885792, + "rewards/accuracies": 0.984375, + "rewards/chosen": -4.80400276184082, + "rewards/margins": 6.908002853393555, + "rewards/rejected": -11.712005615234375, + "step": 449 + }, + { + "epoch": 1.7201340675125687, + "grad_norm": 0.4135036670549277, + "learning_rate": 9.601070687655667e-07, + "logits/chosen": -0.5779565572738647, + "logits/rejected": -0.584420919418335, + "logps/chosen": -1108.6175537109375, + "logps/rejected": -1146.436767578125, + "loss": 0.0306, + "num_input_tokens_seen": 148214848, + "rewards/accuracies": 0.984375, + "rewards/chosen": -4.858382225036621, + "rewards/margins": 7.102333068847656, + "rewards/rejected": -11.960715293884277, + "step": 450 + }, + { + "epoch": 1.7239645678716782, + "grad_norm": 0.3289261014324889, + "learning_rate": 9.340429548508468e-07, + "logits/chosen": -0.5381314754486084, + "logits/rejected": -0.5400239825248718, + "logps/chosen": -1080.6904296875, + "logps/rejected": -1149.926513671875, + "loss": 0.0279, + "num_input_tokens_seen": 148543616, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.57551908493042, + "rewards/margins": 6.6705322265625, + "rewards/rejected": -12.246050834655762, + "step": 451 + }, + { + "epoch": 1.7277950682307877, + "grad_norm": 0.31848460444544546, + "learning_rate": 9.083201690947763e-07, + "logits/chosen": -0.5438213348388672, + "logits/rejected": -0.5525491237640381, + "logps/chosen": -1103.7099609375, + "logps/rejected": -1158.974609375, + "loss": 0.0249, + "num_input_tokens_seen": 148872672, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.007929801940918, + "rewards/margins": 6.430359363555908, + "rewards/rejected": -11.438289642333984, + "step": 452 + }, + { + "epoch": 1.7316255685898971, + "grad_norm": 0.297104853418433, + "learning_rate": 8.829396799457024e-07, + "logits/chosen": -0.5764440298080444, + "logits/rejected": -0.5888148546218872, + "logps/chosen": -1119.0758056640625, + "logps/rejected": -1184.151123046875, + "loss": 0.0223, + "num_input_tokens_seen": 149199488, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -5.064773082733154, + "rewards/margins": 6.842432022094727, + "rewards/rejected": -11.907205581665039, + "step": 453 + }, + { + "epoch": 1.7354560689490066, + "grad_norm": 0.41739692260648553, + "learning_rate": 8.579024429646932e-07, + "logits/chosen": -0.6111152768135071, + "logits/rejected": -0.6118967533111572, + "logps/chosen": -1121.3194580078125, + "logps/rejected": -1185.575439453125, + "loss": 0.0259, + "num_input_tokens_seen": 149535552, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -5.145281791687012, + "rewards/margins": 7.020589351654053, + "rewards/rejected": -12.165870666503906, + "step": 454 + }, + { + "epoch": 1.7392865693081159, + "grad_norm": 0.4289303700785929, + "learning_rate": 8.332094007895742e-07, + "logits/chosen": -0.5309807062149048, + "logits/rejected": -0.5524784326553345, + "logps/chosen": -1105.6787109375, + "logps/rejected": -1175.20263671875, + "loss": 0.0268, + "num_input_tokens_seen": 149863552, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -5.314176082611084, + "rewards/margins": 6.844779968261719, + "rewards/rejected": -12.158956527709961, + "step": 455 + }, + { + "epoch": 1.7431170696672251, + "grad_norm": 0.25544728519648785, + "learning_rate": 8.088614830994223e-07, + "logits/chosen": -0.5460496544837952, + "logits/rejected": -0.5635803937911987, + "logps/chosen": -1098.06787109375, + "logps/rejected": -1151.13525390625, + "loss": 0.0178, + "num_input_tokens_seen": 150189216, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.955458164215088, + "rewards/margins": 7.0177226066589355, + "rewards/rejected": -11.973180770874023, + "step": 456 + }, + { + "epoch": 1.7469475700263346, + "grad_norm": 0.27646682544036993, + "learning_rate": 7.848596065795822e-07, + "logits/chosen": -0.5718632936477661, + "logits/rejected": -0.5840321183204651, + "logps/chosen": -1076.5703125, + "logps/rejected": -1142.8958740234375, + "loss": 0.019, + "num_input_tokens_seen": 150520128, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.268486499786377, + "rewards/margins": 6.9232096672058105, + "rewards/rejected": -12.191696166992188, + "step": 457 + }, + { + "epoch": 1.750778070385444, + "grad_norm": 0.31026005509319365, + "learning_rate": 7.612046748871327e-07, + "logits/chosen": -0.612336277961731, + "logits/rejected": -0.6164854764938354, + "logps/chosen": -1071.437744140625, + "logps/rejected": -1139.8594970703125, + "loss": 0.0262, + "num_input_tokens_seen": 150842400, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -5.363009452819824, + "rewards/margins": 6.673300266265869, + "rewards/rejected": -12.036310195922852, + "step": 458 + }, + { + "epoch": 1.7546085707445536, + "grad_norm": 0.1761303793923608, + "learning_rate": 7.378975786168862e-07, + "logits/chosen": -0.5669342279434204, + "logits/rejected": -0.5744572281837463, + "logps/chosen": -1049.119873046875, + "logps/rejected": -1118.1866455078125, + "loss": 0.0136, + "num_input_tokens_seen": 151153632, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.768707275390625, + "rewards/margins": 7.130823135375977, + "rewards/rejected": -11.899530410766602, + "step": 459 + }, + { + "epoch": 1.758439071103663, + "grad_norm": 0.2807412754052514, + "learning_rate": 7.149391952678453e-07, + "logits/chosen": -0.5908502340316772, + "logits/rejected": -0.591762900352478, + "logps/chosen": -1092.927734375, + "logps/rejected": -1128.2021484375, + "loss": 0.0177, + "num_input_tokens_seen": 151481120, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -4.969484329223633, + "rewards/margins": 6.945247650146484, + "rewards/rejected": -11.914731979370117, + "step": 460 + }, + { + "epoch": 1.7622695714627723, + "grad_norm": 0.3702438361901646, + "learning_rate": 6.923303892101629e-07, + "logits/chosen": -0.5917842388153076, + "logits/rejected": -0.5703705549240112, + "logps/chosen": -1091.193603515625, + "logps/rejected": -1158.2886962890625, + "loss": 0.0293, + "num_input_tokens_seen": 151812512, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.422772407531738, + "rewards/margins": 6.562808990478516, + "rewards/rejected": -11.985581398010254, + "step": 461 + }, + { + "epoch": 1.7661000718218818, + "grad_norm": 0.4512780176460302, + "learning_rate": 6.700720116526116e-07, + "logits/chosen": -0.5994899868965149, + "logits/rejected": -0.615473210811615, + "logps/chosen": -1124.9935302734375, + "logps/rejected": -1175.539794921875, + "loss": 0.0318, + "num_input_tokens_seen": 152140000, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -4.803532600402832, + "rewards/margins": 6.8885345458984375, + "rewards/rejected": -11.692068099975586, + "step": 462 + }, + { + "epoch": 1.769930572180991, + "grad_norm": 0.31355089845953826, + "learning_rate": 6.481649006105239e-07, + "logits/chosen": -0.6010457873344421, + "logits/rejected": -0.594322681427002, + "logps/chosen": -1090.3466796875, + "logps/rejected": -1160.83837890625, + "loss": 0.0206, + "num_input_tokens_seen": 152463584, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.109030246734619, + "rewards/margins": 6.9955925941467285, + "rewards/rejected": -12.104622840881348, + "step": 463 + }, + { + "epoch": 1.7737610725401005, + "grad_norm": 0.3465421963894818, + "learning_rate": 6.266098808742515e-07, + "logits/chosen": -0.5488612055778503, + "logits/rejected": -0.5536898970603943, + "logps/chosen": -1109.115966796875, + "logps/rejected": -1146.42431640625, + "loss": 0.0256, + "num_input_tokens_seen": 152791584, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -5.001533031463623, + "rewards/margins": 6.720609664916992, + "rewards/rejected": -11.722143173217773, + "step": 464 + }, + { + "epoch": 1.77759157289921, + "grad_norm": 0.28145428546234696, + "learning_rate": 6.054077639781009e-07, + "logits/chosen": -0.622146725654602, + "logits/rejected": -0.6340442895889282, + "logps/chosen": -1065.704345703125, + "logps/rejected": -1132.8978271484375, + "loss": 0.0178, + "num_input_tokens_seen": 153119264, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.24699592590332, + "rewards/margins": 6.64593505859375, + "rewards/rejected": -11.89293098449707, + "step": 465 + }, + { + "epoch": 1.7814220732583195, + "grad_norm": 0.5850918893598807, + "learning_rate": 5.845593481697931e-07, + "logits/chosen": -0.5338524580001831, + "logits/rejected": -0.5375256538391113, + "logps/chosen": -1048.61376953125, + "logps/rejected": -1096.6982421875, + "loss": 0.046, + "num_input_tokens_seen": 153430272, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -5.032464027404785, + "rewards/margins": 6.751783847808838, + "rewards/rejected": -11.784248352050781, + "step": 466 + }, + { + "epoch": 1.7852525736174287, + "grad_norm": 0.21964540335272656, + "learning_rate": 5.640654183803962e-07, + "logits/chosen": -0.5285056233406067, + "logits/rejected": -0.562720000743866, + "logps/chosen": -1070.579345703125, + "logps/rejected": -1099.959716796875, + "loss": 0.0186, + "num_input_tokens_seen": 153752128, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.171965599060059, + "rewards/margins": 6.8383355140686035, + "rewards/rejected": -12.01030158996582, + "step": 467 + }, + { + "epoch": 1.7890830739765382, + "grad_norm": 0.21029322341019102, + "learning_rate": 5.439267461947884e-07, + "logits/chosen": -0.5642714500427246, + "logits/rejected": -0.5721608996391296, + "logps/chosen": -1112.114501953125, + "logps/rejected": -1173.99462890625, + "loss": 0.0117, + "num_input_tokens_seen": 154088384, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.179225444793701, + "rewards/margins": 6.717039585113525, + "rewards/rejected": -11.896265029907227, + "step": 468 + }, + { + "epoch": 1.7929135743356475, + "grad_norm": 0.2618768617225922, + "learning_rate": 5.241440898225891e-07, + "logits/chosen": -0.6088806986808777, + "logits/rejected": -0.6048654317855835, + "logps/chosen": -1101.7752685546875, + "logps/rejected": -1162.05029296875, + "loss": 0.0198, + "num_input_tokens_seen": 154422560, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.108433723449707, + "rewards/margins": 6.6637773513793945, + "rewards/rejected": -11.772211074829102, + "step": 469 + }, + { + "epoch": 1.796744074694757, + "grad_norm": 0.5004405771873324, + "learning_rate": 5.047181940696333e-07, + "logits/chosen": -0.6143144369125366, + "logits/rejected": -0.642931342124939, + "logps/chosen": -1118.310791015625, + "logps/rejected": -1164.6956787109375, + "loss": 0.0242, + "num_input_tokens_seen": 154766336, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -4.942951679229736, + "rewards/margins": 7.233571529388428, + "rewards/rejected": -12.176523208618164, + "step": 470 + }, + { + "epoch": 1.8005745750538664, + "grad_norm": 0.2804489284198292, + "learning_rate": 4.856497903099167e-07, + "logits/chosen": -0.5101760625839233, + "logits/rejected": -0.5303949117660522, + "logps/chosen": -1078.23876953125, + "logps/rejected": -1165.059326171875, + "loss": 0.0205, + "num_input_tokens_seen": 155091648, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -5.064233303070068, + "rewards/margins": 7.12818717956543, + "rewards/rejected": -12.192420959472656, + "step": 471 + }, + { + "epoch": 1.804405075412976, + "grad_norm": 0.3197776161572269, + "learning_rate": 4.6693959645806143e-07, + "logits/chosen": -0.5787489414215088, + "logits/rejected": -0.5966456532478333, + "logps/chosen": -1075.5628662109375, + "logps/rejected": -1113.416259765625, + "loss": 0.0211, + "num_input_tokens_seen": 155418336, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -5.318225860595703, + "rewards/margins": 6.6135945320129395, + "rewards/rejected": -11.9318208694458, + "step": 472 + }, + { + "epoch": 1.8082355757720854, + "grad_norm": 0.3189664268268836, + "learning_rate": 4.4858831694229334e-07, + "logits/chosen": -0.5821654796600342, + "logits/rejected": -0.5808714628219604, + "logps/chosen": -1076.416748046875, + "logps/rejected": -1138.97802734375, + "loss": 0.0218, + "num_input_tokens_seen": 155737088, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -4.915964126586914, + "rewards/margins": 6.923944473266602, + "rewards/rejected": -11.839908599853516, + "step": 473 + }, + { + "epoch": 1.8120660761311946, + "grad_norm": 0.5042828466166709, + "learning_rate": 4.305966426779118e-07, + "logits/chosen": -0.546190619468689, + "logits/rejected": -0.5510247945785522, + "logps/chosen": -1112.429443359375, + "logps/rejected": -1166.0345458984375, + "loss": 0.0385, + "num_input_tokens_seen": 156064512, + "rewards/accuracies": 0.984375, + "rewards/chosen": -5.297481060028076, + "rewards/margins": 6.511920928955078, + "rewards/rejected": -11.809402465820312, + "step": 474 + }, + { + "epoch": 1.815896576490304, + "grad_norm": 0.39676766532588814, + "learning_rate": 4.1296525104128513e-07, + "logits/chosen": -0.5063787698745728, + "logits/rejected": -0.5123206377029419, + "logps/chosen": -1094.56103515625, + "logps/rejected": -1154.891845703125, + "loss": 0.058, + "num_input_tokens_seen": 156397152, + "rewards/accuracies": 0.984375, + "rewards/chosen": -4.998780250549316, + "rewards/margins": 6.682381629943848, + "rewards/rejected": -11.681161880493164, + "step": 475 + }, + { + "epoch": 1.8197270768494134, + "grad_norm": 0.6500689796049444, + "learning_rate": 3.9569480584434217e-07, + "logits/chosen": -0.6066817045211792, + "logits/rejected": -0.5958479046821594, + "logps/chosen": -1119.053466796875, + "logps/rejected": -1164.248291015625, + "loss": 0.032, + "num_input_tokens_seen": 156730688, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -5.058269023895264, + "rewards/margins": 6.810879707336426, + "rewards/rejected": -11.869148254394531, + "step": 476 + }, + { + "epoch": 1.8235575772085229, + "grad_norm": 0.4639961638405062, + "learning_rate": 3.787859573095853e-07, + "logits/chosen": -0.5423205494880676, + "logits/rejected": -0.5485752820968628, + "logps/chosen": -1100.522705078125, + "logps/rejected": -1136.4788818359375, + "loss": 0.0376, + "num_input_tokens_seen": 157051776, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -4.944726943969727, + "rewards/margins": 6.78798770904541, + "rewards/rejected": -11.732715606689453, + "step": 477 + }, + { + "epoch": 1.8273880775676323, + "grad_norm": 0.48304362889946373, + "learning_rate": 3.6223934204560165e-07, + "logits/chosen": -0.5817406177520752, + "logits/rejected": -0.570923388004303, + "logps/chosen": -1119.2373046875, + "logps/rejected": -1202.935302734375, + "loss": 0.0275, + "num_input_tokens_seen": 157388704, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -5.299049377441406, + "rewards/margins": 6.9755353927612305, + "rewards/rejected": -12.27458381652832, + "step": 478 + }, + { + "epoch": 1.8312185779267418, + "grad_norm": 0.23121164369970276, + "learning_rate": 3.4605558302310715e-07, + "logits/chosen": -0.5674431920051575, + "logits/rejected": -0.566830039024353, + "logps/chosen": -1123.62841796875, + "logps/rejected": -1184.93212890625, + "loss": 0.019, + "num_input_tokens_seen": 157725120, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.323459148406982, + "rewards/margins": 6.570496559143066, + "rewards/rejected": -11.89395523071289, + "step": 479 + }, + { + "epoch": 1.835049078285851, + "grad_norm": 0.507883364805033, + "learning_rate": 3.302352895514793e-07, + "logits/chosen": -0.6002224087715149, + "logits/rejected": -0.5940362215042114, + "logps/chosen": -1082.4443359375, + "logps/rejected": -1150.4942626953125, + "loss": 0.0435, + "num_input_tokens_seen": 158054944, + "rewards/accuracies": 0.984375, + "rewards/chosen": -5.020196914672852, + "rewards/margins": 6.936100482940674, + "rewards/rejected": -11.956296920776367, + "step": 480 + }, + { + "epoch": 1.8388795786449605, + "grad_norm": 0.3061052270101948, + "learning_rate": 3.147790572558262e-07, + "logits/chosen": -0.5784134864807129, + "logits/rejected": -0.5945654511451721, + "logps/chosen": -1103.856201171875, + "logps/rejected": -1182.0810546875, + "loss": 0.0168, + "num_input_tokens_seen": 158379936, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.698309421539307, + "rewards/margins": 7.306283950805664, + "rewards/rejected": -12.004592895507812, + "step": 481 + }, + { + "epoch": 1.8427100790040698, + "grad_norm": 0.2750625777788953, + "learning_rate": 2.996874680545603e-07, + "logits/chosen": -0.5575003623962402, + "logits/rejected": -0.5689373016357422, + "logps/chosen": -1084.29150390625, + "logps/rejected": -1110.9722900390625, + "loss": 0.0216, + "num_input_tokens_seen": 158694752, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.452197551727295, + "rewards/margins": 6.236988067626953, + "rewards/rejected": -11.68918514251709, + "step": 482 + }, + { + "epoch": 1.8465405793631793, + "grad_norm": 0.18294170648072644, + "learning_rate": 2.849610901374822e-07, + "logits/chosen": -0.5478576421737671, + "logits/rejected": -0.5681541562080383, + "logps/chosen": -1098.531982421875, + "logps/rejected": -1154.928466796875, + "loss": 0.0136, + "num_input_tokens_seen": 159033856, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.846682071685791, + "rewards/margins": 7.121401786804199, + "rewards/rejected": -11.968084335327148, + "step": 483 + }, + { + "epoch": 1.8503710797222888, + "grad_norm": 0.2064769067380975, + "learning_rate": 2.7060047794439937e-07, + "logits/chosen": -0.5644625425338745, + "logits/rejected": -0.5791463255882263, + "logps/chosen": -1081.317626953125, + "logps/rejected": -1141.6392822265625, + "loss": 0.0147, + "num_input_tokens_seen": 159362944, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.960305213928223, + "rewards/margins": 6.766103744506836, + "rewards/rejected": -11.726408958435059, + "step": 484 + }, + { + "epoch": 1.8542015800813982, + "grad_norm": 0.34607653141918, + "learning_rate": 2.5660617214424146e-07, + "logits/chosen": -0.5761258602142334, + "logits/rejected": -0.5884686708450317, + "logps/chosen": -1113.8516845703125, + "logps/rejected": -1164.2545166015625, + "loss": 0.0225, + "num_input_tokens_seen": 159694496, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.735446929931641, + "rewards/margins": 6.753626823425293, + "rewards/rejected": -11.489073753356934, + "step": 485 + }, + { + "epoch": 1.8580320804405075, + "grad_norm": 0.5377871355379038, + "learning_rate": 2.4297869961471544e-07, + "logits/chosen": -0.4664217233657837, + "logits/rejected": -0.49878525733947754, + "logps/chosen": -1080.5186767578125, + "logps/rejected": -1142.665283203125, + "loss": 0.039, + "num_input_tokens_seen": 160038496, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -5.523893356323242, + "rewards/margins": 6.439659118652344, + "rewards/rejected": -11.963552474975586, + "step": 486 + }, + { + "epoch": 1.861862580799617, + "grad_norm": 0.44727583236303325, + "learning_rate": 2.2971857342245607e-07, + "logits/chosen": -0.5878583788871765, + "logits/rejected": -0.6006830334663391, + "logps/chosen": -1130.0882568359375, + "logps/rejected": -1185.45166015625, + "loss": 0.0283, + "num_input_tokens_seen": 160378176, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -4.976678371429443, + "rewards/margins": 7.090885162353516, + "rewards/rejected": -12.067564010620117, + "step": 487 + }, + { + "epoch": 1.8656930811587262, + "grad_norm": 0.3258026092752773, + "learning_rate": 2.168262928037246e-07, + "logits/chosen": -0.6091374158859253, + "logits/rejected": -0.6080597639083862, + "logps/chosen": -1093.76513671875, + "logps/rejected": -1152.98095703125, + "loss": 0.0196, + "num_input_tokens_seen": 160710144, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.350541114807129, + "rewards/margins": 6.890563011169434, + "rewards/rejected": -12.241104125976562, + "step": 488 + }, + { + "epoch": 1.8695235815178357, + "grad_norm": 0.3195992529249187, + "learning_rate": 2.0430234314559482e-07, + "logits/chosen": -0.5493265390396118, + "logits/rejected": -0.5563054084777832, + "logps/chosen": -1104.7052001953125, + "logps/rejected": -1167.94970703125, + "loss": 0.0245, + "num_input_tokens_seen": 161039680, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.804253578186035, + "rewards/margins": 6.66768741607666, + "rewards/rejected": -11.471940994262695, + "step": 489 + }, + { + "epoch": 1.8733540818769452, + "grad_norm": 0.5492168142028014, + "learning_rate": 1.921471959676957e-07, + "logits/chosen": -0.5748569965362549, + "logits/rejected": -0.588194727897644, + "logps/chosen": -1106.531494140625, + "logps/rejected": -1168.142333984375, + "loss": 0.0382, + "num_input_tokens_seen": 161366240, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -5.08213996887207, + "rewards/margins": 6.665508270263672, + "rewards/rejected": -11.747648239135742, + "step": 490 + }, + { + "epoch": 1.8771845822360547, + "grad_norm": 0.3716796121969407, + "learning_rate": 1.8036130890444758e-07, + "logits/chosen": -0.5888176560401917, + "logits/rejected": -0.6164166927337646, + "logps/chosen": -1079.544921875, + "logps/rejected": -1150.6966552734375, + "loss": 0.0211, + "num_input_tokens_seen": 161682880, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -5.064846038818359, + "rewards/margins": 7.294391632080078, + "rewards/rejected": -12.359237670898438, + "step": 491 + }, + { + "epoch": 1.8810150825951641, + "grad_norm": 0.22240240172785558, + "learning_rate": 1.6894512568783717e-07, + "logits/chosen": -0.5541859269142151, + "logits/rejected": -0.5638035535812378, + "logps/chosen": -1077.4581298828125, + "logps/rejected": -1124.5787353515625, + "loss": 0.0162, + "num_input_tokens_seen": 162003520, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.330525875091553, + "rewards/margins": 6.368184566497803, + "rewards/rejected": -11.698710441589355, + "step": 492 + }, + { + "epoch": 1.8848455829542734, + "grad_norm": 0.594545837894161, + "learning_rate": 1.5789907613070977e-07, + "logits/chosen": -0.5701717138290405, + "logits/rejected": -0.561404824256897, + "logps/chosen": -1086.4013671875, + "logps/rejected": -1153.95458984375, + "loss": 0.0506, + "num_input_tokens_seen": 162340448, + "rewards/accuracies": 0.984375, + "rewards/chosen": -4.965949058532715, + "rewards/margins": 6.700102806091309, + "rewards/rejected": -11.666050910949707, + "step": 493 + }, + { + "epoch": 1.8886760833133827, + "grad_norm": 0.1996995910906416, + "learning_rate": 1.472235761105878e-07, + "logits/chosen": -0.566815972328186, + "logits/rejected": -0.5813367962837219, + "logps/chosen": -1062.567626953125, + "logps/rejected": -1133.778564453125, + "loss": 0.0138, + "num_input_tokens_seen": 162661088, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.35923957824707, + "rewards/margins": 6.970911026000977, + "rewards/rejected": -12.330151557922363, + "step": 494 + }, + { + "epoch": 1.8925065836724921, + "grad_norm": 0.4014631934817975, + "learning_rate": 1.3691902755401442e-07, + "logits/chosen": -0.5628057718276978, + "logits/rejected": -0.5921974182128906, + "logps/chosen": -1109.745361328125, + "logps/rejected": -1170.15478515625, + "loss": 0.0203, + "num_input_tokens_seen": 162992512, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -5.222186088562012, + "rewards/margins": 6.9820876121521, + "rewards/rejected": -12.204275131225586, + "step": 495 + }, + { + "epoch": 1.8963370840316016, + "grad_norm": 0.3066776616309968, + "learning_rate": 1.2698581842141567e-07, + "logits/chosen": -0.5842748880386353, + "logits/rejected": -0.6082962155342102, + "logps/chosen": -1132.0692138671875, + "logps/rejected": -1208.8475341796875, + "loss": 0.0212, + "num_input_tokens_seen": 163324864, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.08241081237793, + "rewards/margins": 6.98268461227417, + "rewards/rejected": -12.065094947814941, + "step": 496 + }, + { + "epoch": 1.900167584390711, + "grad_norm": 0.29309526459768737, + "learning_rate": 1.1742432269250536e-07, + "logits/chosen": -0.5320190191268921, + "logits/rejected": -0.5167162418365479, + "logps/chosen": -1086.14453125, + "logps/rejected": -1164.0419921875, + "loss": 0.018, + "num_input_tokens_seen": 163662144, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -4.776819229125977, + "rewards/margins": 6.601822853088379, + "rewards/rejected": -11.378641128540039, + "step": 497 + }, + { + "epoch": 1.9039980847498206, + "grad_norm": 0.19015482420877416, + "learning_rate": 1.0823490035218986e-07, + "logits/chosen": -0.544445812702179, + "logits/rejected": -0.5502886772155762, + "logps/chosen": -1091.1800537109375, + "logps/rejected": -1188.2353515625, + "loss": 0.0114, + "num_input_tokens_seen": 163988064, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.649686813354492, + "rewards/margins": 7.545851230621338, + "rewards/rejected": -12.195537567138672, + "step": 498 + }, + { + "epoch": 1.9078285851089298, + "grad_norm": 0.26330546626714024, + "learning_rate": 9.94178973770299e-08, + "logits/chosen": -0.573994517326355, + "logits/rejected": -0.6186249256134033, + "logps/chosen": -1106.7874755859375, + "logps/rejected": -1160.6258544921875, + "loss": 0.0214, + "num_input_tokens_seen": 164331552, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.751733779907227, + "rewards/margins": 6.745034694671631, + "rewards/rejected": -11.496768951416016, + "step": 499 + }, + { + "epoch": 1.9116590854680393, + "grad_norm": 0.446090819996274, + "learning_rate": 9.09736457221999e-08, + "logits/chosen": -0.5141263008117676, + "logits/rejected": -0.5171107649803162, + "logps/chosen": -1088.882080078125, + "logps/rejected": -1149.7489013671875, + "loss": 0.0408, + "num_input_tokens_seen": 164657472, + "rewards/accuracies": 0.984375, + "rewards/chosen": -5.317117214202881, + "rewards/margins": 6.2619829177856445, + "rewards/rejected": -11.579099655151367, + "step": 500 + }, + { + "epoch": 1.9154895858271486, + "grad_norm": 0.49820098330715146, + "learning_rate": 8.290246330900475e-08, + "logits/chosen": -0.5549517869949341, + "logits/rejected": -0.5542320013046265, + "logps/chosen": -1052.072021484375, + "logps/rejected": -1117.225830078125, + "loss": 0.0288, + "num_input_tokens_seen": 164976896, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.412148952484131, + "rewards/margins": 6.537993431091309, + "rewards/rejected": -11.950142860412598, + "step": 501 + }, + { + "epoch": 1.919320086186258, + "grad_norm": 0.4798305203257399, + "learning_rate": 7.520465401290033e-08, + "logits/chosen": -0.5905622243881226, + "logits/rejected": -0.5992444753646851, + "logps/chosen": -1112.00732421875, + "logps/rejected": -1172.260986328125, + "loss": 0.0381, + "num_input_tokens_seen": 165307264, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -4.97149658203125, + "rewards/margins": 6.620386123657227, + "rewards/rejected": -11.591882705688477, + "step": 502 + }, + { + "epoch": 1.9231505865453675, + "grad_norm": 0.2290608346599347, + "learning_rate": 6.788050765205501e-08, + "logits/chosen": -0.509782612323761, + "logits/rejected": -0.5324023962020874, + "logps/chosen": -1137.250244140625, + "logps/rejected": -1196.82421875, + "loss": 0.0199, + "num_input_tokens_seen": 165644416, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.699871063232422, + "rewards/margins": 6.657888412475586, + "rewards/rejected": -11.357759475708008, + "step": 503 + }, + { + "epoch": 1.926981086904477, + "grad_norm": 0.4865026527688004, + "learning_rate": 6.09302999764394e-08, + "logits/chosen": -0.6649181246757507, + "logits/rejected": -0.6522095203399658, + "logps/chosen": -1137.3560791015625, + "logps/rejected": -1193.30712890625, + "loss": 0.0407, + "num_input_tokens_seen": 165977312, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -5.3496928215026855, + "rewards/margins": 7.056132793426514, + "rewards/rejected": -12.4058256149292, + "step": 504 + }, + { + "epoch": 1.9308115872635863, + "grad_norm": 0.2647650262687232, + "learning_rate": 5.435429265744585e-08, + "logits/chosen": -0.5825361609458923, + "logits/rejected": -0.6007415056228638, + "logps/chosen": -1144.543212890625, + "logps/rejected": -1193.9190673828125, + "loss": 0.013, + "num_input_tokens_seen": 166309056, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.880622863769531, + "rewards/margins": 6.4802045822143555, + "rewards/rejected": -11.360828399658203, + "step": 505 + }, + { + "epoch": 1.9346420876226957, + "grad_norm": 0.46778074261715, + "learning_rate": 4.815273327803183e-08, + "logits/chosen": -0.522498369216919, + "logits/rejected": -0.5168355703353882, + "logps/chosen": -1086.7999267578125, + "logps/rejected": -1135.597900390625, + "loss": 0.0373, + "num_input_tokens_seen": 166639520, + "rewards/accuracies": 0.984375, + "rewards/chosen": -5.356372833251953, + "rewards/margins": 6.602511405944824, + "rewards/rejected": -11.958884239196777, + "step": 506 + }, + { + "epoch": 1.938472587981805, + "grad_norm": 0.2555736439687232, + "learning_rate": 4.232585532340183e-08, + "logits/chosen": -0.5908759832382202, + "logits/rejected": -0.607272744178772, + "logps/chosen": -1123.119140625, + "logps/rejected": -1181.783203125, + "loss": 0.0196, + "num_input_tokens_seen": 166975296, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.191551208496094, + "rewards/margins": 6.777198314666748, + "rewards/rejected": -11.96875, + "step": 507 + }, + { + "epoch": 1.9423030883409145, + "grad_norm": 0.5550711238198239, + "learning_rate": 3.687387817221999e-08, + "logits/chosen": -0.5406380891799927, + "logits/rejected": -0.5444807410240173, + "logps/chosen": -1098.705810546875, + "logps/rejected": -1139.139892578125, + "loss": 0.0556, + "num_input_tokens_seen": 167301664, + "rewards/accuracies": 0.96875, + "rewards/chosen": -4.962944984436035, + "rewards/margins": 6.445636749267578, + "rewards/rejected": -11.40858268737793, + "step": 508 + }, + { + "epoch": 1.946133588700024, + "grad_norm": 0.3942935603792195, + "learning_rate": 3.179700708834332e-08, + "logits/chosen": -0.5868821740150452, + "logits/rejected": -0.5931185483932495, + "logps/chosen": -1135.7606201171875, + "logps/rejected": -1189.2559814453125, + "loss": 0.0157, + "num_input_tokens_seen": 167634208, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.011362075805664, + "rewards/margins": 6.834611415863037, + "rewards/rejected": -11.84597396850586, + "step": 509 + }, + { + "epoch": 1.9499640890591334, + "grad_norm": 0.24633091568599824, + "learning_rate": 2.7095433213097933e-08, + "logits/chosen": -0.5268473625183105, + "logits/rejected": -0.5497424602508545, + "logps/chosen": -1133.066162109375, + "logps/rejected": -1197.1282958984375, + "loss": 0.0187, + "num_input_tokens_seen": 167967168, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.6928277015686035, + "rewards/margins": 6.889833450317383, + "rewards/rejected": -11.582660675048828, + "step": 510 + }, + { + "epoch": 1.953794589418243, + "grad_norm": 0.4087397977247375, + "learning_rate": 2.276933355808364e-08, + "logits/chosen": -0.5248457789421082, + "logits/rejected": -0.5135776400566101, + "logps/chosen": -1078.596435546875, + "logps/rejected": -1106.1142578125, + "loss": 0.0442, + "num_input_tokens_seen": 168302656, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -5.285521507263184, + "rewards/margins": 6.527587890625, + "rewards/rejected": -11.8131103515625, + "step": 511 + }, + { + "epoch": 1.9576250897773522, + "grad_norm": 0.5280995569362179, + "learning_rate": 1.881887099850821e-08, + "logits/chosen": -0.4920487701892853, + "logits/rejected": -0.49276888370513916, + "logps/chosen": -1070.3243408203125, + "logps/rejected": -1131.8331298828125, + "loss": 0.0336, + "num_input_tokens_seen": 168639648, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -5.160616874694824, + "rewards/margins": 6.538956642150879, + "rewards/rejected": -11.69957447052002, + "step": 512 + }, + { + "epoch": 1.9614555901364614, + "grad_norm": 0.2699567692819676, + "learning_rate": 1.524419426705226e-08, + "logits/chosen": -0.5664230585098267, + "logits/rejected": -0.6074328422546387, + "logps/chosen": -1103.9208984375, + "logps/rejected": -1132.8095703125, + "loss": 0.0215, + "num_input_tokens_seen": 168967520, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.087352752685547, + "rewards/margins": 6.490187168121338, + "rewards/rejected": -11.577539443969727, + "step": 513 + }, + { + "epoch": 1.965286090495571, + "grad_norm": 0.480735500363299, + "learning_rate": 1.2045437948275952e-08, + "logits/chosen": -0.5805327892303467, + "logits/rejected": -0.5914294719696045, + "logps/chosen": -1101.74169921875, + "logps/rejected": -1167.2808837890625, + "loss": 0.0333, + "num_input_tokens_seen": 169297856, + "rewards/accuracies": 0.984375, + "rewards/chosen": -5.391973495483398, + "rewards/margins": 6.908017158508301, + "rewards/rejected": -12.2999906539917, + "step": 514 + }, + { + "epoch": 1.9691165908546804, + "grad_norm": 0.5287270712935227, + "learning_rate": 9.222722473546385e-09, + "logits/chosen": -0.5746007561683655, + "logits/rejected": -0.6011930704116821, + "logps/chosen": -1058.7333984375, + "logps/rejected": -1110.0943603515625, + "loss": 0.0483, + "num_input_tokens_seen": 169618464, + "rewards/accuracies": 0.984375, + "rewards/chosen": -5.310314178466797, + "rewards/margins": 6.451066970825195, + "rewards/rejected": -11.761381149291992, + "step": 515 + }, + { + "epoch": 1.9729470912137899, + "grad_norm": 0.5260099676865022, + "learning_rate": 6.776154116504563e-09, + "logits/chosen": -0.605218768119812, + "logits/rejected": -0.5927131175994873, + "logps/chosen": -1122.588623046875, + "logps/rejected": -1180.889892578125, + "loss": 0.0467, + "num_input_tokens_seen": 169949120, + "rewards/accuracies": 0.984375, + "rewards/chosen": -4.61450719833374, + "rewards/margins": 7.21722412109375, + "rewards/rejected": -11.831730842590332, + "step": 516 + }, + { + "epoch": 1.9767775915728993, + "grad_norm": 0.34293149540401135, + "learning_rate": 4.705824989068575e-09, + "logits/chosen": -0.5874618291854858, + "logits/rejected": -0.6010653972625732, + "logps/chosen": -1132.280029296875, + "logps/rejected": -1205.7244873046875, + "loss": 0.0224, + "num_input_tokens_seen": 170304320, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.322883605957031, + "rewards/margins": 7.377194404602051, + "rewards/rejected": -11.700077056884766, + "step": 517 + }, + { + "epoch": 1.9806080919320086, + "grad_norm": 0.45671463514409116, + "learning_rate": 3.0118130379575005e-09, + "logits/chosen": -0.5985814929008484, + "logits/rejected": -0.5846527218818665, + "logps/chosen": -1130.593017578125, + "logps/rejected": -1187.200439453125, + "loss": 0.0367, + "num_input_tokens_seen": 170634080, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -5.007251739501953, + "rewards/margins": 6.881740093231201, + "rewards/rejected": -11.888991355895996, + "step": 518 + }, + { + "epoch": 1.984438592291118, + "grad_norm": 0.19452903539752295, + "learning_rate": 1.6941820417659682e-09, + "logits/chosen": -0.564356803894043, + "logits/rejected": -0.5611583590507507, + "logps/chosen": -1111.96142578125, + "logps/rejected": -1154.638671875, + "loss": 0.0147, + "num_input_tokens_seen": 170959328, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.935775279998779, + "rewards/margins": 6.6817450523376465, + "rewards/rejected": -11.617520332336426, + "step": 519 + }, + { + "epoch": 1.9882690926502273, + "grad_norm": 0.3712582623259577, + "learning_rate": 7.529816085549701e-10, + "logits/chosen": -0.5205713510513306, + "logits/rejected": -0.5405097603797913, + "logps/chosen": -1116.733642578125, + "logps/rejected": -1164.6763916015625, + "loss": 0.025, + "num_input_tokens_seen": 171291360, + "rewards/accuracies": 0.9921875, + "rewards/chosen": -4.830142021179199, + "rewards/margins": 6.903097152709961, + "rewards/rejected": -11.733240127563477, + "step": 520 + }, + { + "epoch": 1.9920995930093368, + "grad_norm": 0.20428212615806787, + "learning_rate": 1.8824717398890912e-10, + "logits/chosen": -0.5101537704467773, + "logits/rejected": -0.4972701966762543, + "logps/chosen": -1036.364013671875, + "logps/rejected": -1105.681884765625, + "loss": 0.0177, + "num_input_tokens_seen": 171601152, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.316509246826172, + "rewards/margins": 6.576091289520264, + "rewards/rejected": -11.892601013183594, + "step": 521 + }, + { + "epoch": 1.9959300933684463, + "grad_norm": 0.3030289067974063, + "learning_rate": 0.0, + "logits/chosen": -0.5760252475738525, + "logits/rejected": -0.5729879140853882, + "logps/chosen": -1107.64453125, + "logps/rejected": -1175.9635009765625, + "loss": 0.0211, + "num_input_tokens_seen": 171931200, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.959815979003906, + "rewards/margins": 6.926852226257324, + "rewards/rejected": -11.88666820526123, + "step": 522 + }, + { + "epoch": 1.9959300933684463, + "num_input_tokens_seen": 171931200, + "step": 522, + "total_flos": 140045856276480.0, + "train_loss": 0.10187807008338615, + "train_runtime": 26262.2511, + "train_samples_per_second": 2.544, + "train_steps_per_second": 0.02 + } + ], + "logging_steps": 1.0, + "max_steps": 522, + "num_input_tokens_seen": 171931200, + "num_train_epochs": 2, + "save_steps": 260, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 140045856276480.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}