{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9880609304240429, "eval_steps": 500, "global_step": 75, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013174145738987238, "grad_norm": 0.5080101490020752, "learning_rate": 1.25e-07, "logits/chosen": 10.32492733001709, "logits/rejected": 10.282785415649414, "logps/chosen": -1.0583709478378296, "logps/rejected": -1.1253927946090698, "loss": 1.3887, "rewards/accuracies": 0.5390625, "rewards/chosen": -2.116741895675659, "rewards/margins": 0.1340436041355133, "rewards/rejected": -2.2507855892181396, "step": 1 }, { "epoch": 0.026348291477974475, "grad_norm": 0.5300341844558716, "learning_rate": 2.5e-07, "logits/chosen": 10.530074119567871, "logits/rejected": 10.672085762023926, "logps/chosen": -1.1093882322311401, "logps/rejected": -1.2041704654693604, "loss": 1.3086, "rewards/accuracies": 0.6171875, "rewards/chosen": -2.2187764644622803, "rewards/margins": 0.18956437706947327, "rewards/rejected": -2.4083409309387207, "step": 2 }, { "epoch": 0.03952243721696171, "grad_norm": 1.1035377979278564, "learning_rate": 3.75e-07, "logits/chosen": 10.240287780761719, "logits/rejected": 10.40180492401123, "logps/chosen": -1.1902998685836792, "logps/rejected": -1.311903715133667, "loss": 1.3245, "rewards/accuracies": 0.6484375, "rewards/chosen": -2.3805997371673584, "rewards/margins": 0.24320749938488007, "rewards/rejected": -2.623807430267334, "step": 3 }, { "epoch": 0.05269658295594895, "grad_norm": 1.8411834239959717, "learning_rate": 5e-07, "logits/chosen": 10.358713150024414, "logits/rejected": 10.405998229980469, "logps/chosen": -1.0821034908294678, "logps/rejected": -1.1429979801177979, "loss": 1.3768, "rewards/accuracies": 0.578125, "rewards/chosen": -2.1642069816589355, "rewards/margins": 0.12178920209407806, "rewards/rejected": -2.2859959602355957, "step": 4 }, { "epoch": 0.06587072869493618, "grad_norm": 0.5591928362846375, "learning_rate": 6.249999999999999e-07, "logits/chosen": 10.748247146606445, "logits/rejected": 10.783531188964844, "logps/chosen": -1.1161627769470215, "logps/rejected": -1.1165733337402344, "loss": 1.4573, "rewards/accuracies": 0.5859375, "rewards/chosen": -2.232325553894043, "rewards/margins": 0.0008210651576519012, "rewards/rejected": -2.2331466674804688, "step": 5 }, { "epoch": 0.07904487443392343, "grad_norm": 1.0748578310012817, "learning_rate": 7.5e-07, "logits/chosen": 10.895101547241211, "logits/rejected": 10.935002326965332, "logps/chosen": -1.051721453666687, "logps/rejected": -1.1617565155029297, "loss": 1.3341, "rewards/accuracies": 0.609375, "rewards/chosen": -2.103442907333374, "rewards/margins": 0.22007031738758087, "rewards/rejected": -2.3235130310058594, "step": 6 }, { "epoch": 0.09221902017291066, "grad_norm": 1.2474150657653809, "learning_rate": 8.75e-07, "logits/chosen": 10.060023307800293, "logits/rejected": 10.135562896728516, "logps/chosen": -1.0141205787658691, "logps/rejected": -1.1586568355560303, "loss": 1.3246, "rewards/accuracies": 0.6328125, "rewards/chosen": -2.0282411575317383, "rewards/margins": 0.28907278180122375, "rewards/rejected": -2.3173136711120605, "step": 7 }, { "epoch": 0.1053931659118979, "grad_norm": 1.7001811265945435, "learning_rate": 1e-06, "logits/chosen": 10.837862014770508, "logits/rejected": 10.750317573547363, "logps/chosen": -1.0156888961791992, "logps/rejected": -1.1156163215637207, "loss": 1.314, "rewards/accuracies": 0.671875, "rewards/chosen": -2.0313777923583984, "rewards/margins": 0.19985482096672058, "rewards/rejected": -2.2312326431274414, "step": 8 }, { "epoch": 0.11856731165088513, "grad_norm": 1.0888780355453491, "learning_rate": 9.994504457428556e-07, "logits/chosen": 10.434420585632324, "logits/rejected": 10.604791641235352, "logps/chosen": -1.1994280815124512, "logps/rejected": -1.1771385669708252, "loss": 1.4926, "rewards/accuracies": 0.5234375, "rewards/chosen": -2.3988561630249023, "rewards/margins": -0.04457877576351166, "rewards/rejected": -2.3542771339416504, "step": 9 }, { "epoch": 0.13174145738987236, "grad_norm": 0.9426828622817993, "learning_rate": 9.97802991010949e-07, "logits/chosen": 10.330013275146484, "logits/rejected": 10.423635482788086, "logps/chosen": -1.1276183128356934, "logps/rejected": -1.1535909175872803, "loss": 1.4741, "rewards/accuracies": 0.5625, "rewards/chosen": -2.2552366256713867, "rewards/margins": 0.05194506794214249, "rewards/rejected": -2.3071818351745605, "step": 10 }, { "epoch": 0.14491560312885962, "grad_norm": 1.174481987953186, "learning_rate": 9.950612572673255e-07, "logits/chosen": 10.557825088500977, "logits/rejected": 10.500927925109863, "logps/chosen": -1.2195600271224976, "logps/rejected": -1.3650047779083252, "loss": 1.3192, "rewards/accuracies": 0.640625, "rewards/chosen": -2.439120054244995, "rewards/margins": 0.2908894717693329, "rewards/rejected": -2.7300095558166504, "step": 11 }, { "epoch": 0.15808974886784685, "grad_norm": 0.9938077330589294, "learning_rate": 9.912312714377879e-07, "logits/chosen": 10.221735000610352, "logits/rejected": 10.339948654174805, "logps/chosen": -1.072077989578247, "logps/rejected": -1.114700198173523, "loss": 1.3846, "rewards/accuracies": 0.6015625, "rewards/chosen": -2.144155979156494, "rewards/margins": 0.08524461090564728, "rewards/rejected": -2.229400396347046, "step": 12 }, { "epoch": 0.17126389460683408, "grad_norm": 0.9858765006065369, "learning_rate": 9.863214526624063e-07, "logits/chosen": 10.290239334106445, "logits/rejected": 10.385334014892578, "logps/chosen": -1.1751586198806763, "logps/rejected": -1.3674362897872925, "loss": 1.2879, "rewards/accuracies": 0.6015625, "rewards/chosen": -2.3503172397613525, "rewards/margins": 0.38455531001091003, "rewards/rejected": -2.734872579574585, "step": 13 }, { "epoch": 0.1844380403458213, "grad_norm": 1.6941860914230347, "learning_rate": 9.8034259378842e-07, "logits/chosen": 10.756081581115723, "logits/rejected": 10.947944641113281, "logps/chosen": -1.0922513008117676, "logps/rejected": -1.1827645301818848, "loss": 1.3465, "rewards/accuracies": 0.5859375, "rewards/chosen": -2.184502601623535, "rewards/margins": 0.18102645874023438, "rewards/rejected": -2.3655290603637695, "step": 14 }, { "epoch": 0.19761218608480857, "grad_norm": 0.33364373445510864, "learning_rate": 9.73307837645217e-07, "logits/chosen": 10.299134254455566, "logits/rejected": 10.412195205688477, "logps/chosen": -1.1260679960250854, "logps/rejected": -1.2360444068908691, "loss": 1.3164, "rewards/accuracies": 0.609375, "rewards/chosen": -2.252135992050171, "rewards/margins": 0.219952791929245, "rewards/rejected": -2.4720888137817383, "step": 15 }, { "epoch": 0.2107863318237958, "grad_norm": 0.7868668437004089, "learning_rate": 9.652326481535433e-07, "logits/chosen": 10.79407024383545, "logits/rejected": 10.986239433288574, "logps/chosen": -1.0521718263626099, "logps/rejected": -1.0945489406585693, "loss": 1.415, "rewards/accuracies": 0.53125, "rewards/chosen": -2.1043436527252197, "rewards/margins": 0.08475431054830551, "rewards/rejected": -2.1890978813171387, "step": 16 }, { "epoch": 0.22396047756278303, "grad_norm": 0.6708123683929443, "learning_rate": 9.561347763324483e-07, "logits/chosen": 10.472009658813477, "logits/rejected": 10.457694053649902, "logps/chosen": -1.0896780490875244, "logps/rejected": -1.1595150232315063, "loss": 1.3736, "rewards/accuracies": 0.5625, "rewards/chosen": -2.179356098175049, "rewards/margins": 0.13967394828796387, "rewards/rejected": -2.3190300464630127, "step": 17 }, { "epoch": 0.23713462330177026, "grad_norm": 0.9609116911888123, "learning_rate": 9.460342212786932e-07, "logits/chosen": 10.532232284545898, "logits/rejected": 10.626241683959961, "logps/chosen": -1.1494747400283813, "logps/rejected": -1.1549882888793945, "loss": 1.4623, "rewards/accuracies": 0.6171875, "rewards/chosen": -2.2989494800567627, "rewards/margins": 0.011027364060282707, "rewards/rejected": -2.309976577758789, "step": 18 }, { "epoch": 0.2503087690407575, "grad_norm": 0.49065420031547546, "learning_rate": 9.349531862043951e-07, "logits/chosen": 10.552773475646973, "logits/rejected": 10.44497013092041, "logps/chosen": -1.123961329460144, "logps/rejected": -1.2145094871520996, "loss": 1.3068, "rewards/accuracies": 0.671875, "rewards/chosen": -2.247922658920288, "rewards/margins": 0.18109644949436188, "rewards/rejected": -2.429018974304199, "step": 19 }, { "epoch": 0.2634829147797447, "grad_norm": 1.4662220478057861, "learning_rate": 9.229160296295487e-07, "logits/chosen": 10.48397445678711, "logits/rejected": 10.557746887207031, "logps/chosen": -1.0979560613632202, "logps/rejected": -1.1902183294296265, "loss": 1.355, "rewards/accuracies": 0.625, "rewards/chosen": -2.1959121227264404, "rewards/margins": 0.1845243275165558, "rewards/rejected": -2.380436658859253, "step": 20 }, { "epoch": 0.276657060518732, "grad_norm": 1.0163906812667847, "learning_rate": 9.099492118367122e-07, "logits/chosen": 10.452715873718262, "logits/rejected": 10.512972831726074, "logps/chosen": -1.0967323780059814, "logps/rejected": -1.2304078340530396, "loss": 1.2888, "rewards/accuracies": 0.625, "rewards/chosen": -2.193464756011963, "rewards/margins": 0.26735079288482666, "rewards/rejected": -2.460815668106079, "step": 21 }, { "epoch": 0.28983120625771924, "grad_norm": 3.908339738845825, "learning_rate": 8.960812367055646e-07, "logits/chosen": 10.526410102844238, "logits/rejected": 10.603084564208984, "logps/chosen": -1.224048376083374, "logps/rejected": -1.2536935806274414, "loss": 1.4348, "rewards/accuracies": 0.578125, "rewards/chosen": -2.448096752166748, "rewards/margins": 0.0592900887131691, "rewards/rejected": -2.507387161254883, "step": 22 }, { "epoch": 0.3030053519967065, "grad_norm": 0.31968942284584045, "learning_rate": 8.813425890551909e-07, "logits/chosen": 10.193231582641602, "logits/rejected": 10.474884986877441, "logps/chosen": -1.1644542217254639, "logps/rejected": -1.1897979974746704, "loss": 1.4224, "rewards/accuracies": 0.59375, "rewards/chosen": -2.3289084434509277, "rewards/margins": 0.05068742483854294, "rewards/rejected": -2.379595994949341, "step": 23 }, { "epoch": 0.3161794977356937, "grad_norm": 1.0304734706878662, "learning_rate": 8.657656676318345e-07, "logits/chosen": 10.281815528869629, "logits/rejected": 10.21823501586914, "logps/chosen": -1.0919454097747803, "logps/rejected": -1.210888385772705, "loss": 1.325, "rewards/accuracies": 0.5703125, "rewards/chosen": -2.1838908195495605, "rewards/margins": 0.23788578808307648, "rewards/rejected": -2.42177677154541, "step": 24 }, { "epoch": 0.32935364347468093, "grad_norm": 2.0577356815338135, "learning_rate": 8.493847138894208e-07, "logits/chosen": 10.638153076171875, "logits/rejected": 10.773796081542969, "logps/chosen": -1.0134081840515137, "logps/rejected": -1.0593600273132324, "loss": 1.372, "rewards/accuracies": 0.5703125, "rewards/chosen": -2.0268163681030273, "rewards/margins": 0.09190365672111511, "rewards/rejected": -2.118720054626465, "step": 25 }, { "epoch": 0.34252778921366817, "grad_norm": 0.6051430106163025, "learning_rate": 8.322357367194108e-07, "logits/chosen": 10.498373031616211, "logits/rejected": 10.679786682128906, "logps/chosen": -1.0168992280960083, "logps/rejected": -1.0872042179107666, "loss": 1.3499, "rewards/accuracies": 0.578125, "rewards/chosen": -2.0337984561920166, "rewards/margins": 0.14060987532138824, "rewards/rejected": -2.174408435821533, "step": 26 }, { "epoch": 0.3557019349526554, "grad_norm": 1.769278645515442, "learning_rate": 8.143564332954425e-07, "logits/chosen": 10.561027526855469, "logits/rejected": 10.846573829650879, "logps/chosen": -1.114155650138855, "logps/rejected": -1.1514461040496826, "loss": 1.399, "rewards/accuracies": 0.5703125, "rewards/chosen": -2.22831130027771, "rewards/margins": 0.07458095252513885, "rewards/rejected": -2.3028922080993652, "step": 27 }, { "epoch": 0.3688760806916426, "grad_norm": 0.6603056192398071, "learning_rate": 7.957861062067612e-07, "logits/chosen": 10.477982521057129, "logits/rejected": 10.683720588684082, "logps/chosen": -1.0256245136260986, "logps/rejected": -1.1278630495071411, "loss": 1.3446, "rewards/accuracies": 0.578125, "rewards/chosen": -2.0512490272521973, "rewards/margins": 0.20447733998298645, "rewards/rejected": -2.2557260990142822, "step": 28 }, { "epoch": 0.3820502264306299, "grad_norm": 1.5685954093933105, "learning_rate": 7.765655770625996e-07, "logits/chosen": 10.337364196777344, "logits/rejected": 10.332595825195312, "logps/chosen": -1.043882966041565, "logps/rejected": -1.0915210247039795, "loss": 1.3965, "rewards/accuracies": 0.6796875, "rewards/chosen": -2.08776593208313, "rewards/margins": 0.09527605772018433, "rewards/rejected": -2.183042049407959, "step": 29 }, { "epoch": 0.39522437216961714, "grad_norm": 0.5102595686912537, "learning_rate": 7.567370967574209e-07, "logits/chosen": 10.61121654510498, "logits/rejected": 10.84305191040039, "logps/chosen": -1.0729877948760986, "logps/rejected": -1.1098031997680664, "loss": 1.4371, "rewards/accuracies": 0.578125, "rewards/chosen": -2.1459755897521973, "rewards/margins": 0.07363072782754898, "rewards/rejected": -2.219606399536133, "step": 30 }, { "epoch": 0.4083985179086044, "grad_norm": 0.8290932774543762, "learning_rate": 7.363442525942826e-07, "logits/chosen": 10.494096755981445, "logits/rejected": 10.498456954956055, "logps/chosen": -0.9797660708427429, "logps/rejected": -1.0840959548950195, "loss": 1.3044, "rewards/accuracies": 0.65625, "rewards/chosen": -1.9595321416854858, "rewards/margins": 0.2086598128080368, "rewards/rejected": -2.168191909790039, "step": 31 }, { "epoch": 0.4215726636475916, "grad_norm": 0.36728209257125854, "learning_rate": 7.154318724704851e-07, "logits/chosen": 10.502325057983398, "logits/rejected": 10.551267623901367, "logps/chosen": -1.040996789932251, "logps/rejected": -1.0938708782196045, "loss": 1.3946, "rewards/accuracies": 0.5703125, "rewards/chosen": -2.081993579864502, "rewards/margins": 0.10574813187122345, "rewards/rejected": -2.187741756439209, "step": 32 }, { "epoch": 0.43474680938657884, "grad_norm": 0.5727468132972717, "learning_rate": 6.940459263361248e-07, "logits/chosen": 10.425085067749023, "logits/rejected": 10.45986270904541, "logps/chosen": -1.0259983539581299, "logps/rejected": -1.064608097076416, "loss": 1.4179, "rewards/accuracies": 0.6015625, "rewards/chosen": -2.0519967079162598, "rewards/margins": 0.07721954584121704, "rewards/rejected": -2.129216194152832, "step": 33 }, { "epoch": 0.44792095512556607, "grad_norm": 0.8594540953636169, "learning_rate": 6.722334251421664e-07, "logits/chosen": 10.195260047912598, "logits/rejected": 10.263075828552246, "logps/chosen": -1.0975958108901978, "logps/rejected": -1.1462163925170898, "loss": 1.4471, "rewards/accuracies": 0.5703125, "rewards/chosen": -2.1951916217803955, "rewards/margins": 0.09724146127700806, "rewards/rejected": -2.2924327850341797, "step": 34 }, { "epoch": 0.4610951008645533, "grad_norm": 2.0125956535339355, "learning_rate": 6.500423175001703e-07, "logits/chosen": 10.623847007751465, "logits/rejected": 10.824085235595703, "logps/chosen": -1.1299512386322021, "logps/rejected": -1.1615726947784424, "loss": 1.4721, "rewards/accuracies": 0.578125, "rewards/chosen": -2.2599024772644043, "rewards/margins": 0.06324289739131927, "rewards/rejected": -2.3231453895568848, "step": 35 }, { "epoch": 0.47426924660354053, "grad_norm": 0.9820675253868103, "learning_rate": 6.275213842808382e-07, "logits/chosen": 10.475167274475098, "logits/rejected": 10.816570281982422, "logps/chosen": -1.0255820751190186, "logps/rejected": -1.0681825876235962, "loss": 1.3712, "rewards/accuracies": 0.578125, "rewards/chosen": -2.051164150238037, "rewards/margins": 0.08520102500915527, "rewards/rejected": -2.1363651752471924, "step": 36 }, { "epoch": 0.4874433923425278, "grad_norm": 0.5090928077697754, "learning_rate": 6.047201313830723e-07, "logits/chosen": 10.42083740234375, "logits/rejected": 10.372156143188477, "logps/chosen": -1.0987904071807861, "logps/rejected": -1.2581254243850708, "loss": 1.2585, "rewards/accuracies": 0.6640625, "rewards/chosen": -2.1975808143615723, "rewards/margins": 0.3186701834201813, "rewards/rejected": -2.5162508487701416, "step": 37 }, { "epoch": 0.500617538081515, "grad_norm": 2.139826774597168, "learning_rate": 5.816886809092651e-07, "logits/chosen": 10.55873966217041, "logits/rejected": 10.591476440429688, "logps/chosen": -1.1373264789581299, "logps/rejected": -1.1765114068984985, "loss": 1.3998, "rewards/accuracies": 0.6484375, "rewards/chosen": -2.2746529579162598, "rewards/margins": 0.07836979627609253, "rewards/rejected": -2.353022813796997, "step": 38 }, { "epoch": 0.5137916838205022, "grad_norm": 2.3105835914611816, "learning_rate": 5.584776609860413e-07, "logits/chosen": 10.445201873779297, "logits/rejected": 10.462349891662598, "logps/chosen": -1.0283745527267456, "logps/rejected": -1.0441796779632568, "loss": 1.4224, "rewards/accuracies": 0.5625, "rewards/chosen": -2.056749105453491, "rewards/margins": 0.03160998225212097, "rewards/rejected": -2.0883593559265137, "step": 39 }, { "epoch": 0.5269658295594895, "grad_norm": 0.9087686538696289, "learning_rate": 5.351380944726465e-07, "logits/chosen": 10.493326187133789, "logits/rejected": 10.636161804199219, "logps/chosen": -1.0796146392822266, "logps/rejected": -1.1792174577713013, "loss": 1.3171, "rewards/accuracies": 0.6875, "rewards/chosen": -2.159229278564453, "rewards/margins": 0.1992054581642151, "rewards/rejected": -2.3584349155426025, "step": 40 }, { "epoch": 0.5401399752984768, "grad_norm": 4.677896022796631, "learning_rate": 5.117212868016303e-07, "logits/chosen": 10.75143814086914, "logits/rejected": 10.868351936340332, "logps/chosen": -1.0814591646194458, "logps/rejected": -1.154773235321045, "loss": 1.3634, "rewards/accuracies": 0.546875, "rewards/chosen": -2.1629183292388916, "rewards/margins": 0.14662815630435944, "rewards/rejected": -2.30954647064209, "step": 41 }, { "epoch": 0.553314121037464, "grad_norm": 0.9839391112327576, "learning_rate": 4.882787131983697e-07, "logits/chosen": 10.169832229614258, "logits/rejected": 10.324451446533203, "logps/chosen": -0.9578548669815063, "logps/rejected": -1.03363835811615, "loss": 1.3441, "rewards/accuracies": 0.5859375, "rewards/chosen": -1.9157097339630127, "rewards/margins": 0.15156704187393188, "rewards/rejected": -2.0672767162323, "step": 42 }, { "epoch": 0.5664882667764513, "grad_norm": 1.3352464437484741, "learning_rate": 4.648619055273537e-07, "logits/chosen": 9.958123207092285, "logits/rejected": 10.148454666137695, "logps/chosen": -0.9660685062408447, "logps/rejected": -1.1172298192977905, "loss": 1.2631, "rewards/accuracies": 0.703125, "rewards/chosen": -1.9321370124816895, "rewards/margins": 0.3023225665092468, "rewards/rejected": -2.234459638595581, "step": 43 }, { "epoch": 0.5796624125154385, "grad_norm": 0.859604001045227, "learning_rate": 4.4152233901395875e-07, "logits/chosen": 10.425186157226562, "logits/rejected": 10.529376983642578, "logps/chosen": -1.035997986793518, "logps/rejected": -1.11883544921875, "loss": 1.3453, "rewards/accuracies": 0.6171875, "rewards/chosen": -2.071995973587036, "rewards/margins": 0.16567496955394745, "rewards/rejected": -2.2376708984375, "step": 44 }, { "epoch": 0.5928365582544257, "grad_norm": 0.5243343710899353, "learning_rate": 4.183113190907348e-07, "logits/chosen": 10.306164741516113, "logits/rejected": 10.420225143432617, "logps/chosen": -1.0819859504699707, "logps/rejected": -1.1435920000076294, "loss": 1.3655, "rewards/accuracies": 0.6328125, "rewards/chosen": -2.1639719009399414, "rewards/margins": 0.12321220338344574, "rewards/rejected": -2.287184000015259, "step": 45 }, { "epoch": 0.606010703993413, "grad_norm": 4.204680919647217, "learning_rate": 3.9527986861692785e-07, "logits/chosen": 10.315364837646484, "logits/rejected": 10.468154907226562, "logps/chosen": -1.0733493566513062, "logps/rejected": -1.1826094388961792, "loss": 1.3462, "rewards/accuracies": 0.609375, "rewards/chosen": -2.1466987133026123, "rewards/margins": 0.21852055191993713, "rewards/rejected": -2.3652188777923584, "step": 46 }, { "epoch": 0.6191848497324002, "grad_norm": 0.4108594059944153, "learning_rate": 3.724786157191618e-07, "logits/chosen": 10.772510528564453, "logits/rejected": 10.858367919921875, "logps/chosen": -1.147858738899231, "logps/rejected": -1.1819148063659668, "loss": 1.4181, "rewards/accuracies": 0.5625, "rewards/chosen": -2.295717477798462, "rewards/margins": 0.06811191886663437, "rewards/rejected": -2.3638296127319336, "step": 47 }, { "epoch": 0.6323589954713874, "grad_norm": 0.5696843266487122, "learning_rate": 3.499576824998297e-07, "logits/chosen": 10.961053848266602, "logits/rejected": 10.980447769165039, "logps/chosen": -1.0650501251220703, "logps/rejected": -1.1953485012054443, "loss": 1.3094, "rewards/accuracies": 0.625, "rewards/chosen": -2.1301002502441406, "rewards/margins": 0.2605968713760376, "rewards/rejected": -2.3906970024108887, "step": 48 }, { "epoch": 0.6455331412103746, "grad_norm": 0.44700711965560913, "learning_rate": 3.2776657485783356e-07, "logits/chosen": 10.55281925201416, "logits/rejected": 10.7970552444458, "logps/chosen": -1.0387682914733887, "logps/rejected": -1.1529128551483154, "loss": 1.3126, "rewards/accuracies": 0.5859375, "rewards/chosen": -2.0775365829467773, "rewards/margins": 0.22828897833824158, "rewards/rejected": -2.305825710296631, "step": 49 }, { "epoch": 0.6587072869493619, "grad_norm": 4.293384075164795, "learning_rate": 3.0595407366387506e-07, "logits/chosen": 10.384468078613281, "logits/rejected": 10.416984558105469, "logps/chosen": -0.9318161010742188, "logps/rejected": -0.9768213033676147, "loss": 1.3795, "rewards/accuracies": 0.5546875, "rewards/chosen": -1.8636322021484375, "rewards/margins": 0.09001035988330841, "rewards/rejected": -1.9536426067352295, "step": 50 }, { "epoch": 0.6718814326883491, "grad_norm": 0.33170759677886963, "learning_rate": 2.845681275295148e-07, "logits/chosen": 10.023512840270996, "logits/rejected": 10.245420455932617, "logps/chosen": -1.018381953239441, "logps/rejected": -1.0732663869857788, "loss": 1.3705, "rewards/accuracies": 0.5546875, "rewards/chosen": -2.036763906478882, "rewards/margins": 0.10976862907409668, "rewards/rejected": -2.1465327739715576, "step": 51 }, { "epoch": 0.6850555784273363, "grad_norm": 0.38991814851760864, "learning_rate": 2.636557474057173e-07, "logits/chosen": 10.319741249084473, "logits/rejected": 10.445501327514648, "logps/chosen": -1.0599719285964966, "logps/rejected": -1.1591590642929077, "loss": 1.3299, "rewards/accuracies": 0.5234375, "rewards/chosen": -2.119943857192993, "rewards/margins": 0.19837446510791779, "rewards/rejected": -2.3183181285858154, "step": 52 }, { "epoch": 0.6982297241663236, "grad_norm": 0.19526095688343048, "learning_rate": 2.432629032425789e-07, "logits/chosen": 10.324767112731934, "logits/rejected": 10.408279418945312, "logps/chosen": -0.9632182717323303, "logps/rejected": -1.0930753946304321, "loss": 1.2703, "rewards/accuracies": 0.625, "rewards/chosen": -1.9264365434646606, "rewards/margins": 0.259714275598526, "rewards/rejected": -2.1861507892608643, "step": 53 }, { "epoch": 0.7114038699053108, "grad_norm": 2.3096110820770264, "learning_rate": 2.2343442293740028e-07, "logits/chosen": 10.577717781066895, "logits/rejected": 10.743069648742676, "logps/chosen": -1.1589571237564087, "logps/rejected": -1.2371091842651367, "loss": 1.3911, "rewards/accuracies": 0.59375, "rewards/chosen": -2.3179142475128174, "rewards/margins": 0.1563042253255844, "rewards/rejected": -2.4742183685302734, "step": 54 }, { "epoch": 0.724578015644298, "grad_norm": 1.9873132705688477, "learning_rate": 2.0421389379323877e-07, "logits/chosen": 10.299473762512207, "logits/rejected": 10.592483520507812, "logps/chosen": -1.1511658430099487, "logps/rejected": -1.2319570779800415, "loss": 1.3872, "rewards/accuracies": 0.5234375, "rewards/chosen": -2.3023316860198975, "rewards/margins": 0.16158249974250793, "rewards/rejected": -2.463914155960083, "step": 55 }, { "epoch": 0.7377521613832853, "grad_norm": 0.9143347144126892, "learning_rate": 1.8564356670455767e-07, "logits/chosen": 10.18372917175293, "logits/rejected": 10.334182739257812, "logps/chosen": -1.0275030136108398, "logps/rejected": -1.1190022230148315, "loss": 1.3208, "rewards/accuracies": 0.6328125, "rewards/chosen": -2.0550060272216797, "rewards/margins": 0.18299821019172668, "rewards/rejected": -2.238004446029663, "step": 56 }, { "epoch": 0.7509263071222725, "grad_norm": 0.7662608027458191, "learning_rate": 1.6776426328058919e-07, "logits/chosen": 10.509971618652344, "logits/rejected": 10.623988151550293, "logps/chosen": -1.0604355335235596, "logps/rejected": -1.1462385654449463, "loss": 1.37, "rewards/accuracies": 0.609375, "rewards/chosen": -2.120871067047119, "rewards/margins": 0.17160603404045105, "rewards/rejected": -2.2924771308898926, "step": 57 }, { "epoch": 0.7641004528612598, "grad_norm": 0.6004998087882996, "learning_rate": 1.5061528611057915e-07, "logits/chosen": 10.32964038848877, "logits/rejected": 10.610329627990723, "logps/chosen": -1.0015301704406738, "logps/rejected": -1.170923113822937, "loss": 1.2979, "rewards/accuracies": 0.6171875, "rewards/chosen": -2.0030603408813477, "rewards/margins": 0.33878597617149353, "rewards/rejected": -2.341846227645874, "step": 58 }, { "epoch": 0.7772745986002471, "grad_norm": 0.5703756213188171, "learning_rate": 1.3423433236816562e-07, "logits/chosen": 10.539653778076172, "logits/rejected": 10.416500091552734, "logps/chosen": -1.0045316219329834, "logps/rejected": -1.0751566886901855, "loss": 1.3611, "rewards/accuracies": 0.6015625, "rewards/chosen": -2.009063243865967, "rewards/margins": 0.14124992489814758, "rewards/rejected": -2.150313377380371, "step": 59 }, { "epoch": 0.7904487443392343, "grad_norm": 0.5115736126899719, "learning_rate": 1.1865741094480908e-07, "logits/chosen": 10.369180679321289, "logits/rejected": 10.536138534545898, "logps/chosen": -0.9789251089096069, "logps/rejected": -1.065828800201416, "loss": 1.3283, "rewards/accuracies": 0.59375, "rewards/chosen": -1.9578502178192139, "rewards/margins": 0.17380748689174652, "rewards/rejected": -2.131657600402832, "step": 60 }, { "epoch": 0.8036228900782215, "grad_norm": 1.1319605112075806, "learning_rate": 1.0391876329443533e-07, "logits/chosen": 10.789149284362793, "logits/rejected": 10.966440200805664, "logps/chosen": -1.0893032550811768, "logps/rejected": -1.0708649158477783, "loss": 1.5005, "rewards/accuracies": 0.5, "rewards/chosen": -2.1786065101623535, "rewards/margins": -0.03687664493918419, "rewards/rejected": -2.1417298316955566, "step": 61 }, { "epoch": 0.8167970358172087, "grad_norm": 0.5752662420272827, "learning_rate": 9.00507881632877e-08, "logits/chosen": 10.33529281616211, "logits/rejected": 10.413320541381836, "logps/chosen": -0.9582546949386597, "logps/rejected": -0.9869816303253174, "loss": 1.3962, "rewards/accuracies": 0.59375, "rewards/chosen": -1.9165093898773193, "rewards/margins": 0.05745404213666916, "rewards/rejected": -1.9739632606506348, "step": 62 }, { "epoch": 0.829971181556196, "grad_norm": 0.47591590881347656, "learning_rate": 7.708397037045128e-08, "logits/chosen": 10.545299530029297, "logits/rejected": 10.59538745880127, "logps/chosen": -1.0177710056304932, "logps/rejected": -1.0703089237213135, "loss": 1.3921, "rewards/accuracies": 0.6171875, "rewards/chosen": -2.0355420112609863, "rewards/margins": 0.10507576167583466, "rewards/rejected": -2.140617847442627, "step": 63 }, { "epoch": 0.8431453272951832, "grad_norm": 1.4164929389953613, "learning_rate": 6.504681379560489e-08, "logits/chosen": 10.248059272766113, "logits/rejected": 10.443687438964844, "logps/chosen": -1.0844589471817017, "logps/rejected": -1.1213093996047974, "loss": 1.3948, "rewards/accuracies": 0.6015625, "rewards/chosen": -2.1689178943634033, "rewards/margins": 0.07370080798864365, "rewards/rejected": -2.2426187992095947, "step": 64 }, { "epoch": 0.8563194730341704, "grad_norm": 0.5239718556404114, "learning_rate": 5.396577872130675e-08, "logits/chosen": 10.074962615966797, "logits/rejected": 10.317444801330566, "logps/chosen": -0.9682564735412598, "logps/rejected": -1.0767121315002441, "loss": 1.2955, "rewards/accuracies": 0.59375, "rewards/chosen": -1.9365129470825195, "rewards/margins": 0.2169112116098404, "rewards/rejected": -2.1534242630004883, "step": 65 }, { "epoch": 0.8694936187731577, "grad_norm": 0.7642585635185242, "learning_rate": 4.3865223667551686e-08, "logits/chosen": 10.72495174407959, "logits/rejected": 10.884751319885254, "logps/chosen": -1.002912998199463, "logps/rejected": -1.0845009088516235, "loss": 1.3526, "rewards/accuracies": 0.609375, "rewards/chosen": -2.005825996398926, "rewards/margins": 0.16317592561244965, "rewards/rejected": -2.169001817703247, "step": 66 }, { "epoch": 0.8826677645121449, "grad_norm": 0.7223308086395264, "learning_rate": 3.476735184645674e-08, "logits/chosen": 10.134446144104004, "logits/rejected": 10.361305236816406, "logps/chosen": -0.9966915845870972, "logps/rejected": -1.0558110475540161, "loss": 1.3993, "rewards/accuracies": 0.5859375, "rewards/chosen": -1.9933831691741943, "rewards/margins": 0.11823896318674088, "rewards/rejected": -2.1116220951080322, "step": 67 }, { "epoch": 0.8958419102511321, "grad_norm": 1.389740228652954, "learning_rate": 2.6692162354782943e-08, "logits/chosen": 10.645910263061523, "logits/rejected": 10.837828636169434, "logps/chosen": -1.0475916862487793, "logps/rejected": -1.0700594186782837, "loss": 1.4014, "rewards/accuracies": 0.578125, "rewards/chosen": -2.0951833724975586, "rewards/margins": 0.0449354350566864, "rewards/rejected": -2.1401188373565674, "step": 68 }, { "epoch": 0.9090160559901194, "grad_norm": 0.7170724272727966, "learning_rate": 1.9657406211579962e-08, "logits/chosen": 10.590347290039062, "logits/rejected": 10.748612403869629, "logps/chosen": -1.0553630590438843, "logps/rejected": -1.1445770263671875, "loss": 1.3717, "rewards/accuracies": 0.578125, "rewards/chosen": -2.1107261180877686, "rewards/margins": 0.1784280687570572, "rewards/rejected": -2.289154052734375, "step": 69 }, { "epoch": 0.9221902017291066, "grad_norm": 0.5668903589248657, "learning_rate": 1.3678547337593494e-08, "logits/chosen": 10.05616569519043, "logits/rejected": 10.123579978942871, "logps/chosen": -0.9823417067527771, "logps/rejected": -1.0538458824157715, "loss": 1.3274, "rewards/accuracies": 0.609375, "rewards/chosen": -1.9646834135055542, "rewards/margins": 0.14300836622714996, "rewards/rejected": -2.107691764831543, "step": 70 }, { "epoch": 0.9353643474680938, "grad_norm": 0.7955140471458435, "learning_rate": 8.768728562211946e-09, "logits/chosen": 10.548900604248047, "logits/rejected": 10.70114517211914, "logps/chosen": -1.0273078680038452, "logps/rejected": -1.125605583190918, "loss": 1.3531, "rewards/accuracies": 0.5546875, "rewards/chosen": -2.0546157360076904, "rewards/margins": 0.19659575819969177, "rewards/rejected": -2.251211166381836, "step": 71 }, { "epoch": 0.9485384932070811, "grad_norm": 0.6895307302474976, "learning_rate": 4.938742732674528e-09, "logits/chosen": 10.362960815429688, "logits/rejected": 10.645204544067383, "logps/chosen": -0.9852724075317383, "logps/rejected": -1.013486623764038, "loss": 1.4038, "rewards/accuracies": 0.546875, "rewards/chosen": -1.9705448150634766, "rewards/margins": 0.05642828345298767, "rewards/rejected": -2.026973247528076, "step": 72 }, { "epoch": 0.9617126389460683, "grad_norm": 0.9335980415344238, "learning_rate": 2.1970089890509524e-09, "logits/chosen": 10.450238227844238, "logits/rejected": 10.472136497497559, "logps/chosen": -0.9828729629516602, "logps/rejected": -1.0797333717346191, "loss": 1.306, "rewards/accuracies": 0.5859375, "rewards/chosen": -1.9657459259033203, "rewards/margins": 0.19372045993804932, "rewards/rejected": -2.1594667434692383, "step": 73 }, { "epoch": 0.9748867846850556, "grad_norm": 6.053179740905762, "learning_rate": 5.495542571443135e-10, "logits/chosen": 10.098093032836914, "logits/rejected": 10.248712539672852, "logps/chosen": -0.9912701845169067, "logps/rejected": -1.0857067108154297, "loss": 1.3291, "rewards/accuracies": 0.7109375, "rewards/chosen": -1.9825403690338135, "rewards/margins": 0.1888730227947235, "rewards/rejected": -2.1714134216308594, "step": 74 }, { "epoch": 0.9880609304240429, "grad_norm": 0.5729569792747498, "learning_rate": 0.0, "logits/chosen": 10.522598266601562, "logits/rejected": 10.471258163452148, "logps/chosen": -0.9319721460342407, "logps/rejected": -1.08614981174469, "loss": 1.2844, "rewards/accuracies": 0.6796875, "rewards/chosen": -1.8639442920684814, "rewards/margins": 0.3083552122116089, "rewards/rejected": -2.17229962348938, "step": 75 }, { "epoch": 0.9880609304240429, "step": 75, "total_flos": 0.0, "train_loss": 1.3642690054575601, "train_runtime": 7453.8282, "train_samples_per_second": 1.303, "train_steps_per_second": 0.01 } ], "logging_steps": 1, "max_steps": 75, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 12, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }