diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4714 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 3125, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "grad_norm": 72.06516054493869, + "learning_rate": 1.99424e-05, + "logits/chosen": 0.231048583984375, + "logits/rejected": 0.40448302030563354, + "logps/chosen": -263.57501220703125, + "logps/rejected": -220.2624969482422, + "loss": 0.8815, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.6032226085662842, + "rewards/margins": 0.45408326387405396, + "rewards/rejected": -2.056042432785034, + "step": 10 + }, + { + "epoch": 0.0064, + "grad_norm": 67.31594654043894, + "learning_rate": 1.9878400000000003e-05, + "logits/chosen": 0.58917236328125, + "logits/rejected": 0.6860595941543579, + "logps/chosen": -325.4750061035156, + "logps/rejected": -271.20001220703125, + "loss": 0.8773, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -7.1884765625, + "rewards/margins": 0.971118152141571, + "rewards/rejected": -8.158594131469727, + "step": 20 + }, + { + "epoch": 0.0096, + "grad_norm": 92.75698380810871, + "learning_rate": 1.98144e-05, + "logits/chosen": 0.5443969964981079, + "logits/rejected": 0.6279541254043579, + "logps/chosen": -360.07501220703125, + "logps/rejected": -327.75, + "loss": 1.0953, + "rewards/accuracies": 0.6343749761581421, + "rewards/chosen": -11.444531440734863, + "rewards/margins": 0.8212890625, + "rewards/rejected": -12.265625, + "step": 30 + }, + { + "epoch": 0.0128, + "grad_norm": 62.32858351999384, + "learning_rate": 1.97504e-05, + "logits/chosen": 0.3295272886753082, + "logits/rejected": 0.38934326171875, + "logps/chosen": -312.5, + "logps/rejected": -276.2749938964844, + "loss": 1.4072, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -8.244531631469727, + "rewards/margins": 0.801513671875, + "rewards/rejected": -9.04296875, + "step": 40 + }, + { + "epoch": 0.016, + "grad_norm": 50.08386638109074, + "learning_rate": 1.9686400000000002e-05, + "logits/chosen": 0.71923828125, + "logits/rejected": 0.766186535358429, + "logps/chosen": -347.57501220703125, + "logps/rejected": -320.4750061035156, + "loss": 1.2033, + "rewards/accuracies": 0.625, + "rewards/chosen": -11.116406440734863, + "rewards/margins": 0.8971191644668579, + "rewards/rejected": -12.012499809265137, + "step": 50 + }, + { + "epoch": 0.0192, + "grad_norm": 94.5795753479825, + "learning_rate": 1.96224e-05, + "logits/chosen": 0.876757800579071, + "logits/rejected": 0.9106689691543579, + "logps/chosen": -431.54998779296875, + "logps/rejected": -398.8999938964844, + "loss": 1.2877, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -19.684375762939453, + "rewards/margins": 1.2566406726837158, + "rewards/rejected": -20.934375762939453, + "step": 60 + }, + { + "epoch": 0.0224, + "grad_norm": 87.40075574771897, + "learning_rate": 1.9558400000000002e-05, + "logits/chosen": 0.45653611421585083, + "logits/rejected": 0.5081542730331421, + "logps/chosen": -383.7250061035156, + "logps/rejected": -348.1000061035156, + "loss": 1.7876, + "rewards/accuracies": 0.5531250238418579, + "rewards/chosen": -14.109375, + "rewards/margins": 0.7588866949081421, + "rewards/rejected": -14.8671875, + "step": 70 + }, + { + "epoch": 0.0256, + "grad_norm": 63.266263022294886, + "learning_rate": 1.94944e-05, + "logits/chosen": 0.560455322265625, + "logits/rejected": 0.558880627155304, + "logps/chosen": -446.3500061035156, + "logps/rejected": -406.8999938964844, + "loss": 1.5243, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -18.8125, + "rewards/margins": 2.1751952171325684, + "rewards/rejected": -20.993749618530273, + "step": 80 + }, + { + "epoch": 0.0288, + "grad_norm": 68.41909447724173, + "learning_rate": 1.9430400000000003e-05, + "logits/chosen": 0.8628174066543579, + "logits/rejected": 0.8074280023574829, + "logps/chosen": -409.6000061035156, + "logps/rejected": -388.79998779296875, + "loss": 1.2459, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -15.942187309265137, + "rewards/margins": 2.4151368141174316, + "rewards/rejected": -18.362499237060547, + "step": 90 + }, + { + "epoch": 0.032, + "grad_norm": 54.65422389140606, + "learning_rate": 1.93664e-05, + "logits/chosen": NaN, + "logits/rejected": 1.095117211341858, + "logps/chosen": -339.8500061035156, + "logps/rejected": -304.875, + "loss": 1.3923, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -10.8828125, + "rewards/margins": 1.1833007335662842, + "rewards/rejected": -12.067187309265137, + "step": 100 + }, + { + "epoch": 0.0352, + "grad_norm": 67.99626476930845, + "learning_rate": 1.93024e-05, + "logits/chosen": 0.854418933391571, + "logits/rejected": 0.8412841558456421, + "logps/chosen": -388.95001220703125, + "logps/rejected": -357.0, + "loss": 1.4211, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -16.135936737060547, + "rewards/margins": 1.308984398841858, + "rewards/rejected": -17.446874618530273, + "step": 110 + }, + { + "epoch": 0.0384, + "grad_norm": 71.03359350317054, + "learning_rate": 1.9238400000000002e-05, + "logits/chosen": 0.7735840082168579, + "logits/rejected": 0.8270019292831421, + "logps/chosen": -400.45001220703125, + "logps/rejected": -384.3500061035156, + "loss": 1.6411, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -17.251562118530273, + "rewards/margins": 1.4036133289337158, + "rewards/rejected": -18.651561737060547, + "step": 120 + }, + { + "epoch": 0.0416, + "grad_norm": 66.85635125220757, + "learning_rate": 1.91744e-05, + "logits/chosen": 1.1936523914337158, + "logits/rejected": 1.174414038658142, + "logps/chosen": -418.70001220703125, + "logps/rejected": -375.42498779296875, + "loss": 1.3968, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -16.337499618530273, + "rewards/margins": 1.5578124523162842, + "rewards/rejected": -17.892187118530273, + "step": 130 + }, + { + "epoch": 0.0448, + "grad_norm": 43.79339456634566, + "learning_rate": 1.9110400000000003e-05, + "logits/chosen": 1.4757812023162842, + "logits/rejected": 1.500390648841858, + "logps/chosen": -474.5, + "logps/rejected": -437.6000061035156, + "loss": 1.1547, + "rewards/accuracies": 0.684374988079071, + "rewards/chosen": -21.924999237060547, + "rewards/margins": 2.652148485183716, + "rewards/rejected": -24.584375381469727, + "step": 140 + }, + { + "epoch": 0.048, + "grad_norm": 39.19090939652315, + "learning_rate": 1.90464e-05, + "logits/chosen": 1.182226538658142, + "logits/rejected": 1.2678711414337158, + "logps/chosen": -504.54998779296875, + "logps/rejected": -473.20001220703125, + "loss": 1.0756, + "rewards/accuracies": 0.671875, + "rewards/chosen": -27.087499618530273, + "rewards/margins": 2.50390625, + "rewards/rejected": -29.584375381469727, + "step": 150 + }, + { + "epoch": 0.0512, + "grad_norm": 81.33473530148507, + "learning_rate": 1.89824e-05, + "logits/chosen": NaN, + "logits/rejected": 1.238867163658142, + "logps/chosen": -455.0, + "logps/rejected": -433.3500061035156, + "loss": 1.2397, + "rewards/accuracies": 0.65625, + "rewards/chosen": -21.904687881469727, + "rewards/margins": 1.8517577648162842, + "rewards/rejected": -23.756250381469727, + "step": 160 + }, + { + "epoch": 0.0544, + "grad_norm": 34.92625419778924, + "learning_rate": 1.89184e-05, + "logits/chosen": 1.28662109375, + "logits/rejected": 1.349023461341858, + "logps/chosen": -479.45001220703125, + "logps/rejected": -466.8500061035156, + "loss": 1.2286, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -24.075000762939453, + "rewards/margins": 2.2798829078674316, + "rewards/rejected": -26.362499237060547, + "step": 170 + }, + { + "epoch": 0.0576, + "grad_norm": 65.20906568316, + "learning_rate": 1.88544e-05, + "logits/chosen": 1.4373047351837158, + "logits/rejected": 1.4529297351837158, + "logps/chosen": -493.0, + "logps/rejected": -460.54998779296875, + "loss": 1.4699, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -24.090625762939453, + "rewards/margins": 2.7845702171325684, + "rewards/rejected": -26.881250381469727, + "step": 180 + }, + { + "epoch": 0.0608, + "grad_norm": 68.96107516764462, + "learning_rate": 1.8790400000000002e-05, + "logits/chosen": 1.8361327648162842, + "logits/rejected": 1.876562476158142, + "logps/chosen": -427.25, + "logps/rejected": -413.6499938964844, + "loss": 1.5985, + "rewards/accuracies": 0.6656249761581421, + "rewards/chosen": -17.9140625, + "rewards/margins": 2.70458984375, + "rewards/rejected": -20.620311737060547, + "step": 190 + }, + { + "epoch": 0.064, + "grad_norm": 24.34257611593426, + "learning_rate": 1.87264e-05, + "logits/chosen": 1.532812476158142, + "logits/rejected": 1.641992211341858, + "logps/chosen": -545.7000122070312, + "logps/rejected": -510.20001220703125, + "loss": 1.3766, + "rewards/accuracies": 0.6656249761581421, + "rewards/chosen": -28.649999618530273, + "rewards/margins": 2.236523389816284, + "rewards/rejected": -30.881250381469727, + "step": 200 + }, + { + "epoch": 0.0672, + "grad_norm": 56.274263875182086, + "learning_rate": 1.86624e-05, + "logits/chosen": 1.5291016101837158, + "logits/rejected": 1.6222655773162842, + "logps/chosen": -519.9000244140625, + "logps/rejected": -483.04998779296875, + "loss": 1.5423, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -27.978124618530273, + "rewards/margins": 2.147265672683716, + "rewards/rejected": -30.134374618530273, + "step": 210 + }, + { + "epoch": 0.0704, + "grad_norm": 43.12330507290986, + "learning_rate": 1.85984e-05, + "logits/chosen": 1.486914038658142, + "logits/rejected": 1.56640625, + "logps/chosen": -515.4000244140625, + "logps/rejected": -502.6000061035156, + "loss": 1.4434, + "rewards/accuracies": 0.65625, + "rewards/chosen": -26.762500762939453, + "rewards/margins": 3.0335936546325684, + "rewards/rejected": -29.787500381469727, + "step": 220 + }, + { + "epoch": 0.0736, + "grad_norm": 104.38928515088328, + "learning_rate": 1.85344e-05, + "logits/chosen": 1.5703125, + "logits/rejected": 1.5720703601837158, + "logps/chosen": -518.6500244140625, + "logps/rejected": -507.8500061035156, + "loss": 1.5691, + "rewards/accuracies": 0.671875, + "rewards/chosen": -28.631250381469727, + "rewards/margins": 2.7933592796325684, + "rewards/rejected": -31.403125762939453, + "step": 230 + }, + { + "epoch": 0.0768, + "grad_norm": 49.06004160121367, + "learning_rate": 1.8470400000000002e-05, + "logits/chosen": NaN, + "logits/rejected": 1.42626953125, + "logps/chosen": -547.9500122070312, + "logps/rejected": -520.2999877929688, + "loss": 1.6266, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -30.484375, + "rewards/margins": 2.9068360328674316, + "rewards/rejected": -33.375, + "step": 240 + }, + { + "epoch": 0.08, + "grad_norm": 50.08613409082756, + "learning_rate": 1.84064e-05, + "logits/chosen": 1.2625000476837158, + "logits/rejected": 1.298242211341858, + "logps/chosen": -526.0499877929688, + "logps/rejected": -508.1499938964844, + "loss": 1.7452, + "rewards/accuracies": 0.65625, + "rewards/chosen": -29.581249237060547, + "rewards/margins": 1.581640601158142, + "rewards/rejected": -31.178125381469727, + "step": 250 + }, + { + "epoch": 0.0832, + "grad_norm": 68.80666192170693, + "learning_rate": 1.8342400000000002e-05, + "logits/chosen": 1.3650391101837158, + "logits/rejected": 1.4216797351837158, + "logps/chosen": -537.9000244140625, + "logps/rejected": -524.1500244140625, + "loss": 1.9467, + "rewards/accuracies": 0.684374988079071, + "rewards/chosen": -28.493749618530273, + "rewards/margins": 2.242480516433716, + "rewards/rejected": -30.737499237060547, + "step": 260 + }, + { + "epoch": 0.0864, + "grad_norm": 48.44817365637928, + "learning_rate": 1.82784e-05, + "logits/chosen": 1.2820312976837158, + "logits/rejected": 1.366601586341858, + "logps/chosen": -518.2999877929688, + "logps/rejected": -507.45001220703125, + "loss": 1.5778, + "rewards/accuracies": 0.65625, + "rewards/chosen": -26.75, + "rewards/margins": 3.1568360328674316, + "rewards/rejected": -29.896875381469727, + "step": 270 + }, + { + "epoch": 0.0896, + "grad_norm": 55.2264342326786, + "learning_rate": 1.82144e-05, + "logits/chosen": 1.174218773841858, + "logits/rejected": 1.2262694835662842, + "logps/chosen": -531.7000122070312, + "logps/rejected": -520.0999755859375, + "loss": 2.2168, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -29.125, + "rewards/margins": 3.4775390625, + "rewards/rejected": -32.631248474121094, + "step": 280 + }, + { + "epoch": 0.0928, + "grad_norm": 17.509579683846354, + "learning_rate": 1.81504e-05, + "logits/chosen": 1.038964867591858, + "logits/rejected": 1.112695336341858, + "logps/chosen": -531.1500244140625, + "logps/rejected": -501.45001220703125, + "loss": 1.4645, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -28.693750381469727, + "rewards/margins": 2.20703125, + "rewards/rejected": -30.896875381469727, + "step": 290 + }, + { + "epoch": 0.096, + "grad_norm": 67.44310712108323, + "learning_rate": 1.80864e-05, + "logits/chosen": 1.015234351158142, + "logits/rejected": 1.0890624523162842, + "logps/chosen": -502.3500061035156, + "logps/rejected": -494.20001220703125, + "loss": 1.8911, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -27.140625, + "rewards/margins": 2.791210889816284, + "rewards/rejected": -29.921875, + "step": 300 + }, + { + "epoch": 0.0992, + "grad_norm": 66.76807934268572, + "learning_rate": 1.8022400000000002e-05, + "logits/chosen": 0.9276367425918579, + "logits/rejected": 1.0182616710662842, + "logps/chosen": -493.70001220703125, + "logps/rejected": -474.70001220703125, + "loss": 1.6675, + "rewards/accuracies": 0.59375, + "rewards/chosen": -24.915624618530273, + "rewards/margins": 2.3501954078674316, + "rewards/rejected": -27.265625, + "step": 310 + }, + { + "epoch": 0.1024, + "grad_norm": 38.01975820590516, + "learning_rate": 1.7958400000000004e-05, + "logits/chosen": 0.9652343988418579, + "logits/rejected": 1.0524413585662842, + "logps/chosen": -485.70001220703125, + "logps/rejected": -477.1000061035156, + "loss": 1.6211, + "rewards/accuracies": 0.640625, + "rewards/chosen": -26.571874618530273, + "rewards/margins": 2.431933641433716, + "rewards/rejected": -28.993749618530273, + "step": 320 + }, + { + "epoch": 0.1056, + "grad_norm": 47.41486546942003, + "learning_rate": 1.78944e-05, + "logits/chosen": 1.1033203601837158, + "logits/rejected": 1.233300805091858, + "logps/chosen": -560.7000122070312, + "logps/rejected": -530.6500244140625, + "loss": 1.3983, + "rewards/accuracies": 0.6968749761581421, + "rewards/chosen": -31.793750762939453, + "rewards/margins": 3.658984422683716, + "rewards/rejected": -35.459373474121094, + "step": 330 + }, + { + "epoch": 0.1088, + "grad_norm": 54.10847973092568, + "learning_rate": 1.78304e-05, + "logits/chosen": 1.1881835460662842, + "logits/rejected": 1.345117211341858, + "logps/chosen": -569.7999877929688, + "logps/rejected": -572.7999877929688, + "loss": 1.6905, + "rewards/accuracies": 0.703125, + "rewards/chosen": -32.734375, + "rewards/margins": 4.145117282867432, + "rewards/rejected": -36.890625, + "step": 340 + }, + { + "epoch": 0.112, + "grad_norm": 44.154579996950346, + "learning_rate": 1.77664e-05, + "logits/chosen": 1.252832055091858, + "logits/rejected": 1.418554663658142, + "logps/chosen": -633.3499755859375, + "logps/rejected": -637.9500122070312, + "loss": 1.984, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -38.146873474121094, + "rewards/margins": 3.931640625, + "rewards/rejected": -42.08124923706055, + "step": 350 + }, + { + "epoch": 0.1152, + "grad_norm": 56.29869734936286, + "learning_rate": 1.7702400000000002e-05, + "logits/chosen": 0.9906250238418579, + "logits/rejected": 1.1129882335662842, + "logps/chosen": -557.0999755859375, + "logps/rejected": -543.5999755859375, + "loss": 1.0955, + "rewards/accuracies": 0.6468750238418579, + "rewards/chosen": -32.025001525878906, + "rewards/margins": 3.8828125, + "rewards/rejected": -35.90625, + "step": 360 + }, + { + "epoch": 0.1184, + "grad_norm": 24.075696906183882, + "learning_rate": 1.7638400000000004e-05, + "logits/chosen": 0.8721679449081421, + "logits/rejected": 0.9461914300918579, + "logps/chosen": -580.7000122070312, + "logps/rejected": -578.75, + "loss": 1.3949, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -32.259376525878906, + "rewards/margins": 4.385937690734863, + "rewards/rejected": -36.64374923706055, + "step": 370 + }, + { + "epoch": 0.1216, + "grad_norm": 72.29804215542902, + "learning_rate": 1.75744e-05, + "logits/chosen": 0.9410156011581421, + "logits/rejected": 1.0275390148162842, + "logps/chosen": -564.0, + "logps/rejected": -546.0999755859375, + "loss": 1.4662, + "rewards/accuracies": 0.703125, + "rewards/chosen": -31.365625381469727, + "rewards/margins": 3.5619139671325684, + "rewards/rejected": -34.91875076293945, + "step": 380 + }, + { + "epoch": 0.1248, + "grad_norm": 32.08984891925103, + "learning_rate": 1.75104e-05, + "logits/chosen": 0.79833984375, + "logits/rejected": 1.019921898841858, + "logps/chosen": -573.5, + "logps/rejected": -545.5, + "loss": 1.2478, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -34.00312423706055, + "rewards/margins": 3.6591796875, + "rewards/rejected": -37.662498474121094, + "step": 390 + }, + { + "epoch": 0.128, + "grad_norm": 53.03788678986745, + "learning_rate": 1.74464e-05, + "logits/chosen": 0.754638671875, + "logits/rejected": 0.9137207269668579, + "logps/chosen": -633.7000122070312, + "logps/rejected": -602.1500244140625, + "loss": 1.9865, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -37.20624923706055, + "rewards/margins": 2.9712891578674316, + "rewards/rejected": -40.19062423706055, + "step": 400 + }, + { + "epoch": 0.1312, + "grad_norm": 28.774739819689344, + "learning_rate": 1.73824e-05, + "logits/chosen": 0.962207019329071, + "logits/rejected": 1.1422851085662842, + "logps/chosen": -637.7000122070312, + "logps/rejected": -610.0999755859375, + "loss": 1.374, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -38.53125, + "rewards/margins": 3.3929686546325684, + "rewards/rejected": -41.931251525878906, + "step": 410 + }, + { + "epoch": 0.1344, + "grad_norm": 52.18984822269678, + "learning_rate": 1.7318400000000003e-05, + "logits/chosen": 0.9715820550918579, + "logits/rejected": 1.06298828125, + "logps/chosen": -584.4500122070312, + "logps/rejected": -564.0, + "loss": 1.5886, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -33.646873474121094, + "rewards/margins": 3.372851610183716, + "rewards/rejected": -37.01874923706055, + "step": 420 + }, + { + "epoch": 0.1376, + "grad_norm": 50.15500041735182, + "learning_rate": 1.7254400000000002e-05, + "logits/chosen": 0.8271484375, + "logits/rejected": 0.9488281011581421, + "logps/chosen": -600.5499877929688, + "logps/rejected": -582.4500122070312, + "loss": 2.058, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -36.96875, + "rewards/margins": 2.6484375, + "rewards/rejected": -39.618751525878906, + "step": 430 + }, + { + "epoch": 0.1408, + "grad_norm": 63.480323123245526, + "learning_rate": 1.71904e-05, + "logits/chosen": 0.8534179925918579, + "logits/rejected": 0.9185546636581421, + "logps/chosen": -650.9000244140625, + "logps/rejected": -622.9000244140625, + "loss": 1.9219, + "rewards/accuracies": 0.59375, + "rewards/chosen": -40.29999923706055, + "rewards/margins": 1.6640625, + "rewards/rejected": -41.98749923706055, + "step": 440 + }, + { + "epoch": 0.144, + "grad_norm": 83.77496034505918, + "learning_rate": 1.71264e-05, + "logits/chosen": 0.799267590045929, + "logits/rejected": 0.910839855670929, + "logps/chosen": -668.5, + "logps/rejected": -645.5999755859375, + "loss": 1.8358, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -41.368751525878906, + "rewards/margins": 2.6468749046325684, + "rewards/rejected": -44.0, + "step": 450 + }, + { + "epoch": 0.1472, + "grad_norm": 29.813610383691678, + "learning_rate": 1.70624e-05, + "logits/chosen": 0.5895019769668579, + "logits/rejected": 0.6923828125, + "logps/chosen": -625.2999877929688, + "logps/rejected": -621.0499877929688, + "loss": 1.4774, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -39.506248474121094, + "rewards/margins": 2.995312452316284, + "rewards/rejected": -42.5, + "step": 460 + }, + { + "epoch": 0.1504, + "grad_norm": 180.7604639291859, + "learning_rate": 1.6998400000000003e-05, + "logits/chosen": 0.34160155057907104, + "logits/rejected": 0.4342285096645355, + "logps/chosen": -605.0499877929688, + "logps/rejected": -602.5999755859375, + "loss": 1.6735, + "rewards/accuracies": 0.6968749761581421, + "rewards/chosen": -36.631248474121094, + "rewards/margins": 4.542578220367432, + "rewards/rejected": -41.150001525878906, + "step": 470 + }, + { + "epoch": 0.1536, + "grad_norm": 48.63799038212587, + "learning_rate": 1.6934400000000002e-05, + "logits/chosen": 0.3518127501010895, + "logits/rejected": 0.4427856504917145, + "logps/chosen": -612.0999755859375, + "logps/rejected": -590.0999755859375, + "loss": 2.0439, + "rewards/accuracies": 0.6343749761581421, + "rewards/chosen": -37.775001525878906, + "rewards/margins": 2.4281249046325684, + "rewards/rejected": -40.20624923706055, + "step": 480 + }, + { + "epoch": 0.1568, + "grad_norm": 37.41673163568628, + "learning_rate": 1.6870400000000004e-05, + "logits/chosen": 0.335458368062973, + "logits/rejected": 0.43060302734375, + "logps/chosen": -610.2999877929688, + "logps/rejected": -592.8499755859375, + "loss": 1.6041, + "rewards/accuracies": 0.6968749761581421, + "rewards/chosen": -35.609375, + "rewards/margins": 3.303906202316284, + "rewards/rejected": -38.91875076293945, + "step": 490 + }, + { + "epoch": 0.16, + "grad_norm": 26.291102862006582, + "learning_rate": 1.68064e-05, + "logits/chosen": 0.36564940214157104, + "logits/rejected": 0.4826454222202301, + "logps/chosen": -674.7000122070312, + "logps/rejected": -653.4000244140625, + "loss": 1.5988, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -42.32500076293945, + "rewards/margins": 3.46875, + "rewards/rejected": -45.79999923706055, + "step": 500 + }, + { + "epoch": 0.1632, + "grad_norm": 39.95073291435577, + "learning_rate": 1.67424e-05, + "logits/chosen": 0.33665770292282104, + "logits/rejected": 0.4792236387729645, + "logps/chosen": -629.9000244140625, + "logps/rejected": -621.3499755859375, + "loss": 1.3085, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -39.09687423706055, + "rewards/margins": 3.880078077316284, + "rewards/rejected": -42.95000076293945, + "step": 510 + }, + { + "epoch": 0.1664, + "grad_norm": 42.0639206744406, + "learning_rate": 1.6678400000000003e-05, + "logits/chosen": 0.44868165254592896, + "logits/rejected": 0.5293945074081421, + "logps/chosen": -617.1500244140625, + "logps/rejected": -606.7000122070312, + "loss": 1.849, + "rewards/accuracies": 0.640625, + "rewards/chosen": -37.19062423706055, + "rewards/margins": 3.7105469703674316, + "rewards/rejected": -40.875, + "step": 520 + }, + { + "epoch": 0.1696, + "grad_norm": 46.5708491699604, + "learning_rate": 1.66144e-05, + "logits/chosen": 0.6026366949081421, + "logits/rejected": 0.741015613079071, + "logps/chosen": -633.7999877929688, + "logps/rejected": -623.75, + "loss": 1.8425, + "rewards/accuracies": 0.6875, + "rewards/chosen": -39.5625, + "rewards/margins": 2.8218750953674316, + "rewards/rejected": -42.35625076293945, + "step": 530 + }, + { + "epoch": 0.1728, + "grad_norm": 44.08411871984865, + "learning_rate": 1.6550400000000003e-05, + "logits/chosen": 0.532763659954071, + "logits/rejected": 0.61279296875, + "logps/chosen": -601.9500122070312, + "logps/rejected": -597.0499877929688, + "loss": 1.6733, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": -34.59375, + "rewards/margins": 3.856640577316284, + "rewards/rejected": -38.44062423706055, + "step": 540 + }, + { + "epoch": 0.176, + "grad_norm": 51.086462511837894, + "learning_rate": 1.6486400000000002e-05, + "logits/chosen": 0.5535888671875, + "logits/rejected": 0.720458984375, + "logps/chosen": -616.75, + "logps/rejected": -600.2999877929688, + "loss": 1.8463, + "rewards/accuracies": 0.671875, + "rewards/chosen": -38.243751525878906, + "rewards/margins": 3.561718702316284, + "rewards/rejected": -41.79999923706055, + "step": 550 + }, + { + "epoch": 0.1792, + "grad_norm": 98.59034152091577, + "learning_rate": 1.64224e-05, + "logits/chosen": 0.24627113342285156, + "logits/rejected": 0.3730407655239105, + "logps/chosen": -631.0999755859375, + "logps/rejected": -630.5, + "loss": 1.427, + "rewards/accuracies": 0.6656249761581421, + "rewards/chosen": -39.525001525878906, + "rewards/margins": 4.168749809265137, + "rewards/rejected": -43.70624923706055, + "step": 560 + }, + { + "epoch": 0.1824, + "grad_norm": 51.10415695723496, + "learning_rate": 1.6358400000000002e-05, + "logits/chosen": 0.026998138055205345, + "logits/rejected": 0.14695052802562714, + "logps/chosen": -660.7000122070312, + "logps/rejected": -647.7999877929688, + "loss": 1.8255, + "rewards/accuracies": 0.703125, + "rewards/chosen": -41.57500076293945, + "rewards/margins": 3.4976563453674316, + "rewards/rejected": -45.10625076293945, + "step": 570 + }, + { + "epoch": 0.1856, + "grad_norm": 47.93213643256416, + "learning_rate": 1.62944e-05, + "logits/chosen": 0.11405792087316513, + "logits/rejected": 0.22451934218406677, + "logps/chosen": -651.5999755859375, + "logps/rejected": -628.0, + "loss": 2.0114, + "rewards/accuracies": 0.6968749761581421, + "rewards/chosen": -39.881248474121094, + "rewards/margins": 3.618359327316284, + "rewards/rejected": -43.493751525878906, + "step": 580 + }, + { + "epoch": 0.1888, + "grad_norm": 83.41007510418964, + "learning_rate": 1.6230400000000003e-05, + "logits/chosen": 0.19649505615234375, + "logits/rejected": 0.3397933840751648, + "logps/chosen": -627.3499755859375, + "logps/rejected": -606.0499877929688, + "loss": 1.463, + "rewards/accuracies": 0.671875, + "rewards/chosen": -37.96875, + "rewards/margins": 2.376953125, + "rewards/rejected": -40.36249923706055, + "step": 590 + }, + { + "epoch": 0.192, + "grad_norm": 28.81403201309904, + "learning_rate": 1.61664e-05, + "logits/chosen": 0.1689552366733551, + "logits/rejected": 0.2696182131767273, + "logps/chosen": -658.0, + "logps/rejected": -635.9500122070312, + "loss": 1.6455, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -41.20624923706055, + "rewards/margins": 3.1421875953674316, + "rewards/rejected": -44.318748474121094, + "step": 600 + }, + { + "epoch": 0.1952, + "grad_norm": 61.34725082837701, + "learning_rate": 1.61024e-05, + "logits/chosen": 0.09075927734375, + "logits/rejected": 0.26406097412109375, + "logps/chosen": -682.7000122070312, + "logps/rejected": -639.0, + "loss": 1.7396, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -41.756248474121094, + "rewards/margins": 3.12890625, + "rewards/rejected": -44.875, + "step": 610 + }, + { + "epoch": 0.1984, + "grad_norm": 42.24429252671063, + "learning_rate": 1.6038400000000002e-05, + "logits/chosen": 0.2577148377895355, + "logits/rejected": 0.35584717988967896, + "logps/chosen": -616.1500244140625, + "logps/rejected": -575.7000122070312, + "loss": 1.3043, + "rewards/accuracies": 0.715624988079071, + "rewards/chosen": -36.234375, + "rewards/margins": 3.4593749046325684, + "rewards/rejected": -39.67499923706055, + "step": 620 + }, + { + "epoch": 0.2016, + "grad_norm": 32.5713838251784, + "learning_rate": 1.59744e-05, + "logits/chosen": 0.294912725687027, + "logits/rejected": 0.4835205078125, + "logps/chosen": -614.9000244140625, + "logps/rejected": -621.5499877929688, + "loss": 1.4359, + "rewards/accuracies": 0.6781250238418579, + "rewards/chosen": -39.1875, + "rewards/margins": 4.471093654632568, + "rewards/rejected": -43.63750076293945, + "step": 630 + }, + { + "epoch": 0.2048, + "grad_norm": 42.595929409719815, + "learning_rate": 1.5910400000000003e-05, + "logits/chosen": 0.18522796034812927, + "logits/rejected": 0.33216553926467896, + "logps/chosen": -659.5999755859375, + "logps/rejected": -664.9000244140625, + "loss": 1.4863, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -41.0, + "rewards/margins": 4.653906345367432, + "rewards/rejected": -45.650001525878906, + "step": 640 + }, + { + "epoch": 0.208, + "grad_norm": 31.20306145083759, + "learning_rate": 1.58464e-05, + "logits/chosen": 0.2649597227573395, + "logits/rejected": 0.4009460508823395, + "logps/chosen": -678.4000244140625, + "logps/rejected": -680.5999755859375, + "loss": 1.227, + "rewards/accuracies": 0.6968749761581421, + "rewards/chosen": -43.875, + "rewards/margins": 5.211718559265137, + "rewards/rejected": -49.10625076293945, + "step": 650 + }, + { + "epoch": 0.2112, + "grad_norm": 38.78603097715352, + "learning_rate": 1.5782400000000003e-05, + "logits/chosen": 0.11473388969898224, + "logits/rejected": 0.24262085556983948, + "logps/chosen": -662.9500122070312, + "logps/rejected": -672.4000244140625, + "loss": 1.5832, + "rewards/accuracies": 0.6968749761581421, + "rewards/chosen": -41.118751525878906, + "rewards/margins": 4.8125, + "rewards/rejected": -45.9375, + "step": 660 + }, + { + "epoch": 0.2144, + "grad_norm": 57.46675514687721, + "learning_rate": 1.5718400000000002e-05, + "logits/chosen": 0.120941162109375, + "logits/rejected": 0.29290771484375, + "logps/chosen": -641.0999755859375, + "logps/rejected": -655.0999755859375, + "loss": 1.9178, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -40.63750076293945, + "rewards/margins": 3.264453172683716, + "rewards/rejected": -43.92499923706055, + "step": 670 + }, + { + "epoch": 0.2176, + "grad_norm": 43.95416076450901, + "learning_rate": 1.56544e-05, + "logits/chosen": 0.10860595852136612, + "logits/rejected": 0.26765745878219604, + "logps/chosen": -647.0999755859375, + "logps/rejected": -625.7000122070312, + "loss": 1.9465, + "rewards/accuracies": 0.715624988079071, + "rewards/chosen": -40.79999923706055, + "rewards/margins": 3.3539061546325684, + "rewards/rejected": -44.131248474121094, + "step": 680 + }, + { + "epoch": 0.2208, + "grad_norm": 40.455817117237345, + "learning_rate": 1.5590400000000002e-05, + "logits/chosen": 0.13580170273780823, + "logits/rejected": 0.33785706758499146, + "logps/chosen": -683.7999877929688, + "logps/rejected": -664.2000122070312, + "loss": 1.6045, + "rewards/accuracies": 0.7281249761581421, + "rewards/chosen": -42.881248474121094, + "rewards/margins": 4.641797065734863, + "rewards/rejected": -47.525001525878906, + "step": 690 + }, + { + "epoch": 0.224, + "grad_norm": 43.25947452615599, + "learning_rate": 1.55264e-05, + "logits/chosen": 0.06497345119714737, + "logits/rejected": 0.23727111518383026, + "logps/chosen": -646.2999877929688, + "logps/rejected": -632.7999877929688, + "loss": 1.8122, + "rewards/accuracies": 0.6968749761581421, + "rewards/chosen": -40.912498474121094, + "rewards/margins": 3.6683592796325684, + "rewards/rejected": -44.58124923706055, + "step": 700 + }, + { + "epoch": 0.2272, + "grad_norm": 37.454723687237205, + "learning_rate": 1.5462400000000003e-05, + "logits/chosen": 0.18839111924171448, + "logits/rejected": 0.360678106546402, + "logps/chosen": -704.4000244140625, + "logps/rejected": -686.7999877929688, + "loss": 1.7326, + "rewards/accuracies": 0.671875, + "rewards/chosen": -47.712501525878906, + "rewards/margins": 2.428906202316284, + "rewards/rejected": -50.14374923706055, + "step": 710 + }, + { + "epoch": 0.2304, + "grad_norm": 45.54945982579894, + "learning_rate": 1.53984e-05, + "logits/chosen": 0.1751358062028885, + "logits/rejected": 0.37080687284469604, + "logps/chosen": -681.7999877929688, + "logps/rejected": -666.2999877929688, + "loss": 1.4359, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -44.95624923706055, + "rewards/margins": 3.3218750953674316, + "rewards/rejected": -48.28125, + "step": 720 + }, + { + "epoch": 0.2336, + "grad_norm": 66.48742759140517, + "learning_rate": 1.53344e-05, + "logits/chosen": 0.18069687485694885, + "logits/rejected": 0.3331451416015625, + "logps/chosen": -677.0, + "logps/rejected": -663.7000122070312, + "loss": 2.234, + "rewards/accuracies": 0.671875, + "rewards/chosen": -42.4375, + "rewards/margins": 3.743359327316284, + "rewards/rejected": -46.17499923706055, + "step": 730 + }, + { + "epoch": 0.2368, + "grad_norm": 37.522327209701274, + "learning_rate": 1.5270400000000002e-05, + "logits/chosen": 0.10627365112304688, + "logits/rejected": 0.275918573141098, + "logps/chosen": -695.7999877929688, + "logps/rejected": -710.0, + "loss": 1.3593, + "rewards/accuracies": 0.684374988079071, + "rewards/chosen": -45.818748474121094, + "rewards/margins": 4.501172065734863, + "rewards/rejected": -50.32500076293945, + "step": 740 + }, + { + "epoch": 0.24, + "grad_norm": 38.214598626293, + "learning_rate": 1.52064e-05, + "logits/chosen": -0.07285461574792862, + "logits/rejected": 0.07509155571460724, + "logps/chosen": -668.2999877929688, + "logps/rejected": -642.2999877929688, + "loss": 1.9895, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -41.921875, + "rewards/margins": 3.208203077316284, + "rewards/rejected": -45.131248474121094, + "step": 750 + }, + { + "epoch": 0.2432, + "grad_norm": 18.45868948841473, + "learning_rate": 1.5142400000000001e-05, + "logits/chosen": -0.21304932236671448, + "logits/rejected": -0.12935790419578552, + "logps/chosen": -611.7999877929688, + "logps/rejected": -620.1500244140625, + "loss": 1.1468, + "rewards/accuracies": 0.721875011920929, + "rewards/chosen": -37.89374923706055, + "rewards/margins": 4.466406345367432, + "rewards/rejected": -42.38750076293945, + "step": 760 + }, + { + "epoch": 0.2464, + "grad_norm": 50.50110374912833, + "learning_rate": 1.5078400000000001e-05, + "logits/chosen": NaN, + "logits/rejected": -0.126597598195076, + "logps/chosen": -665.5499877929688, + "logps/rejected": -741.0999755859375, + "loss": 1.6933, + "rewards/accuracies": 0.71875, + "rewards/chosen": -40.900001525878906, + "rewards/margins": 4.021874904632568, + "rewards/rejected": -44.92499923706055, + "step": 770 + }, + { + "epoch": 0.2496, + "grad_norm": 36.323914322167525, + "learning_rate": 1.5014400000000001e-05, + "logits/chosen": -0.2960983216762543, + "logits/rejected": -0.10639800876379013, + "logps/chosen": -677.0, + "logps/rejected": -660.6500244140625, + "loss": 1.3649, + "rewards/accuracies": 0.6875, + "rewards/chosen": -42.09375, + "rewards/margins": 4.171093940734863, + "rewards/rejected": -46.23749923706055, + "step": 780 + }, + { + "epoch": 0.2528, + "grad_norm": 40.98554240207498, + "learning_rate": 1.49504e-05, + "logits/chosen": -0.396575927734375, + "logits/rejected": -0.2843994200229645, + "logps/chosen": -634.3499755859375, + "logps/rejected": -630.9000244140625, + "loss": 1.2482, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -39.26874923706055, + "rewards/margins": 4.376172065734863, + "rewards/rejected": -43.662498474121094, + "step": 790 + }, + { + "epoch": 0.256, + "grad_norm": 68.44628531453232, + "learning_rate": 1.48864e-05, + "logits/chosen": -0.35917967557907104, + "logits/rejected": -0.19907227158546448, + "logps/chosen": -623.25, + "logps/rejected": -606.4500122070312, + "loss": 1.6582, + "rewards/accuracies": 0.6781250238418579, + "rewards/chosen": -38.868751525878906, + "rewards/margins": 2.953125, + "rewards/rejected": -41.837501525878906, + "step": 800 + }, + { + "epoch": 0.2592, + "grad_norm": 29.704417097607617, + "learning_rate": 1.48224e-05, + "logits/chosen": -0.3877929747104645, + "logits/rejected": -0.211151123046875, + "logps/chosen": -661.5999755859375, + "logps/rejected": -656.0999755859375, + "loss": 1.1269, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -42.39374923706055, + "rewards/margins": 4.458984375, + "rewards/rejected": -46.849998474121094, + "step": 810 + }, + { + "epoch": 0.2624, + "grad_norm": 48.15003119838898, + "learning_rate": 1.4758400000000001e-05, + "logits/chosen": -0.45916748046875, + "logits/rejected": -0.3183044493198395, + "logps/chosen": -699.2000122070312, + "logps/rejected": -697.9000244140625, + "loss": 1.8473, + "rewards/accuracies": 0.721875011920929, + "rewards/chosen": -45.64374923706055, + "rewards/margins": 3.991406202316284, + "rewards/rejected": -49.63750076293945, + "step": 820 + }, + { + "epoch": 0.2656, + "grad_norm": 44.613017899690675, + "learning_rate": 1.4694400000000003e-05, + "logits/chosen": -0.3657287657260895, + "logits/rejected": NaN, + "logps/chosen": -659.8499755859375, + "logps/rejected": -612.7000122070312, + "loss": 5.7769, + "rewards/accuracies": 0.671875, + "rewards/chosen": -43.118751525878906, + "rewards/margins": -1.235742211341858, + "rewards/rejected": -41.900001525878906, + "step": 830 + }, + { + "epoch": 0.2688, + "grad_norm": 43.75173055134459, + "learning_rate": 1.46304e-05, + "logits/chosen": -0.40928345918655396, + "logits/rejected": -0.3201583921909332, + "logps/chosen": -616.7999877929688, + "logps/rejected": -606.5499877929688, + "loss": 1.6114, + "rewards/accuracies": 0.721875011920929, + "rewards/chosen": -38.165626525878906, + "rewards/margins": 3.885546922683716, + "rewards/rejected": -42.056251525878906, + "step": 840 + }, + { + "epoch": 0.272, + "grad_norm": 18.530340431083207, + "learning_rate": 1.45664e-05, + "logits/chosen": -0.35566407442092896, + "logits/rejected": -0.286752313375473, + "logps/chosen": -672.0999755859375, + "logps/rejected": -664.2999877929688, + "loss": 1.4289, + "rewards/accuracies": 0.6656249761581421, + "rewards/chosen": -44.837501525878906, + "rewards/margins": 3.283203125, + "rewards/rejected": -48.125, + "step": 850 + }, + { + "epoch": 0.2752, + "grad_norm": 40.76302621743691, + "learning_rate": 1.45024e-05, + "logits/chosen": -0.364227294921875, + "logits/rejected": -0.25963133573532104, + "logps/chosen": -687.2000122070312, + "logps/rejected": -690.2000122070312, + "loss": 1.6557, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": -45.20624923706055, + "rewards/margins": 4.821875095367432, + "rewards/rejected": -50.025001525878906, + "step": 860 + }, + { + "epoch": 0.2784, + "grad_norm": 40.8932489788095, + "learning_rate": 1.44384e-05, + "logits/chosen": -0.4478515684604645, + "logits/rejected": NaN, + "logps/chosen": -651.0999755859375, + "logps/rejected": -633.0499877929688, + "loss": 1.5459, + "rewards/accuracies": 0.690625011920929, + "rewards/chosen": -40.78125, + "rewards/margins": 4.033593654632568, + "rewards/rejected": -44.82500076293945, + "step": 870 + }, + { + "epoch": 0.2816, + "grad_norm": 49.01984548176554, + "learning_rate": 1.4374400000000003e-05, + "logits/chosen": NaN, + "logits/rejected": -0.3775878846645355, + "logps/chosen": -659.9500122070312, + "logps/rejected": -774.4000244140625, + "loss": 1.5128, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": -41.37812423706055, + "rewards/margins": 6.240234375, + "rewards/rejected": -47.618751525878906, + "step": 880 + }, + { + "epoch": 0.2848, + "grad_norm": 29.90375315285076, + "learning_rate": 1.4310400000000003e-05, + "logits/chosen": -0.4033203125, + "logits/rejected": -0.25498658418655396, + "logps/chosen": -656.0999755859375, + "logps/rejected": -643.2000122070312, + "loss": 1.2458, + "rewards/accuracies": 0.721875011920929, + "rewards/chosen": -42.64374923706055, + "rewards/margins": 3.559765577316284, + "rewards/rejected": -46.181251525878906, + "step": 890 + }, + { + "epoch": 0.288, + "grad_norm": 123.75390388742322, + "learning_rate": 1.42464e-05, + "logits/chosen": -0.3350830078125, + "logits/rejected": -0.18463440239429474, + "logps/chosen": -632.7999877929688, + "logps/rejected": -634.0999755859375, + "loss": 1.4703, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -38.181251525878906, + "rewards/margins": 6.01953125, + "rewards/rejected": -44.20000076293945, + "step": 900 + }, + { + "epoch": 0.2912, + "grad_norm": 35.26785703173216, + "learning_rate": 1.41824e-05, + "logits/chosen": 0.01738281175494194, + "logits/rejected": 0.12006988376379013, + "logps/chosen": -690.0, + "logps/rejected": -685.5999755859375, + "loss": 1.1489, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -45.01874923706055, + "rewards/margins": 5.295507907867432, + "rewards/rejected": -50.32500076293945, + "step": 910 + }, + { + "epoch": 0.2944, + "grad_norm": 31.809750540628983, + "learning_rate": 1.41184e-05, + "logits/chosen": -0.13148804008960724, + "logits/rejected": 0.0057846070267260075, + "logps/chosen": -693.7999877929688, + "logps/rejected": -683.7999877929688, + "loss": 1.2265, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -45.66875076293945, + "rewards/margins": 4.670312404632568, + "rewards/rejected": -50.3125, + "step": 920 + }, + { + "epoch": 0.2976, + "grad_norm": 32.30726067405822, + "learning_rate": 1.4054400000000002e-05, + "logits/chosen": -0.40455323457717896, + "logits/rejected": -0.16559448838233948, + "logps/chosen": -719.4000244140625, + "logps/rejected": -727.0, + "loss": 1.1941, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -46.73749923706055, + "rewards/margins": 6.235937595367432, + "rewards/rejected": -52.95624923706055, + "step": 930 + }, + { + "epoch": 0.3008, + "grad_norm": 24.646705893567123, + "learning_rate": 1.3990400000000002e-05, + "logits/chosen": -0.57275390625, + "logits/rejected": -0.449179083108902, + "logps/chosen": -616.8499755859375, + "logps/rejected": -632.7000122070312, + "loss": 1.3855, + "rewards/accuracies": 0.765625, + "rewards/chosen": -37.131248474121094, + "rewards/margins": 4.955859184265137, + "rewards/rejected": -42.099998474121094, + "step": 940 + }, + { + "epoch": 0.304, + "grad_norm": 49.29450819010971, + "learning_rate": 1.3926400000000003e-05, + "logits/chosen": -0.4102416932582855, + "logits/rejected": -0.3392578065395355, + "logps/chosen": -576.9500122070312, + "logps/rejected": -565.4500122070312, + "loss": 1.6633, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -35.00312423706055, + "rewards/margins": 3.369140625, + "rewards/rejected": -38.39374923706055, + "step": 950 + }, + { + "epoch": 0.3072, + "grad_norm": 49.780482808788896, + "learning_rate": 1.38624e-05, + "logits/chosen": -0.36146241426467896, + "logits/rejected": -0.23393554985523224, + "logps/chosen": -617.0999755859375, + "logps/rejected": -634.5, + "loss": 1.4172, + "rewards/accuracies": 0.703125, + "rewards/chosen": -36.743751525878906, + "rewards/margins": 7.173047065734863, + "rewards/rejected": -43.931251525878906, + "step": 960 + }, + { + "epoch": 0.3104, + "grad_norm": 17.189839906819454, + "learning_rate": 1.37984e-05, + "logits/chosen": -0.32120054960250854, + "logits/rejected": -0.22429199516773224, + "logps/chosen": -681.4000244140625, + "logps/rejected": -693.0, + "loss": 1.3687, + "rewards/accuracies": 0.715624988079071, + "rewards/chosen": -42.79999923706055, + "rewards/margins": 5.156640529632568, + "rewards/rejected": -47.96875, + "step": 970 + }, + { + "epoch": 0.3136, + "grad_norm": 12.304934608565206, + "learning_rate": 1.3734400000000002e-05, + "logits/chosen": -0.3793701231479645, + "logits/rejected": -0.2492828369140625, + "logps/chosen": -693.0, + "logps/rejected": -685.7999877929688, + "loss": 1.319, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -44.01874923706055, + "rewards/margins": 5.4609375, + "rewards/rejected": -49.48749923706055, + "step": 980 + }, + { + "epoch": 0.3168, + "grad_norm": 68.73462530803205, + "learning_rate": 1.3670400000000002e-05, + "logits/chosen": -0.39849853515625, + "logits/rejected": -0.24716797471046448, + "logps/chosen": -638.25, + "logps/rejected": -639.8499755859375, + "loss": 1.6008, + "rewards/accuracies": 0.690625011920929, + "rewards/chosen": -40.45624923706055, + "rewards/margins": 3.61328125, + "rewards/rejected": -44.04375076293945, + "step": 990 + }, + { + "epoch": 0.32, + "grad_norm": 38.49405717094058, + "learning_rate": 1.3606400000000002e-05, + "logits/chosen": -0.3262786865234375, + "logits/rejected": -0.2204742431640625, + "logps/chosen": -671.7999877929688, + "logps/rejected": -656.5999755859375, + "loss": 1.5795, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -41.775001525878906, + "rewards/margins": 4.087109565734863, + "rewards/rejected": -45.849998474121094, + "step": 1000 + }, + { + "epoch": 0.3232, + "grad_norm": 31.47359965899451, + "learning_rate": 1.3542400000000003e-05, + "logits/chosen": -0.33366698026657104, + "logits/rejected": -0.143086239695549, + "logps/chosen": -687.0999755859375, + "logps/rejected": -677.2999877929688, + "loss": 1.5533, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -43.900001525878906, + "rewards/margins": 4.942968845367432, + "rewards/rejected": -48.849998474121094, + "step": 1010 + }, + { + "epoch": 0.3264, + "grad_norm": 25.202888613549842, + "learning_rate": 1.34784e-05, + "logits/chosen": -0.1284584105014801, + "logits/rejected": 0.03115081787109375, + "logps/chosen": -716.2000122070312, + "logps/rejected": -697.5999755859375, + "loss": 1.4172, + "rewards/accuracies": 0.715624988079071, + "rewards/chosen": -48.306251525878906, + "rewards/margins": 3.723828077316284, + "rewards/rejected": -52.025001525878906, + "step": 1020 + }, + { + "epoch": 0.3296, + "grad_norm": 27.21888910223227, + "learning_rate": 1.3414400000000002e-05, + "logits/chosen": -0.410501092672348, + "logits/rejected": -0.23717650771141052, + "logps/chosen": -670.2000122070312, + "logps/rejected": -680.7000122070312, + "loss": 1.2433, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -43.03125, + "rewards/margins": 5.464453220367432, + "rewards/rejected": -48.474998474121094, + "step": 1030 + }, + { + "epoch": 0.3328, + "grad_norm": 47.60066919480353, + "learning_rate": 1.3350400000000002e-05, + "logits/chosen": -0.43865966796875, + "logits/rejected": -0.2837890684604645, + "logps/chosen": -679.4000244140625, + "logps/rejected": -667.0999755859375, + "loss": 1.3288, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -43.03125, + "rewards/margins": 4.525000095367432, + "rewards/rejected": -47.568748474121094, + "step": 1040 + }, + { + "epoch": 0.336, + "grad_norm": 30.202395773250057, + "learning_rate": 1.3286400000000002e-05, + "logits/chosen": -0.4092346131801605, + "logits/rejected": NaN, + "logps/chosen": -720.9000244140625, + "logps/rejected": -697.2999877929688, + "loss": 1.9026, + "rewards/accuracies": 0.65625, + "rewards/chosen": -49.01250076293945, + "rewards/margins": 2.942578077316284, + "rewards/rejected": -51.962501525878906, + "step": 1050 + }, + { + "epoch": 0.3392, + "grad_norm": 42.034420148084266, + "learning_rate": 1.3222400000000002e-05, + "logits/chosen": -0.6834716796875, + "logits/rejected": -0.5794311761856079, + "logps/chosen": -678.4000244140625, + "logps/rejected": -675.5999755859375, + "loss": 1.43, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -42.556251525878906, + "rewards/margins": 5.164453029632568, + "rewards/rejected": -47.724998474121094, + "step": 1060 + }, + { + "epoch": 0.3424, + "grad_norm": 49.03860430443391, + "learning_rate": 1.31584e-05, + "logits/chosen": -0.6819823980331421, + "logits/rejected": -0.547760009765625, + "logps/chosen": -664.2999877929688, + "logps/rejected": -675.0999755859375, + "loss": 1.364, + "rewards/accuracies": 0.6781250238418579, + "rewards/chosen": -43.59375, + "rewards/margins": 4.734375, + "rewards/rejected": -48.34375, + "step": 1070 + }, + { + "epoch": 0.3456, + "grad_norm": 31.483761229112318, + "learning_rate": 1.3094400000000001e-05, + "logits/chosen": -0.5381530523300171, + "logits/rejected": -0.42399901151657104, + "logps/chosen": -707.7000122070312, + "logps/rejected": -691.7999877929688, + "loss": 1.6441, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -47.412498474121094, + "rewards/margins": 2.9984374046325684, + "rewards/rejected": -50.41875076293945, + "step": 1080 + }, + { + "epoch": 0.3488, + "grad_norm": 34.48968739279645, + "learning_rate": 1.3030400000000001e-05, + "logits/chosen": -0.594866931438446, + "logits/rejected": -0.39453125, + "logps/chosen": -680.7000122070312, + "logps/rejected": -663.2000122070312, + "loss": 1.6324, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -42.625, + "rewards/margins": 5.530468940734863, + "rewards/rejected": -48.125, + "step": 1090 + }, + { + "epoch": 0.352, + "grad_norm": 43.9423065836892, + "learning_rate": 1.2966400000000002e-05, + "logits/chosen": -0.68505859375, + "logits/rejected": -0.531848132610321, + "logps/chosen": -629.75, + "logps/rejected": -628.9000244140625, + "loss": 1.316, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": -38.95000076293945, + "rewards/margins": 4.518359184265137, + "rewards/rejected": -43.46875, + "step": 1100 + }, + { + "epoch": 0.3552, + "grad_norm": 37.14781795255988, + "learning_rate": 1.2902400000000002e-05, + "logits/chosen": -0.70550537109375, + "logits/rejected": -0.4788818359375, + "logps/chosen": -666.0, + "logps/rejected": -652.2000122070312, + "loss": 1.3027, + "rewards/accuracies": 0.71875, + "rewards/chosen": -40.54375076293945, + "rewards/margins": 4.414453029632568, + "rewards/rejected": -44.95624923706055, + "step": 1110 + }, + { + "epoch": 0.3584, + "grad_norm": 36.64866248404492, + "learning_rate": 1.2838400000000002e-05, + "logits/chosen": -0.65966796875, + "logits/rejected": -0.5438476800918579, + "logps/chosen": -680.5999755859375, + "logps/rejected": -677.5999755859375, + "loss": 1.5804, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -45.4375, + "rewards/margins": 4.028124809265137, + "rewards/rejected": -49.45624923706055, + "step": 1120 + }, + { + "epoch": 0.3616, + "grad_norm": 24.444790043115425, + "learning_rate": 1.2774400000000001e-05, + "logits/chosen": -0.607128918170929, + "logits/rejected": -0.4249511659145355, + "logps/chosen": -675.2999877929688, + "logps/rejected": -684.2000122070312, + "loss": 1.198, + "rewards/accuracies": 0.746874988079071, + "rewards/chosen": -45.17499923706055, + "rewards/margins": 4.401171684265137, + "rewards/rejected": -49.57500076293945, + "step": 1130 + }, + { + "epoch": 0.3648, + "grad_norm": 40.983608565411764, + "learning_rate": 1.2710400000000001e-05, + "logits/chosen": -0.59814453125, + "logits/rejected": -0.4374023377895355, + "logps/chosen": -652.0999755859375, + "logps/rejected": -670.5, + "loss": 2.0744, + "rewards/accuracies": 0.7406250238418579, + "rewards/chosen": -41.962501525878906, + "rewards/margins": 6.257031440734863, + "rewards/rejected": -48.243751525878906, + "step": 1140 + }, + { + "epoch": 0.368, + "grad_norm": 39.26306466929758, + "learning_rate": 1.2646400000000001e-05, + "logits/chosen": -0.580737292766571, + "logits/rejected": -0.4674316346645355, + "logps/chosen": -674.4000244140625, + "logps/rejected": -734.4000244140625, + "loss": 1.113, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -42.375, + "rewards/margins": 9.228124618530273, + "rewards/rejected": -51.618751525878906, + "step": 1150 + }, + { + "epoch": 0.3712, + "grad_norm": 31.72807758004206, + "learning_rate": 1.2582400000000002e-05, + "logits/chosen": -0.6524902582168579, + "logits/rejected": -0.521557629108429, + "logps/chosen": -709.2999877929688, + "logps/rejected": -695.0, + "loss": 2.8163, + "rewards/accuracies": 0.75, + "rewards/chosen": -45.70000076293945, + "rewards/margins": 3.572265625, + "rewards/rejected": -49.243751525878906, + "step": 1160 + }, + { + "epoch": 0.3744, + "grad_norm": 26.864773010834657, + "learning_rate": 1.2518400000000002e-05, + "logits/chosen": -0.5064331293106079, + "logits/rejected": -0.31370848417282104, + "logps/chosen": -675.7000122070312, + "logps/rejected": -677.9000244140625, + "loss": 1.3189, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -44.45000076293945, + "rewards/margins": 3.821484327316284, + "rewards/rejected": -48.26874923706055, + "step": 1170 + }, + { + "epoch": 0.3776, + "grad_norm": 18.536367675480182, + "learning_rate": 1.2454400000000002e-05, + "logits/chosen": NaN, + "logits/rejected": -0.3017578125, + "logps/chosen": -693.0999755859375, + "logps/rejected": -689.5, + "loss": 1.1542, + "rewards/accuracies": 0.734375, + "rewards/chosen": -44.53125, + "rewards/margins": 4.386328220367432, + "rewards/rejected": -48.91875076293945, + "step": 1180 + }, + { + "epoch": 0.3808, + "grad_norm": 83.85775887295496, + "learning_rate": 1.23904e-05, + "logits/chosen": -0.5306640863418579, + "logits/rejected": -0.3149658143520355, + "logps/chosen": -723.5999755859375, + "logps/rejected": -728.9000244140625, + "loss": 1.6014, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -47.837501525878906, + "rewards/margins": 4.866015434265137, + "rewards/rejected": -52.6875, + "step": 1190 + }, + { + "epoch": 0.384, + "grad_norm": 48.56346315080496, + "learning_rate": 1.2326400000000001e-05, + "logits/chosen": -0.591595470905304, + "logits/rejected": -0.4169677793979645, + "logps/chosen": -722.0999755859375, + "logps/rejected": -720.4000244140625, + "loss": 1.6855, + "rewards/accuracies": 0.703125, + "rewards/chosen": -47.493751525878906, + "rewards/margins": 4.980078220367432, + "rewards/rejected": -52.474998474121094, + "step": 1200 + }, + { + "epoch": 0.3872, + "grad_norm": 27.08253908214466, + "learning_rate": 1.2262400000000001e-05, + "logits/chosen": -0.5696045160293579, + "logits/rejected": -0.35125428438186646, + "logps/chosen": -696.5, + "logps/rejected": -704.0, + "loss": 1.266, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": -46.98749923706055, + "rewards/margins": 6.761328220367432, + "rewards/rejected": -53.743751525878906, + "step": 1210 + }, + { + "epoch": 0.3904, + "grad_norm": 45.58573367769971, + "learning_rate": 1.2198400000000002e-05, + "logits/chosen": -0.507214367389679, + "logits/rejected": -0.370339959859848, + "logps/chosen": -718.9000244140625, + "logps/rejected": -711.5999755859375, + "loss": 1.8102, + "rewards/accuracies": 0.6656249761581421, + "rewards/chosen": -48.275001525878906, + "rewards/margins": 3.2464842796325684, + "rewards/rejected": -51.53125, + "step": 1220 + }, + { + "epoch": 0.3936, + "grad_norm": 30.442508287653766, + "learning_rate": 1.2134400000000002e-05, + "logits/chosen": -0.6756652593612671, + "logits/rejected": -0.4883056581020355, + "logps/chosen": -722.7999877929688, + "logps/rejected": -696.5, + "loss": 1.3741, + "rewards/accuracies": 0.71875, + "rewards/chosen": -46.625, + "rewards/margins": 4.519921779632568, + "rewards/rejected": -51.14374923706055, + "step": 1230 + }, + { + "epoch": 0.3968, + "grad_norm": 39.62281764482628, + "learning_rate": 1.20704e-05, + "logits/chosen": -0.647021472454071, + "logits/rejected": -0.352630615234375, + "logps/chosen": -695.2000122070312, + "logps/rejected": -701.9000244140625, + "loss": 1.3634, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -45.76250076293945, + "rewards/margins": 5.391797065734863, + "rewards/rejected": -51.1875, + "step": 1240 + }, + { + "epoch": 0.4, + "grad_norm": 56.94249115539426, + "learning_rate": 1.20064e-05, + "logits/chosen": -0.626617431640625, + "logits/rejected": -0.523999035358429, + "logps/chosen": -726.2000122070312, + "logps/rejected": -730.0999755859375, + "loss": 0.9063, + "rewards/accuracies": 0.7718750238418579, + "rewards/chosen": -49.056251525878906, + "rewards/margins": 6.05859375, + "rewards/rejected": -55.13750076293945, + "step": 1250 + }, + { + "epoch": 0.4032, + "grad_norm": 25.301760469749322, + "learning_rate": 1.1942400000000001e-05, + "logits/chosen": -0.647631824016571, + "logits/rejected": -0.5055999755859375, + "logps/chosen": -728.5999755859375, + "logps/rejected": -726.0999755859375, + "loss": 1.4879, + "rewards/accuracies": 0.746874988079071, + "rewards/chosen": -47.849998474121094, + "rewards/margins": 4.621874809265137, + "rewards/rejected": -52.45000076293945, + "step": 1260 + }, + { + "epoch": 0.4064, + "grad_norm": 45.78075012847253, + "learning_rate": 1.1878400000000001e-05, + "logits/chosen": -0.675854504108429, + "logits/rejected": -0.602569580078125, + "logps/chosen": -671.5, + "logps/rejected": -683.2999877929688, + "loss": 1.4389, + "rewards/accuracies": 0.75, + "rewards/chosen": -42.131248474121094, + "rewards/margins": 4.62890625, + "rewards/rejected": -46.756248474121094, + "step": 1270 + }, + { + "epoch": 0.4096, + "grad_norm": 49.430335961402825, + "learning_rate": 1.1814400000000002e-05, + "logits/chosen": -0.633227527141571, + "logits/rejected": -0.503387451171875, + "logps/chosen": -601.0499877929688, + "logps/rejected": -609.9000244140625, + "loss": 1.5427, + "rewards/accuracies": 0.6781250238418579, + "rewards/chosen": -37.38750076293945, + "rewards/margins": 3.7523436546325684, + "rewards/rejected": -41.131248474121094, + "step": 1280 + }, + { + "epoch": 0.4128, + "grad_norm": 19.900739223194954, + "learning_rate": 1.1750400000000002e-05, + "logits/chosen": -0.8213866949081421, + "logits/rejected": -0.7073730230331421, + "logps/chosen": -611.2999877929688, + "logps/rejected": -596.75, + "loss": 1.0477, + "rewards/accuracies": 0.753125011920929, + "rewards/chosen": -36.02812576293945, + "rewards/margins": 4.68359375, + "rewards/rejected": -40.70624923706055, + "step": 1290 + }, + { + "epoch": 0.416, + "grad_norm": 38.51755593166137, + "learning_rate": 1.16864e-05, + "logits/chosen": -0.8940185308456421, + "logits/rejected": -0.821337878704071, + "logps/chosen": -654.5999755859375, + "logps/rejected": -653.7999877929688, + "loss": 1.4967, + "rewards/accuracies": 0.703125, + "rewards/chosen": -41.04999923706055, + "rewards/margins": 4.975390434265137, + "rewards/rejected": -46.01250076293945, + "step": 1300 + }, + { + "epoch": 0.4192, + "grad_norm": 32.14775699366813, + "learning_rate": 1.16224e-05, + "logits/chosen": -1.0138671398162842, + "logits/rejected": -0.8365234136581421, + "logps/chosen": -665.7999877929688, + "logps/rejected": -644.2000122070312, + "loss": 1.2318, + "rewards/accuracies": 0.75, + "rewards/chosen": -42.25, + "rewards/margins": 4.791406154632568, + "rewards/rejected": -47.04375076293945, + "step": 1310 + }, + { + "epoch": 0.4224, + "grad_norm": 33.48748552900965, + "learning_rate": 1.1558400000000001e-05, + "logits/chosen": -0.98095703125, + "logits/rejected": -0.8668457269668579, + "logps/chosen": -671.0, + "logps/rejected": -686.5999755859375, + "loss": 2.4339, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -44.556251525878906, + "rewards/margins": 4.135546684265137, + "rewards/rejected": -48.6875, + "step": 1320 + }, + { + "epoch": 0.4256, + "grad_norm": 35.72606394744342, + "learning_rate": 1.1494400000000001e-05, + "logits/chosen": -0.97509765625, + "logits/rejected": -0.821728527545929, + "logps/chosen": -744.7000122070312, + "logps/rejected": -736.7999877929688, + "loss": 1.3735, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -48.375, + "rewards/margins": 4.604687690734863, + "rewards/rejected": -52.96875, + "step": 1330 + }, + { + "epoch": 0.4288, + "grad_norm": 22.738204624159227, + "learning_rate": 1.1430400000000002e-05, + "logits/chosen": -0.850903332233429, + "logits/rejected": -0.683728039264679, + "logps/chosen": -676.2999877929688, + "logps/rejected": -696.7999877929688, + "loss": 1.0104, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -45.70000076293945, + "rewards/margins": 5.3828125, + "rewards/rejected": -51.11249923706055, + "step": 1340 + }, + { + "epoch": 0.432, + "grad_norm": 35.62747483551517, + "learning_rate": 1.1366400000000002e-05, + "logits/chosen": -0.902783215045929, + "logits/rejected": -0.77490234375, + "logps/chosen": -697.2000122070312, + "logps/rejected": -686.5, + "loss": 1.1435, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -45.962501525878906, + "rewards/margins": 4.558203220367432, + "rewards/rejected": -50.53125, + "step": 1350 + }, + { + "epoch": 0.4352, + "grad_norm": 45.562269740805384, + "learning_rate": 1.13024e-05, + "logits/chosen": -0.945019543170929, + "logits/rejected": -0.7890990972518921, + "logps/chosen": -708.9000244140625, + "logps/rejected": -706.2999877929688, + "loss": 1.326, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -45.25, + "rewards/margins": 4.703906059265137, + "rewards/rejected": -49.9375, + "step": 1360 + }, + { + "epoch": 0.4384, + "grad_norm": 19.992416969929188, + "learning_rate": 1.12384e-05, + "logits/chosen": -0.98681640625, + "logits/rejected": -0.740966796875, + "logps/chosen": -705.4000244140625, + "logps/rejected": -697.0, + "loss": 1.3905, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -46.256248474121094, + "rewards/margins": 4.323828220367432, + "rewards/rejected": -50.5625, + "step": 1370 + }, + { + "epoch": 0.4416, + "grad_norm": 44.01446576997527, + "learning_rate": 1.1174400000000001e-05, + "logits/chosen": -0.940478503704071, + "logits/rejected": -0.746734619140625, + "logps/chosen": -696.0, + "logps/rejected": -676.2999877929688, + "loss": 1.1882, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": -44.537498474121094, + "rewards/margins": 4.753515720367432, + "rewards/rejected": -49.26874923706055, + "step": 1380 + }, + { + "epoch": 0.4448, + "grad_norm": 19.85792464714508, + "learning_rate": 1.1110400000000001e-05, + "logits/chosen": -0.865795910358429, + "logits/rejected": -0.762744128704071, + "logps/chosen": -704.2999877929688, + "logps/rejected": -711.7000122070312, + "loss": 1.3501, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -46.712501525878906, + "rewards/margins": 4.548437595367432, + "rewards/rejected": -51.23749923706055, + "step": 1390 + }, + { + "epoch": 0.448, + "grad_norm": 32.97163771144295, + "learning_rate": 1.1046400000000002e-05, + "logits/chosen": NaN, + "logits/rejected": -0.7448364496231079, + "logps/chosen": -717.6500244140625, + "logps/rejected": -691.2000122070312, + "loss": 1.529, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -47.087501525878906, + "rewards/margins": 3.608593702316284, + "rewards/rejected": -50.693748474121094, + "step": 1400 + }, + { + "epoch": 0.4512, + "grad_norm": 56.63714193074343, + "learning_rate": 1.0982400000000002e-05, + "logits/chosen": -0.8114258050918579, + "logits/rejected": -0.6397033929824829, + "logps/chosen": -708.4000244140625, + "logps/rejected": -711.2000122070312, + "loss": 1.6799, + "rewards/accuracies": 0.65625, + "rewards/chosen": -46.568748474121094, + "rewards/margins": 3.831249952316284, + "rewards/rejected": -50.39374923706055, + "step": 1410 + }, + { + "epoch": 0.4544, + "grad_norm": 37.82419338196218, + "learning_rate": 1.09184e-05, + "logits/chosen": -0.803271472454071, + "logits/rejected": -0.6448974609375, + "logps/chosen": -694.0999755859375, + "logps/rejected": -694.5, + "loss": 1.6071, + "rewards/accuracies": 0.734375, + "rewards/chosen": -46.03125, + "rewards/margins": 4.0, + "rewards/rejected": -50.01874923706055, + "step": 1420 + }, + { + "epoch": 0.4576, + "grad_norm": 35.2165866750218, + "learning_rate": 1.08544e-05, + "logits/chosen": NaN, + "logits/rejected": -0.5844482183456421, + "logps/chosen": -739.9000244140625, + "logps/rejected": -716.2000122070312, + "loss": 1.6826, + "rewards/accuracies": 0.6781250238418579, + "rewards/chosen": -48.91875076293945, + "rewards/margins": 2.9632811546325684, + "rewards/rejected": -51.881248474121094, + "step": 1430 + }, + { + "epoch": 0.4608, + "grad_norm": 25.10545424336984, + "learning_rate": 1.0790400000000001e-05, + "logits/chosen": -0.7391601800918579, + "logits/rejected": -0.6878417730331421, + "logps/chosen": -710.5, + "logps/rejected": -716.2999877929688, + "loss": 1.1861, + "rewards/accuracies": 0.6875, + "rewards/chosen": -47.818748474121094, + "rewards/margins": 4.393359184265137, + "rewards/rejected": -52.224998474121094, + "step": 1440 + }, + { + "epoch": 0.464, + "grad_norm": 37.403530945832465, + "learning_rate": 1.0726400000000001e-05, + "logits/chosen": -0.74072265625, + "logits/rejected": -0.613391101360321, + "logps/chosen": -725.9000244140625, + "logps/rejected": -707.5999755859375, + "loss": 2.6482, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -49.087501525878906, + "rewards/margins": 2.563671827316284, + "rewards/rejected": -51.650001525878906, + "step": 1450 + }, + { + "epoch": 0.4672, + "grad_norm": 31.894584615960177, + "learning_rate": 1.0662400000000001e-05, + "logits/chosen": -0.694140613079071, + "logits/rejected": -0.55181884765625, + "logps/chosen": -699.5999755859375, + "logps/rejected": -700.9000244140625, + "loss": 1.2498, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -44.775001525878906, + "rewards/margins": 4.860937595367432, + "rewards/rejected": -49.650001525878906, + "step": 1460 + }, + { + "epoch": 0.4704, + "grad_norm": 31.77468983079844, + "learning_rate": 1.05984e-05, + "logits/chosen": -0.676074206829071, + "logits/rejected": -0.46323853731155396, + "logps/chosen": -677.2999877929688, + "logps/rejected": -690.7999877929688, + "loss": 1.0611, + "rewards/accuracies": 0.8031250238418579, + "rewards/chosen": -43.16875076293945, + "rewards/margins": 7.153906345367432, + "rewards/rejected": -50.34375, + "step": 1470 + }, + { + "epoch": 0.4736, + "grad_norm": 701.8720995315323, + "learning_rate": 1.05344e-05, + "logits/chosen": -0.679760754108429, + "logits/rejected": -0.5099121332168579, + "logps/chosen": -725.0, + "logps/rejected": -687.5999755859375, + "loss": 2.1299, + "rewards/accuracies": 0.7406250238418579, + "rewards/chosen": -46.631248474121094, + "rewards/margins": 4.241796970367432, + "rewards/rejected": -50.868751525878906, + "step": 1480 + }, + { + "epoch": 0.4768, + "grad_norm": 21.110634577150943, + "learning_rate": 1.04704e-05, + "logits/chosen": -0.6755126714706421, + "logits/rejected": -0.540844738483429, + "logps/chosen": -699.5, + "logps/rejected": -710.2000122070312, + "loss": 1.2919, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -46.58124923706055, + "rewards/margins": 4.320703029632568, + "rewards/rejected": -50.90625, + "step": 1490 + }, + { + "epoch": 0.48, + "grad_norm": 33.68928045290949, + "learning_rate": 1.0406400000000001e-05, + "logits/chosen": -0.641308605670929, + "logits/rejected": -0.47297364473342896, + "logps/chosen": -705.9000244140625, + "logps/rejected": -689.5999755859375, + "loss": 1.2651, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -46.63750076293945, + "rewards/margins": 3.5082030296325684, + "rewards/rejected": -50.125, + "step": 1500 + }, + { + "epoch": 0.4832, + "grad_norm": 41.78929183594789, + "learning_rate": 1.0342400000000001e-05, + "logits/chosen": -0.649487316608429, + "logits/rejected": -0.512768566608429, + "logps/chosen": -712.2000122070312, + "logps/rejected": -707.2000122070312, + "loss": 1.605, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -47.443748474121094, + "rewards/margins": 3.4710936546325684, + "rewards/rejected": -50.91875076293945, + "step": 1510 + }, + { + "epoch": 0.4864, + "grad_norm": 83.1103876604526, + "learning_rate": 1.0278400000000001e-05, + "logits/chosen": -0.7444823980331421, + "logits/rejected": -0.6318725347518921, + "logps/chosen": -759.7000122070312, + "logps/rejected": -734.0999755859375, + "loss": 1.469, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": -48.53125, + "rewards/margins": 3.9937500953674316, + "rewards/rejected": -52.537498474121094, + "step": 1520 + }, + { + "epoch": 0.4896, + "grad_norm": 31.31743279282106, + "learning_rate": 1.02144e-05, + "logits/chosen": -0.8201904296875, + "logits/rejected": -0.7158203125, + "logps/chosen": -739.0999755859375, + "logps/rejected": -739.7000122070312, + "loss": 1.1614, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -47.525001525878906, + "rewards/margins": 6.057421684265137, + "rewards/rejected": -53.5625, + "step": 1530 + }, + { + "epoch": 0.4928, + "grad_norm": 30.082403707106717, + "learning_rate": 1.01504e-05, + "logits/chosen": -0.735424816608429, + "logits/rejected": -0.5428711175918579, + "logps/chosen": -743.2000122070312, + "logps/rejected": -739.5, + "loss": 1.2279, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -49.35625076293945, + "rewards/margins": 4.89453125, + "rewards/rejected": -54.275001525878906, + "step": 1540 + }, + { + "epoch": 0.496, + "grad_norm": 28.892914763782777, + "learning_rate": 1.00864e-05, + "logits/chosen": -0.809374988079071, + "logits/rejected": -0.6345459222793579, + "logps/chosen": -699.2999877929688, + "logps/rejected": -721.2000122070312, + "loss": 1.195, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -46.89374923706055, + "rewards/margins": 6.657422065734863, + "rewards/rejected": -53.5625, + "step": 1550 + }, + { + "epoch": 0.4992, + "grad_norm": 24.949136089979643, + "learning_rate": 1.00224e-05, + "logits/chosen": -0.7973388433456421, + "logits/rejected": NaN, + "logps/chosen": -712.2000122070312, + "logps/rejected": -697.4500122070312, + "loss": 1.6646, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -46.381248474121094, + "rewards/margins": 4.708203315734863, + "rewards/rejected": -51.09375, + "step": 1560 + }, + { + "epoch": 0.5024, + "grad_norm": 17.07305244434439, + "learning_rate": 9.958400000000001e-06, + "logits/chosen": -0.791088879108429, + "logits/rejected": -0.7275635004043579, + "logps/chosen": -642.4500122070312, + "logps/rejected": -676.0999755859375, + "loss": 1.3848, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -41.243751525878906, + "rewards/margins": 7.883984565734863, + "rewards/rejected": -49.150001525878906, + "step": 1570 + }, + { + "epoch": 0.5056, + "grad_norm": 27.387702681775544, + "learning_rate": 9.894400000000001e-06, + "logits/chosen": -0.751025378704071, + "logits/rejected": -0.6616455316543579, + "logps/chosen": -630.0, + "logps/rejected": -632.3499755859375, + "loss": 1.1648, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": -40.01250076293945, + "rewards/margins": 4.319140434265137, + "rewards/rejected": -44.337501525878906, + "step": 1580 + }, + { + "epoch": 0.5088, + "grad_norm": 58.06569005556697, + "learning_rate": 9.830400000000002e-06, + "logits/chosen": -0.677294909954071, + "logits/rejected": -0.5551391839981079, + "logps/chosen": -656.9000244140625, + "logps/rejected": -646.9000244140625, + "loss": 1.5886, + "rewards/accuracies": 0.671875, + "rewards/chosen": -42.8125, + "rewards/margins": 3.055468797683716, + "rewards/rejected": -45.881248474121094, + "step": 1590 + }, + { + "epoch": 0.512, + "grad_norm": 32.15835121742251, + "learning_rate": 9.7664e-06, + "logits/chosen": -0.8250976800918579, + "logits/rejected": -0.684741199016571, + "logps/chosen": -662.2000122070312, + "logps/rejected": -653.7000122070312, + "loss": 1.3567, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": -40.978126525878906, + "rewards/margins": 5.03515625, + "rewards/rejected": -46.037498474121094, + "step": 1600 + }, + { + "epoch": 0.5152, + "grad_norm": 39.1902416854394, + "learning_rate": 9.7024e-06, + "logits/chosen": -0.7381652593612671, + "logits/rejected": -0.53082275390625, + "logps/chosen": -680.4000244140625, + "logps/rejected": -673.2999877929688, + "loss": 1.6258, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -44.45624923706055, + "rewards/margins": 3.6875, + "rewards/rejected": -48.13750076293945, + "step": 1610 + }, + { + "epoch": 0.5184, + "grad_norm": 24.371787626734253, + "learning_rate": 9.6384e-06, + "logits/chosen": -0.79248046875, + "logits/rejected": -0.678546130657196, + "logps/chosen": -705.4000244140625, + "logps/rejected": -715.5, + "loss": 1.2993, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -46.82500076293945, + "rewards/margins": 4.224609375, + "rewards/rejected": -51.04999923706055, + "step": 1620 + }, + { + "epoch": 0.5216, + "grad_norm": 31.2556593358844, + "learning_rate": 9.574400000000001e-06, + "logits/chosen": -0.7989257574081421, + "logits/rejected": -0.6652587652206421, + "logps/chosen": -679.9000244140625, + "logps/rejected": -684.0999755859375, + "loss": 1.1808, + "rewards/accuracies": 0.734375, + "rewards/chosen": -45.837501525878906, + "rewards/margins": 4.585156440734863, + "rewards/rejected": -50.4375, + "step": 1630 + }, + { + "epoch": 0.5248, + "grad_norm": 27.55628635760918, + "learning_rate": 9.510400000000001e-06, + "logits/chosen": -0.862622082233429, + "logits/rejected": -0.7339111566543579, + "logps/chosen": -663.2999877929688, + "logps/rejected": -659.7000122070312, + "loss": 0.9807, + "rewards/accuracies": 0.7281249761581421, + "rewards/chosen": -43.256248474121094, + "rewards/margins": 4.79296875, + "rewards/rejected": -48.0625, + "step": 1640 + }, + { + "epoch": 0.528, + "grad_norm": 34.30396390187216, + "learning_rate": 9.446400000000002e-06, + "logits/chosen": -0.8206787109375, + "logits/rejected": -0.670666515827179, + "logps/chosen": -694.2000122070312, + "logps/rejected": -695.0999755859375, + "loss": 1.0455, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -45.9375, + "rewards/margins": 5.129687309265137, + "rewards/rejected": -51.068748474121094, + "step": 1650 + }, + { + "epoch": 0.5312, + "grad_norm": 34.776770970797735, + "learning_rate": 9.3824e-06, + "logits/chosen": -0.9286254644393921, + "logits/rejected": -0.726513683795929, + "logps/chosen": -685.0, + "logps/rejected": -689.0, + "loss": 1.0873, + "rewards/accuracies": 0.7718750238418579, + "rewards/chosen": -44.875, + "rewards/margins": 5.595703125, + "rewards/rejected": -50.493751525878906, + "step": 1660 + }, + { + "epoch": 0.5344, + "grad_norm": 27.00821250166549, + "learning_rate": 9.3184e-06, + "logits/chosen": -1.0827147960662842, + "logits/rejected": -0.8606933355331421, + "logps/chosen": -696.9000244140625, + "logps/rejected": -684.7999877929688, + "loss": 1.7363, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -44.849998474121094, + "rewards/margins": 4.677734375, + "rewards/rejected": -49.537498474121094, + "step": 1670 + }, + { + "epoch": 0.5376, + "grad_norm": 32.894164043095316, + "learning_rate": 9.2544e-06, + "logits/chosen": -0.972582995891571, + "logits/rejected": -0.8651367425918579, + "logps/chosen": -700.7999877929688, + "logps/rejected": -748.0, + "loss": 1.1076, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -47.79375076293945, + "rewards/margins": 8.16015625, + "rewards/rejected": -55.98125076293945, + "step": 1680 + }, + { + "epoch": 0.5408, + "grad_norm": 53.448156583233676, + "learning_rate": 9.190400000000001e-06, + "logits/chosen": -0.986499011516571, + "logits/rejected": -0.8663574457168579, + "logps/chosen": -692.5999755859375, + "logps/rejected": -689.7999877929688, + "loss": 0.9655, + "rewards/accuracies": 0.7718750238418579, + "rewards/chosen": -45.275001525878906, + "rewards/margins": 5.28125, + "rewards/rejected": -50.5625, + "step": 1690 + }, + { + "epoch": 0.544, + "grad_norm": 22.723538012527303, + "learning_rate": 9.126400000000001e-06, + "logits/chosen": -1.037988305091858, + "logits/rejected": -0.8658202886581421, + "logps/chosen": -707.5999755859375, + "logps/rejected": -698.9000244140625, + "loss": 0.9581, + "rewards/accuracies": 0.784375011920929, + "rewards/chosen": -45.962501525878906, + "rewards/margins": 5.108202934265137, + "rewards/rejected": -51.09375, + "step": 1700 + }, + { + "epoch": 0.5472, + "grad_norm": 29.930466159752008, + "learning_rate": 9.062400000000002e-06, + "logits/chosen": -0.9510253667831421, + "logits/rejected": -0.8387451171875, + "logps/chosen": -689.0, + "logps/rejected": -692.4000244140625, + "loss": 1.5855, + "rewards/accuracies": 0.715624988079071, + "rewards/chosen": -46.48749923706055, + "rewards/margins": 4.531640529632568, + "rewards/rejected": -51.01250076293945, + "step": 1710 + }, + { + "epoch": 0.5504, + "grad_norm": 22.74526767141219, + "learning_rate": 8.9984e-06, + "logits/chosen": -0.881787121295929, + "logits/rejected": -0.7762206792831421, + "logps/chosen": -697.2999877929688, + "logps/rejected": -691.0999755859375, + "loss": 1.3577, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -46.42499923706055, + "rewards/margins": 3.6976561546325684, + "rewards/rejected": -50.118751525878906, + "step": 1720 + }, + { + "epoch": 0.5536, + "grad_norm": 43.291646269995006, + "learning_rate": 8.9344e-06, + "logits/chosen": -0.972363293170929, + "logits/rejected": -0.83868408203125, + "logps/chosen": -713.0999755859375, + "logps/rejected": -712.5999755859375, + "loss": 1.2145, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -45.98749923706055, + "rewards/margins": 4.237500190734863, + "rewards/rejected": -50.224998474121094, + "step": 1730 + }, + { + "epoch": 0.5568, + "grad_norm": 27.09778676867347, + "learning_rate": 8.8704e-06, + "logits/chosen": -1.0656249523162842, + "logits/rejected": -0.7837280035018921, + "logps/chosen": -697.2999877929688, + "logps/rejected": -682.5999755859375, + "loss": 1.2162, + "rewards/accuracies": 0.778124988079071, + "rewards/chosen": -45.0, + "rewards/margins": 5.428906440734863, + "rewards/rejected": -50.46875, + "step": 1740 + }, + { + "epoch": 0.56, + "grad_norm": 18.82366502469674, + "learning_rate": 8.806400000000001e-06, + "logits/chosen": -0.972851574420929, + "logits/rejected": -0.8165038824081421, + "logps/chosen": -699.9000244140625, + "logps/rejected": -718.5, + "loss": 0.9451, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -46.38750076293945, + "rewards/margins": 5.488671779632568, + "rewards/rejected": -51.875, + "step": 1750 + }, + { + "epoch": 0.5632, + "grad_norm": 37.03702244755633, + "learning_rate": 8.742400000000001e-06, + "logits/chosen": -0.986035168170929, + "logits/rejected": -0.8427978754043579, + "logps/chosen": -693.0, + "logps/rejected": -680.9000244140625, + "loss": 1.5865, + "rewards/accuracies": 0.734375, + "rewards/chosen": -44.91875076293945, + "rewards/margins": 4.442187309265137, + "rewards/rejected": -49.32500076293945, + "step": 1760 + }, + { + "epoch": 0.5664, + "grad_norm": 15.669813086288999, + "learning_rate": 8.6784e-06, + "logits/chosen": -0.8355957269668579, + "logits/rejected": -0.685546875, + "logps/chosen": -672.5999755859375, + "logps/rejected": -683.9000244140625, + "loss": 0.9858, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -44.875, + "rewards/margins": 5.412499904632568, + "rewards/rejected": -50.3125, + "step": 1770 + }, + { + "epoch": 0.5696, + "grad_norm": 18.185718699982754, + "learning_rate": 8.6144e-06, + "logits/chosen": -0.8070312738418579, + "logits/rejected": -0.677416980266571, + "logps/chosen": -729.2999877929688, + "logps/rejected": -713.0, + "loss": 1.114, + "rewards/accuracies": 0.71875, + "rewards/chosen": -47.04999923706055, + "rewards/margins": 4.353906154632568, + "rewards/rejected": -51.431251525878906, + "step": 1780 + }, + { + "epoch": 0.5728, + "grad_norm": 18.43743777637667, + "learning_rate": 8.5504e-06, + "logits/chosen": -0.96875, + "logits/rejected": -0.7574218511581421, + "logps/chosen": -726.0, + "logps/rejected": -709.7000122070312, + "loss": 1.0618, + "rewards/accuracies": 0.796875, + "rewards/chosen": -47.07500076293945, + "rewards/margins": 5.14453125, + "rewards/rejected": -52.212501525878906, + "step": 1790 + }, + { + "epoch": 0.576, + "grad_norm": 43.0770877472424, + "learning_rate": 8.4864e-06, + "logits/chosen": -0.9660278558731079, + "logits/rejected": -0.8299804925918579, + "logps/chosen": -660.5999755859375, + "logps/rejected": -684.7999877929688, + "loss": 1.3212, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -43.724998474121094, + "rewards/margins": 5.642187595367432, + "rewards/rejected": -49.349998474121094, + "step": 1800 + }, + { + "epoch": 0.5792, + "grad_norm": 35.13641837689736, + "learning_rate": 8.422400000000001e-06, + "logits/chosen": -0.9552246332168579, + "logits/rejected": -0.800488293170929, + "logps/chosen": -691.4000244140625, + "logps/rejected": -693.9000244140625, + "loss": 1.2366, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -46.11249923706055, + "rewards/margins": 4.806640625, + "rewards/rejected": -50.931251525878906, + "step": 1810 + }, + { + "epoch": 0.5824, + "grad_norm": 39.08294024192249, + "learning_rate": 8.358400000000001e-06, + "logits/chosen": -0.996899425983429, + "logits/rejected": -0.862011730670929, + "logps/chosen": -716.2000122070312, + "logps/rejected": -729.2999877929688, + "loss": 1.0015, + "rewards/accuracies": 0.71875, + "rewards/chosen": -48.400001525878906, + "rewards/margins": 4.721093654632568, + "rewards/rejected": -53.11249923706055, + "step": 1820 + }, + { + "epoch": 0.5856, + "grad_norm": 40.43387329794278, + "learning_rate": 8.2944e-06, + "logits/chosen": -1.0739257335662842, + "logits/rejected": -0.936962902545929, + "logps/chosen": -703.5999755859375, + "logps/rejected": -701.9000244140625, + "loss": 1.2172, + "rewards/accuracies": 0.71875, + "rewards/chosen": -46.375, + "rewards/margins": 4.6171875, + "rewards/rejected": -50.98125076293945, + "step": 1830 + }, + { + "epoch": 0.5888, + "grad_norm": 37.96585113209214, + "learning_rate": 8.2304e-06, + "logits/chosen": -1.178613305091858, + "logits/rejected": -1.0226562023162842, + "logps/chosen": -729.2999877929688, + "logps/rejected": -732.0, + "loss": 1.1452, + "rewards/accuracies": 0.753125011920929, + "rewards/chosen": -46.431251525878906, + "rewards/margins": 6.264452934265137, + "rewards/rejected": -52.6875, + "step": 1840 + }, + { + "epoch": 0.592, + "grad_norm": 22.023179991872624, + "learning_rate": 8.1664e-06, + "logits/chosen": -1.1320312023162842, + "logits/rejected": -0.948486328125, + "logps/chosen": -710.7999877929688, + "logps/rejected": -693.9000244140625, + "loss": 1.1163, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -47.04999923706055, + "rewards/margins": 4.289453029632568, + "rewards/rejected": -51.34375, + "step": 1850 + }, + { + "epoch": 0.5952, + "grad_norm": 20.62670986549159, + "learning_rate": 8.1024e-06, + "logits/chosen": -1.138671875, + "logits/rejected": -0.9735351800918579, + "logps/chosen": -688.0, + "logps/rejected": -692.2000122070312, + "loss": 1.0507, + "rewards/accuracies": 0.71875, + "rewards/chosen": -44.5625, + "rewards/margins": 5.944140434265137, + "rewards/rejected": -50.493751525878906, + "step": 1860 + }, + { + "epoch": 0.5984, + "grad_norm": 16.339457625847704, + "learning_rate": 8.0384e-06, + "logits/chosen": -1.102148413658142, + "logits/rejected": -1.001074194908142, + "logps/chosen": -708.2000122070312, + "logps/rejected": -722.7999877929688, + "loss": 0.7581, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -47.71875, + "rewards/margins": 5.521874904632568, + "rewards/rejected": -53.23125076293945, + "step": 1870 + }, + { + "epoch": 0.6016, + "grad_norm": 35.66308810908752, + "learning_rate": 7.974400000000001e-06, + "logits/chosen": -1.0789062976837158, + "logits/rejected": -0.9618285894393921, + "logps/chosen": -738.0999755859375, + "logps/rejected": -732.0999755859375, + "loss": 0.9391, + "rewards/accuracies": 0.71875, + "rewards/chosen": -48.60625076293945, + "rewards/margins": 4.964062690734863, + "rewards/rejected": -53.587501525878906, + "step": 1880 + }, + { + "epoch": 0.6048, + "grad_norm": 32.66045303471343, + "learning_rate": 7.9104e-06, + "logits/chosen": -1.063330054283142, + "logits/rejected": NaN, + "logps/chosen": -699.0999755859375, + "logps/rejected": -702.0999755859375, + "loss": 1.513, + "rewards/accuracies": 0.7281249761581421, + "rewards/chosen": -47.41875076293945, + "rewards/margins": 3.7445311546325684, + "rewards/rejected": -51.17499923706055, + "step": 1890 + }, + { + "epoch": 0.608, + "grad_norm": 27.47143642743382, + "learning_rate": 7.8464e-06, + "logits/chosen": -1.07275390625, + "logits/rejected": -0.8569580316543579, + "logps/chosen": -699.0999755859375, + "logps/rejected": -701.4000244140625, + "loss": 0.9579, + "rewards/accuracies": 0.75, + "rewards/chosen": -46.39374923706055, + "rewards/margins": 5.1484375, + "rewards/rejected": -51.556251525878906, + "step": 1900 + }, + { + "epoch": 0.6112, + "grad_norm": 35.69727290418723, + "learning_rate": 7.7824e-06, + "logits/chosen": -1.019262671470642, + "logits/rejected": -0.923876941204071, + "logps/chosen": -720.0999755859375, + "logps/rejected": -712.2000122070312, + "loss": 1.8816, + "rewards/accuracies": 0.703125, + "rewards/chosen": -47.743751525878906, + "rewards/margins": 4.006249904632568, + "rewards/rejected": -51.743751525878906, + "step": 1910 + }, + { + "epoch": 0.6144, + "grad_norm": 25.35554574306627, + "learning_rate": 7.7184e-06, + "logits/chosen": -1.036523461341858, + "logits/rejected": -0.8519042730331421, + "logps/chosen": -729.4000244140625, + "logps/rejected": -720.2999877929688, + "loss": 1.039, + "rewards/accuracies": 0.753125011920929, + "rewards/chosen": -48.36249923706055, + "rewards/margins": 4.984375, + "rewards/rejected": -53.337501525878906, + "step": 1920 + }, + { + "epoch": 0.6176, + "grad_norm": 33.902865352947586, + "learning_rate": 7.6544e-06, + "logits/chosen": -1.0681641101837158, + "logits/rejected": -0.8866211175918579, + "logps/chosen": -718.5999755859375, + "logps/rejected": -700.2000122070312, + "loss": 1.3977, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": -47.41875076293945, + "rewards/margins": 3.130078077316284, + "rewards/rejected": -50.556251525878906, + "step": 1930 + }, + { + "epoch": 0.6208, + "grad_norm": 31.096553058517653, + "learning_rate": 7.590400000000001e-06, + "logits/chosen": -1.114648461341858, + "logits/rejected": -0.987255871295929, + "logps/chosen": -719.2999877929688, + "logps/rejected": -706.4000244140625, + "loss": 1.2642, + "rewards/accuracies": 0.7406250238418579, + "rewards/chosen": -45.35625076293945, + "rewards/margins": 5.084374904632568, + "rewards/rejected": -50.45624923706055, + "step": 1940 + }, + { + "epoch": 0.624, + "grad_norm": 54.87734980777987, + "learning_rate": 7.5264000000000005e-06, + "logits/chosen": -1.041601538658142, + "logits/rejected": -0.8419189453125, + "logps/chosen": -731.5, + "logps/rejected": -740.2000122070312, + "loss": 1.2067, + "rewards/accuracies": 0.75, + "rewards/chosen": -48.287498474121094, + "rewards/margins": 4.91015625, + "rewards/rejected": -53.212501525878906, + "step": 1950 + }, + { + "epoch": 0.6272, + "grad_norm": 34.47923251544078, + "learning_rate": 7.462400000000001e-06, + "logits/chosen": -0.9884277582168579, + "logits/rejected": NaN, + "logps/chosen": -733.7999877929688, + "logps/rejected": -728.4000244140625, + "loss": 1.4988, + "rewards/accuracies": 0.753125011920929, + "rewards/chosen": -49.431251525878906, + "rewards/margins": 4.154687404632568, + "rewards/rejected": -53.587501525878906, + "step": 1960 + }, + { + "epoch": 0.6304, + "grad_norm": 36.36671803353493, + "learning_rate": 7.398400000000001e-06, + "logits/chosen": NaN, + "logits/rejected": -0.912402331829071, + "logps/chosen": -702.5999755859375, + "logps/rejected": -696.0, + "loss": 1.406, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": -46.45000076293945, + "rewards/margins": 4.345703125, + "rewards/rejected": -50.806251525878906, + "step": 1970 + }, + { + "epoch": 0.6336, + "grad_norm": 29.91188477302202, + "learning_rate": 7.3344000000000005e-06, + "logits/chosen": -0.9787353277206421, + "logits/rejected": -0.8780517578125, + "logps/chosen": -713.5999755859375, + "logps/rejected": -724.2999877929688, + "loss": 1.0452, + "rewards/accuracies": 0.753125011920929, + "rewards/chosen": -46.26874923706055, + "rewards/margins": 5.87109375, + "rewards/rejected": -52.150001525878906, + "step": 1980 + }, + { + "epoch": 0.6368, + "grad_norm": 26.78263480473984, + "learning_rate": 7.270400000000001e-06, + "logits/chosen": -1.061132788658142, + "logits/rejected": -0.928173840045929, + "logps/chosen": -694.4000244140625, + "logps/rejected": -688.5, + "loss": 1.1949, + "rewards/accuracies": 0.734375, + "rewards/chosen": -44.849998474121094, + "rewards/margins": 4.34375, + "rewards/rejected": -49.193748474121094, + "step": 1990 + }, + { + "epoch": 0.64, + "grad_norm": 30.836160521060737, + "learning_rate": 7.2064e-06, + "logits/chosen": -0.986987292766571, + "logits/rejected": -0.860644519329071, + "logps/chosen": -687.7999877929688, + "logps/rejected": -673.7000122070312, + "loss": 1.1639, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -45.775001525878906, + "rewards/margins": 3.8265624046325684, + "rewards/rejected": -49.618751525878906, + "step": 2000 + }, + { + "epoch": 0.6432, + "grad_norm": 16.21314671791893, + "learning_rate": 7.1424000000000004e-06, + "logits/chosen": -0.953857421875, + "logits/rejected": -0.8311767578125, + "logps/chosen": -702.9000244140625, + "logps/rejected": -704.5999755859375, + "loss": 1.189, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": -45.849998474121094, + "rewards/margins": 4.592968940734863, + "rewards/rejected": -50.443748474121094, + "step": 2010 + }, + { + "epoch": 0.6464, + "grad_norm": 16.706238605453063, + "learning_rate": 7.078400000000001e-06, + "logits/chosen": -0.8988281488418579, + "logits/rejected": -0.631359875202179, + "logps/chosen": -717.5, + "logps/rejected": -692.5, + "loss": 0.8347, + "rewards/accuracies": 0.7718750238418579, + "rewards/chosen": -46.5625, + "rewards/margins": 4.493750095367432, + "rewards/rejected": -51.04999923706055, + "step": 2020 + }, + { + "epoch": 0.6496, + "grad_norm": 23.11390264308454, + "learning_rate": 7.0144e-06, + "logits/chosen": -0.981738269329071, + "logits/rejected": -0.803149402141571, + "logps/chosen": -696.7000122070312, + "logps/rejected": -699.0, + "loss": 0.9888, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -45.974998474121094, + "rewards/margins": 4.503125190734863, + "rewards/rejected": -50.48125076293945, + "step": 2030 + }, + { + "epoch": 0.6528, + "grad_norm": 25.551886307935956, + "learning_rate": 6.9504e-06, + "logits/chosen": -1.005957007408142, + "logits/rejected": -0.791259765625, + "logps/chosen": -718.7999877929688, + "logps/rejected": -714.0, + "loss": 1.0371, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -46.82500076293945, + "rewards/margins": 5.044531345367432, + "rewards/rejected": -51.86249923706055, + "step": 2040 + }, + { + "epoch": 0.656, + "grad_norm": 38.10513975607243, + "learning_rate": 6.886400000000001e-06, + "logits/chosen": -1.0060546398162842, + "logits/rejected": -0.7935760617256165, + "logps/chosen": -695.7999877929688, + "logps/rejected": -697.0, + "loss": 0.8056, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -46.381248474121094, + "rewards/margins": 5.079687595367432, + "rewards/rejected": -51.45624923706055, + "step": 2050 + }, + { + "epoch": 0.6592, + "grad_norm": 43.87278443969138, + "learning_rate": 6.8224e-06, + "logits/chosen": -1.1340820789337158, + "logits/rejected": -1.0141112804412842, + "logps/chosen": -705.5999755859375, + "logps/rejected": -714.4000244140625, + "loss": 1.3753, + "rewards/accuracies": 0.746874988079071, + "rewards/chosen": -45.0625, + "rewards/margins": 4.603906154632568, + "rewards/rejected": -49.662498474121094, + "step": 2060 + }, + { + "epoch": 0.6624, + "grad_norm": 30.116406487757125, + "learning_rate": 6.7584e-06, + "logits/chosen": -1.061132788658142, + "logits/rejected": -0.921923816204071, + "logps/chosen": -729.7000122070312, + "logps/rejected": -746.0, + "loss": 1.2115, + "rewards/accuracies": 0.721875011920929, + "rewards/chosen": -49.45624923706055, + "rewards/margins": 5.823046684265137, + "rewards/rejected": -55.256248474121094, + "step": 2070 + }, + { + "epoch": 0.6656, + "grad_norm": 30.073141884146498, + "learning_rate": 6.694400000000001e-06, + "logits/chosen": -1.0830078125, + "logits/rejected": -0.8765624761581421, + "logps/chosen": -715.7999877929688, + "logps/rejected": -714.5999755859375, + "loss": 0.8647, + "rewards/accuracies": 0.78125, + "rewards/chosen": -47.0, + "rewards/margins": 4.775781154632568, + "rewards/rejected": -51.76250076293945, + "step": 2080 + }, + { + "epoch": 0.6688, + "grad_norm": 44.582761143663106, + "learning_rate": 6.6304e-06, + "logits/chosen": -1.01953125, + "logits/rejected": -0.8620361089706421, + "logps/chosen": -689.2999877929688, + "logps/rejected": -690.4000244140625, + "loss": 1.1372, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -46.38750076293945, + "rewards/margins": 4.301953315734863, + "rewards/rejected": -50.681251525878906, + "step": 2090 + }, + { + "epoch": 0.672, + "grad_norm": 32.232233128260255, + "learning_rate": 6.5664e-06, + "logits/chosen": -1.0627930164337158, + "logits/rejected": -0.897936999797821, + "logps/chosen": -743.9000244140625, + "logps/rejected": -759.7999877929688, + "loss": 0.8998, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -47.96875, + "rewards/margins": 5.380078315734863, + "rewards/rejected": -53.33124923706055, + "step": 2100 + }, + { + "epoch": 0.6752, + "grad_norm": 24.673000840435204, + "learning_rate": 6.502400000000001e-06, + "logits/chosen": -1.0478515625, + "logits/rejected": -0.807861328125, + "logps/chosen": -706.9000244140625, + "logps/rejected": -694.9000244140625, + "loss": 1.1519, + "rewards/accuracies": 0.746874988079071, + "rewards/chosen": -47.756248474121094, + "rewards/margins": 4.149218559265137, + "rewards/rejected": -51.92499923706055, + "step": 2110 + }, + { + "epoch": 0.6784, + "grad_norm": 14.646256448520822, + "learning_rate": 6.4384e-06, + "logits/chosen": -1.0353515148162842, + "logits/rejected": -0.891357421875, + "logps/chosen": -714.2000122070312, + "logps/rejected": -718.0, + "loss": 0.9878, + "rewards/accuracies": 0.7281249761581421, + "rewards/chosen": -46.67499923706055, + "rewards/margins": 6.03125, + "rewards/rejected": -52.724998474121094, + "step": 2120 + }, + { + "epoch": 0.6816, + "grad_norm": 515.206138123639, + "learning_rate": 6.3744e-06, + "logits/chosen": -1.020361304283142, + "logits/rejected": -0.8248656988143921, + "logps/chosen": -723.2999877929688, + "logps/rejected": -710.5999755859375, + "loss": 1.5449, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -47.9375, + "rewards/margins": 4.746874809265137, + "rewards/rejected": -52.6875, + "step": 2130 + }, + { + "epoch": 0.6848, + "grad_norm": 33.81383796288538, + "learning_rate": 6.310400000000001e-06, + "logits/chosen": -1.0212891101837158, + "logits/rejected": -0.821093738079071, + "logps/chosen": -693.5, + "logps/rejected": -721.2999877929688, + "loss": 1.3032, + "rewards/accuracies": 0.734375, + "rewards/chosen": -46.11249923706055, + "rewards/margins": 6.295702934265137, + "rewards/rejected": -52.431251525878906, + "step": 2140 + }, + { + "epoch": 0.688, + "grad_norm": 32.355318848104815, + "learning_rate": 6.2464e-06, + "logits/chosen": -0.9873046875, + "logits/rejected": -0.8365722894668579, + "logps/chosen": -699.7999877929688, + "logps/rejected": -695.5, + "loss": 1.2379, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -46.037498474121094, + "rewards/margins": 4.799609184265137, + "rewards/rejected": -50.84375, + "step": 2150 + }, + { + "epoch": 0.6912, + "grad_norm": 23.615506940371404, + "learning_rate": 6.1824e-06, + "logits/chosen": -1.00634765625, + "logits/rejected": -0.9322265386581421, + "logps/chosen": -697.0, + "logps/rejected": -712.5, + "loss": 1.227, + "rewards/accuracies": 0.7718750238418579, + "rewards/chosen": -47.875, + "rewards/margins": 4.198046684265137, + "rewards/rejected": -52.056251525878906, + "step": 2160 + }, + { + "epoch": 0.6944, + "grad_norm": 63.3641597780795, + "learning_rate": 6.1184000000000014e-06, + "logits/chosen": -0.994403064250946, + "logits/rejected": -0.8553466796875, + "logps/chosen": -730.9000244140625, + "logps/rejected": -732.0, + "loss": 1.0171, + "rewards/accuracies": 0.753125011920929, + "rewards/chosen": -49.07500076293945, + "rewards/margins": 4.673828125, + "rewards/rejected": -53.76250076293945, + "step": 2170 + }, + { + "epoch": 0.6976, + "grad_norm": 20.809371501110515, + "learning_rate": 6.0544e-06, + "logits/chosen": -1.0204589366912842, + "logits/rejected": -0.813793957233429, + "logps/chosen": -712.5, + "logps/rejected": -710.0, + "loss": 0.7018, + "rewards/accuracies": 0.796875, + "rewards/chosen": -47.306251525878906, + "rewards/margins": 5.598437309265137, + "rewards/rejected": -52.88750076293945, + "step": 2180 + }, + { + "epoch": 0.7008, + "grad_norm": 33.50747747591909, + "learning_rate": 5.9904e-06, + "logits/chosen": -1.0885741710662842, + "logits/rejected": -0.971972644329071, + "logps/chosen": -717.7999877929688, + "logps/rejected": -721.7000122070312, + "loss": 1.0924, + "rewards/accuracies": 0.721875011920929, + "rewards/chosen": -48.025001525878906, + "rewards/margins": 4.930468559265137, + "rewards/rejected": -52.96875, + "step": 2190 + }, + { + "epoch": 0.704, + "grad_norm": 32.846590306215425, + "learning_rate": 5.9264e-06, + "logits/chosen": -1.009033203125, + "logits/rejected": -0.8384765386581421, + "logps/chosen": -727.7999877929688, + "logps/rejected": -722.4000244140625, + "loss": 1.0518, + "rewards/accuracies": 0.753125011920929, + "rewards/chosen": -47.943748474121094, + "rewards/margins": 5.194140434265137, + "rewards/rejected": -53.131248474121094, + "step": 2200 + }, + { + "epoch": 0.7072, + "grad_norm": 18.12069084200929, + "learning_rate": 5.8624e-06, + "logits/chosen": -1.0234375, + "logits/rejected": -0.83984375, + "logps/chosen": -743.5999755859375, + "logps/rejected": -744.9000244140625, + "loss": 1.332, + "rewards/accuracies": 0.75, + "rewards/chosen": -48.118751525878906, + "rewards/margins": 5.614843845367432, + "rewards/rejected": -53.73749923706055, + "step": 2210 + }, + { + "epoch": 0.7104, + "grad_norm": 30.990298323590697, + "learning_rate": 5.798400000000001e-06, + "logits/chosen": -0.9886230230331421, + "logits/rejected": -0.810791015625, + "logps/chosen": -702.0999755859375, + "logps/rejected": -717.4000244140625, + "loss": 1.213, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -48.756248474121094, + "rewards/margins": 4.651953220367432, + "rewards/rejected": -53.4375, + "step": 2220 + }, + { + "epoch": 0.7136, + "grad_norm": 25.950846199532993, + "learning_rate": 5.7344e-06, + "logits/chosen": -1.082421898841858, + "logits/rejected": -0.895800769329071, + "logps/chosen": -730.2999877929688, + "logps/rejected": -726.7999877929688, + "loss": 1.0928, + "rewards/accuracies": 0.734375, + "rewards/chosen": -49.03125, + "rewards/margins": 5.237109184265137, + "rewards/rejected": -54.26250076293945, + "step": 2230 + }, + { + "epoch": 0.7168, + "grad_norm": 41.05623425015503, + "learning_rate": 5.6704e-06, + "logits/chosen": NaN, + "logits/rejected": -0.9478393793106079, + "logps/chosen": -697.0999755859375, + "logps/rejected": -810.7000122070312, + "loss": 1.1882, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -46.693748474121094, + "rewards/margins": 4.758984565734863, + "rewards/rejected": -51.42499923706055, + "step": 2240 + }, + { + "epoch": 0.72, + "grad_norm": 26.691785630362762, + "learning_rate": 5.606400000000001e-06, + "logits/chosen": -1.12939453125, + "logits/rejected": -0.979968249797821, + "logps/chosen": -688.4000244140625, + "logps/rejected": -691.2999877929688, + "loss": 0.7555, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -45.45624923706055, + "rewards/margins": 5.046093940734863, + "rewards/rejected": -50.493751525878906, + "step": 2250 + }, + { + "epoch": 0.7232, + "grad_norm": 36.09811273148159, + "learning_rate": 5.5424e-06, + "logits/chosen": NaN, + "logits/rejected": -1.01104736328125, + "logps/chosen": -717.2999877929688, + "logps/rejected": -724.4000244140625, + "loss": 1.2529, + "rewards/accuracies": 0.7718750238418579, + "rewards/chosen": -45.76874923706055, + "rewards/margins": 5.684374809265137, + "rewards/rejected": -51.443748474121094, + "step": 2260 + }, + { + "epoch": 0.7264, + "grad_norm": 19.629173955929687, + "learning_rate": 5.478400000000001e-06, + "logits/chosen": -1.137792944908142, + "logits/rejected": -1.005273461341858, + "logps/chosen": -686.5, + "logps/rejected": -691.0, + "loss": 0.9069, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -45.26874923706055, + "rewards/margins": 5.016406059265137, + "rewards/rejected": -50.29999923706055, + "step": 2270 + }, + { + "epoch": 0.7296, + "grad_norm": 15.321250419813746, + "learning_rate": 5.414400000000001e-06, + "logits/chosen": -1.083886742591858, + "logits/rejected": -0.943359375, + "logps/chosen": -716.4000244140625, + "logps/rejected": -712.2000122070312, + "loss": 0.9849, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -46.587501525878906, + "rewards/margins": 5.075390815734863, + "rewards/rejected": -51.65625, + "step": 2280 + }, + { + "epoch": 0.7328, + "grad_norm": 25.626519146586663, + "learning_rate": 5.3504e-06, + "logits/chosen": -1.0432617664337158, + "logits/rejected": -0.8513427972793579, + "logps/chosen": -700.7999877929688, + "logps/rejected": -706.0, + "loss": 0.7705, + "rewards/accuracies": 0.765625, + "rewards/chosen": -46.962501525878906, + "rewards/margins": 5.443749904632568, + "rewards/rejected": -52.39374923706055, + "step": 2290 + }, + { + "epoch": 0.736, + "grad_norm": 34.37616620792697, + "learning_rate": 5.286400000000001e-06, + "logits/chosen": -1.012231469154358, + "logits/rejected": -0.8785156011581421, + "logps/chosen": -704.5, + "logps/rejected": -712.0, + "loss": 0.991, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -48.131248474121094, + "rewards/margins": 5.206640720367432, + "rewards/rejected": -53.29999923706055, + "step": 2300 + }, + { + "epoch": 0.7392, + "grad_norm": 18.513866720148574, + "learning_rate": 5.222400000000001e-06, + "logits/chosen": -1.0185058116912842, + "logits/rejected": -0.868457019329071, + "logps/chosen": -710.0999755859375, + "logps/rejected": -707.7999877929688, + "loss": 1.0177, + "rewards/accuracies": 0.7906249761581421, + "rewards/chosen": -46.162498474121094, + "rewards/margins": 5.4765625, + "rewards/rejected": -51.618751525878906, + "step": 2310 + }, + { + "epoch": 0.7424, + "grad_norm": 19.111370557860546, + "learning_rate": 5.1584000000000005e-06, + "logits/chosen": NaN, + "logits/rejected": -0.7626708745956421, + "logps/chosen": -696.0, + "logps/rejected": -699.9000244140625, + "loss": 0.742, + "rewards/accuracies": 0.7593749761581421, + "rewards/chosen": -46.756248474121094, + "rewards/margins": 5.501172065734863, + "rewards/rejected": -52.23125076293945, + "step": 2320 + }, + { + "epoch": 0.7456, + "grad_norm": 26.75112470648383, + "learning_rate": 5.094400000000001e-06, + "logits/chosen": -0.992968738079071, + "logits/rejected": -0.7928711175918579, + "logps/chosen": -699.2999877929688, + "logps/rejected": -692.5, + "loss": 0.7287, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -46.01874923706055, + "rewards/margins": 5.475781440734863, + "rewards/rejected": -51.48749923706055, + "step": 2330 + }, + { + "epoch": 0.7488, + "grad_norm": 20.28190998304776, + "learning_rate": 5.030400000000001e-06, + "logits/chosen": -1.0549805164337158, + "logits/rejected": -0.885986328125, + "logps/chosen": -686.4000244140625, + "logps/rejected": -705.7000122070312, + "loss": 0.9325, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -44.33124923706055, + "rewards/margins": 6.580078125, + "rewards/rejected": -50.900001525878906, + "step": 2340 + }, + { + "epoch": 0.752, + "grad_norm": 30.31617353068247, + "learning_rate": 4.9664000000000004e-06, + "logits/chosen": -0.877636730670929, + "logits/rejected": -0.7491210699081421, + "logps/chosen": -727.0, + "logps/rejected": -742.2000122070312, + "loss": 1.4356, + "rewards/accuracies": 0.721875011920929, + "rewards/chosen": -49.42499923706055, + "rewards/margins": 4.980859279632568, + "rewards/rejected": -54.38750076293945, + "step": 2350 + }, + { + "epoch": 0.7552, + "grad_norm": 40.91946462285244, + "learning_rate": 4.902400000000001e-06, + "logits/chosen": -0.966552734375, + "logits/rejected": -0.845751941204071, + "logps/chosen": -741.7000122070312, + "logps/rejected": -738.4000244140625, + "loss": 1.0233, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -48.15625, + "rewards/margins": 5.648828029632568, + "rewards/rejected": -53.8125, + "step": 2360 + }, + { + "epoch": 0.7584, + "grad_norm": 34.039942825197706, + "learning_rate": 4.8384e-06, + "logits/chosen": -1.003027319908142, + "logits/rejected": -0.858447253704071, + "logps/chosen": -757.0, + "logps/rejected": -728.7999877929688, + "loss": 1.6555, + "rewards/accuracies": 0.746874988079071, + "rewards/chosen": -48.25, + "rewards/margins": 4.786718845367432, + "rewards/rejected": -53.006248474121094, + "step": 2370 + }, + { + "epoch": 0.7616, + "grad_norm": 37.671632804350644, + "learning_rate": 4.7744e-06, + "logits/chosen": -0.769360363483429, + "logits/rejected": -0.765625, + "logps/chosen": -677.7000122070312, + "logps/rejected": -687.7999877929688, + "loss": 0.8503, + "rewards/accuracies": 0.753125011920929, + "rewards/chosen": -45.92499923706055, + "rewards/margins": 5.232421875, + "rewards/rejected": -51.150001525878906, + "step": 2380 + }, + { + "epoch": 0.7648, + "grad_norm": 21.084041436826528, + "learning_rate": 4.710400000000001e-06, + "logits/chosen": -0.979687511920929, + "logits/rejected": -0.841064453125, + "logps/chosen": -699.0, + "logps/rejected": -716.7999877929688, + "loss": 0.8988, + "rewards/accuracies": 0.765625, + "rewards/chosen": -45.006248474121094, + "rewards/margins": 5.942187309265137, + "rewards/rejected": -50.95624923706055, + "step": 2390 + }, + { + "epoch": 0.768, + "grad_norm": 30.650485611004637, + "learning_rate": 4.6464e-06, + "logits/chosen": -0.9476073980331421, + "logits/rejected": -0.809130847454071, + "logps/chosen": -693.2999877929688, + "logps/rejected": -693.5, + "loss": 1.0418, + "rewards/accuracies": 0.778124988079071, + "rewards/chosen": -44.506248474121094, + "rewards/margins": 6.182031154632568, + "rewards/rejected": -50.693748474121094, + "step": 2400 + }, + { + "epoch": 0.7712, + "grad_norm": 26.04029078012027, + "learning_rate": 4.5824e-06, + "logits/chosen": -0.9512695074081421, + "logits/rejected": -0.866650402545929, + "logps/chosen": -699.0, + "logps/rejected": -697.5999755859375, + "loss": 1.1675, + "rewards/accuracies": 0.765625, + "rewards/chosen": -44.875, + "rewards/margins": 4.698437690734863, + "rewards/rejected": -49.556251525878906, + "step": 2410 + }, + { + "epoch": 0.7744, + "grad_norm": 36.83215201031879, + "learning_rate": 4.518400000000001e-06, + "logits/chosen": -1.0114257335662842, + "logits/rejected": -0.8132690191268921, + "logps/chosen": -699.5999755859375, + "logps/rejected": -692.5999755859375, + "loss": 0.8036, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -45.6875, + "rewards/margins": 4.723046779632568, + "rewards/rejected": -50.40625, + "step": 2420 + }, + { + "epoch": 0.7776, + "grad_norm": 22.070335988355417, + "learning_rate": 4.4544e-06, + "logits/chosen": -1.0234375, + "logits/rejected": -0.854473888874054, + "logps/chosen": -680.9000244140625, + "logps/rejected": -704.5999755859375, + "loss": 0.8279, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -44.287498474121094, + "rewards/margins": 6.0546875, + "rewards/rejected": -50.36249923706055, + "step": 2430 + }, + { + "epoch": 0.7808, + "grad_norm": 25.46409714576939, + "learning_rate": 4.3904e-06, + "logits/chosen": -1.0222656726837158, + "logits/rejected": -0.886279284954071, + "logps/chosen": -687.4000244140625, + "logps/rejected": -690.0999755859375, + "loss": 0.8075, + "rewards/accuracies": 0.753125011920929, + "rewards/chosen": -44.474998474121094, + "rewards/margins": 5.271484375, + "rewards/rejected": -49.73749923706055, + "step": 2440 + }, + { + "epoch": 0.784, + "grad_norm": 27.18862097944765, + "learning_rate": 4.326400000000001e-06, + "logits/chosen": -0.98291015625, + "logits/rejected": -0.8377441167831421, + "logps/chosen": -707.9000244140625, + "logps/rejected": -716.2999877929688, + "loss": 0.9811, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -44.90625, + "rewards/margins": 5.499218940734863, + "rewards/rejected": -50.41875076293945, + "step": 2450 + }, + { + "epoch": 0.7872, + "grad_norm": 19.320839238994825, + "learning_rate": 4.2624e-06, + "logits/chosen": NaN, + "logits/rejected": -0.816760241985321, + "logps/chosen": -684.5, + "logps/rejected": -710.2999877929688, + "loss": 1.0548, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -45.756248474121094, + "rewards/margins": 5.346093654632568, + "rewards/rejected": -51.099998474121094, + "step": 2460 + }, + { + "epoch": 0.7904, + "grad_norm": 15.431831897667609, + "learning_rate": 4.1984e-06, + "logits/chosen": -0.8251098394393921, + "logits/rejected": -0.7601318359375, + "logps/chosen": -709.2999877929688, + "logps/rejected": -720.7000122070312, + "loss": 0.8206, + "rewards/accuracies": 0.7593749761581421, + "rewards/chosen": -46.13750076293945, + "rewards/margins": 5.135937690734863, + "rewards/rejected": -51.256248474121094, + "step": 2470 + }, + { + "epoch": 0.7936, + "grad_norm": 31.682062882644072, + "learning_rate": 4.1344e-06, + "logits/chosen": -0.8877929449081421, + "logits/rejected": -0.791735827922821, + "logps/chosen": -730.4000244140625, + "logps/rejected": -741.4000244140625, + "loss": 1.0101, + "rewards/accuracies": 0.746874988079071, + "rewards/chosen": -47.95624923706055, + "rewards/margins": 4.861718654632568, + "rewards/rejected": -52.806251525878906, + "step": 2480 + }, + { + "epoch": 0.7968, + "grad_norm": 27.980848387195604, + "learning_rate": 4.070400000000001e-06, + "logits/chosen": -0.811279296875, + "logits/rejected": -0.635449230670929, + "logps/chosen": -719.5999755859375, + "logps/rejected": -724.5, + "loss": 0.9866, + "rewards/accuracies": 0.7593749761581421, + "rewards/chosen": -48.35625076293945, + "rewards/margins": 4.940625190734863, + "rewards/rejected": -53.28125, + "step": 2490 + }, + { + "epoch": 0.8, + "grad_norm": 29.28289970079953, + "learning_rate": 4.0064e-06, + "logits/chosen": -0.7798095941543579, + "logits/rejected": -0.6259216070175171, + "logps/chosen": -700.2999877929688, + "logps/rejected": -704.5, + "loss": 0.9269, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -47.70000076293945, + "rewards/margins": 4.230078220367432, + "rewards/rejected": -51.91875076293945, + "step": 2500 + }, + { + "epoch": 0.8032, + "grad_norm": 49.45455905668464, + "learning_rate": 3.9424e-06, + "logits/chosen": NaN, + "logits/rejected": -0.6906982660293579, + "logps/chosen": -700.9000244140625, + "logps/rejected": -821.7000122070312, + "loss": 1.232, + "rewards/accuracies": 0.784375011920929, + "rewards/chosen": -46.29999923706055, + "rewards/margins": 4.734375, + "rewards/rejected": -51.01874923706055, + "step": 2510 + }, + { + "epoch": 0.8064, + "grad_norm": 31.980275647030638, + "learning_rate": 3.878400000000001e-06, + "logits/chosen": -0.9493163824081421, + "logits/rejected": -0.7739502191543579, + "logps/chosen": -702.5, + "logps/rejected": -701.5999755859375, + "loss": 1.0134, + "rewards/accuracies": 0.7718750238418579, + "rewards/chosen": -46.29999923706055, + "rewards/margins": 5.244531154632568, + "rewards/rejected": -51.537498474121094, + "step": 2520 + }, + { + "epoch": 0.8096, + "grad_norm": 32.60979718815698, + "learning_rate": 3.8144000000000003e-06, + "logits/chosen": -0.919140636920929, + "logits/rejected": -0.746960461139679, + "logps/chosen": -720.9000244140625, + "logps/rejected": -733.0999755859375, + "loss": 1.066, + "rewards/accuracies": 0.765625, + "rewards/chosen": -46.025001525878906, + "rewards/margins": 6.26171875, + "rewards/rejected": -52.29375076293945, + "step": 2530 + }, + { + "epoch": 0.8128, + "grad_norm": 18.078133548658265, + "learning_rate": 3.7504e-06, + "logits/chosen": -0.819042980670929, + "logits/rejected": -0.6877075433731079, + "logps/chosen": -717.5999755859375, + "logps/rejected": -709.0999755859375, + "loss": 0.7904, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -46.73749923706055, + "rewards/margins": 5.543359279632568, + "rewards/rejected": -52.29999923706055, + "step": 2540 + }, + { + "epoch": 0.816, + "grad_norm": 17.463419825027277, + "learning_rate": 3.6864000000000004e-06, + "logits/chosen": -0.8096923828125, + "logits/rejected": -0.663586437702179, + "logps/chosen": -690.5999755859375, + "logps/rejected": -689.7999877929688, + "loss": 0.8754, + "rewards/accuracies": 0.796875, + "rewards/chosen": -46.20624923706055, + "rewards/margins": 5.366796970367432, + "rewards/rejected": -51.587501525878906, + "step": 2550 + }, + { + "epoch": 0.8192, + "grad_norm": 34.45398458085358, + "learning_rate": 3.6224000000000002e-06, + "logits/chosen": NaN, + "logits/rejected": -0.6997131109237671, + "logps/chosen": -702.2999877929688, + "logps/rejected": -694.0, + "loss": 0.8496, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -46.29375076293945, + "rewards/margins": 4.94140625, + "rewards/rejected": -51.20624923706055, + "step": 2560 + }, + { + "epoch": 0.8224, + "grad_norm": 50.25198996079993, + "learning_rate": 3.5584e-06, + "logits/chosen": -0.758056640625, + "logits/rejected": -0.6135498285293579, + "logps/chosen": -687.7999877929688, + "logps/rejected": -691.5999755859375, + "loss": 1.0614, + "rewards/accuracies": 0.784375011920929, + "rewards/chosen": -46.256248474121094, + "rewards/margins": 4.607812404632568, + "rewards/rejected": -50.86249923706055, + "step": 2570 + }, + { + "epoch": 0.8256, + "grad_norm": 19.482286568161427, + "learning_rate": 3.4944e-06, + "logits/chosen": -0.765332043170929, + "logits/rejected": -0.599072277545929, + "logps/chosen": -715.4000244140625, + "logps/rejected": -710.9000244140625, + "loss": 0.9956, + "rewards/accuracies": 0.7593749761581421, + "rewards/chosen": -45.78125, + "rewards/margins": 4.977734565734863, + "rewards/rejected": -50.76250076293945, + "step": 2580 + }, + { + "epoch": 0.8288, + "grad_norm": 27.18743711565877, + "learning_rate": 3.4304000000000002e-06, + "logits/chosen": -0.725689709186554, + "logits/rejected": -0.6280151605606079, + "logps/chosen": -705.2000122070312, + "logps/rejected": -717.0, + "loss": 0.8677, + "rewards/accuracies": 0.7906249761581421, + "rewards/chosen": -47.41875076293945, + "rewards/margins": 4.928906440734863, + "rewards/rejected": -52.36249923706055, + "step": 2590 + }, + { + "epoch": 0.832, + "grad_norm": 28.274057380796958, + "learning_rate": 3.3664e-06, + "logits/chosen": -0.815625011920929, + "logits/rejected": -0.579394519329071, + "logps/chosen": -711.7000122070312, + "logps/rejected": -700.2999877929688, + "loss": 0.801, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -46.681251525878906, + "rewards/margins": 5.397265434265137, + "rewards/rejected": -52.0625, + "step": 2600 + }, + { + "epoch": 0.8352, + "grad_norm": 27.10807058678835, + "learning_rate": 3.3024e-06, + "logits/chosen": -0.818896472454071, + "logits/rejected": -0.65411376953125, + "logps/chosen": -691.2000122070312, + "logps/rejected": -701.9000244140625, + "loss": 0.8487, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -45.91875076293945, + "rewards/margins": 5.360547065734863, + "rewards/rejected": -51.28125, + "step": 2610 + }, + { + "epoch": 0.8384, + "grad_norm": 16.848696540134306, + "learning_rate": 3.2384000000000006e-06, + "logits/chosen": -0.879833996295929, + "logits/rejected": -0.7293335199356079, + "logps/chosen": -723.7000122070312, + "logps/rejected": -734.4000244140625, + "loss": 0.9332, + "rewards/accuracies": 0.8031250238418579, + "rewards/chosen": -47.57500076293945, + "rewards/margins": 5.736718654632568, + "rewards/rejected": -53.34375, + "step": 2620 + }, + { + "epoch": 0.8416, + "grad_norm": 23.088137550157327, + "learning_rate": 3.1744e-06, + "logits/chosen": -0.7398926019668579, + "logits/rejected": -0.58740234375, + "logps/chosen": -700.2000122070312, + "logps/rejected": -716.9000244140625, + "loss": 0.8959, + "rewards/accuracies": 0.7718750238418579, + "rewards/chosen": -48.04375076293945, + "rewards/margins": 5.1171875, + "rewards/rejected": -53.16875076293945, + "step": 2630 + }, + { + "epoch": 0.8448, + "grad_norm": 41.2060376447536, + "learning_rate": 3.1104e-06, + "logits/chosen": -0.8064941167831421, + "logits/rejected": -0.6497802734375, + "logps/chosen": -723.0999755859375, + "logps/rejected": -730.7000122070312, + "loss": 1.0018, + "rewards/accuracies": 0.75, + "rewards/chosen": -47.89374923706055, + "rewards/margins": 4.893750190734863, + "rewards/rejected": -52.79375076293945, + "step": 2640 + }, + { + "epoch": 0.848, + "grad_norm": 17.33185174612147, + "learning_rate": 3.0464000000000006e-06, + "logits/chosen": -0.8216308355331421, + "logits/rejected": -0.7013915777206421, + "logps/chosen": -732.7000122070312, + "logps/rejected": -728.2000122070312, + "loss": 0.9806, + "rewards/accuracies": 0.765625, + "rewards/chosen": -48.125, + "rewards/margins": 4.714062690734863, + "rewards/rejected": -52.84375, + "step": 2650 + }, + { + "epoch": 0.8512, + "grad_norm": 24.95900546548201, + "learning_rate": 2.9824000000000004e-06, + "logits/chosen": NaN, + "logits/rejected": -0.6495605707168579, + "logps/chosen": -713.5, + "logps/rejected": -722.9000244140625, + "loss": 0.7509, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -46.640625, + "rewards/margins": 5.935937404632568, + "rewards/rejected": -52.54999923706055, + "step": 2660 + }, + { + "epoch": 0.8544, + "grad_norm": 25.149175120143106, + "learning_rate": 2.9184000000000003e-06, + "logits/chosen": -0.812207043170929, + "logits/rejected": -0.5872558355331421, + "logps/chosen": -702.7000122070312, + "logps/rejected": -694.0, + "loss": 0.9383, + "rewards/accuracies": 0.753125011920929, + "rewards/chosen": -47.900001525878906, + "rewards/margins": 4.258593559265137, + "rewards/rejected": -52.14374923706055, + "step": 2670 + }, + { + "epoch": 0.8576, + "grad_norm": 12.827501902736019, + "learning_rate": 2.8544000000000006e-06, + "logits/chosen": -0.889843761920929, + "logits/rejected": -0.7074950933456421, + "logps/chosen": -700.7000122070312, + "logps/rejected": -717.4000244140625, + "loss": 0.9013, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -46.006248474121094, + "rewards/margins": 5.360156059265137, + "rewards/rejected": -51.36249923706055, + "step": 2680 + }, + { + "epoch": 0.8608, + "grad_norm": 23.908624027933058, + "learning_rate": 2.7904000000000004e-06, + "logits/chosen": -0.8385254144668579, + "logits/rejected": -0.65087890625, + "logps/chosen": -684.5, + "logps/rejected": -682.7000122070312, + "loss": 0.999, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -46.29999923706055, + "rewards/margins": 4.389062404632568, + "rewards/rejected": -50.71875, + "step": 2690 + }, + { + "epoch": 0.864, + "grad_norm": 22.34800607029224, + "learning_rate": 2.7264000000000003e-06, + "logits/chosen": -0.818115234375, + "logits/rejected": -0.692089855670929, + "logps/chosen": -693.0999755859375, + "logps/rejected": -697.7000122070312, + "loss": 0.8839, + "rewards/accuracies": 0.7718750238418579, + "rewards/chosen": -45.54375076293945, + "rewards/margins": 5.27734375, + "rewards/rejected": -50.78125, + "step": 2700 + }, + { + "epoch": 0.8672, + "grad_norm": 25.496548305387073, + "learning_rate": 2.6624e-06, + "logits/chosen": -0.839428722858429, + "logits/rejected": -0.694866955280304, + "logps/chosen": -677.0, + "logps/rejected": -676.2999877929688, + "loss": 1.069, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -45.26250076293945, + "rewards/margins": 5.021874904632568, + "rewards/rejected": -50.306251525878906, + "step": 2710 + }, + { + "epoch": 0.8704, + "grad_norm": 38.775614969317544, + "learning_rate": 2.5984000000000004e-06, + "logits/chosen": -0.8185790777206421, + "logits/rejected": -0.6710449457168579, + "logps/chosen": -680.2999877929688, + "logps/rejected": -722.9000244140625, + "loss": 0.8952, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -45.20624923706055, + "rewards/margins": 7.703906059265137, + "rewards/rejected": -52.90625, + "step": 2720 + }, + { + "epoch": 0.8736, + "grad_norm": 27.731107053080645, + "learning_rate": 2.5344000000000002e-06, + "logits/chosen": -0.8282226324081421, + "logits/rejected": -0.7029052972793579, + "logps/chosen": -689.7000122070312, + "logps/rejected": -701.9000244140625, + "loss": 0.965, + "rewards/accuracies": 0.75, + "rewards/chosen": -47.381248474121094, + "rewards/margins": 4.256640434265137, + "rewards/rejected": -51.65625, + "step": 2730 + }, + { + "epoch": 0.8768, + "grad_norm": 25.15430651962972, + "learning_rate": 2.4704e-06, + "logits/chosen": -0.8565429449081421, + "logits/rejected": -0.7360382080078125, + "logps/chosen": -719.0, + "logps/rejected": -725.5, + "loss": 0.8099, + "rewards/accuracies": 0.778124988079071, + "rewards/chosen": -47.375, + "rewards/margins": 5.586718559265137, + "rewards/rejected": -52.95624923706055, + "step": 2740 + }, + { + "epoch": 0.88, + "grad_norm": 16.604222380353896, + "learning_rate": 2.4064e-06, + "logits/chosen": -0.877636730670929, + "logits/rejected": -0.7131592035293579, + "logps/chosen": -724.5, + "logps/rejected": -733.5999755859375, + "loss": 0.7523, + "rewards/accuracies": 0.7906249761581421, + "rewards/chosen": -47.5625, + "rewards/margins": 5.414843559265137, + "rewards/rejected": -53.01874923706055, + "step": 2750 + }, + { + "epoch": 0.8832, + "grad_norm": 28.08075144528864, + "learning_rate": 2.3424000000000002e-06, + "logits/chosen": -0.810229480266571, + "logits/rejected": -0.734570324420929, + "logps/chosen": -728.5, + "logps/rejected": -744.2999877929688, + "loss": 0.836, + "rewards/accuracies": 0.78125, + "rewards/chosen": -48.40625, + "rewards/margins": 5.391406059265137, + "rewards/rejected": -53.806251525878906, + "step": 2760 + }, + { + "epoch": 0.8864, + "grad_norm": 28.94506411278981, + "learning_rate": 2.2784e-06, + "logits/chosen": -0.716931164264679, + "logits/rejected": NaN, + "logps/chosen": -733.5, + "logps/rejected": -724.0999755859375, + "loss": 1.5468, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -49.868751525878906, + "rewards/margins": 3.221484422683716, + "rewards/rejected": -53.087501525878906, + "step": 2770 + }, + { + "epoch": 0.8896, + "grad_norm": 20.33472177289239, + "learning_rate": 2.2144000000000003e-06, + "logits/chosen": -0.8364013433456421, + "logits/rejected": -0.6757232546806335, + "logps/chosen": -726.0, + "logps/rejected": -714.7999877929688, + "loss": 0.5291, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -47.98125076293945, + "rewards/margins": 5.720312595367432, + "rewards/rejected": -53.70000076293945, + "step": 2780 + }, + { + "epoch": 0.8928, + "grad_norm": 51.97128098464107, + "learning_rate": 2.1504e-06, + "logits/chosen": -0.863330066204071, + "logits/rejected": -0.766650378704071, + "logps/chosen": -735.7999877929688, + "logps/rejected": -756.0999755859375, + "loss": 0.9659, + "rewards/accuracies": 0.734375, + "rewards/chosen": -47.59375, + "rewards/margins": 6.539843559265137, + "rewards/rejected": -54.15625, + "step": 2790 + }, + { + "epoch": 0.896, + "grad_norm": 34.837802174621544, + "learning_rate": 2.0864e-06, + "logits/chosen": -0.863842785358429, + "logits/rejected": -0.722607433795929, + "logps/chosen": -693.2999877929688, + "logps/rejected": -702.0, + "loss": 0.9458, + "rewards/accuracies": 0.765625, + "rewards/chosen": -45.92499923706055, + "rewards/margins": 5.104296684265137, + "rewards/rejected": -51.01874923706055, + "step": 2800 + }, + { + "epoch": 0.8992, + "grad_norm": 28.70089724997986, + "learning_rate": 2.0224000000000003e-06, + "logits/chosen": -0.869335949420929, + "logits/rejected": -0.8036133050918579, + "logps/chosen": -710.0, + "logps/rejected": -715.2999877929688, + "loss": 1.1216, + "rewards/accuracies": 0.71875, + "rewards/chosen": -46.3125, + "rewards/margins": 4.509375095367432, + "rewards/rejected": -50.818748474121094, + "step": 2810 + }, + { + "epoch": 0.9024, + "grad_norm": 30.63117289172316, + "learning_rate": 1.9584e-06, + "logits/chosen": -0.7902587652206421, + "logits/rejected": -0.666430652141571, + "logps/chosen": -669.2999877929688, + "logps/rejected": -677.5999755859375, + "loss": 0.8344, + "rewards/accuracies": 0.765625, + "rewards/chosen": -45.5, + "rewards/margins": 4.933203220367432, + "rewards/rejected": -50.431251525878906, + "step": 2820 + }, + { + "epoch": 0.9056, + "grad_norm": 25.68361016400013, + "learning_rate": 1.8944e-06, + "logits/chosen": -0.8807617425918579, + "logits/rejected": -0.740283191204071, + "logps/chosen": -713.5, + "logps/rejected": -711.4000244140625, + "loss": 0.7472, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -45.912498474121094, + "rewards/margins": 5.207812309265137, + "rewards/rejected": -51.14374923706055, + "step": 2830 + }, + { + "epoch": 0.9088, + "grad_norm": 20.342553234409863, + "learning_rate": 1.8304000000000003e-06, + "logits/chosen": -0.875, + "logits/rejected": -0.7489013671875, + "logps/chosen": -730.7999877929688, + "logps/rejected": -714.2999877929688, + "loss": 0.8745, + "rewards/accuracies": 0.753125011920929, + "rewards/chosen": -48.1875, + "rewards/margins": 4.098437309265137, + "rewards/rejected": -52.287498474121094, + "step": 2840 + }, + { + "epoch": 0.912, + "grad_norm": 87.2935450396475, + "learning_rate": 1.7664000000000001e-06, + "logits/chosen": -0.772265613079071, + "logits/rejected": -0.7514404058456421, + "logps/chosen": -712.5, + "logps/rejected": -727.0999755859375, + "loss": 0.9705, + "rewards/accuracies": 0.753125011920929, + "rewards/chosen": -48.03125, + "rewards/margins": 4.150000095367432, + "rewards/rejected": -52.162498474121094, + "step": 2850 + }, + { + "epoch": 0.9152, + "grad_norm": 32.63676345895593, + "learning_rate": 1.7024000000000002e-06, + "logits/chosen": -0.8949218988418579, + "logits/rejected": -0.729443371295929, + "logps/chosen": -708.0, + "logps/rejected": -718.9000244140625, + "loss": 0.9999, + "rewards/accuracies": 0.7718750238418579, + "rewards/chosen": -47.587501525878906, + "rewards/margins": 4.604687690734863, + "rewards/rejected": -52.212501525878906, + "step": 2860 + }, + { + "epoch": 0.9184, + "grad_norm": 13.131566594805347, + "learning_rate": 1.6384000000000003e-06, + "logits/chosen": -0.839648425579071, + "logits/rejected": -0.675048828125, + "logps/chosen": -691.2000122070312, + "logps/rejected": -698.5, + "loss": 0.7785, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -46.23749923706055, + "rewards/margins": 5.367968559265137, + "rewards/rejected": -51.618751525878906, + "step": 2870 + }, + { + "epoch": 0.9216, + "grad_norm": 20.210063813573157, + "learning_rate": 1.5744000000000001e-06, + "logits/chosen": NaN, + "logits/rejected": -0.6630004644393921, + "logps/chosen": -692.7999877929688, + "logps/rejected": -704.0999755859375, + "loss": 0.8411, + "rewards/accuracies": 0.7406250238418579, + "rewards/chosen": -46.131248474121094, + "rewards/margins": 4.912109375, + "rewards/rejected": -51.0625, + "step": 2880 + }, + { + "epoch": 0.9248, + "grad_norm": 38.30773616311555, + "learning_rate": 1.5104000000000002e-06, + "logits/chosen": -0.826892077922821, + "logits/rejected": -0.7580322027206421, + "logps/chosen": -688.2000122070312, + "logps/rejected": -697.5999755859375, + "loss": 0.8446, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -46.21875, + "rewards/margins": 4.414843559265137, + "rewards/rejected": -50.60625076293945, + "step": 2890 + }, + { + "epoch": 0.928, + "grad_norm": 40.22722342990997, + "learning_rate": 1.4464e-06, + "logits/chosen": -0.8775390386581421, + "logits/rejected": -0.6669158935546875, + "logps/chosen": -696.7999877929688, + "logps/rejected": -687.7999877929688, + "loss": 1.3252, + "rewards/accuracies": 0.7406250238418579, + "rewards/chosen": -47.087501525878906, + "rewards/margins": 3.6832032203674316, + "rewards/rejected": -50.756248474121094, + "step": 2900 + }, + { + "epoch": 0.9312, + "grad_norm": 31.041235936499106, + "learning_rate": 1.3824e-06, + "logits/chosen": -0.8633056879043579, + "logits/rejected": -0.733020007610321, + "logps/chosen": -704.2000122070312, + "logps/rejected": -712.7999877929688, + "loss": 1.1374, + "rewards/accuracies": 0.721875011920929, + "rewards/chosen": -46.881248474121094, + "rewards/margins": 3.9964842796325684, + "rewards/rejected": -50.875, + "step": 2910 + }, + { + "epoch": 0.9344, + "grad_norm": 28.493763023980495, + "learning_rate": 1.3184000000000002e-06, + "logits/chosen": -0.837451159954071, + "logits/rejected": -0.7790282964706421, + "logps/chosen": -711.4000244140625, + "logps/rejected": -705.2999877929688, + "loss": 0.9248, + "rewards/accuracies": 0.784375011920929, + "rewards/chosen": -45.20624923706055, + "rewards/margins": 5.015234470367432, + "rewards/rejected": -50.224998474121094, + "step": 2920 + }, + { + "epoch": 0.9376, + "grad_norm": 25.54508861275617, + "learning_rate": 1.2544e-06, + "logits/chosen": NaN, + "logits/rejected": -0.6931396722793579, + "logps/chosen": -704.9000244140625, + "logps/rejected": -707.2999877929688, + "loss": 1.0966, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -46.21875, + "rewards/margins": 4.586328029632568, + "rewards/rejected": -50.8125, + "step": 2930 + }, + { + "epoch": 0.9408, + "grad_norm": 18.95103038138321, + "learning_rate": 1.1904e-06, + "logits/chosen": NaN, + "logits/rejected": -0.6669677495956421, + "logps/chosen": -701.0, + "logps/rejected": -697.0, + "loss": 0.9149, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -46.618751525878906, + "rewards/margins": 4.646093845367432, + "rewards/rejected": -51.23125076293945, + "step": 2940 + }, + { + "epoch": 0.944, + "grad_norm": 24.581771907889188, + "learning_rate": 1.1264000000000001e-06, + "logits/chosen": -0.8424072265625, + "logits/rejected": -0.653454601764679, + "logps/chosen": -712.7999877929688, + "logps/rejected": -708.9000244140625, + "loss": 0.9627, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -47.006248474121094, + "rewards/margins": 4.411718845367432, + "rewards/rejected": -51.375, + "step": 2950 + }, + { + "epoch": 0.9472, + "grad_norm": 36.59334250475912, + "learning_rate": 1.0624000000000002e-06, + "logits/chosen": -0.855224609375, + "logits/rejected": -0.755810558795929, + "logps/chosen": -726.0, + "logps/rejected": -719.5999755859375, + "loss": 1.1101, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -45.931251525878906, + "rewards/margins": 4.267187595367432, + "rewards/rejected": -50.193748474121094, + "step": 2960 + }, + { + "epoch": 0.9504, + "grad_norm": 18.958201936576806, + "learning_rate": 9.984e-07, + "logits/chosen": -0.8471924066543579, + "logits/rejected": -0.7294921875, + "logps/chosen": -726.0, + "logps/rejected": -720.2999877929688, + "loss": 0.571, + "rewards/accuracies": 0.8125, + "rewards/chosen": -45.693748474121094, + "rewards/margins": 5.928124904632568, + "rewards/rejected": -51.618751525878906, + "step": 2970 + }, + { + "epoch": 0.9536, + "grad_norm": 16.88941090161605, + "learning_rate": 9.344e-07, + "logits/chosen": -0.8091796636581421, + "logits/rejected": -0.6782592535018921, + "logps/chosen": -692.7999877929688, + "logps/rejected": -703.2999877929688, + "loss": 0.9194, + "rewards/accuracies": 0.753125011920929, + "rewards/chosen": -46.443748474121094, + "rewards/margins": 5.051562309265137, + "rewards/rejected": -51.48749923706055, + "step": 2980 + }, + { + "epoch": 0.9568, + "grad_norm": 25.846567770988575, + "learning_rate": 8.704000000000002e-07, + "logits/chosen": -0.852001965045929, + "logits/rejected": -0.621874988079071, + "logps/chosen": -706.7999877929688, + "logps/rejected": -693.0999755859375, + "loss": 0.716, + "rewards/accuracies": 0.7906249761581421, + "rewards/chosen": -46.243751525878906, + "rewards/margins": 4.926953315734863, + "rewards/rejected": -51.181251525878906, + "step": 2990 + }, + { + "epoch": 0.96, + "grad_norm": 35.33734147087596, + "learning_rate": 8.064000000000001e-07, + "logits/chosen": -0.842089831829071, + "logits/rejected": -0.7021239995956421, + "logps/chosen": -698.2000122070312, + "logps/rejected": -703.5, + "loss": 0.7254, + "rewards/accuracies": 0.796875, + "rewards/chosen": -45.931251525878906, + "rewards/margins": 5.244531154632568, + "rewards/rejected": -51.181251525878906, + "step": 3000 + }, + { + "epoch": 0.9632, + "grad_norm": 22.328108594927276, + "learning_rate": 7.424000000000001e-07, + "logits/chosen": -0.865039050579071, + "logits/rejected": -0.7381988763809204, + "logps/chosen": -714.9000244140625, + "logps/rejected": -717.7000122070312, + "loss": 0.8847, + "rewards/accuracies": 0.7718750238418579, + "rewards/chosen": -45.775001525878906, + "rewards/margins": 5.114062309265137, + "rewards/rejected": -50.88750076293945, + "step": 3010 + }, + { + "epoch": 0.9664, + "grad_norm": 30.66790250732441, + "learning_rate": 6.784e-07, + "logits/chosen": -0.8562256097793579, + "logits/rejected": NaN, + "logps/chosen": -698.0, + "logps/rejected": -707.4000244140625, + "loss": 0.7132, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -44.63750076293945, + "rewards/margins": 5.620312690734863, + "rewards/rejected": -50.243751525878906, + "step": 3020 + }, + { + "epoch": 0.9696, + "grad_norm": 32.290821615623756, + "learning_rate": 6.144000000000001e-07, + "logits/chosen": -0.859570324420929, + "logits/rejected": -0.708325207233429, + "logps/chosen": -718.4000244140625, + "logps/rejected": -707.2000122070312, + "loss": 0.76, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -45.787498474121094, + "rewards/margins": 5.176953315734863, + "rewards/rejected": -50.974998474121094, + "step": 3030 + }, + { + "epoch": 0.9728, + "grad_norm": 36.98146302872118, + "learning_rate": 5.504000000000001e-07, + "logits/chosen": -0.817822277545929, + "logits/rejected": -0.620288074016571, + "logps/chosen": -697.5, + "logps/rejected": -696.2000122070312, + "loss": 0.7549, + "rewards/accuracies": 0.8125, + "rewards/chosen": -46.287498474121094, + "rewards/margins": 5.23828125, + "rewards/rejected": -51.53125, + "step": 3040 + }, + { + "epoch": 0.976, + "grad_norm": 15.540019445487696, + "learning_rate": 4.864e-07, + "logits/chosen": -0.868945300579071, + "logits/rejected": -0.7304443120956421, + "logps/chosen": -696.0, + "logps/rejected": -714.7000122070312, + "loss": 0.9413, + "rewards/accuracies": 0.7718750238418579, + "rewards/chosen": -45.98749923706055, + "rewards/margins": 4.785937309265137, + "rewards/rejected": -50.76874923706055, + "step": 3050 + }, + { + "epoch": 0.9792, + "grad_norm": 24.566892342952926, + "learning_rate": 4.224e-07, + "logits/chosen": -0.847485363483429, + "logits/rejected": -0.6754394769668579, + "logps/chosen": -712.2999877929688, + "logps/rejected": -718.7999877929688, + "loss": 0.8152, + "rewards/accuracies": 0.8125, + "rewards/chosen": -46.16875076293945, + "rewards/margins": 5.44921875, + "rewards/rejected": -51.599998474121094, + "step": 3060 + }, + { + "epoch": 0.9824, + "grad_norm": 16.268903048624235, + "learning_rate": 3.584e-07, + "logits/chosen": -0.846875011920929, + "logits/rejected": -0.7189697027206421, + "logps/chosen": -696.4000244140625, + "logps/rejected": -700.9000244140625, + "loss": 0.7778, + "rewards/accuracies": 0.78125, + "rewards/chosen": -46.150001525878906, + "rewards/margins": 4.963281154632568, + "rewards/rejected": -51.125, + "step": 3070 + }, + { + "epoch": 0.9856, + "grad_norm": 45.08062165104853, + "learning_rate": 2.9440000000000004e-07, + "logits/chosen": -0.8548828363418579, + "logits/rejected": -0.72186279296875, + "logps/chosen": -702.2999877929688, + "logps/rejected": -699.0999755859375, + "loss": 0.9371, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -46.368751525878906, + "rewards/margins": 4.705078125, + "rewards/rejected": -51.08124923706055, + "step": 3080 + }, + { + "epoch": 0.9888, + "grad_norm": 23.350810243354506, + "learning_rate": 2.3040000000000002e-07, + "logits/chosen": -0.851611316204071, + "logits/rejected": -0.7162841558456421, + "logps/chosen": -696.2000122070312, + "logps/rejected": -700.5, + "loss": 0.8579, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -46.67499923706055, + "rewards/margins": 4.52734375, + "rewards/rejected": -51.224998474121094, + "step": 3090 + }, + { + "epoch": 0.992, + "grad_norm": 25.578366828611138, + "learning_rate": 1.664e-07, + "logits/chosen": -0.8206542730331421, + "logits/rejected": -0.73602294921875, + "logps/chosen": -711.2000122070312, + "logps/rejected": -714.0, + "loss": 0.8973, + "rewards/accuracies": 0.784375011920929, + "rewards/chosen": -46.54375076293945, + "rewards/margins": 4.841406345367432, + "rewards/rejected": -51.381248474121094, + "step": 3100 + }, + { + "epoch": 0.9952, + "grad_norm": 24.117757080722892, + "learning_rate": 1.0240000000000002e-07, + "logits/chosen": -0.8827148675918579, + "logits/rejected": -0.6673828363418579, + "logps/chosen": -702.7000122070312, + "logps/rejected": -701.7999877929688, + "loss": 0.7748, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -46.212501525878906, + "rewards/margins": 4.952343940734863, + "rewards/rejected": -51.14374923706055, + "step": 3110 + }, + { + "epoch": 0.9984, + "grad_norm": 22.68071342499608, + "learning_rate": 3.8400000000000006e-08, + "logits/chosen": -0.818408191204071, + "logits/rejected": -0.71337890625, + "logps/chosen": -691.7000122070312, + "logps/rejected": -703.0999755859375, + "loss": 0.8272, + "rewards/accuracies": 0.753125011920929, + "rewards/chosen": -46.16875076293945, + "rewards/margins": 5.1015625, + "rewards/rejected": -51.275001525878906, + "step": 3120 + } + ], + "logging_steps": 10, + "max_steps": 3125, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}