diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7047 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9981298423724285, + "eval_steps": 500, + "global_step": 467, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0021373230029388193, + "grad_norm": 2.51713228225708, + "learning_rate": 1.0638297872340425e-08, + "logits/chosen": -1.1381689310073853, + "logits/rejected": -0.9913416504859924, + "logps/chosen": -0.2839311957359314, + "logps/rejected": -0.2955534756183624, + "loss": 1.6097, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5678623914718628, + "rewards/margins": 0.023244591429829597, + "rewards/rejected": -0.5911069512367249, + "step": 1 + }, + { + "epoch": 0.004274646005877639, + "grad_norm": 6.541850566864014, + "learning_rate": 2.127659574468085e-08, + "logits/chosen": -1.0311710834503174, + "logits/rejected": -0.8901023864746094, + "logps/chosen": -0.24952735006809235, + "logps/rejected": -0.24253402650356293, + "loss": 1.6096, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4990547001361847, + "rewards/margins": -0.013986671343445778, + "rewards/rejected": -0.48506805300712585, + "step": 2 + }, + { + "epoch": 0.006411969008816457, + "grad_norm": 5.6596479415893555, + "learning_rate": 3.191489361702127e-08, + "logits/chosen": -0.9279628992080688, + "logits/rejected": -0.8305555582046509, + "logps/chosen": -0.2633163630962372, + "logps/rejected": -0.26702702045440674, + "loss": 1.6174, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5266327261924744, + "rewards/margins": 0.007421246729791164, + "rewards/rejected": -0.5340540409088135, + "step": 3 + }, + { + "epoch": 0.008549292011755277, + "grad_norm": 3.8121635913848877, + "learning_rate": 4.25531914893617e-08, + "logits/chosen": -0.8504582047462463, + "logits/rejected": -0.7527742981910706, + "logps/chosen": -0.2771408259868622, + "logps/rejected": -0.26471394300460815, + "loss": 1.6393, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5542816519737244, + "rewards/margins": -0.024853792041540146, + "rewards/rejected": -0.5294278860092163, + "step": 4 + }, + { + "epoch": 0.010686615014694095, + "grad_norm": 6.048301696777344, + "learning_rate": 5.3191489361702123e-08, + "logits/chosen": -1.156632661819458, + "logits/rejected": -1.2128832340240479, + "logps/chosen": -0.28773820400238037, + "logps/rejected": -0.29937219619750977, + "loss": 1.6108, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5754764080047607, + "rewards/margins": 0.023267941549420357, + "rewards/rejected": -0.5987443923950195, + "step": 5 + }, + { + "epoch": 0.012823938017632914, + "grad_norm": 3.6442198753356934, + "learning_rate": 6.382978723404254e-08, + "logits/chosen": -1.0647015571594238, + "logits/rejected": -1.031942367553711, + "logps/chosen": -0.25931063294410706, + "logps/rejected": -0.28003033995628357, + "loss": 1.6079, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5186212658882141, + "rewards/margins": 0.04143940657377243, + "rewards/rejected": -0.5600606799125671, + "step": 6 + }, + { + "epoch": 0.014961261020571734, + "grad_norm": 5.595146656036377, + "learning_rate": 7.446808510638298e-08, + "logits/chosen": -0.7785481810569763, + "logits/rejected": -0.7654089331626892, + "logps/chosen": -0.25532105565071106, + "logps/rejected": -0.24814245104789734, + "loss": 1.6092, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5106421113014221, + "rewards/margins": -0.01435722503811121, + "rewards/rejected": -0.4962849020957947, + "step": 7 + }, + { + "epoch": 0.017098584023510555, + "grad_norm": 2.9471020698547363, + "learning_rate": 8.51063829787234e-08, + "logits/chosen": -1.0282069444656372, + "logits/rejected": -1.0483824014663696, + "logps/chosen": -0.24546000361442566, + "logps/rejected": -0.2658500373363495, + "loss": 1.5902, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4909200072288513, + "rewards/margins": 0.04078003019094467, + "rewards/rejected": -0.531700074672699, + "step": 8 + }, + { + "epoch": 0.01923590702644937, + "grad_norm": 3.132836103439331, + "learning_rate": 9.574468085106382e-08, + "logits/chosen": -0.9889479875564575, + "logits/rejected": -0.8638209104537964, + "logps/chosen": -0.27614107728004456, + "logps/rejected": -0.2566734254360199, + "loss": 1.6173, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.5522821545600891, + "rewards/margins": -0.03893527761101723, + "rewards/rejected": -0.5133468508720398, + "step": 9 + }, + { + "epoch": 0.02137323002938819, + "grad_norm": 5.624292850494385, + "learning_rate": 1.0638297872340425e-07, + "logits/chosen": -1.0719839334487915, + "logits/rejected": -1.0015329122543335, + "logps/chosen": -0.32535240054130554, + "logps/rejected": -0.31745338439941406, + "loss": 1.6211, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.6507048010826111, + "rewards/margins": -0.015798063948750496, + "rewards/rejected": -0.6349067687988281, + "step": 10 + }, + { + "epoch": 0.02351055303232701, + "grad_norm": 5.1507039070129395, + "learning_rate": 1.1702127659574468e-07, + "logits/chosen": -0.9715439677238464, + "logits/rejected": -0.8908199071884155, + "logps/chosen": -0.2835432291030884, + "logps/rejected": -0.2507440745830536, + "loss": 1.612, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5670864582061768, + "rewards/margins": -0.0655982717871666, + "rewards/rejected": -0.5014881491661072, + "step": 11 + }, + { + "epoch": 0.02564787603526583, + "grad_norm": 2.2926666736602783, + "learning_rate": 1.2765957446808508e-07, + "logits/chosen": -0.9799962639808655, + "logits/rejected": -1.0184035301208496, + "logps/chosen": -0.29446908831596375, + "logps/rejected": -0.26765191555023193, + "loss": 1.6202, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5889381766319275, + "rewards/margins": -0.05363432317972183, + "rewards/rejected": -0.5353038311004639, + "step": 12 + }, + { + "epoch": 0.027785199038204648, + "grad_norm": 5.308409690856934, + "learning_rate": 1.3829787234042553e-07, + "logits/chosen": -0.8681848049163818, + "logits/rejected": -0.8799771070480347, + "logps/chosen": -0.3181426227092743, + "logps/rejected": -0.3121987581253052, + "loss": 1.6031, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6362852454185486, + "rewards/margins": -0.011887717992067337, + "rewards/rejected": -0.6243975162506104, + "step": 13 + }, + { + "epoch": 0.029922522041143467, + "grad_norm": 4.573068618774414, + "learning_rate": 1.4893617021276595e-07, + "logits/chosen": -0.8867932558059692, + "logits/rejected": -0.861649751663208, + "logps/chosen": -0.312772661447525, + "logps/rejected": -0.29462364315986633, + "loss": 1.6226, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.62554532289505, + "rewards/margins": -0.03629804030060768, + "rewards/rejected": -0.5892472863197327, + "step": 14 + }, + { + "epoch": 0.03205984504408229, + "grad_norm": 4.1025872230529785, + "learning_rate": 1.5957446808510638e-07, + "logits/chosen": -1.1116752624511719, + "logits/rejected": -0.9415389895439148, + "logps/chosen": -0.27133169770240784, + "logps/rejected": -0.29030919075012207, + "loss": 1.5818, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5426633954048157, + "rewards/margins": 0.037955012172460556, + "rewards/rejected": -0.5806183815002441, + "step": 15 + }, + { + "epoch": 0.03419716804702111, + "grad_norm": 3.307173728942871, + "learning_rate": 1.702127659574468e-07, + "logits/chosen": -0.9105625152587891, + "logits/rejected": -0.8872620463371277, + "logps/chosen": -0.2662544846534729, + "logps/rejected": -0.28296971321105957, + "loss": 1.6112, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5325089693069458, + "rewards/margins": 0.03343046456575394, + "rewards/rejected": -0.5659394264221191, + "step": 16 + }, + { + "epoch": 0.03633449104995993, + "grad_norm": 6.173768997192383, + "learning_rate": 1.8085106382978725e-07, + "logits/chosen": -0.7553848028182983, + "logits/rejected": -0.7946615815162659, + "logps/chosen": -0.277927964925766, + "logps/rejected": -0.28916075825691223, + "loss": 1.5928, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.555855929851532, + "rewards/margins": 0.02246551401913166, + "rewards/rejected": -0.5783215165138245, + "step": 17 + }, + { + "epoch": 0.03847181405289874, + "grad_norm": 3.708397626876831, + "learning_rate": 1.9148936170212765e-07, + "logits/chosen": -1.0742344856262207, + "logits/rejected": -1.1560362577438354, + "logps/chosen": -0.2530558407306671, + "logps/rejected": -0.2565101981163025, + "loss": 1.6245, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5061116814613342, + "rewards/margins": 0.006908770650625229, + "rewards/rejected": -0.513020396232605, + "step": 18 + }, + { + "epoch": 0.04060913705583756, + "grad_norm": 4.8654351234436035, + "learning_rate": 2.0212765957446807e-07, + "logits/chosen": -1.1306225061416626, + "logits/rejected": -1.0444625616073608, + "logps/chosen": -0.2724864184856415, + "logps/rejected": -0.2817416787147522, + "loss": 1.6247, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.544972836971283, + "rewards/margins": 0.018510470166802406, + "rewards/rejected": -0.5634833574295044, + "step": 19 + }, + { + "epoch": 0.04274646005877638, + "grad_norm": 3.5271363258361816, + "learning_rate": 2.127659574468085e-07, + "logits/chosen": -1.0348137617111206, + "logits/rejected": -1.0212081670761108, + "logps/chosen": -0.2397567480802536, + "logps/rejected": -0.23578569293022156, + "loss": 1.6172, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4795134961605072, + "rewards/margins": -0.007942091673612595, + "rewards/rejected": -0.4715713858604431, + "step": 20 + }, + { + "epoch": 0.0448837830617152, + "grad_norm": 7.901147842407227, + "learning_rate": 2.2340425531914892e-07, + "logits/chosen": -1.1679033041000366, + "logits/rejected": -1.0415174961090088, + "logps/chosen": -0.33534738421440125, + "logps/rejected": -0.27388396859169006, + "loss": 1.6502, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.6706947684288025, + "rewards/margins": -0.12292689830064774, + "rewards/rejected": -0.5477679371833801, + "step": 21 + }, + { + "epoch": 0.04702110606465402, + "grad_norm": 2.3991823196411133, + "learning_rate": 2.3404255319148937e-07, + "logits/chosen": -1.0736172199249268, + "logits/rejected": -1.0771551132202148, + "logps/chosen": -0.2646552622318268, + "logps/rejected": -0.2733539938926697, + "loss": 1.6048, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5293105244636536, + "rewards/margins": 0.01739754155278206, + "rewards/rejected": -0.5467079877853394, + "step": 22 + }, + { + "epoch": 0.04915842906759284, + "grad_norm": 4.812252998352051, + "learning_rate": 2.4468085106382976e-07, + "logits/chosen": -0.8147614002227783, + "logits/rejected": -0.9166449904441833, + "logps/chosen": -0.28619590401649475, + "logps/rejected": -0.2908383309841156, + "loss": 1.5764, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5723918080329895, + "rewards/margins": 0.009284832514822483, + "rewards/rejected": -0.5816766619682312, + "step": 23 + }, + { + "epoch": 0.05129575207053166, + "grad_norm": 5.214301109313965, + "learning_rate": 2.5531914893617016e-07, + "logits/chosen": -1.0316184759140015, + "logits/rejected": -1.0412724018096924, + "logps/chosen": -0.23989242315292358, + "logps/rejected": -0.26728230714797974, + "loss": 1.602, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.47978484630584717, + "rewards/margins": 0.0547797717154026, + "rewards/rejected": -0.5345646142959595, + "step": 24 + }, + { + "epoch": 0.053433075073470476, + "grad_norm": 3.372835636138916, + "learning_rate": 2.659574468085106e-07, + "logits/chosen": -1.0795375108718872, + "logits/rejected": -0.9741866588592529, + "logps/chosen": -0.28838473558425903, + "logps/rejected": -0.32610005140304565, + "loss": 1.6016, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5767694711685181, + "rewards/margins": 0.07543070614337921, + "rewards/rejected": -0.6522001028060913, + "step": 25 + }, + { + "epoch": 0.055570398076409296, + "grad_norm": 3.9052999019622803, + "learning_rate": 2.7659574468085106e-07, + "logits/chosen": -1.2568001747131348, + "logits/rejected": -1.1107139587402344, + "logps/chosen": -0.30466389656066895, + "logps/rejected": -0.2980763614177704, + "loss": 1.6209, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6093277931213379, + "rewards/margins": -0.0131750563159585, + "rewards/rejected": -0.5961527228355408, + "step": 26 + }, + { + "epoch": 0.057707721079348115, + "grad_norm": 3.9069981575012207, + "learning_rate": 2.872340425531915e-07, + "logits/chosen": -1.0098018646240234, + "logits/rejected": -0.9794459342956543, + "logps/chosen": -0.2699134051799774, + "logps/rejected": -0.28315117955207825, + "loss": 1.6203, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5398268103599548, + "rewards/margins": 0.026475582271814346, + "rewards/rejected": -0.5663023591041565, + "step": 27 + }, + { + "epoch": 0.059845044082286934, + "grad_norm": 4.644921779632568, + "learning_rate": 2.978723404255319e-07, + "logits/chosen": -0.8839479088783264, + "logits/rejected": -0.9320971965789795, + "logps/chosen": -0.2668587565422058, + "logps/rejected": -0.27507418394088745, + "loss": 1.6176, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5337175130844116, + "rewards/margins": 0.016430813819169998, + "rewards/rejected": -0.5501483678817749, + "step": 28 + }, + { + "epoch": 0.061982367085225754, + "grad_norm": 3.2341363430023193, + "learning_rate": 3.085106382978723e-07, + "logits/chosen": -1.0859841108322144, + "logits/rejected": -1.0080296993255615, + "logps/chosen": -0.2636515498161316, + "logps/rejected": -0.2644122838973999, + "loss": 1.6185, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5273030996322632, + "rewards/margins": 0.0015215259045362473, + "rewards/rejected": -0.5288245677947998, + "step": 29 + }, + { + "epoch": 0.06411969008816458, + "grad_norm": 5.580157279968262, + "learning_rate": 3.1914893617021275e-07, + "logits/chosen": -1.170966386795044, + "logits/rejected": -0.9350689053535461, + "logps/chosen": -0.2749802768230438, + "logps/rejected": -0.2526704668998718, + "loss": 1.6164, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.5499605536460876, + "rewards/margins": -0.04461963474750519, + "rewards/rejected": -0.5053409337997437, + "step": 30 + }, + { + "epoch": 0.06625701309110339, + "grad_norm": 4.681908130645752, + "learning_rate": 3.2978723404255315e-07, + "logits/chosen": -1.0664238929748535, + "logits/rejected": -0.9249334335327148, + "logps/chosen": -0.26851335167884827, + "logps/rejected": -0.3246592581272125, + "loss": 1.5989, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5370267033576965, + "rewards/margins": 0.11229176819324493, + "rewards/rejected": -0.649318516254425, + "step": 31 + }, + { + "epoch": 0.06839433609404222, + "grad_norm": 7.798113822937012, + "learning_rate": 3.404255319148936e-07, + "logits/chosen": -0.8868415951728821, + "logits/rejected": -0.8269252777099609, + "logps/chosen": -0.26608309149742126, + "logps/rejected": -0.29178884625434875, + "loss": 1.6, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5321661829948425, + "rewards/margins": 0.051411453634500504, + "rewards/rejected": -0.5835776925086975, + "step": 32 + }, + { + "epoch": 0.07053165909698103, + "grad_norm": 3.8565964698791504, + "learning_rate": 3.5106382978723405e-07, + "logits/chosen": -1.075560450553894, + "logits/rejected": -0.9206546545028687, + "logps/chosen": -0.3033750355243683, + "logps/rejected": -0.2647935748100281, + "loss": 1.6255, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.6067500710487366, + "rewards/margins": -0.07716288417577744, + "rewards/rejected": -0.5295871496200562, + "step": 33 + }, + { + "epoch": 0.07266898209991986, + "grad_norm": 4.738920211791992, + "learning_rate": 3.617021276595745e-07, + "logits/chosen": -1.0078967809677124, + "logits/rejected": -0.9841946363449097, + "logps/chosen": -0.29649823904037476, + "logps/rejected": -0.3331226706504822, + "loss": 1.6, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5929964780807495, + "rewards/margins": 0.07324886322021484, + "rewards/rejected": -0.6662453413009644, + "step": 34 + }, + { + "epoch": 0.07480630510285867, + "grad_norm": 15.601210594177246, + "learning_rate": 3.7234042553191484e-07, + "logits/chosen": -1.0271260738372803, + "logits/rejected": -1.0070686340332031, + "logps/chosen": -0.2500755488872528, + "logps/rejected": -0.2826491892337799, + "loss": 1.6131, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5001510977745056, + "rewards/margins": 0.06514722108840942, + "rewards/rejected": -0.5652983784675598, + "step": 35 + }, + { + "epoch": 0.07694362810579748, + "grad_norm": 7.088011264801025, + "learning_rate": 3.829787234042553e-07, + "logits/chosen": -0.7224124670028687, + "logits/rejected": -0.5971524119377136, + "logps/chosen": -0.2726445198059082, + "logps/rejected": -0.2940409481525421, + "loss": 1.603, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5452890396118164, + "rewards/margins": 0.04279275983572006, + "rewards/rejected": -0.5880818963050842, + "step": 36 + }, + { + "epoch": 0.07908095110873631, + "grad_norm": 4.70820426940918, + "learning_rate": 3.9361702127659574e-07, + "logits/chosen": -0.979728102684021, + "logits/rejected": -0.9153163433074951, + "logps/chosen": -0.27593153715133667, + "logps/rejected": -0.26201528310775757, + "loss": 1.6146, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5518630743026733, + "rewards/margins": -0.02783256769180298, + "rewards/rejected": -0.5240305662155151, + "step": 37 + }, + { + "epoch": 0.08121827411167512, + "grad_norm": 8.7505464553833, + "learning_rate": 4.0425531914893614e-07, + "logits/chosen": -0.8443434238433838, + "logits/rejected": -0.8855568170547485, + "logps/chosen": -0.29990217089653015, + "logps/rejected": -0.2905019521713257, + "loss": 1.6493, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5998043417930603, + "rewards/margins": -0.0188005194067955, + "rewards/rejected": -0.5810039043426514, + "step": 38 + }, + { + "epoch": 0.08335559711461395, + "grad_norm": 5.359803676605225, + "learning_rate": 4.148936170212766e-07, + "logits/chosen": -1.0560702085494995, + "logits/rejected": -1.1278265714645386, + "logps/chosen": -0.25392618775367737, + "logps/rejected": -0.2735791802406311, + "loss": 1.5949, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5078523755073547, + "rewards/margins": 0.039305973798036575, + "rewards/rejected": -0.5471583604812622, + "step": 39 + }, + { + "epoch": 0.08549292011755276, + "grad_norm": 3.1088671684265137, + "learning_rate": 4.25531914893617e-07, + "logits/chosen": -1.0592100620269775, + "logits/rejected": -1.0815989971160889, + "logps/chosen": -0.2885398864746094, + "logps/rejected": -0.2929195761680603, + "loss": 1.6321, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5770797729492188, + "rewards/margins": 0.00875941477715969, + "rewards/rejected": -0.5858391523361206, + "step": 40 + }, + { + "epoch": 0.08763024312049159, + "grad_norm": 8.073966026306152, + "learning_rate": 4.3617021276595744e-07, + "logits/chosen": -1.0096590518951416, + "logits/rejected": -0.8713966012001038, + "logps/chosen": -0.30629193782806396, + "logps/rejected": -0.33664122223854065, + "loss": 1.5914, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6125838756561279, + "rewards/margins": 0.06069856137037277, + "rewards/rejected": -0.6732824444770813, + "step": 41 + }, + { + "epoch": 0.0897675661234304, + "grad_norm": 5.209786891937256, + "learning_rate": 4.4680851063829783e-07, + "logits/chosen": -1.0377849340438843, + "logits/rejected": -0.8914337754249573, + "logps/chosen": -0.2845829427242279, + "logps/rejected": -0.3244422674179077, + "loss": 1.6121, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5691658854484558, + "rewards/margins": 0.079718679189682, + "rewards/rejected": -0.6488845348358154, + "step": 42 + }, + { + "epoch": 0.09190488912636922, + "grad_norm": 4.667476177215576, + "learning_rate": 4.574468085106383e-07, + "logits/chosen": -0.7347361445426941, + "logits/rejected": -0.7869642376899719, + "logps/chosen": -0.3507947623729706, + "logps/rejected": -0.27199897170066833, + "loss": 1.6222, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.7015895247459412, + "rewards/margins": -0.1575915366411209, + "rewards/rejected": -0.5439979434013367, + "step": 43 + }, + { + "epoch": 0.09404221212930804, + "grad_norm": 14.311481475830078, + "learning_rate": 4.6808510638297873e-07, + "logits/chosen": -0.8943421840667725, + "logits/rejected": -0.836614727973938, + "logps/chosen": -0.4167774021625519, + "logps/rejected": -0.430794894695282, + "loss": 1.597, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8335548043251038, + "rewards/margins": 0.028035037219524384, + "rewards/rejected": -0.861589789390564, + "step": 44 + }, + { + "epoch": 0.09617953513224686, + "grad_norm": 3.08385968208313, + "learning_rate": 4.787234042553192e-07, + "logits/chosen": -0.9741953015327454, + "logits/rejected": -0.8605018258094788, + "logps/chosen": -0.2905868887901306, + "logps/rejected": -0.29014959931373596, + "loss": 1.6179, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5811737775802612, + "rewards/margins": -0.0008745882660150528, + "rewards/rejected": -0.5802991986274719, + "step": 45 + }, + { + "epoch": 0.09831685813518568, + "grad_norm": 3.141914129257202, + "learning_rate": 4.893617021276595e-07, + "logits/chosen": -0.8467612266540527, + "logits/rejected": -0.8879311084747314, + "logps/chosen": -0.2710065543651581, + "logps/rejected": -0.28622525930404663, + "loss": 1.6098, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5420131087303162, + "rewards/margins": 0.03043745458126068, + "rewards/rejected": -0.5724505186080933, + "step": 46 + }, + { + "epoch": 0.1004541811381245, + "grad_norm": 5.874278545379639, + "learning_rate": 5e-07, + "logits/chosen": -0.9935128688812256, + "logits/rejected": -1.0635360479354858, + "logps/chosen": -0.2610815465450287, + "logps/rejected": -0.2970622777938843, + "loss": 1.5882, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5221630930900574, + "rewards/margins": 0.07196150720119476, + "rewards/rejected": -0.5941245555877686, + "step": 47 + }, + { + "epoch": 0.10259150414106331, + "grad_norm": 3.674631118774414, + "learning_rate": 4.999930062653174e-07, + "logits/chosen": -0.7607293725013733, + "logits/rejected": -0.9387491941452026, + "logps/chosen": -0.30105069279670715, + "logps/rejected": -0.29622718691825867, + "loss": 1.6263, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6021013855934143, + "rewards/margins": -0.009646959602832794, + "rewards/rejected": -0.5924543738365173, + "step": 48 + }, + { + "epoch": 0.10472882714400214, + "grad_norm": 3.2993836402893066, + "learning_rate": 4.999720254525684e-07, + "logits/chosen": -1.041825294494629, + "logits/rejected": -0.8979977965354919, + "logps/chosen": -0.3147028684616089, + "logps/rejected": -0.32463401556015015, + "loss": 1.5836, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6294057369232178, + "rewards/margins": 0.019862275570631027, + "rewards/rejected": -0.6492680311203003, + "step": 49 + }, + { + "epoch": 0.10686615014694095, + "grad_norm": 3.6394598484039307, + "learning_rate": 4.999370587356267e-07, + "logits/chosen": -1.0319520235061646, + "logits/rejected": -0.9399799108505249, + "logps/chosen": -0.3198903501033783, + "logps/rejected": -0.33650463819503784, + "loss": 1.61, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6397807002067566, + "rewards/margins": 0.03322865813970566, + "rewards/rejected": -0.6730092763900757, + "step": 50 + }, + { + "epoch": 0.10900347314987978, + "grad_norm": 3.5822248458862305, + "learning_rate": 4.998881080708758e-07, + "logits/chosen": -0.7624353170394897, + "logits/rejected": -0.7781803011894226, + "logps/chosen": -0.22195202112197876, + "logps/rejected": -0.2529197931289673, + "loss": 1.6014, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.4439040422439575, + "rewards/margins": 0.061935484409332275, + "rewards/rejected": -0.5058395862579346, + "step": 51 + }, + { + "epoch": 0.11114079615281859, + "grad_norm": 4.502132415771484, + "learning_rate": 4.998251761970996e-07, + "logits/chosen": -0.934096097946167, + "logits/rejected": -0.9894377589225769, + "logps/chosen": -0.3010854721069336, + "logps/rejected": -0.2971184551715851, + "loss": 1.6238, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6021709442138672, + "rewards/margins": -0.00793398916721344, + "rewards/rejected": -0.5942369103431702, + "step": 52 + }, + { + "epoch": 0.11327811915575742, + "grad_norm": 14.495563507080078, + "learning_rate": 4.997482666353286e-07, + "logits/chosen": -0.9065138101577759, + "logits/rejected": -0.8083285093307495, + "logps/chosen": -0.2879031002521515, + "logps/rejected": -0.30471161007881165, + "loss": 1.6036, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.575806200504303, + "rewards/margins": 0.03361699730157852, + "rewards/rejected": -0.6094232201576233, + "step": 53 + }, + { + "epoch": 0.11541544215869623, + "grad_norm": 5.210042953491211, + "learning_rate": 4.996573836886434e-07, + "logits/chosen": -1.012821912765503, + "logits/rejected": -0.935365617275238, + "logps/chosen": -0.27059802412986755, + "logps/rejected": -0.28305694460868835, + "loss": 1.5922, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5411960482597351, + "rewards/margins": 0.024917850270867348, + "rewards/rejected": -0.5661138892173767, + "step": 54 + }, + { + "epoch": 0.11755276516163506, + "grad_norm": 3.4929800033569336, + "learning_rate": 4.995525324419337e-07, + "logits/chosen": -1.03290593624115, + "logits/rejected": -0.8397963047027588, + "logps/chosen": -0.23197168111801147, + "logps/rejected": -0.257206529378891, + "loss": 1.6012, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.46394336223602295, + "rewards/margins": 0.05046967417001724, + "rewards/rejected": -0.514413058757782, + "step": 55 + }, + { + "epoch": 0.11969008816457387, + "grad_norm": 6.639918804168701, + "learning_rate": 4.99433718761614e-07, + "logits/chosen": -0.8676168918609619, + "logits/rejected": -0.8751212954521179, + "logps/chosen": -0.2813611626625061, + "logps/rejected": -0.28943243622779846, + "loss": 1.602, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5627223253250122, + "rewards/margins": 0.01614254154264927, + "rewards/rejected": -0.5788648724555969, + "step": 56 + }, + { + "epoch": 0.1218274111675127, + "grad_norm": 3.159461736679077, + "learning_rate": 4.993009492952949e-07, + "logits/chosen": -0.9598115682601929, + "logits/rejected": -0.9728808999061584, + "logps/chosen": -0.2418256551027298, + "logps/rejected": -0.27858078479766846, + "loss": 1.6025, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4836513102054596, + "rewards/margins": 0.07351024448871613, + "rewards/rejected": -0.5571615695953369, + "step": 57 + }, + { + "epoch": 0.12396473417045151, + "grad_norm": 3.1260619163513184, + "learning_rate": 4.991542314714122e-07, + "logits/chosen": -1.1715333461761475, + "logits/rejected": -1.0372506380081177, + "logps/chosen": -0.2886565625667572, + "logps/rejected": -0.3048909306526184, + "loss": 1.6142, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5773131251335144, + "rewards/margins": 0.03246863931417465, + "rewards/rejected": -0.6097818613052368, + "step": 58 + }, + { + "epoch": 0.12610205717339032, + "grad_norm": 4.803882598876953, + "learning_rate": 4.989935734988097e-07, + "logits/chosen": -0.8652929663658142, + "logits/rejected": -0.9138813018798828, + "logps/chosen": -0.22791269421577454, + "logps/rejected": -0.2620168924331665, + "loss": 1.5999, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.4558253884315491, + "rewards/margins": 0.06820837408304214, + "rewards/rejected": -0.524033784866333, + "step": 59 + }, + { + "epoch": 0.12823938017632916, + "grad_norm": 2.9545466899871826, + "learning_rate": 4.988189843662815e-07, + "logits/chosen": -0.9540647864341736, + "logits/rejected": -0.9105108380317688, + "logps/chosen": -0.28050848841667175, + "logps/rejected": -0.2682150602340698, + "loss": 1.6229, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5610169768333435, + "rewards/margins": -0.024586813524365425, + "rewards/rejected": -0.5364301204681396, + "step": 60 + }, + { + "epoch": 0.13037670317926797, + "grad_norm": 5.4623260498046875, + "learning_rate": 4.986304738420683e-07, + "logits/chosen": -0.8594868779182434, + "logits/rejected": -0.8749207854270935, + "logps/chosen": -0.23750001192092896, + "logps/rejected": -0.24768495559692383, + "loss": 1.5863, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4750000238418579, + "rewards/margins": 0.020369907841086388, + "rewards/rejected": -0.49536991119384766, + "step": 61 + }, + { + "epoch": 0.13251402618220678, + "grad_norm": 5.195383548736572, + "learning_rate": 4.984280524733107e-07, + "logits/chosen": -0.8988451361656189, + "logits/rejected": -1.0471916198730469, + "logps/chosen": -0.2563616931438446, + "logps/rejected": -0.264529824256897, + "loss": 1.628, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5127233862876892, + "rewards/margins": 0.01633620262145996, + "rewards/rejected": -0.529059648513794, + "step": 62 + }, + { + "epoch": 0.1346513491851456, + "grad_norm": 3.365633249282837, + "learning_rate": 4.982117315854593e-07, + "logits/chosen": -0.9563354253768921, + "logits/rejected": -1.143921971321106, + "logps/chosen": -0.27564752101898193, + "logps/rejected": -0.2891802191734314, + "loss": 1.6243, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5512950420379639, + "rewards/margins": 0.02706541307270527, + "rewards/rejected": -0.5783604383468628, + "step": 63 + }, + { + "epoch": 0.13678867218808444, + "grad_norm": 3.6298470497131348, + "learning_rate": 4.979815232816416e-07, + "logits/chosen": -0.9835873246192932, + "logits/rejected": -0.8579452037811279, + "logps/chosen": -0.2935434579849243, + "logps/rejected": -0.26197710633277893, + "loss": 1.6428, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.5870869159698486, + "rewards/margins": -0.06313266605138779, + "rewards/rejected": -0.5239542126655579, + "step": 64 + }, + { + "epoch": 0.13892599519102325, + "grad_norm": 5.261904239654541, + "learning_rate": 4.977374404419837e-07, + "logits/chosen": -1.0193111896514893, + "logits/rejected": -1.036008358001709, + "logps/chosen": -0.27654433250427246, + "logps/rejected": -0.25757479667663574, + "loss": 1.5985, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.6913608908653259, + "rewards/margins": -0.047423895448446274, + "rewards/rejected": -0.6439369320869446, + "step": 65 + }, + { + "epoch": 0.14106331819396206, + "grad_norm": 3.326939582824707, + "learning_rate": 4.974794967228907e-07, + "logits/chosen": -1.0054104328155518, + "logits/rejected": -0.9754442572593689, + "logps/chosen": -0.2905897796154022, + "logps/rejected": -0.32264938950538635, + "loss": 1.6248, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7264744639396667, + "rewards/margins": 0.08014895021915436, + "rewards/rejected": -0.8066234588623047, + "step": 66 + }, + { + "epoch": 0.14320064119690087, + "grad_norm": 5.669600486755371, + "learning_rate": 4.972077065562821e-07, + "logits/chosen": -0.9552958607673645, + "logits/rejected": -1.0761511325836182, + "logps/chosen": -0.3276459574699402, + "logps/rejected": -0.32107335329055786, + "loss": 1.6203, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8191148638725281, + "rewards/margins": -0.01643138751387596, + "rewards/rejected": -0.8026834726333618, + "step": 67 + }, + { + "epoch": 0.14533796419983971, + "grad_norm": 3.257904052734375, + "learning_rate": 4.969220851487844e-07, + "logits/chosen": -0.9927914142608643, + "logits/rejected": -0.9472739696502686, + "logps/chosen": -0.3458186686038971, + "logps/rejected": -0.34241756796836853, + "loss": 1.6191, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.8645466566085815, + "rewards/margins": -0.008502773940563202, + "rewards/rejected": -0.8560439348220825, + "step": 68 + }, + { + "epoch": 0.14747528720277853, + "grad_norm": 5.560789585113525, + "learning_rate": 4.966226484808803e-07, + "logits/chosen": -0.9344061613082886, + "logits/rejected": -0.8273663520812988, + "logps/chosen": -0.2849215567111969, + "logps/rejected": -0.31608855724334717, + "loss": 1.6123, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7123039960861206, + "rewards/margins": 0.0779174268245697, + "rewards/rejected": -0.7902213335037231, + "step": 69 + }, + { + "epoch": 0.14961261020571734, + "grad_norm": 3.70934796333313, + "learning_rate": 4.963094133060148e-07, + "logits/chosen": -0.9611161947250366, + "logits/rejected": -0.8749902248382568, + "logps/chosen": -0.2869144380092621, + "logps/rejected": -0.23931002616882324, + "loss": 1.6348, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.7172860503196716, + "rewards/margins": -0.11901099979877472, + "rewards/rejected": -0.5982750654220581, + "step": 70 + }, + { + "epoch": 0.15174993320865615, + "grad_norm": 3.771442413330078, + "learning_rate": 4.959823971496574e-07, + "logits/chosen": -1.0483484268188477, + "logits/rejected": -0.9827014803886414, + "logps/chosen": -0.3061015009880066, + "logps/rejected": -0.3094024658203125, + "loss": 1.5879, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.7652537822723389, + "rewards/margins": 0.00825244840234518, + "rewards/rejected": -0.7735061645507812, + "step": 71 + }, + { + "epoch": 0.15388725621159496, + "grad_norm": 3.6872432231903076, + "learning_rate": 4.956416183083221e-07, + "logits/chosen": -1.0115149021148682, + "logits/rejected": -1.0020099878311157, + "logps/chosen": -0.26311925053596497, + "logps/rejected": -0.27171316742897034, + "loss": 1.5697, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6577981114387512, + "rewards/margins": 0.02148478478193283, + "rewards/rejected": -0.6792829036712646, + "step": 72 + }, + { + "epoch": 0.1560245792145338, + "grad_norm": 7.885510444641113, + "learning_rate": 4.952870958485431e-07, + "logits/chosen": -0.7439613938331604, + "logits/rejected": -0.7543243169784546, + "logps/chosen": -0.32277047634124756, + "logps/rejected": -0.44049495458602905, + "loss": 1.5719, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8069261312484741, + "rewards/margins": 0.2943112254142761, + "rewards/rejected": -1.1012372970581055, + "step": 73 + }, + { + "epoch": 0.15816190221747262, + "grad_norm": 14.005949020385742, + "learning_rate": 4.949188496058089e-07, + "logits/chosen": -0.8661502599716187, + "logits/rejected": -0.9138545989990234, + "logps/chosen": -0.27060988545417786, + "logps/rejected": -0.25004029273986816, + "loss": 1.6381, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6765246987342834, + "rewards/margins": -0.05142403393983841, + "rewards/rejected": -0.6251006722450256, + "step": 74 + }, + { + "epoch": 0.16029922522041143, + "grad_norm": 7.343827247619629, + "learning_rate": 4.945369001834514e-07, + "logits/chosen": -1.07318115234375, + "logits/rejected": -1.0178194046020508, + "logps/chosen": -0.2654929459095001, + "logps/rejected": -0.29686206579208374, + "loss": 1.5458, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6637323498725891, + "rewards/margins": 0.07842274755239487, + "rewards/rejected": -0.7421550750732422, + "step": 75 + }, + { + "epoch": 0.16243654822335024, + "grad_norm": 4.01411247253418, + "learning_rate": 4.941412689514941e-07, + "logits/chosen": -1.162184238433838, + "logits/rejected": -1.2236565351486206, + "logps/chosen": -0.2647251486778259, + "logps/rejected": -0.2977098226547241, + "loss": 1.6206, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6618129014968872, + "rewards/margins": 0.08246167004108429, + "rewards/rejected": -0.7442746162414551, + "step": 76 + }, + { + "epoch": 0.16457387122628908, + "grad_norm": 4.315869331359863, + "learning_rate": 4.937319780454559e-07, + "logits/chosen": -0.8569203019142151, + "logits/rejected": -0.7959333062171936, + "logps/chosen": -0.29102951288223267, + "logps/rejected": -0.31862419843673706, + "loss": 1.5992, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.727573812007904, + "rewards/margins": 0.06898671388626099, + "rewards/rejected": -0.7965604662895203, + "step": 77 + }, + { + "epoch": 0.1667111942292279, + "grad_norm": 6.3516645431518555, + "learning_rate": 4.933090503651128e-07, + "logits/chosen": -0.9815778136253357, + "logits/rejected": -0.9455960988998413, + "logps/chosen": -0.290622353553772, + "logps/rejected": -0.25920000672340393, + "loss": 1.6019, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.7265558838844299, + "rewards/margins": -0.07855589687824249, + "rewards/rejected": -0.6479999423027039, + "step": 78 + }, + { + "epoch": 0.1688485172321667, + "grad_norm": 5.576763153076172, + "learning_rate": 4.928725095732168e-07, + "logits/chosen": -0.7572908401489258, + "logits/rejected": -0.8643375039100647, + "logps/chosen": -0.28876882791519165, + "logps/rejected": -0.38018882274627686, + "loss": 1.574, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.721921980381012, + "rewards/margins": 0.22855007648468018, + "rewards/rejected": -0.9504721164703369, + "step": 79 + }, + { + "epoch": 0.17098584023510552, + "grad_norm": 6.904773235321045, + "learning_rate": 4.924223800941717e-07, + "logits/chosen": -1.1600089073181152, + "logits/rejected": -1.001929759979248, + "logps/chosen": -0.3185364007949829, + "logps/rejected": -0.2833505868911743, + "loss": 1.5885, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.796341061592102, + "rewards/margins": -0.08796463906764984, + "rewards/rejected": -0.708376407623291, + "step": 80 + }, + { + "epoch": 0.17312316323804436, + "grad_norm": 5.542295932769775, + "learning_rate": 4.919586871126667e-07, + "logits/chosen": -1.1290327310562134, + "logits/rejected": -1.0776805877685547, + "logps/chosen": -0.28904592990875244, + "logps/rejected": -0.32642093300819397, + "loss": 1.5823, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7226147651672363, + "rewards/margins": 0.09343745559453964, + "rewards/rejected": -0.816052258014679, + "step": 81 + }, + { + "epoch": 0.17526048624098317, + "grad_norm": 7.346787929534912, + "learning_rate": 4.91481456572267e-07, + "logits/chosen": -1.008028507232666, + "logits/rejected": -0.7614388465881348, + "logps/chosen": -0.276422917842865, + "logps/rejected": -0.28925520181655884, + "loss": 1.5471, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6910573244094849, + "rewards/margins": 0.03208072483539581, + "rewards/rejected": -0.7231380343437195, + "step": 82 + }, + { + "epoch": 0.17739780924392198, + "grad_norm": 3.4823226928710938, + "learning_rate": 4.909907151739633e-07, + "logits/chosen": -0.8054043650627136, + "logits/rejected": -0.8212348222732544, + "logps/chosen": -0.25493186712265015, + "logps/rejected": -0.2324959635734558, + "loss": 1.6075, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.637329638004303, + "rewards/margins": -0.05608966201543808, + "rewards/rejected": -0.5812399983406067, + "step": 83 + }, + { + "epoch": 0.1795351322468608, + "grad_norm": 8.512069702148438, + "learning_rate": 4.904864903746765e-07, + "logits/chosen": -0.8016963601112366, + "logits/rejected": -0.8472069501876831, + "logps/chosen": -0.3017991781234741, + "logps/rejected": -0.30026912689208984, + "loss": 1.6386, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7544978857040405, + "rewards/margins": -0.003825142979621887, + "rewards/rejected": -0.7506727576255798, + "step": 84 + }, + { + "epoch": 0.18167245524979964, + "grad_norm": 3.27510666847229, + "learning_rate": 4.899688103857222e-07, + "logits/chosen": -0.9057269096374512, + "logits/rejected": -0.8979475498199463, + "logps/chosen": -0.24959440529346466, + "logps/rejected": -0.3118637502193451, + "loss": 1.5594, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.623986005783081, + "rewards/margins": 0.15567341446876526, + "rewards/rejected": -0.7796593904495239, + "step": 85 + }, + { + "epoch": 0.18380977825273845, + "grad_norm": 3.2452337741851807, + "learning_rate": 4.894377041712326e-07, + "logits/chosen": -0.6997116208076477, + "logits/rejected": -0.6495150327682495, + "logps/chosen": -0.2519880533218384, + "logps/rejected": -0.30695033073425293, + "loss": 1.585, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6299701929092407, + "rewards/margins": 0.137405663728714, + "rewards/rejected": -0.7673758268356323, + "step": 86 + }, + { + "epoch": 0.18594710125567726, + "grad_norm": 6.7908430099487305, + "learning_rate": 4.888932014465352e-07, + "logits/chosen": -0.8975124359130859, + "logits/rejected": -0.8113777041435242, + "logps/chosen": -0.2837047576904297, + "logps/rejected": -0.2963961958885193, + "loss": 1.5927, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.709261953830719, + "rewards/margins": 0.03172856196761131, + "rewards/rejected": -0.7409905195236206, + "step": 87 + }, + { + "epoch": 0.18808442425861607, + "grad_norm": 4.965595722198486, + "learning_rate": 4.883353326764906e-07, + "logits/chosen": -0.8913217186927795, + "logits/rejected": -0.8421756625175476, + "logps/chosen": -0.25936800241470337, + "logps/rejected": -0.45224201679229736, + "loss": 1.5572, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.648419976234436, + "rewards/margins": 0.4821849763393402, + "rewards/rejected": -1.1306049823760986, + "step": 88 + }, + { + "epoch": 0.1902217472615549, + "grad_norm": 5.781017780303955, + "learning_rate": 4.877641290737883e-07, + "logits/chosen": -0.9931791424751282, + "logits/rejected": -0.9962902665138245, + "logps/chosen": -0.2539224624633789, + "logps/rejected": -0.2921288311481476, + "loss": 1.6042, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6348061561584473, + "rewards/margins": 0.0955159068107605, + "rewards/rejected": -0.730322003364563, + "step": 89 + }, + { + "epoch": 0.19235907026449373, + "grad_norm": 5.002528190612793, + "learning_rate": 4.871796225971999e-07, + "logits/chosen": -0.9850423336029053, + "logits/rejected": -0.857207179069519, + "logps/chosen": -0.27758607268333435, + "logps/rejected": -0.3058236241340637, + "loss": 1.5967, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6939651966094971, + "rewards/margins": 0.0705939531326294, + "rewards/rejected": -0.7645590901374817, + "step": 90 + }, + { + "epoch": 0.19449639326743254, + "grad_norm": 5.870463848114014, + "learning_rate": 4.86581845949791e-07, + "logits/chosen": -0.949233889579773, + "logits/rejected": -1.0075958967208862, + "logps/chosen": -0.2556777596473694, + "logps/rejected": -0.2905680537223816, + "loss": 1.5559, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6391944289207458, + "rewards/margins": 0.08722572773694992, + "rewards/rejected": -0.7264201045036316, + "step": 91 + }, + { + "epoch": 0.19663371627037135, + "grad_norm": 4.762439727783203, + "learning_rate": 4.859708325770919e-07, + "logits/chosen": -1.1257095336914062, + "logits/rejected": -1.173663854598999, + "logps/chosen": -0.28581249713897705, + "logps/rejected": -0.3704802989959717, + "loss": 1.5915, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7145313024520874, + "rewards/margins": 0.21166956424713135, + "rewards/rejected": -0.926200807094574, + "step": 92 + }, + { + "epoch": 0.1987710392733102, + "grad_norm": 5.9837117195129395, + "learning_rate": 4.853466166652258e-07, + "logits/chosen": -0.9948515295982361, + "logits/rejected": -0.9665160179138184, + "logps/chosen": -0.2551361620426178, + "logps/rejected": -0.28811219334602356, + "loss": 1.5882, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6378403902053833, + "rewards/margins": 0.08244016021490097, + "rewards/rejected": -0.7202805280685425, + "step": 93 + }, + { + "epoch": 0.200908362276249, + "grad_norm": 4.58953857421875, + "learning_rate": 4.847092331389964e-07, + "logits/chosen": -0.7557870149612427, + "logits/rejected": -0.7804038524627686, + "logps/chosen": -0.26233580708503723, + "logps/rejected": -0.28490206599235535, + "loss": 1.6059, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6558394432067871, + "rewards/margins": 0.056415725499391556, + "rewards/rejected": -0.7122551798820496, + "step": 94 + }, + { + "epoch": 0.20304568527918782, + "grad_norm": 4.404232978820801, + "learning_rate": 4.840587176599343e-07, + "logits/chosen": -1.1708656549453735, + "logits/rejected": -1.1824274063110352, + "logps/chosen": -0.3498944640159607, + "logps/rejected": -0.3052523732185364, + "loss": 1.5518, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8747361898422241, + "rewards/margins": -0.1116051897406578, + "rewards/rejected": -0.7631310224533081, + "step": 95 + }, + { + "epoch": 0.20518300828212663, + "grad_norm": 2.755133628845215, + "learning_rate": 4.833951066243004e-07, + "logits/chosen": -0.9821409583091736, + "logits/rejected": -0.9246101975440979, + "logps/chosen": -0.29376041889190674, + "logps/rejected": -0.2656431794166565, + "loss": 1.6092, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7344010472297668, + "rewards/margins": -0.07029299437999725, + "rewards/rejected": -0.6641080379486084, + "step": 96 + }, + { + "epoch": 0.20732033128506547, + "grad_norm": 8.961865425109863, + "learning_rate": 4.82718437161051e-07, + "logits/chosen": -0.9781126976013184, + "logits/rejected": -1.0274431705474854, + "logps/chosen": -0.2688814699649811, + "logps/rejected": -0.25695058703422546, + "loss": 1.6414, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6722037196159363, + "rewards/margins": -0.029827285557985306, + "rewards/rejected": -0.6423764228820801, + "step": 97 + }, + { + "epoch": 0.20945765428800428, + "grad_norm": 3.496291160583496, + "learning_rate": 4.820287471297597e-07, + "logits/chosen": -1.110063076019287, + "logits/rejected": -0.9798667430877686, + "logps/chosen": -0.2772579789161682, + "logps/rejected": -0.284493625164032, + "loss": 1.6027, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6931449174880981, + "rewards/margins": 0.01808912120759487, + "rewards/rejected": -0.7112340927124023, + "step": 98 + }, + { + "epoch": 0.2115949772909431, + "grad_norm": 9.36062240600586, + "learning_rate": 4.813260751184992e-07, + "logits/chosen": -1.0408313274383545, + "logits/rejected": -0.9097151160240173, + "logps/chosen": -0.2336195558309555, + "logps/rejected": -0.28545060753822327, + "loss": 1.5888, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5840489268302917, + "rewards/margins": 0.12957759201526642, + "rewards/rejected": -0.7136265635490417, + "step": 99 + }, + { + "epoch": 0.2137323002938819, + "grad_norm": 4.372857570648193, + "learning_rate": 4.806104604416823e-07, + "logits/chosen": -1.1981866359710693, + "logits/rejected": -1.1812773942947388, + "logps/chosen": -0.40545234084129333, + "logps/rejected": -0.32747963070869446, + "loss": 1.6366, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0136308670043945, + "rewards/margins": -0.194931760430336, + "rewards/rejected": -0.8186991214752197, + "step": 100 + }, + { + "epoch": 0.21586962329682075, + "grad_norm": 6.457290172576904, + "learning_rate": 4.798819431378626e-07, + "logits/chosen": -0.9583615064620972, + "logits/rejected": -0.9292630553245544, + "logps/chosen": -0.2667827904224396, + "logps/rejected": -0.3141520917415619, + "loss": 1.5717, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6669570207595825, + "rewards/margins": 0.11842326819896698, + "rewards/rejected": -0.7853802442550659, + "step": 101 + }, + { + "epoch": 0.21800694629975956, + "grad_norm": 4.259753704071045, + "learning_rate": 4.79140563967494e-07, + "logits/chosen": -0.9554131031036377, + "logits/rejected": -0.9235316514968872, + "logps/chosen": -0.2790910005569458, + "logps/rejected": -0.29358065128326416, + "loss": 1.5956, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6977274417877197, + "rewards/margins": 0.03622422739863396, + "rewards/rejected": -0.7339516878128052, + "step": 102 + }, + { + "epoch": 0.22014426930269837, + "grad_norm": 7.2334675788879395, + "learning_rate": 4.783863644106502e-07, + "logits/chosen": -0.958928108215332, + "logits/rejected": -0.9119776487350464, + "logps/chosen": -0.2611943483352661, + "logps/rejected": -0.29755640029907227, + "loss": 1.5837, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6529859304428101, + "rewards/margins": 0.09090512990951538, + "rewards/rejected": -0.7438910007476807, + "step": 103 + }, + { + "epoch": 0.22228159230563718, + "grad_norm": 3.2508132457733154, + "learning_rate": 4.776193866647039e-07, + "logits/chosen": -1.073838710784912, + "logits/rejected": -0.9083616733551025, + "logps/chosen": -0.2882213294506073, + "logps/rejected": -0.2734883427619934, + "loss": 1.606, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7205533385276794, + "rewards/margins": -0.03683248162269592, + "rewards/rejected": -0.6837208867073059, + "step": 104 + }, + { + "epoch": 0.224418915308576, + "grad_norm": 5.259939193725586, + "learning_rate": 4.768396736419662e-07, + "logits/chosen": -0.9633040428161621, + "logits/rejected": -0.9958257675170898, + "logps/chosen": -0.2812567949295044, + "logps/rejected": -0.3445313572883606, + "loss": 1.6165, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.703141987323761, + "rewards/margins": 0.1581864058971405, + "rewards/rejected": -0.8613283634185791, + "step": 105 + }, + { + "epoch": 0.22655623831151483, + "grad_norm": 4.128396511077881, + "learning_rate": 4.7604726896728496e-07, + "logits/chosen": -0.898779571056366, + "logits/rejected": -0.8008460998535156, + "logps/chosen": -0.3449317216873169, + "logps/rejected": -0.3174844980239868, + "loss": 1.581, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.862329363822937, + "rewards/margins": -0.06861816346645355, + "rewards/rejected": -0.793711245059967, + "step": 106 + }, + { + "epoch": 0.22869356131445365, + "grad_norm": 7.073979377746582, + "learning_rate": 4.752422169756047e-07, + "logits/chosen": -0.7736971378326416, + "logits/rejected": -0.7141239643096924, + "logps/chosen": -0.27883169054985046, + "logps/rejected": -0.28934141993522644, + "loss": 1.6038, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6970791816711426, + "rewards/margins": 0.02627432905137539, + "rewards/rejected": -0.7233536243438721, + "step": 107 + }, + { + "epoch": 0.23083088431739246, + "grad_norm": 3.607881784439087, + "learning_rate": 4.744245627094858e-07, + "logits/chosen": -0.7748329043388367, + "logits/rejected": -0.7313745021820068, + "logps/chosen": -0.3115028142929077, + "logps/rejected": -0.3770483732223511, + "loss": 1.6484, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7787570357322693, + "rewards/margins": 0.1638639271259308, + "rewards/rejected": -0.9426208734512329, + "step": 108 + }, + { + "epoch": 0.23296820732033127, + "grad_norm": 6.025390148162842, + "learning_rate": 4.735943519165842e-07, + "logits/chosen": -0.8779905438423157, + "logits/rejected": -0.9295673966407776, + "logps/chosen": -0.2856750786304474, + "logps/rejected": -0.3149481415748596, + "loss": 1.6163, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7141878008842468, + "rewards/margins": 0.07318252325057983, + "rewards/rejected": -0.7873702645301819, + "step": 109 + }, + { + "epoch": 0.2351055303232701, + "grad_norm": 10.654799461364746, + "learning_rate": 4.7275163104709194e-07, + "logits/chosen": -1.139617681503296, + "logits/rejected": -1.037335753440857, + "logps/chosen": -0.3125270903110504, + "logps/rejected": -0.42321839928627014, + "loss": 1.5969, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7813177704811096, + "rewards/margins": 0.27672821283340454, + "rewards/rejected": -1.0580458641052246, + "step": 110 + }, + { + "epoch": 0.23724285332620892, + "grad_norm": 3.1225905418395996, + "learning_rate": 4.718964472511385e-07, + "logits/chosen": -0.7755342125892639, + "logits/rejected": -0.9119763374328613, + "logps/chosen": -0.26263684034347534, + "logps/rejected": -0.2584255635738373, + "loss": 1.6006, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6565921306610107, + "rewards/margins": -0.010528111830353737, + "rewards/rejected": -0.6460639834403992, + "step": 111 + }, + { + "epoch": 0.23938017632914774, + "grad_norm": 3.907759189605713, + "learning_rate": 4.710288483761524e-07, + "logits/chosen": -0.805738091468811, + "logits/rejected": -0.8327180743217468, + "logps/chosen": -0.26873674988746643, + "logps/rejected": -0.27950698137283325, + "loss": 1.5569, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6718418598175049, + "rewards/margins": 0.02692551538348198, + "rewards/rejected": -0.6987674236297607, + "step": 112 + }, + { + "epoch": 0.24151749933208655, + "grad_norm": 5.125892162322998, + "learning_rate": 4.7014888296418447e-07, + "logits/chosen": -0.8660019040107727, + "logits/rejected": -0.7626081109046936, + "logps/chosen": -0.27202802896499634, + "logps/rejected": -0.3179852068424225, + "loss": 1.5133, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.680070161819458, + "rewards/margins": 0.11489284038543701, + "rewards/rejected": -0.7949629426002502, + "step": 113 + }, + { + "epoch": 0.2436548223350254, + "grad_norm": 3.331281900405884, + "learning_rate": 4.692566002491916e-07, + "logits/chosen": -0.9860325455665588, + "logits/rejected": -1.0227984189987183, + "logps/chosen": -0.277464359998703, + "logps/rejected": -0.3393504023551941, + "loss": 1.5764, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6936609745025635, + "rewards/margins": 0.15471497178077698, + "rewards/rejected": -0.8483759164810181, + "step": 114 + }, + { + "epoch": 0.2457921453379642, + "grad_norm": 4.2767205238342285, + "learning_rate": 4.683520501542824e-07, + "logits/chosen": -1.1069515943527222, + "logits/rejected": -0.9956479668617249, + "logps/chosen": -0.26621949672698975, + "logps/rejected": -0.2311069816350937, + "loss": 1.6116, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6655487418174744, + "rewards/margins": -0.08778128772974014, + "rewards/rejected": -0.5777674317359924, + "step": 115 + }, + { + "epoch": 0.24792946834090301, + "grad_norm": 4.450298309326172, + "learning_rate": 4.6743528328892384e-07, + "logits/chosen": -1.089507818222046, + "logits/rejected": -1.0226225852966309, + "logps/chosen": -0.31000208854675293, + "logps/rejected": -0.3048384189605713, + "loss": 1.5601, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7750052213668823, + "rewards/margins": -0.012909159064292908, + "rewards/rejected": -0.7620960474014282, + "step": 116 + }, + { + "epoch": 0.25006679134384185, + "grad_norm": 8.30679988861084, + "learning_rate": 4.6650635094610966e-07, + "logits/chosen": -1.019626498222351, + "logits/rejected": -1.0024278163909912, + "logps/chosen": -0.27769044041633606, + "logps/rejected": -0.31382811069488525, + "loss": 1.603, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6942261457443237, + "rewards/margins": 0.09034418314695358, + "rewards/rejected": -0.7845702767372131, + "step": 117 + }, + { + "epoch": 0.25220411434678064, + "grad_norm": 2.7810580730438232, + "learning_rate": 4.655653050994906e-07, + "logits/chosen": -0.8939322829246521, + "logits/rejected": -0.9443778991699219, + "logps/chosen": -0.3001169264316559, + "logps/rejected": -0.27608948945999146, + "loss": 1.6033, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7502923011779785, + "rewards/margins": -0.06006866693496704, + "rewards/rejected": -0.6902236938476562, + "step": 118 + }, + { + "epoch": 0.2543414373497195, + "grad_norm": 7.52852201461792, + "learning_rate": 4.646121984004665e-07, + "logits/chosen": -1.0176833868026733, + "logits/rejected": -0.9106737971305847, + "logps/chosen": -0.2855750620365143, + "logps/rejected": -0.2689260244369507, + "loss": 1.6369, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.7139376401901245, + "rewards/margins": -0.041622575372457504, + "rewards/rejected": -0.6723150610923767, + "step": 119 + }, + { + "epoch": 0.2564787603526583, + "grad_norm": 5.235323429107666, + "learning_rate": 4.636470841752404e-07, + "logits/chosen": -0.894492506980896, + "logits/rejected": -0.8580023050308228, + "logps/chosen": -0.2390415519475937, + "logps/rejected": -0.3226756751537323, + "loss": 1.5698, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5976038575172424, + "rewards/margins": 0.20908531546592712, + "rewards/rejected": -0.8066891431808472, + "step": 120 + }, + { + "epoch": 0.2586160833555971, + "grad_norm": 7.31561803817749, + "learning_rate": 4.626700164218349e-07, + "logits/chosen": -1.1262331008911133, + "logits/rejected": -1.1069376468658447, + "logps/chosen": -0.32872867584228516, + "logps/rejected": -0.4012628495693207, + "loss": 1.5619, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8218216896057129, + "rewards/margins": 0.18133553862571716, + "rewards/rejected": -1.003157138824463, + "step": 121 + }, + { + "epoch": 0.26075340635853594, + "grad_norm": 4.992301940917969, + "learning_rate": 4.6168104980707103e-07, + "logits/chosen": -0.947390079498291, + "logits/rejected": -0.9287791848182678, + "logps/chosen": -0.3660104274749756, + "logps/rejected": -0.34243422746658325, + "loss": 1.6722, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.9150261282920837, + "rewards/margins": -0.058940548449754715, + "rewards/rejected": -0.8560855388641357, + "step": 122 + }, + { + "epoch": 0.26289072936147473, + "grad_norm": 9.114599227905273, + "learning_rate": 4.606802396635098e-07, + "logits/chosen": -1.0491164922714233, + "logits/rejected": -1.0415245294570923, + "logps/chosen": -0.2906876802444458, + "logps/rejected": -0.2940623164176941, + "loss": 1.6062, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7267192006111145, + "rewards/margins": 0.008436577394604683, + "rewards/rejected": -0.7351557612419128, + "step": 123 + }, + { + "epoch": 0.26502805236441357, + "grad_norm": 13.20346736907959, + "learning_rate": 4.59667641986356e-07, + "logits/chosen": -0.9480360746383667, + "logits/rejected": -0.9648789167404175, + "logps/chosen": -0.30809280276298523, + "logps/rejected": -0.39009833335876465, + "loss": 1.5673, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.770232081413269, + "rewards/margins": 0.20501384139060974, + "rewards/rejected": -0.9752458930015564, + "step": 124 + }, + { + "epoch": 0.2671653753673524, + "grad_norm": 10.775737762451172, + "learning_rate": 4.5864331343032565e-07, + "logits/chosen": -0.9860743880271912, + "logits/rejected": -0.9669252634048462, + "logps/chosen": -0.4254220724105835, + "logps/rejected": -0.42529717087745667, + "loss": 1.6036, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.063555121421814, + "rewards/margins": -0.0003122463822364807, + "rewards/rejected": -1.0632429122924805, + "step": 125 + }, + { + "epoch": 0.2693026983702912, + "grad_norm": 3.9531850814819336, + "learning_rate": 4.576073113064759e-07, + "logits/chosen": -0.9061692953109741, + "logits/rejected": -1.030226707458496, + "logps/chosen": -0.2965227961540222, + "logps/rejected": -0.3571414351463318, + "loss": 1.5663, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7413069605827332, + "rewards/margins": 0.15154659748077393, + "rewards/rejected": -0.8928536176681519, + "step": 126 + }, + { + "epoch": 0.27144002137323003, + "grad_norm": 10.59150218963623, + "learning_rate": 4.565596935789987e-07, + "logits/chosen": -1.0731703042984009, + "logits/rejected": -1.0575220584869385, + "logps/chosen": -0.3338828682899475, + "logps/rejected": -0.36648380756378174, + "loss": 1.5818, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8347071409225464, + "rewards/margins": 0.08150236308574677, + "rewards/rejected": -0.9162094593048096, + "step": 127 + }, + { + "epoch": 0.2735773443761689, + "grad_norm": 10.513736724853516, + "learning_rate": 4.555005188619775e-07, + "logits/chosen": -0.8747404217720032, + "logits/rejected": -0.8733081817626953, + "logps/chosen": -0.24837706983089447, + "logps/rejected": -0.297157883644104, + "loss": 1.5767, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6209426522254944, + "rewards/margins": 0.12195204943418503, + "rewards/rejected": -0.74289470911026, + "step": 128 + }, + { + "epoch": 0.27571466737910766, + "grad_norm": 5.887775421142578, + "learning_rate": 4.5442984641610784e-07, + "logits/chosen": -1.126139760017395, + "logits/rejected": -1.0465540885925293, + "logps/chosen": -0.2965885102748871, + "logps/rejected": -0.2864833474159241, + "loss": 1.6037, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.7414712905883789, + "rewards/margins": -0.025262875482439995, + "rewards/rejected": -0.7162083387374878, + "step": 129 + }, + { + "epoch": 0.2778519903820465, + "grad_norm": 4.381558418273926, + "learning_rate": 4.533477361453819e-07, + "logits/chosen": -1.0439306497573853, + "logits/rejected": -1.1324841976165771, + "logps/chosen": -0.3036992847919464, + "logps/rejected": -0.3638463020324707, + "loss": 1.5841, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7592483162879944, + "rewards/margins": 0.15036745369434357, + "rewards/rejected": -0.9096157550811768, + "step": 130 + }, + { + "epoch": 0.2799893133849853, + "grad_norm": 4.211307048797607, + "learning_rate": 4.5225424859373684e-07, + "logits/chosen": -0.9729929566383362, + "logits/rejected": -0.971265435218811, + "logps/chosen": -0.3391942083835602, + "logps/rejected": -0.35309362411499023, + "loss": 1.595, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.8479855060577393, + "rewards/margins": 0.03474842756986618, + "rewards/rejected": -0.8827340602874756, + "step": 131 + }, + { + "epoch": 0.2821266363879241, + "grad_norm": 9.722661018371582, + "learning_rate": 4.511494449416671e-07, + "logits/chosen": -0.8604239225387573, + "logits/rejected": -0.790294885635376, + "logps/chosen": -0.25934475660324097, + "logps/rejected": -0.2542663812637329, + "loss": 1.6546, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6483618021011353, + "rewards/margins": -0.012695923447608948, + "rewards/rejected": -0.6356659531593323, + "step": 132 + }, + { + "epoch": 0.28426395939086296, + "grad_norm": 3.2997727394104004, + "learning_rate": 4.500333870028016e-07, + "logits/chosen": -1.0789867639541626, + "logits/rejected": -1.073919653892517, + "logps/chosen": -0.25591588020324707, + "logps/rejected": -0.2593124806880951, + "loss": 1.5489, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6397897601127625, + "rewards/margins": 0.008491499349474907, + "rewards/rejected": -0.6482812166213989, + "step": 133 + }, + { + "epoch": 0.28640128239380175, + "grad_norm": 3.918221950531006, + "learning_rate": 4.489061372204452e-07, + "logits/chosen": -0.9510654211044312, + "logits/rejected": -0.880722165107727, + "logps/chosen": -0.2889711260795593, + "logps/rejected": -0.32566970586776733, + "loss": 1.5822, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7224277257919312, + "rewards/margins": 0.09174646437168121, + "rewards/rejected": -0.8141741752624512, + "step": 134 + }, + { + "epoch": 0.2885386053967406, + "grad_norm": 9.736861228942871, + "learning_rate": 4.4776775866408533e-07, + "logits/chosen": -1.0732065439224243, + "logits/rejected": -0.9681872725486755, + "logps/chosen": -0.41927701234817505, + "logps/rejected": -0.2924247086048126, + "loss": 1.5615, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.0481925010681152, + "rewards/margins": -0.3171307146549225, + "rewards/rejected": -0.7310618162155151, + "step": 135 + }, + { + "epoch": 0.29067592839967943, + "grad_norm": 3.712836503982544, + "learning_rate": 4.4661831502586244e-07, + "logits/chosen": -0.9898865222930908, + "logits/rejected": -0.958566427230835, + "logps/chosen": -0.3362237811088562, + "logps/rejected": -0.3830156624317169, + "loss": 1.5408, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8405593633651733, + "rewards/margins": 0.11697974801063538, + "rewards/rejected": -0.9575392007827759, + "step": 136 + }, + { + "epoch": 0.2928132514026182, + "grad_norm": 6.5986409187316895, + "learning_rate": 4.4545787061700746e-07, + "logits/chosen": -0.9952265620231628, + "logits/rejected": -0.9618417024612427, + "logps/chosen": -0.33093225955963135, + "logps/rejected": -0.3158915042877197, + "loss": 1.6151, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8273307085037231, + "rewards/margins": -0.037601932883262634, + "rewards/rejected": -0.7897287607192993, + "step": 137 + }, + { + "epoch": 0.29495057440555705, + "grad_norm": 11.264966011047363, + "learning_rate": 4.442864903642427e-07, + "logits/chosen": -0.9705032706260681, + "logits/rejected": -1.010439395904541, + "logps/chosen": -0.29408299922943115, + "logps/rejected": -0.32492977380752563, + "loss": 1.6785, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7352074980735779, + "rewards/margins": 0.07711698114871979, + "rewards/rejected": -0.8123244047164917, + "step": 138 + }, + { + "epoch": 0.29708789740849584, + "grad_norm": 3.294029951095581, + "learning_rate": 4.4310423980614986e-07, + "logits/chosen": -0.9771057963371277, + "logits/rejected": -0.8812280893325806, + "logps/chosen": -0.27679648995399475, + "logps/rejected": -0.301949143409729, + "loss": 1.574, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6919912099838257, + "rewards/margins": 0.06288158893585205, + "rewards/rejected": -0.7548727989196777, + "step": 139 + }, + { + "epoch": 0.2992252204114347, + "grad_norm": 6.963750839233398, + "learning_rate": 4.4191118508950277e-07, + "logits/chosen": -0.9832889437675476, + "logits/rejected": -1.041925311088562, + "logps/chosen": -0.3261723518371582, + "logps/rejected": -0.36972764134407043, + "loss": 1.6004, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8154308199882507, + "rewards/margins": 0.10888823121786118, + "rewards/rejected": -0.9243191480636597, + "step": 140 + }, + { + "epoch": 0.3013625434143735, + "grad_norm": 4.65049934387207, + "learning_rate": 4.407073929655666e-07, + "logits/chosen": -0.8786113858222961, + "logits/rejected": -0.8743698000907898, + "logps/chosen": -0.3489750027656555, + "logps/rejected": -0.34578827023506165, + "loss": 1.613, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8724376559257507, + "rewards/margins": -0.007966993376612663, + "rewards/rejected": -0.8644705414772034, + "step": 141 + }, + { + "epoch": 0.3034998664173123, + "grad_norm": 5.086356163024902, + "learning_rate": 4.394929307863632e-07, + "logits/chosen": -1.178961157798767, + "logits/rejected": -1.1221526861190796, + "logps/chosen": -0.30651959776878357, + "logps/rejected": -0.27971014380455017, + "loss": 1.5804, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.7662990093231201, + "rewards/margins": -0.0670236200094223, + "rewards/rejected": -0.6992753744125366, + "step": 142 + }, + { + "epoch": 0.30563718942025114, + "grad_norm": 4.039379119873047, + "learning_rate": 4.3826786650090273e-07, + "logits/chosen": -1.1027284860610962, + "logits/rejected": -1.0947867631912231, + "logps/chosen": -0.3164759874343872, + "logps/rejected": -0.4346773028373718, + "loss": 1.5507, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.791189968585968, + "rewards/margins": 0.29550325870513916, + "rewards/rejected": -1.086693286895752, + "step": 143 + }, + { + "epoch": 0.3077745124231899, + "grad_norm": 5.900667190551758, + "learning_rate": 4.370322686513817e-07, + "logits/chosen": -0.8383625149726868, + "logits/rejected": -0.7769290804862976, + "logps/chosen": -0.2520799934864044, + "logps/rejected": -0.23787729442119598, + "loss": 1.5816, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.6302000284194946, + "rewards/margins": -0.03550675883889198, + "rewards/rejected": -0.5946931838989258, + "step": 144 + }, + { + "epoch": 0.30991183542612877, + "grad_norm": 5.484445095062256, + "learning_rate": 4.357862063693485e-07, + "logits/chosen": -0.9914720058441162, + "logits/rejected": -1.0965267419815063, + "logps/chosen": -0.2822697162628174, + "logps/rejected": -0.3381388485431671, + "loss": 1.5638, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7056742906570435, + "rewards/margins": 0.13967278599739075, + "rewards/rejected": -0.8453471064567566, + "step": 145 + }, + { + "epoch": 0.3120491584290676, + "grad_norm": 2.739793539047241, + "learning_rate": 4.345297493718352e-07, + "logits/chosen": -0.9342893362045288, + "logits/rejected": -0.8757031559944153, + "logps/chosen": -0.512154221534729, + "logps/rejected": -0.6000754237174988, + "loss": 1.5743, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2803857326507568, + "rewards/margins": 0.2198028415441513, + "rewards/rejected": -1.5001884698867798, + "step": 146 + }, + { + "epoch": 0.3141864814320064, + "grad_norm": 4.219600677490234, + "learning_rate": 4.332629679574565e-07, + "logits/chosen": -0.7380187511444092, + "logits/rejected": -0.8239220380783081, + "logps/chosen": -0.24688729643821716, + "logps/rejected": -0.2952543795108795, + "loss": 1.5583, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6172181963920593, + "rewards/margins": 0.12091781944036484, + "rewards/rejected": -0.7381359934806824, + "step": 147 + }, + { + "epoch": 0.31632380443494523, + "grad_norm": 7.114157676696777, + "learning_rate": 4.319859330024777e-07, + "logits/chosen": -0.950808048248291, + "logits/rejected": -0.873584508895874, + "logps/chosen": -0.28023314476013184, + "logps/rejected": -0.37878137826919556, + "loss": 1.575, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7005828022956848, + "rewards/margins": 0.2463706135749817, + "rewards/rejected": -0.9469534754753113, + "step": 148 + }, + { + "epoch": 0.3184611274378841, + "grad_norm": 5.033133029937744, + "learning_rate": 4.3069871595684787e-07, + "logits/chosen": -0.9993598461151123, + "logits/rejected": -1.1495643854141235, + "logps/chosen": -0.31961789727211, + "logps/rejected": -0.4130839705467224, + "loss": 1.6089, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7990447878837585, + "rewards/margins": 0.2336651086807251, + "rewards/rejected": -1.0327098369598389, + "step": 149 + }, + { + "epoch": 0.32059845044082286, + "grad_norm": 3.7931158542633057, + "learning_rate": 4.294013888402029e-07, + "logits/chosen": -1.0581141710281372, + "logits/rejected": -0.958967924118042, + "logps/chosen": -0.30636316537857056, + "logps/rejected": -0.31132641434669495, + "loss": 1.6122, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7659078240394592, + "rewards/margins": 0.012408185750246048, + "rewards/rejected": -0.7783160209655762, + "step": 150 + }, + { + "epoch": 0.3227357734437617, + "grad_norm": 4.442758560180664, + "learning_rate": 4.280940242378362e-07, + "logits/chosen": -0.9492220878601074, + "logits/rejected": -0.9829614162445068, + "logps/chosen": -0.26527076959609985, + "logps/rejected": -0.5424583554267883, + "loss": 1.5182, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6631768941879272, + "rewards/margins": 0.6929690837860107, + "rewards/rejected": -1.3561458587646484, + "step": 151 + }, + { + "epoch": 0.3248730964467005, + "grad_norm": 4.347165107727051, + "learning_rate": 4.2677669529663686e-07, + "logits/chosen": -0.9675495624542236, + "logits/rejected": -0.9267060160636902, + "logps/chosen": -0.28724977374076843, + "logps/rejected": -0.27893343567848206, + "loss": 1.6064, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7181244492530823, + "rewards/margins": -0.02079082280397415, + "rewards/rejected": -0.697333574295044, + "step": 152 + }, + { + "epoch": 0.3270104194496393, + "grad_norm": 3.866643190383911, + "learning_rate": 4.254494757209979e-07, + "logits/chosen": -1.0312570333480835, + "logits/rejected": -0.8400145173072815, + "logps/chosen": -0.2714364230632782, + "logps/rejected": -0.3370465636253357, + "loss": 1.5993, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6785910725593567, + "rewards/margins": 0.16402524709701538, + "rewards/rejected": -0.8426163196563721, + "step": 153 + }, + { + "epoch": 0.32914774245257816, + "grad_norm": 7.2947211265563965, + "learning_rate": 4.2411243976869173e-07, + "logits/chosen": -1.1030328273773193, + "logits/rejected": -1.107038140296936, + "logps/chosen": -0.31799790263175964, + "logps/rejected": -0.3556910455226898, + "loss": 1.5435, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7949947714805603, + "rewards/margins": 0.09423284232616425, + "rewards/rejected": -0.8892276287078857, + "step": 154 + }, + { + "epoch": 0.33128506545551695, + "grad_norm": 6.094887733459473, + "learning_rate": 4.227656622467162e-07, + "logits/chosen": -0.9807777404785156, + "logits/rejected": -0.9574925303459167, + "logps/chosen": -0.36069509387016296, + "logps/rejected": -0.411272257566452, + "loss": 1.5455, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9017376899719238, + "rewards/margins": 0.12644296884536743, + "rewards/rejected": -1.028180718421936, + "step": 155 + }, + { + "epoch": 0.3334223884584558, + "grad_norm": 7.840887069702148, + "learning_rate": 4.2140921850710855e-07, + "logits/chosen": -1.1150490045547485, + "logits/rejected": -1.1116127967834473, + "logps/chosen": -0.2742304801940918, + "logps/rejected": -0.3083428740501404, + "loss": 1.546, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6855762004852295, + "rewards/margins": 0.08528101444244385, + "rewards/rejected": -0.7708572149276733, + "step": 156 + }, + { + "epoch": 0.3355597114613946, + "grad_norm": 15.734699249267578, + "learning_rate": 4.200431844427298e-07, + "logits/chosen": -0.9994797706604004, + "logits/rejected": -1.077652931213379, + "logps/chosen": -0.3408905565738678, + "logps/rejected": -0.605131208896637, + "loss": 1.5829, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8522265553474426, + "rewards/margins": 0.6606015563011169, + "rewards/rejected": -1.51282799243927, + "step": 157 + }, + { + "epoch": 0.3376970344643334, + "grad_norm": 4.568877696990967, + "learning_rate": 4.186676364830186e-07, + "logits/chosen": -0.8166912794113159, + "logits/rejected": -0.9158197641372681, + "logps/chosen": -0.3100201189517975, + "logps/rejected": -0.4257528781890869, + "loss": 1.5949, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7750502824783325, + "rewards/margins": 0.28933185338974, + "rewards/rejected": -1.0643821954727173, + "step": 158 + }, + { + "epoch": 0.33983435746727225, + "grad_norm": 7.091736793518066, + "learning_rate": 4.172826515897145e-07, + "logits/chosen": -0.9496626853942871, + "logits/rejected": -0.8826749920845032, + "logps/chosen": -0.2823619842529297, + "logps/rejected": -0.25573766231536865, + "loss": 1.6079, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.7059049606323242, + "rewards/margins": -0.06656082719564438, + "rewards/rejected": -0.6393441557884216, + "step": 159 + }, + { + "epoch": 0.34197168047021104, + "grad_norm": 9.935672760009766, + "learning_rate": 4.158883072525528e-07, + "logits/chosen": -1.139492392539978, + "logits/rejected": -0.9911923408508301, + "logps/chosen": -0.24080964922904968, + "logps/rejected": -0.23250696063041687, + "loss": 1.5373, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6020240783691406, + "rewards/margins": -0.020756704732775688, + "rewards/rejected": -0.5812674164772034, + "step": 160 + }, + { + "epoch": 0.3441090034731499, + "grad_norm": 15.794203758239746, + "learning_rate": 4.1448468148492814e-07, + "logits/chosen": -1.019397258758545, + "logits/rejected": -0.9881049394607544, + "logps/chosen": -0.3946765065193176, + "logps/rejected": -0.3796921670436859, + "loss": 1.5431, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9866912364959717, + "rewards/margins": -0.03746076300740242, + "rewards/rejected": -0.9492304921150208, + "step": 161 + }, + { + "epoch": 0.3462463264760887, + "grad_norm": 3.839250087738037, + "learning_rate": 4.130718528195303e-07, + "logits/chosen": -0.9311838746070862, + "logits/rejected": -0.8956501483917236, + "logps/chosen": -0.282693088054657, + "logps/rejected": -0.2629316449165344, + "loss": 1.5818, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.7067327499389648, + "rewards/margins": -0.04940361529588699, + "rewards/rejected": -0.6573290824890137, + "step": 162 + }, + { + "epoch": 0.3483836494790275, + "grad_norm": 5.865959644317627, + "learning_rate": 4.1164990030394985e-07, + "logits/chosen": -1.0395972728729248, + "logits/rejected": -0.9770699143409729, + "logps/chosen": -0.3128069043159485, + "logps/rejected": -0.31642264127731323, + "loss": 1.5692, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.7820172905921936, + "rewards/margins": 0.009039390832185745, + "rewards/rejected": -0.791056752204895, + "step": 163 + }, + { + "epoch": 0.35052097248196634, + "grad_norm": 4.154603481292725, + "learning_rate": 4.10218903496256e-07, + "logits/chosen": -1.0948988199234009, + "logits/rejected": -0.9907031059265137, + "logps/chosen": -0.30839213728904724, + "logps/rejected": -0.29299482703208923, + "loss": 1.5829, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7709803581237793, + "rewards/margins": -0.03849326819181442, + "rewards/rejected": -0.7324870824813843, + "step": 164 + }, + { + "epoch": 0.3526582954849052, + "grad_norm": 5.253880500793457, + "learning_rate": 4.087789424605447e-07, + "logits/chosen": -1.0539865493774414, + "logits/rejected": -0.9663246870040894, + "logps/chosen": -0.26886874437332153, + "logps/rejected": -0.43172940611839294, + "loss": 1.5157, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.672171950340271, + "rewards/margins": 0.40715163946151733, + "rewards/rejected": -1.0793235301971436, + "step": 165 + }, + { + "epoch": 0.35479561848784397, + "grad_norm": 11.49240493774414, + "learning_rate": 4.0733009776245937e-07, + "logits/chosen": -0.9969057440757751, + "logits/rejected": -1.0402690172195435, + "logps/chosen": -0.3554040193557739, + "logps/rejected": -0.396072655916214, + "loss": 1.584, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8885101079940796, + "rewards/margins": 0.10167157649993896, + "rewards/rejected": -0.9901816248893738, + "step": 166 + }, + { + "epoch": 0.3569329414907828, + "grad_norm": 5.116168975830078, + "learning_rate": 4.058724504646834e-07, + "logits/chosen": -0.9382141828536987, + "logits/rejected": -0.8863942623138428, + "logps/chosen": -0.261793315410614, + "logps/rejected": -0.45277461409568787, + "loss": 1.539, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6544832587242126, + "rewards/margins": 0.47745317220687866, + "rewards/rejected": -1.1319365501403809, + "step": 167 + }, + { + "epoch": 0.3590702644937216, + "grad_norm": 10.550248146057129, + "learning_rate": 4.0440608212240445e-07, + "logits/chosen": -1.0490831136703491, + "logits/rejected": -1.1039912700653076, + "logps/chosen": -0.3632212281227112, + "logps/rejected": -0.3736804723739624, + "loss": 1.5845, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9080529808998108, + "rewards/margins": 0.026148155331611633, + "rewards/rejected": -0.934201180934906, + "step": 168 + }, + { + "epoch": 0.36120758749666043, + "grad_norm": 4.120011806488037, + "learning_rate": 4.0293107477875156e-07, + "logits/chosen": -0.914804220199585, + "logits/rejected": -0.9306747317314148, + "logps/chosen": -0.3597089350223541, + "logps/rejected": -0.39882034063339233, + "loss": 1.5235, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8992725014686584, + "rewards/margins": 0.09777843952178955, + "rewards/rejected": -0.9970508813858032, + "step": 169 + }, + { + "epoch": 0.36334491049959927, + "grad_norm": 3.9872193336486816, + "learning_rate": 4.0144751096020497e-07, + "logits/chosen": -1.0519163608551025, + "logits/rejected": -0.9880449175834656, + "logps/chosen": -0.27723756432533264, + "logps/rejected": -0.3831270933151245, + "loss": 1.5744, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6930938959121704, + "rewards/margins": 0.26472383737564087, + "rewards/rejected": -0.9578177332878113, + "step": 170 + }, + { + "epoch": 0.36548223350253806, + "grad_norm": 5.331676006317139, + "learning_rate": 3.999554736719785e-07, + "logits/chosen": -1.1113324165344238, + "logits/rejected": -1.1892024278640747, + "logps/chosen": -0.3108530640602112, + "logps/rejected": -0.5784565806388855, + "loss": 1.4846, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7771324515342712, + "rewards/margins": 0.6690089702606201, + "rewards/rejected": -1.4461414813995361, + "step": 171 + }, + { + "epoch": 0.3676195565054769, + "grad_norm": 6.586511611938477, + "learning_rate": 3.9845504639337535e-07, + "logits/chosen": -1.2047513723373413, + "logits/rejected": -1.1406968832015991, + "logps/chosen": -0.3595273196697235, + "logps/rejected": -0.32145068049430847, + "loss": 1.5328, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8988182544708252, + "rewards/margins": -0.0951915979385376, + "rewards/rejected": -0.8036267757415771, + "step": 172 + }, + { + "epoch": 0.36975687950841574, + "grad_norm": 6.111835479736328, + "learning_rate": 3.9694631307311825e-07, + "logits/chosen": -0.8004586696624756, + "logits/rejected": -0.7772153615951538, + "logps/chosen": -0.4090813100337982, + "logps/rejected": -0.4898335635662079, + "loss": 1.559, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0227031707763672, + "rewards/margins": 0.20188063383102417, + "rewards/rejected": -1.2245839834213257, + "step": 173 + }, + { + "epoch": 0.3718942025113545, + "grad_norm": 7.0863189697265625, + "learning_rate": 3.954293581246514e-07, + "logits/chosen": -0.9679336547851562, + "logits/rejected": -0.9125540256500244, + "logps/chosen": -0.29369306564331055, + "logps/rejected": -0.31403255462646484, + "loss": 1.5375, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7342327237129211, + "rewards/margins": 0.05084871128201485, + "rewards/rejected": -0.7850814461708069, + "step": 174 + }, + { + "epoch": 0.37403152551429336, + "grad_norm": 7.140958309173584, + "learning_rate": 3.939042664214184e-07, + "logits/chosen": -0.949452817440033, + "logits/rejected": -1.0473122596740723, + "logps/chosen": -0.2707624137401581, + "logps/rejected": -0.32049351930618286, + "loss": 1.5626, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6769061088562012, + "rewards/margins": 0.12432771921157837, + "rewards/rejected": -0.8012337684631348, + "step": 175 + }, + { + "epoch": 0.37616884851723215, + "grad_norm": 7.0456695556640625, + "learning_rate": 3.92371123292113e-07, + "logits/chosen": -1.0727981328964233, + "logits/rejected": -1.1329890489578247, + "logps/chosen": -0.29705438017845154, + "logps/rejected": -0.3278125524520874, + "loss": 1.6107, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7426358461380005, + "rewards/margins": 0.07689555734395981, + "rewards/rejected": -0.8195314407348633, + "step": 176 + }, + { + "epoch": 0.378306171520171, + "grad_norm": 5.836486339569092, + "learning_rate": 3.908300145159055e-07, + "logits/chosen": -0.9942230582237244, + "logits/rejected": -1.0356171131134033, + "logps/chosen": -0.31931719183921814, + "logps/rejected": -0.33853164315223694, + "loss": 1.5837, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7982929348945618, + "rewards/margins": 0.0480361208319664, + "rewards/rejected": -0.8463290929794312, + "step": 177 + }, + { + "epoch": 0.3804434945231098, + "grad_norm": 8.505417823791504, + "learning_rate": 3.8928102631764304e-07, + "logits/chosen": -1.0212180614471436, + "logits/rejected": -1.087773323059082, + "logps/chosen": -0.3532945513725281, + "logps/rejected": -0.5901373028755188, + "loss": 1.5557, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.883236289024353, + "rewards/margins": 0.5921069979667664, + "rewards/rejected": -1.4753433465957642, + "step": 178 + }, + { + "epoch": 0.3825808175260486, + "grad_norm": 6.116640090942383, + "learning_rate": 3.877242453630256e-07, + "logits/chosen": -1.2131381034851074, + "logits/rejected": -1.0686910152435303, + "logps/chosen": -0.3515666127204895, + "logps/rejected": -0.3958896994590759, + "loss": 1.5671, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8789165019989014, + "rewards/margins": 0.11080773174762726, + "rewards/rejected": -0.9897242784500122, + "step": 179 + }, + { + "epoch": 0.38471814052898745, + "grad_norm": 6.355064868927002, + "learning_rate": 3.8615975875375676e-07, + "logits/chosen": -0.9339985847473145, + "logits/rejected": -0.9060691595077515, + "logps/chosen": -0.32276052236557007, + "logps/rejected": -0.37401843070983887, + "loss": 1.548, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8069013953208923, + "rewards/margins": 0.1281447559595108, + "rewards/rejected": -0.9350461959838867, + "step": 180 + }, + { + "epoch": 0.38685546353192624, + "grad_norm": 6.534996509552002, + "learning_rate": 3.8458765402267056e-07, + "logits/chosen": -0.8938146233558655, + "logits/rejected": -0.9069436192512512, + "logps/chosen": -0.336931049823761, + "logps/rejected": -0.4913772940635681, + "loss": 1.5787, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8423275947570801, + "rewards/margins": 0.3861156404018402, + "rewards/rejected": -1.2284431457519531, + "step": 181 + }, + { + "epoch": 0.3889927865348651, + "grad_norm": 10.956029891967773, + "learning_rate": 3.8300801912883414e-07, + "logits/chosen": -1.0703511238098145, + "logits/rejected": -0.9989842176437378, + "logps/chosen": -0.26583123207092285, + "logps/rejected": -0.2977861762046814, + "loss": 1.5609, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6645781397819519, + "rewards/margins": 0.07988730072975159, + "rewards/rejected": -0.7444654107093811, + "step": 182 + }, + { + "epoch": 0.3911301095378039, + "grad_norm": 10.217528343200684, + "learning_rate": 3.8142094245262615e-07, + "logits/chosen": -1.145703673362732, + "logits/rejected": -1.0282764434814453, + "logps/chosen": -0.3538467586040497, + "logps/rejected": -0.3405742645263672, + "loss": 1.5855, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8846168518066406, + "rewards/margins": -0.03318122401833534, + "rewards/rejected": -0.8514357209205627, + "step": 183 + }, + { + "epoch": 0.3932674325407427, + "grad_norm": 4.681653022766113, + "learning_rate": 3.7982651279079227e-07, + "logits/chosen": -1.2552436590194702, + "logits/rejected": -1.259030818939209, + "logps/chosen": -0.2886826992034912, + "logps/rejected": -0.4662485718727112, + "loss": 1.5609, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.7217066884040833, + "rewards/margins": 0.44391465187072754, + "rewards/rejected": -1.1656213998794556, + "step": 184 + }, + { + "epoch": 0.39540475554368154, + "grad_norm": 4.339652061462402, + "learning_rate": 3.7822481935147655e-07, + "logits/chosen": -1.0260683298110962, + "logits/rejected": -1.015075922012329, + "logps/chosen": -0.36714547872543335, + "logps/rejected": -0.5204967260360718, + "loss": 1.5682, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9178636074066162, + "rewards/margins": 0.38337817788124084, + "rewards/rejected": -1.3012418746948242, + "step": 185 + }, + { + "epoch": 0.3975420785466204, + "grad_norm": 5.974206924438477, + "learning_rate": 3.766159517492307e-07, + "logits/chosen": -1.0455535650253296, + "logits/rejected": -1.1319448947906494, + "logps/chosen": -0.41289687156677246, + "logps/rejected": -0.613991379737854, + "loss": 1.5825, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0322421789169312, + "rewards/margins": 0.5027362108230591, + "rewards/rejected": -1.5349783897399902, + "step": 186 + }, + { + "epoch": 0.39967940154955917, + "grad_norm": 8.767956733703613, + "learning_rate": 3.75e-07, + "logits/chosen": -1.0032697916030884, + "logits/rejected": -0.9564570784568787, + "logps/chosen": -0.31954333186149597, + "logps/rejected": -0.4057242274284363, + "loss": 1.6033, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7988582849502563, + "rewards/margins": 0.21545226871967316, + "rewards/rejected": -1.014310598373413, + "step": 187 + }, + { + "epoch": 0.401816724552498, + "grad_norm": 4.35204553604126, + "learning_rate": 3.7337705451608667e-07, + "logits/chosen": -1.1166412830352783, + "logits/rejected": -1.0849709510803223, + "logps/chosen": -0.3008464574813843, + "logps/rejected": -0.2960435450077057, + "loss": 1.5105, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7521160840988159, + "rewards/margins": -0.012007185257971287, + "rewards/rejected": -0.7401089072227478, + "step": 188 + }, + { + "epoch": 0.4039540475554368, + "grad_norm": 3.929826021194458, + "learning_rate": 3.717472061010918e-07, + "logits/chosen": -1.1040568351745605, + "logits/rejected": -1.062517523765564, + "logps/chosen": -0.3373297154903412, + "logps/rejected": -0.5283687710762024, + "loss": 1.5152, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8433243632316589, + "rewards/margins": 0.47759759426116943, + "rewards/rejected": -1.3209218978881836, + "step": 189 + }, + { + "epoch": 0.40609137055837563, + "grad_norm": 4.574549198150635, + "learning_rate": 3.7011054594483443e-07, + "logits/chosen": -1.1240224838256836, + "logits/rejected": -1.0487711429595947, + "logps/chosen": -0.3029400706291199, + "logps/rejected": -0.41601306200027466, + "loss": 1.4632, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7573502063751221, + "rewards/margins": 0.28268247842788696, + "rewards/rejected": -1.0400326251983643, + "step": 190 + }, + { + "epoch": 0.40822869356131447, + "grad_norm": 4.018647193908691, + "learning_rate": 3.6846716561824967e-07, + "logits/chosen": -0.80363529920578, + "logits/rejected": -0.9596213102340698, + "logps/chosen": -0.3076530694961548, + "logps/rejected": -0.5633202195167542, + "loss": 1.5163, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7691327333450317, + "rewards/margins": 0.6391679048538208, + "rewards/rejected": -1.408300518989563, + "step": 191 + }, + { + "epoch": 0.41036601656425326, + "grad_norm": 7.332089424133301, + "learning_rate": 3.668171570682655e-07, + "logits/chosen": -0.9585205316543579, + "logits/rejected": -0.9636404514312744, + "logps/chosen": -0.33684462308883667, + "logps/rejected": -0.3766506016254425, + "loss": 1.5671, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8421115279197693, + "rewards/margins": 0.09951499849557877, + "rewards/rejected": -0.9416265487670898, + "step": 192 + }, + { + "epoch": 0.4125033395671921, + "grad_norm": 8.853985786437988, + "learning_rate": 3.6516061261265805e-07, + "logits/chosen": -1.027462363243103, + "logits/rejected": -0.9135668873786926, + "logps/chosen": -0.3370886445045471, + "logps/rejected": -0.3812939524650574, + "loss": 1.5598, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8427215814590454, + "rewards/margins": 0.11051319539546967, + "rewards/rejected": -0.953234851360321, + "step": 193 + }, + { + "epoch": 0.41464066257013094, + "grad_norm": 5.765879154205322, + "learning_rate": 3.634976249348867e-07, + "logits/chosen": -1.1132540702819824, + "logits/rejected": -1.003641963005066, + "logps/chosen": -0.3518536686897278, + "logps/rejected": -0.5063703656196594, + "loss": 1.5071, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8796342015266418, + "rewards/margins": 0.38629183173179626, + "rewards/rejected": -1.2659261226654053, + "step": 194 + }, + { + "epoch": 0.4167779855730697, + "grad_norm": 17.148714065551758, + "learning_rate": 3.618282870789081e-07, + "logits/chosen": -1.041336178779602, + "logits/rejected": -1.0308490991592407, + "logps/chosen": -0.4422120749950409, + "logps/rejected": -0.4290231466293335, + "loss": 1.6783, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1055301427841187, + "rewards/margins": -0.032972272485494614, + "rewards/rejected": -1.072557806968689, + "step": 195 + }, + { + "epoch": 0.41891530857600856, + "grad_norm": 4.9743332862854, + "learning_rate": 3.601526924439709e-07, + "logits/chosen": -0.9943188428878784, + "logits/rejected": -1.029951810836792, + "logps/chosen": -0.2909929156303406, + "logps/rejected": -0.3154396116733551, + "loss": 1.5771, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7274821996688843, + "rewards/margins": 0.061116717755794525, + "rewards/rejected": -0.7885990142822266, + "step": 196 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 6.192495346069336, + "learning_rate": 3.584709347793895e-07, + "logits/chosen": -0.8082910776138306, + "logits/rejected": -0.8116950988769531, + "logps/chosen": -0.2856646478176117, + "logps/rejected": -0.30446913838386536, + "loss": 1.5157, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7141616344451904, + "rewards/margins": 0.04701121151447296, + "rewards/rejected": -0.7611728310585022, + "step": 197 + }, + { + "epoch": 0.4231899545818862, + "grad_norm": 4.891373157501221, + "learning_rate": 3.567831081792992e-07, + "logits/chosen": -1.0285996198654175, + "logits/rejected": -1.034073829650879, + "logps/chosen": -0.3283870220184326, + "logps/rejected": -0.5464656949043274, + "loss": 1.4871, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8209674954414368, + "rewards/margins": 0.5451965928077698, + "rewards/rejected": -1.366164207458496, + "step": 198 + }, + { + "epoch": 0.425327277584825, + "grad_norm": 13.869108200073242, + "learning_rate": 3.550893070773914e-07, + "logits/chosen": -1.0854626893997192, + "logits/rejected": -1.0260361433029175, + "logps/chosen": -0.39059579372406006, + "logps/rejected": -0.4412023425102234, + "loss": 1.6672, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.9764894247055054, + "rewards/margins": 0.12651631236076355, + "rewards/rejected": -1.1030058860778809, + "step": 199 + }, + { + "epoch": 0.4274646005877638, + "grad_norm": 29.342126846313477, + "learning_rate": 3.5338962624163016e-07, + "logits/chosen": -1.1286933422088623, + "logits/rejected": -1.1019514799118042, + "logps/chosen": -0.29572370648384094, + "logps/rejected": -0.3438429832458496, + "loss": 1.6118, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7393092513084412, + "rewards/margins": 0.12029813230037689, + "rewards/rejected": -0.8596073985099792, + "step": 200 + }, + { + "epoch": 0.42960192359070265, + "grad_norm": 4.435629367828369, + "learning_rate": 3.516841607689501e-07, + "logits/chosen": -1.1759017705917358, + "logits/rejected": -1.0626184940338135, + "logps/chosen": -0.3442676067352295, + "logps/rejected": -0.3576590120792389, + "loss": 1.5321, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8606690168380737, + "rewards/margins": 0.03347862884402275, + "rewards/rejected": -0.8941476345062256, + "step": 201 + }, + { + "epoch": 0.4317392465936415, + "grad_norm": 5.45989990234375, + "learning_rate": 3.499730060799352e-07, + "logits/chosen": -1.1944599151611328, + "logits/rejected": -1.1447770595550537, + "logps/chosen": -0.300496369600296, + "logps/rejected": -0.3771470785140991, + "loss": 1.4774, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7512409687042236, + "rewards/margins": 0.19162678718566895, + "rewards/rejected": -0.9428676962852478, + "step": 202 + }, + { + "epoch": 0.4338765695965803, + "grad_norm": 4.396944046020508, + "learning_rate": 3.482562579134809e-07, + "logits/chosen": -0.9371283054351807, + "logits/rejected": -0.9887581467628479, + "logps/chosen": -0.34337079524993896, + "logps/rejected": -0.31941717863082886, + "loss": 1.5624, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8584270477294922, + "rewards/margins": -0.05988417938351631, + "rewards/rejected": -0.798542857170105, + "step": 203 + }, + { + "epoch": 0.4360138925995191, + "grad_norm": 5.779623508453369, + "learning_rate": 3.465340123214365e-07, + "logits/chosen": -0.9840802550315857, + "logits/rejected": -0.9649553298950195, + "logps/chosen": -0.5713462829589844, + "logps/rejected": -0.7279367446899414, + "loss": 1.5474, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4283654689788818, + "rewards/margins": 0.39147651195526123, + "rewards/rejected": -1.8198421001434326, + "step": 204 + }, + { + "epoch": 0.4381512156024579, + "grad_norm": 10.535792350769043, + "learning_rate": 3.448063656632321e-07, + "logits/chosen": -1.1214243173599243, + "logits/rejected": -1.0236384868621826, + "logps/chosen": -0.327178418636322, + "logps/rejected": -0.3443678021430969, + "loss": 1.5847, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8179460167884827, + "rewards/margins": 0.042973555624485016, + "rewards/rejected": -0.8609195351600647, + "step": 205 + }, + { + "epoch": 0.44028853860539674, + "grad_norm": 5.442493915557861, + "learning_rate": 3.430734146004863e-07, + "logits/chosen": -1.1191673278808594, + "logits/rejected": -0.9904736876487732, + "logps/chosen": -0.2607005536556244, + "logps/rejected": -0.2681718170642853, + "loss": 1.542, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.6517513990402222, + "rewards/margins": 0.01867814175784588, + "rewards/rejected": -0.670429527759552, + "step": 206 + }, + { + "epoch": 0.4424258616083356, + "grad_norm": 6.850170612335205, + "learning_rate": 3.413352560915988e-07, + "logits/chosen": -1.0275464057922363, + "logits/rejected": -1.0052015781402588, + "logps/chosen": -0.3867985010147095, + "logps/rejected": -0.4938412010669708, + "loss": 1.6312, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.9669963121414185, + "rewards/margins": 0.267606645822525, + "rewards/rejected": -1.234602928161621, + "step": 207 + }, + { + "epoch": 0.44456318461127436, + "grad_norm": 9.965657234191895, + "learning_rate": 3.39591987386325e-07, + "logits/chosen": -0.9659216403961182, + "logits/rejected": -0.9130998253822327, + "logps/chosen": -0.33372846245765686, + "logps/rejected": -0.3092671036720276, + "loss": 1.5355, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.834321141242981, + "rewards/margins": -0.061153292655944824, + "rewards/rejected": -0.7731677889823914, + "step": 208 + }, + { + "epoch": 0.4467005076142132, + "grad_norm": 5.595789909362793, + "learning_rate": 3.378437060203357e-07, + "logits/chosen": -1.2547951936721802, + "logits/rejected": -1.1610562801361084, + "logps/chosen": -0.34088316559791565, + "logps/rejected": -0.34324803948402405, + "loss": 1.6059, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.8522078990936279, + "rewards/margins": 0.00591224804520607, + "rewards/rejected": -0.8581202030181885, + "step": 209 + }, + { + "epoch": 0.448837830617152, + "grad_norm": 17.92057991027832, + "learning_rate": 3.360905098097587e-07, + "logits/chosen": -1.0579925775527954, + "logits/rejected": -0.9834758043289185, + "logps/chosen": -0.38748034834861755, + "logps/rejected": -0.6860374808311462, + "loss": 1.5363, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9687008857727051, + "rewards/margins": 0.7463930249214172, + "rewards/rejected": -1.715093970298767, + "step": 210 + }, + { + "epoch": 0.45097515362009083, + "grad_norm": 6.570519924163818, + "learning_rate": 3.343324968457075e-07, + "logits/chosen": -1.0359179973602295, + "logits/rejected": -0.9564209580421448, + "logps/chosen": -0.38825637102127075, + "logps/rejected": -0.3802332878112793, + "loss": 1.5384, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.970641016960144, + "rewards/margins": -0.02005772665143013, + "rewards/rejected": -0.950583279132843, + "step": 211 + }, + { + "epoch": 0.45311247662302967, + "grad_norm": 5.518048286437988, + "learning_rate": 3.325697654887918e-07, + "logits/chosen": -0.998512327671051, + "logits/rejected": -0.9381792545318604, + "logps/chosen": -0.3794736862182617, + "logps/rejected": -0.6236636638641357, + "loss": 1.5091, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9486840963363647, + "rewards/margins": 0.6104748249053955, + "rewards/rejected": -1.5591590404510498, + "step": 212 + }, + { + "epoch": 0.45524979962596845, + "grad_norm": 12.084184646606445, + "learning_rate": 3.30802414363615e-07, + "logits/chosen": -0.9403542280197144, + "logits/rejected": -0.6737431287765503, + "logps/chosen": -0.4244030714035034, + "logps/rejected": -0.43834903836250305, + "loss": 1.4581, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.0610076189041138, + "rewards/margins": 0.03486503288149834, + "rewards/rejected": -1.0958726406097412, + "step": 213 + }, + { + "epoch": 0.4573871226289073, + "grad_norm": 3.5294582843780518, + "learning_rate": 3.2903054235325613e-07, + "logits/chosen": -1.1825759410858154, + "logits/rejected": -1.210655927658081, + "logps/chosen": -0.3315200209617615, + "logps/rejected": -0.46745753288269043, + "loss": 1.5312, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8287999629974365, + "rewards/margins": 0.33984383940696716, + "rewards/rejected": -1.168643832206726, + "step": 214 + }, + { + "epoch": 0.45952444563184613, + "grad_norm": 6.134922027587891, + "learning_rate": 3.272542485937368e-07, + "logits/chosen": -1.1072171926498413, + "logits/rejected": -1.208855152130127, + "logps/chosen": -0.4051734209060669, + "logps/rejected": -0.6289750337600708, + "loss": 1.5473, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.012933611869812, + "rewards/margins": 0.5595039129257202, + "rewards/rejected": -1.5724375247955322, + "step": 215 + }, + { + "epoch": 0.4616617686347849, + "grad_norm": 6.10336446762085, + "learning_rate": 3.2547363246847546e-07, + "logits/chosen": -1.0125056505203247, + "logits/rejected": -1.0291041135787964, + "logps/chosen": -0.3960397243499756, + "logps/rejected": -0.6897832751274109, + "loss": 1.5091, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.990099310874939, + "rewards/margins": 0.7343588471412659, + "rewards/rejected": -1.72445809841156, + "step": 216 + }, + { + "epoch": 0.46379909163772376, + "grad_norm": 9.434686660766602, + "learning_rate": 3.2368879360272606e-07, + "logits/chosen": -1.0608569383621216, + "logits/rejected": -1.0038235187530518, + "logps/chosen": -0.4567071199417114, + "logps/rejected": -0.42994168400764465, + "loss": 1.616, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.1417678594589233, + "rewards/margins": -0.06691357493400574, + "rewards/rejected": -1.0748542547225952, + "step": 217 + }, + { + "epoch": 0.46593641464066254, + "grad_norm": 5.21524715423584, + "learning_rate": 3.218998318580043e-07, + "logits/chosen": -1.1354548931121826, + "logits/rejected": -1.0435974597930908, + "logps/chosen": -0.2741296589374542, + "logps/rejected": -0.37144631147384644, + "loss": 1.6029, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6853241920471191, + "rewards/margins": 0.24329157173633575, + "rewards/rejected": -0.9286156892776489, + "step": 218 + }, + { + "epoch": 0.4680737376436014, + "grad_norm": 4.108745574951172, + "learning_rate": 3.201068473265007e-07, + "logits/chosen": -0.8878648281097412, + "logits/rejected": -0.8645142316818237, + "logps/chosen": -0.32466036081314087, + "logps/rejected": -0.28847843408584595, + "loss": 1.6023, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.811650812625885, + "rewards/margins": -0.09045480191707611, + "rewards/rejected": -0.7211960554122925, + "step": 219 + }, + { + "epoch": 0.4702110606465402, + "grad_norm": 17.760408401489258, + "learning_rate": 3.1830994032548e-07, + "logits/chosen": -1.197770595550537, + "logits/rejected": -1.0971354246139526, + "logps/chosen": -0.44655174016952515, + "logps/rejected": -0.5050027370452881, + "loss": 1.6185, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.1163791418075562, + "rewards/margins": 0.14612753689289093, + "rewards/rejected": -1.2625068426132202, + "step": 220 + }, + { + "epoch": 0.472348383649479, + "grad_norm": 24.489158630371094, + "learning_rate": 3.1650921139166874e-07, + "logits/chosen": -0.9091489315032959, + "logits/rejected": -0.9671614766120911, + "logps/chosen": -0.2689306437969208, + "logps/rejected": -0.2791651487350464, + "loss": 1.6576, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6723266243934631, + "rewards/margins": 0.02558620274066925, + "rewards/rejected": -0.6979128122329712, + "step": 221 + }, + { + "epoch": 0.47448570665241785, + "grad_norm": 4.240891933441162, + "learning_rate": 3.147047612756302e-07, + "logits/chosen": -1.1410434246063232, + "logits/rejected": -0.9494026303291321, + "logps/chosen": -0.3623463809490204, + "logps/rejected": -0.3546559810638428, + "loss": 1.5634, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.9058659672737122, + "rewards/margins": -0.01922605186700821, + "rewards/rejected": -0.8866399526596069, + "step": 222 + }, + { + "epoch": 0.4766230296553567, + "grad_norm": 11.909400939941406, + "learning_rate": 3.128966909361271e-07, + "logits/chosen": -1.0778872966766357, + "logits/rejected": -0.9947598576545715, + "logps/chosen": -0.2876349687576294, + "logps/rejected": -0.3500506281852722, + "loss": 1.5763, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7190873622894287, + "rewards/margins": 0.15603923797607422, + "rewards/rejected": -0.8751266002655029, + "step": 223 + }, + { + "epoch": 0.4787603526582955, + "grad_norm": 3.9968485832214355, + "learning_rate": 3.110851015344735e-07, + "logits/chosen": -1.043594241142273, + "logits/rejected": -1.0751991271972656, + "logps/chosen": -0.3403151333332062, + "logps/rejected": -0.45080384612083435, + "loss": 1.4964, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.850787878036499, + "rewards/margins": 0.27622172236442566, + "rewards/rejected": -1.127009630203247, + "step": 224 + }, + { + "epoch": 0.4808976756612343, + "grad_norm": 4.30190372467041, + "learning_rate": 3.0927009442887437e-07, + "logits/chosen": -0.9305320978164673, + "logits/rejected": -1.0111606121063232, + "logps/chosen": -0.32919758558273315, + "logps/rejected": -0.34503474831581116, + "loss": 1.5875, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8229939341545105, + "rewards/margins": 0.0395929217338562, + "rewards/rejected": -0.8625868558883667, + "step": 225 + }, + { + "epoch": 0.4830349986641731, + "grad_norm": 5.68215799331665, + "learning_rate": 3.074517711687549e-07, + "logits/chosen": -0.9502861499786377, + "logits/rejected": -0.9219777584075928, + "logps/chosen": -0.40744659304618835, + "logps/rejected": -0.4551170766353607, + "loss": 1.5318, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0186164379119873, + "rewards/margins": 0.11917618662118912, + "rewards/rejected": -1.137792706489563, + "step": 226 + }, + { + "epoch": 0.48517232166711194, + "grad_norm": 5.924420356750488, + "learning_rate": 3.056302334890786e-07, + "logits/chosen": -1.0599088668823242, + "logits/rejected": -0.9398927688598633, + "logps/chosen": -0.2768517732620239, + "logps/rejected": -0.3650413155555725, + "loss": 1.5365, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.692129373550415, + "rewards/margins": 0.22047390043735504, + "rewards/rejected": -0.9126032590866089, + "step": 227 + }, + { + "epoch": 0.4873096446700508, + "grad_norm": 9.18790340423584, + "learning_rate": 3.038055833046555e-07, + "logits/chosen": -1.23221755027771, + "logits/rejected": -1.1094015836715698, + "logps/chosen": -0.3468588590621948, + "logps/rejected": -0.533679723739624, + "loss": 1.5544, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.8671470880508423, + "rewards/margins": 0.4670522212982178, + "rewards/rejected": -1.3341991901397705, + "step": 228 + }, + { + "epoch": 0.48944696767298956, + "grad_norm": 3.236159563064575, + "learning_rate": 3.0197792270443976e-07, + "logits/chosen": -1.101015567779541, + "logits/rejected": -0.980370044708252, + "logps/chosen": -0.5276182293891907, + "logps/rejected": -0.2907797694206238, + "loss": 1.6115, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3190455436706543, + "rewards/margins": -0.592096209526062, + "rewards/rejected": -0.7269493341445923, + "step": 229 + }, + { + "epoch": 0.4915842906759284, + "grad_norm": 6.479002475738525, + "learning_rate": 3.001473539458182e-07, + "logits/chosen": -1.1139984130859375, + "logits/rejected": -1.0145281553268433, + "logps/chosen": -0.40499821305274963, + "logps/rejected": -0.5032440423965454, + "loss": 1.5857, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0124956369400024, + "rewards/margins": 0.24561452865600586, + "rewards/rejected": -1.2581101655960083, + "step": 230 + }, + { + "epoch": 0.49372161367886724, + "grad_norm": 11.510492324829102, + "learning_rate": 2.983139794488883e-07, + "logits/chosen": -1.1703720092773438, + "logits/rejected": -1.0775160789489746, + "logps/chosen": -0.4314059615135193, + "logps/rejected": -0.39874231815338135, + "loss": 1.6011, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.078514814376831, + "rewards/margins": -0.08165915310382843, + "rewards/rejected": -0.9968557953834534, + "step": 231 + }, + { + "epoch": 0.49585893668180603, + "grad_norm": 5.559615612030029, + "learning_rate": 2.964779017907287e-07, + "logits/chosen": -1.0301462411880493, + "logits/rejected": -1.0727837085723877, + "logps/chosen": -0.40483570098876953, + "logps/rejected": -0.45073747634887695, + "loss": 1.5311, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0120892524719238, + "rewards/margins": 0.114754319190979, + "rewards/rejected": -1.1268435716629028, + "step": 232 + }, + { + "epoch": 0.49799625968474487, + "grad_norm": 4.339590549468994, + "learning_rate": 2.9463922369965915e-07, + "logits/chosen": -0.9359559416770935, + "logits/rejected": -0.9321252703666687, + "logps/chosen": -0.35180893540382385, + "logps/rejected": -0.536721408367157, + "loss": 1.5723, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8795222640037537, + "rewards/margins": 0.4622812271118164, + "rewards/rejected": -1.3418035507202148, + "step": 233 + }, + { + "epoch": 0.5001335826876837, + "grad_norm": 4.583502292633057, + "learning_rate": 2.927980480494938e-07, + "logits/chosen": -1.0992170572280884, + "logits/rejected": -1.0070428848266602, + "logps/chosen": -0.36073118448257446, + "logps/rejected": -0.3927144706249237, + "loss": 1.5751, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9018279314041138, + "rewards/margins": 0.07995828986167908, + "rewards/rejected": -0.9817862510681152, + "step": 234 + }, + { + "epoch": 0.5022709056906225, + "grad_norm": 4.529286861419678, + "learning_rate": 2.909544778537844e-07, + "logits/chosen": -1.1656326055526733, + "logits/rejected": -1.0929317474365234, + "logps/chosen": -0.3711916506290436, + "logps/rejected": -0.3863615393638611, + "loss": 1.5221, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.9279791116714478, + "rewards/margins": 0.03792468085885048, + "rewards/rejected": -0.9659038186073303, + "step": 235 + }, + { + "epoch": 0.5044082286935613, + "grad_norm": 10.745676040649414, + "learning_rate": 2.8910861626005773e-07, + "logits/chosen": -1.058958649635315, + "logits/rejected": -0.9348481297492981, + "logps/chosen": -0.31637054681777954, + "logps/rejected": -0.33828607201576233, + "loss": 1.4931, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.7909263968467712, + "rewards/margins": 0.05478885397315025, + "rewards/rejected": -0.8457151055335999, + "step": 236 + }, + { + "epoch": 0.5065455516965002, + "grad_norm": 4.350003242492676, + "learning_rate": 2.872605665440436e-07, + "logits/chosen": -1.155067801475525, + "logits/rejected": -1.044098138809204, + "logps/chosen": -0.4006834626197815, + "logps/rejected": -0.3987181484699249, + "loss": 1.5417, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.001708745956421, + "rewards/margins": -0.004913315176963806, + "rewards/rejected": -0.9967952966690063, + "step": 237 + }, + { + "epoch": 0.508682874699439, + "grad_norm": 4.5963358879089355, + "learning_rate": 2.8541043210389726e-07, + "logits/chosen": -0.9011512994766235, + "logits/rejected": -0.9799545407295227, + "logps/chosen": -0.30112171173095703, + "logps/rejected": -0.4484432339668274, + "loss": 1.4859, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7528042793273926, + "rewards/margins": 0.36830389499664307, + "rewards/rejected": -1.1211082935333252, + "step": 238 + }, + { + "epoch": 0.5108201977023777, + "grad_norm": 5.283090591430664, + "learning_rate": 2.8355831645441387e-07, + "logits/chosen": -1.2146248817443848, + "logits/rejected": -1.2574325799942017, + "logps/chosen": -0.3450472354888916, + "logps/rejected": -0.4763634204864502, + "loss": 1.4888, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.862618088722229, + "rewards/margins": 0.32829049229621887, + "rewards/rejected": -1.1909085512161255, + "step": 239 + }, + { + "epoch": 0.5129575207053166, + "grad_norm": 6.43093729019165, + "learning_rate": 2.817043232212371e-07, + "logits/chosen": -1.2071186304092407, + "logits/rejected": -1.1450533866882324, + "logps/chosen": -0.3647967576980591, + "logps/rejected": -0.4625674784183502, + "loss": 1.5268, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.9119919538497925, + "rewards/margins": 0.24442686140537262, + "rewards/rejected": -1.156418800354004, + "step": 240 + }, + { + "epoch": 0.5150948437082554, + "grad_norm": 9.717195510864258, + "learning_rate": 2.7984855613506106e-07, + "logits/chosen": -1.1946227550506592, + "logits/rejected": -1.1376502513885498, + "logps/chosen": -0.29764774441719055, + "logps/rejected": -0.302202045917511, + "loss": 1.5322, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.74411940574646, + "rewards/margins": 0.01138581894338131, + "rewards/rejected": -0.7555052042007446, + "step": 241 + }, + { + "epoch": 0.5172321667111942, + "grad_norm": 6.479928493499756, + "learning_rate": 2.7799111902582693e-07, + "logits/chosen": -1.2251317501068115, + "logits/rejected": -1.0719342231750488, + "logps/chosen": -0.3156971037387848, + "logps/rejected": -0.2400185763835907, + "loss": 1.5725, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.789242684841156, + "rewards/margins": -0.18919625878334045, + "rewards/rejected": -0.6000465154647827, + "step": 242 + }, + { + "epoch": 0.5193694897141331, + "grad_norm": 8.900924682617188, + "learning_rate": 2.761321158169134e-07, + "logits/chosen": -1.0849740505218506, + "logits/rejected": -1.1516170501708984, + "logps/chosen": -0.35676899552345276, + "logps/rejected": -0.5772523283958435, + "loss": 1.4924, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8919224739074707, + "rewards/margins": 0.5512083172798157, + "rewards/rejected": -1.4431307315826416, + "step": 243 + }, + { + "epoch": 0.5215068127170719, + "grad_norm": 5.021285533905029, + "learning_rate": 2.74271650519322e-07, + "logits/chosen": -1.1510225534439087, + "logits/rejected": -1.1225014925003052, + "logps/chosen": -0.3502144515514374, + "logps/rejected": -0.48459944128990173, + "loss": 1.5383, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8755362033843994, + "rewards/margins": 0.3359624445438385, + "rewards/rejected": -1.2114986181259155, + "step": 244 + }, + { + "epoch": 0.5236441357200107, + "grad_norm": 6.033867359161377, + "learning_rate": 2.7240982722585837e-07, + "logits/chosen": -1.0076422691345215, + "logits/rejected": -1.0045421123504639, + "logps/chosen": -0.3226780295372009, + "logps/rejected": -0.37450891733169556, + "loss": 1.5745, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8066950440406799, + "rewards/margins": 0.1295773833990097, + "rewards/rejected": -0.9362723231315613, + "step": 245 + }, + { + "epoch": 0.5257814587229495, + "grad_norm": 7.610095977783203, + "learning_rate": 2.705467501053076e-07, + "logits/chosen": -1.3070695400238037, + "logits/rejected": -1.360163688659668, + "logps/chosen": -0.4132193624973297, + "logps/rejected": -0.5460841059684753, + "loss": 1.5482, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.033048391342163, + "rewards/margins": 0.3321617841720581, + "rewards/rejected": -1.3652101755142212, + "step": 246 + }, + { + "epoch": 0.5279187817258884, + "grad_norm": 5.6094770431518555, + "learning_rate": 2.6868252339660607e-07, + "logits/chosen": -0.9480774998664856, + "logits/rejected": -0.9445351362228394, + "logps/chosen": -0.5733252763748169, + "logps/rejected": -1.0633982419967651, + "loss": 1.5284, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.433313250541687, + "rewards/margins": 1.225182294845581, + "rewards/rejected": -2.6584954261779785, + "step": 247 + }, + { + "epoch": 0.5300561047288271, + "grad_norm": 16.856760025024414, + "learning_rate": 2.6681725140300995e-07, + "logits/chosen": -1.1925255060195923, + "logits/rejected": -1.1129463911056519, + "logps/chosen": -0.28040605783462524, + "logps/rejected": -0.36458098888397217, + "loss": 1.5368, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7010151743888855, + "rewards/margins": 0.21043738722801208, + "rewards/rejected": -0.9114525318145752, + "step": 248 + }, + { + "epoch": 0.5321934277317659, + "grad_norm": 6.675515174865723, + "learning_rate": 2.6495103848625854e-07, + "logits/chosen": -1.2934060096740723, + "logits/rejected": -1.17371666431427, + "logps/chosen": -0.3621112108230591, + "logps/rejected": -0.4810316562652588, + "loss": 1.5586, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9052779674530029, + "rewards/margins": 0.29730114340782166, + "rewards/rejected": -1.2025790214538574, + "step": 249 + }, + { + "epoch": 0.5343307507347048, + "grad_norm": 15.49481201171875, + "learning_rate": 2.63083989060736e-07, + "logits/chosen": -1.019038438796997, + "logits/rejected": -0.9999558925628662, + "logps/chosen": -0.431622713804245, + "logps/rejected": -0.620360255241394, + "loss": 1.55, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0790568590164185, + "rewards/margins": 0.47184401750564575, + "rewards/rejected": -1.550900936126709, + "step": 250 + }, + { + "epoch": 0.5364680737376436, + "grad_norm": 4.874868392944336, + "learning_rate": 2.6121620758762875e-07, + "logits/chosen": -1.1522804498672485, + "logits/rejected": -1.144692301750183, + "logps/chosen": -0.40826401114463806, + "logps/rejected": -0.4715278744697571, + "loss": 1.5317, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0206599235534668, + "rewards/margins": 0.15815965831279755, + "rewards/rejected": -1.1788195371627808, + "step": 251 + }, + { + "epoch": 0.5386053967405824, + "grad_norm": 4.113776206970215, + "learning_rate": 2.593477985690815e-07, + "logits/chosen": -1.0712709426879883, + "logits/rejected": -1.1005451679229736, + "logps/chosen": -0.5715100765228271, + "logps/rejected": -0.6493417620658875, + "loss": 1.5129, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4287753105163574, + "rewards/margins": 0.19457919895648956, + "rewards/rejected": -1.6233545541763306, + "step": 252 + }, + { + "epoch": 0.5407427197435213, + "grad_norm": 17.1635799407959, + "learning_rate": 2.574788665423496e-07, + "logits/chosen": -0.9928967356681824, + "logits/rejected": -0.9838371276855469, + "logps/chosen": -0.3351861536502838, + "logps/rejected": -0.3290242850780487, + "loss": 1.5488, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.8379653692245483, + "rewards/margins": -0.01540469378232956, + "rewards/rejected": -0.822560727596283, + "step": 253 + }, + { + "epoch": 0.5428800427464601, + "grad_norm": 8.28346061706543, + "learning_rate": 2.5560951607395126e-07, + "logits/chosen": -1.1226708889007568, + "logits/rejected": -1.0680346488952637, + "logps/chosen": -0.3342251777648926, + "logps/rejected": -0.3822442591190338, + "loss": 1.5604, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8355628252029419, + "rewards/margins": 0.12004776298999786, + "rewards/rejected": -0.9556106925010681, + "step": 254 + }, + { + "epoch": 0.5450173657493989, + "grad_norm": 12.173513412475586, + "learning_rate": 2.537398517538159e-07, + "logits/chosen": -1.1180171966552734, + "logits/rejected": -1.1232236623764038, + "logps/chosen": -0.3291173279285431, + "logps/rejected": -0.5288177132606506, + "loss": 1.4907, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8227933645248413, + "rewards/margins": 0.49925094842910767, + "rewards/rejected": -1.3220442533493042, + "step": 255 + }, + { + "epoch": 0.5471546887523377, + "grad_norm": 4.996148109436035, + "learning_rate": 2.518699781894332e-07, + "logits/chosen": -1.0864285230636597, + "logits/rejected": -1.0856531858444214, + "logps/chosen": -0.46618932485580444, + "logps/rejected": -0.9816129803657532, + "loss": 1.5076, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.165473222732544, + "rewards/margins": 1.2885593175888062, + "rewards/rejected": -2.4540326595306396, + "step": 256 + }, + { + "epoch": 0.5492920117552765, + "grad_norm": 5.049304008483887, + "learning_rate": 2.5e-07, + "logits/chosen": -0.9920480251312256, + "logits/rejected": -0.897991418838501, + "logps/chosen": -0.3009772002696991, + "logps/rejected": -0.3982135057449341, + "loss": 1.6439, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.7524430155754089, + "rewards/margins": 0.24309074878692627, + "rewards/rejected": -0.9955337643623352, + "step": 257 + }, + { + "epoch": 0.5514293347582153, + "grad_norm": 4.575491428375244, + "learning_rate": 2.4813002181056676e-07, + "logits/chosen": -1.0483980178833008, + "logits/rejected": -1.040475845336914, + "logps/chosen": -0.2760324478149414, + "logps/rejected": -0.5634697675704956, + "loss": 1.5932, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6900811791419983, + "rewards/margins": 0.718593180179596, + "rewards/rejected": -1.4086742401123047, + "step": 258 + }, + { + "epoch": 0.5535666577611541, + "grad_norm": 4.031703948974609, + "learning_rate": 2.4626014824618413e-07, + "logits/chosen": -1.2272746562957764, + "logits/rejected": -1.2073853015899658, + "logps/chosen": -0.4353184700012207, + "logps/rejected": -0.5070162415504456, + "loss": 1.5153, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0882960557937622, + "rewards/margins": 0.17924460768699646, + "rewards/rejected": -1.2675405740737915, + "step": 259 + }, + { + "epoch": 0.555703980764093, + "grad_norm": 8.027057647705078, + "learning_rate": 2.4439048392604877e-07, + "logits/chosen": -0.953754186630249, + "logits/rejected": -0.9900184869766235, + "logps/chosen": -0.2740909159183502, + "logps/rejected": -0.3458973467350006, + "loss": 1.5291, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6852273344993591, + "rewards/margins": 0.17951609194278717, + "rewards/rejected": -0.8647434711456299, + "step": 260 + }, + { + "epoch": 0.5578413037670318, + "grad_norm": 7.306129455566406, + "learning_rate": 2.4252113345765043e-07, + "logits/chosen": -0.9035928845405579, + "logits/rejected": -0.8614873290061951, + "logps/chosen": -0.2865443229675293, + "logps/rejected": -0.32079729437828064, + "loss": 1.5665, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7163608074188232, + "rewards/margins": 0.0856325626373291, + "rewards/rejected": -0.8019933104515076, + "step": 261 + }, + { + "epoch": 0.5599786267699706, + "grad_norm": 3.2643260955810547, + "learning_rate": 2.406522014309186e-07, + "logits/chosen": -1.1808401346206665, + "logits/rejected": -1.1874431371688843, + "logps/chosen": -0.5122575163841248, + "logps/rejected": -0.8233806490898132, + "loss": 1.5851, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2806435823440552, + "rewards/margins": 0.7778077721595764, + "rewards/rejected": -2.0584514141082764, + "step": 262 + }, + { + "epoch": 0.5621159497729095, + "grad_norm": 3.0484321117401123, + "learning_rate": 2.3878379241237134e-07, + "logits/chosen": -1.1015522480010986, + "logits/rejected": -1.1043397188186646, + "logps/chosen": -0.5216892957687378, + "logps/rejected": -0.5477871298789978, + "loss": 1.4888, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3042232990264893, + "rewards/margins": 0.06524449586868286, + "rewards/rejected": -1.3694677352905273, + "step": 263 + }, + { + "epoch": 0.5642532727758482, + "grad_norm": 8.040013313293457, + "learning_rate": 2.3691601093926402e-07, + "logits/chosen": -1.0679914951324463, + "logits/rejected": -1.041649580001831, + "logps/chosen": -0.4239467978477478, + "logps/rejected": -0.427889347076416, + "loss": 1.6854, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0598669052124023, + "rewards/margins": 0.009856484830379486, + "rewards/rejected": -1.06972336769104, + "step": 264 + }, + { + "epoch": 0.566390595778787, + "grad_norm": 10.049110412597656, + "learning_rate": 2.3504896151374144e-07, + "logits/chosen": -1.1767913103103638, + "logits/rejected": -1.2240692377090454, + "logps/chosen": -0.4159534275531769, + "logps/rejected": -0.5419010519981384, + "loss": 1.5352, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0398834943771362, + "rewards/margins": 0.3148690462112427, + "rewards/rejected": -1.3547526597976685, + "step": 265 + }, + { + "epoch": 0.5685279187817259, + "grad_norm": 8.153444290161133, + "learning_rate": 2.3318274859699008e-07, + "logits/chosen": -1.063308596611023, + "logits/rejected": -1.164639949798584, + "logps/chosen": -0.2907513678073883, + "logps/rejected": -0.5243133306503296, + "loss": 1.6477, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7268784046173096, + "rewards/margins": 0.5839048624038696, + "rewards/rejected": -1.3107832670211792, + "step": 266 + }, + { + "epoch": 0.5706652417846647, + "grad_norm": 8.460691452026367, + "learning_rate": 2.3131747660339394e-07, + "logits/chosen": -1.2165307998657227, + "logits/rejected": -1.1944361925125122, + "logps/chosen": -0.5601080656051636, + "logps/rejected": -0.47026118636131287, + "loss": 1.5634, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.4002699851989746, + "rewards/margins": -0.2246171087026596, + "rewards/rejected": -1.1756529808044434, + "step": 267 + }, + { + "epoch": 0.5728025647876035, + "grad_norm": 13.028862953186035, + "learning_rate": 2.2945324989469243e-07, + "logits/chosen": -1.0125137567520142, + "logits/rejected": -0.9787082672119141, + "logps/chosen": -0.3832467794418335, + "logps/rejected": -0.7757288217544556, + "loss": 1.4993, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.958116888999939, + "rewards/margins": 0.9812053442001343, + "rewards/rejected": -1.9393221139907837, + "step": 268 + }, + { + "epoch": 0.5749398877905424, + "grad_norm": 10.404675483703613, + "learning_rate": 2.2759017277414164e-07, + "logits/chosen": -1.1808825731277466, + "logits/rejected": -1.1194167137145996, + "logps/chosen": -0.43756401538848877, + "logps/rejected": -0.3946504294872284, + "loss": 1.6378, + "rewards/accuracies": 0.3125, + "rewards/chosen": -1.0939099788665771, + "rewards/margins": -0.10728396475315094, + "rewards/rejected": -0.9866260290145874, + "step": 269 + }, + { + "epoch": 0.5770772107934812, + "grad_norm": 7.459733963012695, + "learning_rate": 2.2572834948067795e-07, + "logits/chosen": -0.9175713062286377, + "logits/rejected": -0.9572230577468872, + "logps/chosen": -0.2940235137939453, + "logps/rejected": -0.3464244604110718, + "loss": 1.6275, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7350587844848633, + "rewards/margins": 0.13100232183933258, + "rewards/rejected": -0.8660610914230347, + "step": 270 + }, + { + "epoch": 0.57921453379642, + "grad_norm": 6.2825493812561035, + "learning_rate": 2.2386788418308665e-07, + "logits/chosen": -1.0154887437820435, + "logits/rejected": -1.0528539419174194, + "logps/chosen": -0.5251376628875732, + "logps/rejected": -0.7547603845596313, + "loss": 1.5214, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3128442764282227, + "rewards/margins": 0.5740568041801453, + "rewards/rejected": -1.8869010210037231, + "step": 271 + }, + { + "epoch": 0.5813518567993589, + "grad_norm": 4.264716148376465, + "learning_rate": 2.2200888097417302e-07, + "logits/chosen": -1.043276071548462, + "logits/rejected": -0.9186975955963135, + "logps/chosen": -0.39481961727142334, + "logps/rejected": -0.5242050886154175, + "loss": 1.5337, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9870489835739136, + "rewards/margins": 0.32346370816230774, + "rewards/rejected": -1.310512661933899, + "step": 272 + }, + { + "epoch": 0.5834891798022976, + "grad_norm": 4.6611199378967285, + "learning_rate": 2.2015144386493895e-07, + "logits/chosen": -0.9979356527328491, + "logits/rejected": -0.9526849985122681, + "logps/chosen": -0.39222562313079834, + "logps/rejected": -0.46204763650894165, + "loss": 1.4999, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9805639982223511, + "rewards/margins": 0.1745550036430359, + "rewards/rejected": -1.1551190614700317, + "step": 273 + }, + { + "epoch": 0.5856265028052364, + "grad_norm": 4.630171298980713, + "learning_rate": 2.1829567677876297e-07, + "logits/chosen": -0.9676195979118347, + "logits/rejected": -0.9666755199432373, + "logps/chosen": -0.35061851143836975, + "logps/rejected": -0.35894879698753357, + "loss": 1.609, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.8765462636947632, + "rewards/margins": 0.020825695246458054, + "rewards/rejected": -0.8973720073699951, + "step": 274 + }, + { + "epoch": 0.5877638258081752, + "grad_norm": 11.871662139892578, + "learning_rate": 2.164416835455862e-07, + "logits/chosen": -0.7467477321624756, + "logits/rejected": -0.6393258571624756, + "logps/chosen": -0.502008318901062, + "logps/rejected": -0.44732266664505005, + "loss": 1.5655, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.2550209760665894, + "rewards/margins": -0.13671430945396423, + "rewards/rejected": -1.1183066368103027, + "step": 275 + }, + { + "epoch": 0.5899011488111141, + "grad_norm": 3.6022424697875977, + "learning_rate": 2.1458956789610277e-07, + "logits/chosen": -1.2034939527511597, + "logits/rejected": -1.0202971696853638, + "logps/chosen": -0.3793608248233795, + "logps/rejected": -0.33719444274902344, + "loss": 1.5627, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.9484022259712219, + "rewards/margins": -0.10541604459285736, + "rewards/rejected": -0.8429861068725586, + "step": 276 + }, + { + "epoch": 0.5920384718140529, + "grad_norm": 6.574893474578857, + "learning_rate": 2.1273943345595635e-07, + "logits/chosen": -1.2551283836364746, + "logits/rejected": -1.2000017166137695, + "logps/chosen": -0.4082186818122864, + "logps/rejected": -0.6158214807510376, + "loss": 1.5529, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0205466747283936, + "rewards/margins": 0.51900714635849, + "rewards/rejected": -1.5395537614822388, + "step": 277 + }, + { + "epoch": 0.5941757948169917, + "grad_norm": 5.325026035308838, + "learning_rate": 2.1089138373994222e-07, + "logits/chosen": -1.0981683731079102, + "logits/rejected": -1.093741774559021, + "logps/chosen": -0.4150196313858032, + "logps/rejected": -0.5339372754096985, + "loss": 1.5688, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0375490188598633, + "rewards/margins": 0.2972941994667053, + "rewards/rejected": -1.3348432779312134, + "step": 278 + }, + { + "epoch": 0.5963131178199306, + "grad_norm": 17.316631317138672, + "learning_rate": 2.0904552214621556e-07, + "logits/chosen": -1.1414576768875122, + "logits/rejected": -1.1112793684005737, + "logps/chosen": -0.6599245071411133, + "logps/rejected": -0.3380126357078552, + "loss": 1.6195, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6498112678527832, + "rewards/margins": -0.8047796487808228, + "rewards/rejected": -0.8450315594673157, + "step": 279 + }, + { + "epoch": 0.5984504408228694, + "grad_norm": 4.380247116088867, + "learning_rate": 2.072019519505062e-07, + "logits/chosen": -0.9662964940071106, + "logits/rejected": -0.9849826693534851, + "logps/chosen": -0.36699962615966797, + "logps/rejected": -0.3455093204975128, + "loss": 1.5053, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.9174990653991699, + "rewards/margins": -0.053725723177194595, + "rewards/rejected": -0.8637734055519104, + "step": 280 + }, + { + "epoch": 0.6005877638258081, + "grad_norm": 7.828254699707031, + "learning_rate": 2.0536077630034085e-07, + "logits/chosen": -0.9694425463676453, + "logits/rejected": -0.8208516240119934, + "logps/chosen": -0.4557761251926422, + "logps/rejected": -0.6782093048095703, + "loss": 1.6184, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1394402980804443, + "rewards/margins": 0.5560829043388367, + "rewards/rejected": -1.6955231428146362, + "step": 281 + }, + { + "epoch": 0.602725086828747, + "grad_norm": 7.952332496643066, + "learning_rate": 2.0352209820927135e-07, + "logits/chosen": -0.9816855192184448, + "logits/rejected": -0.8845440149307251, + "logps/chosen": -0.3230597972869873, + "logps/rejected": -0.4072348475456238, + "loss": 1.5012, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8076494932174683, + "rewards/margins": 0.21043761074543, + "rewards/rejected": -1.0180871486663818, + "step": 282 + }, + { + "epoch": 0.6048624098316858, + "grad_norm": 8.255958557128906, + "learning_rate": 2.0168602055111173e-07, + "logits/chosen": -1.1203254461288452, + "logits/rejected": -1.107031226158142, + "logps/chosen": -0.6907448768615723, + "logps/rejected": -0.5776211023330688, + "loss": 1.6044, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7268621921539307, + "rewards/margins": -0.28280916810035706, + "rewards/rejected": -1.4440529346466064, + "step": 283 + }, + { + "epoch": 0.6069997328346246, + "grad_norm": 11.462747573852539, + "learning_rate": 1.998526460541818e-07, + "logits/chosen": -1.0083472728729248, + "logits/rejected": -0.9932087659835815, + "logps/chosen": -0.5044468641281128, + "logps/rejected": -0.4264739453792572, + "loss": 1.6586, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2611171007156372, + "rewards/margins": -0.19493228197097778, + "rewards/rejected": -1.0661848783493042, + "step": 284 + }, + { + "epoch": 0.6091370558375635, + "grad_norm": 4.765872478485107, + "learning_rate": 1.980220772955602e-07, + "logits/chosen": -1.035547137260437, + "logits/rejected": -1.0857359170913696, + "logps/chosen": -0.41963931918144226, + "logps/rejected": -0.592042863368988, + "loss": 1.5091, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.049098253250122, + "rewards/margins": 0.43100887537002563, + "rewards/rejected": -1.4801071882247925, + "step": 285 + }, + { + "epoch": 0.6112743788405023, + "grad_norm": 7.985474586486816, + "learning_rate": 1.961944166953445e-07, + "logits/chosen": -0.8251385688781738, + "logits/rejected": -0.9071054458618164, + "logps/chosen": -0.3732144236564636, + "logps/rejected": -0.4104728400707245, + "loss": 1.5169, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9330360889434814, + "rewards/margins": 0.09314604848623276, + "rewards/rejected": -1.0261821746826172, + "step": 286 + }, + { + "epoch": 0.6134117018434411, + "grad_norm": 7.316262722015381, + "learning_rate": 1.9436976651092142e-07, + "logits/chosen": -0.9544340372085571, + "logits/rejected": -0.898868203163147, + "logps/chosen": -0.35007724165916443, + "logps/rejected": -0.495257705450058, + "loss": 1.6104, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8751930594444275, + "rewards/margins": 0.3629511594772339, + "rewards/rejected": -1.2381441593170166, + "step": 287 + }, + { + "epoch": 0.6155490248463799, + "grad_norm": 3.980193614959717, + "learning_rate": 1.9254822883124517e-07, + "logits/chosen": -1.2356715202331543, + "logits/rejected": -1.1466394662857056, + "logps/chosen": -0.418282151222229, + "logps/rejected": -0.5082724094390869, + "loss": 1.5123, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.0457054376602173, + "rewards/margins": 0.22497567534446716, + "rewards/rejected": -1.2706811428070068, + "step": 288 + }, + { + "epoch": 0.6176863478493188, + "grad_norm": 6.745012283325195, + "learning_rate": 1.9072990557112564e-07, + "logits/chosen": -1.2313592433929443, + "logits/rejected": -1.1524139642715454, + "logps/chosen": -0.34664469957351685, + "logps/rejected": -0.5482650399208069, + "loss": 1.5042, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8666118383407593, + "rewards/margins": 0.504050612449646, + "rewards/rejected": -1.3706625699996948, + "step": 289 + }, + { + "epoch": 0.6198236708522575, + "grad_norm": 9.481700897216797, + "learning_rate": 1.8891489846552644e-07, + "logits/chosen": -1.081266164779663, + "logits/rejected": -1.085394263267517, + "logps/chosen": -0.37148210406303406, + "logps/rejected": -0.5117133855819702, + "loss": 1.5743, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.928705096244812, + "rewards/margins": 0.35057833790779114, + "rewards/rejected": -1.2792835235595703, + "step": 290 + }, + { + "epoch": 0.6219609938551963, + "grad_norm": 5.410580635070801, + "learning_rate": 1.8710330906387286e-07, + "logits/chosen": -1.0288105010986328, + "logits/rejected": -1.0116032361984253, + "logps/chosen": -0.3644832670688629, + "logps/rejected": -0.4690641462802887, + "loss": 1.4934, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9112080931663513, + "rewards/margins": 0.261452317237854, + "rewards/rejected": -1.1726603507995605, + "step": 291 + }, + { + "epoch": 0.6240983168581352, + "grad_norm": 12.723451614379883, + "learning_rate": 1.8529523872436977e-07, + "logits/chosen": -1.0818991661071777, + "logits/rejected": -1.0822491645812988, + "logps/chosen": -0.2900945246219635, + "logps/rejected": -0.44144943356513977, + "loss": 1.6044, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7252362370491028, + "rewards/margins": 0.37838736176490784, + "rewards/rejected": -1.103623628616333, + "step": 292 + }, + { + "epoch": 0.626235639861074, + "grad_norm": 18.391582489013672, + "learning_rate": 1.8349078860833124e-07, + "logits/chosen": -1.2308346033096313, + "logits/rejected": -1.1760648488998413, + "logps/chosen": -0.6205483675003052, + "logps/rejected": -0.3508188724517822, + "loss": 1.6365, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.5513708591461182, + "rewards/margins": -0.6743236780166626, + "rewards/rejected": -0.8770472407341003, + "step": 293 + }, + { + "epoch": 0.6283729628640128, + "grad_norm": 4.619331359863281, + "learning_rate": 1.8169005967452e-07, + "logits/chosen": -1.2816352844238281, + "logits/rejected": -1.2922104597091675, + "logps/chosen": -0.3991982340812683, + "logps/rejected": -0.504127562046051, + "loss": 1.5704, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9979956150054932, + "rewards/margins": 0.2623233199119568, + "rewards/rejected": -1.2603188753128052, + "step": 294 + }, + { + "epoch": 0.6305102858669517, + "grad_norm": 5.379226207733154, + "learning_rate": 1.7989315267349933e-07, + "logits/chosen": -1.0375932455062866, + "logits/rejected": -0.987227737903595, + "logps/chosen": -0.5034650564193726, + "logps/rejected": -0.6317480206489563, + "loss": 1.5085, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2586627006530762, + "rewards/margins": 0.3207073211669922, + "rewards/rejected": -1.5793699026107788, + "step": 295 + }, + { + "epoch": 0.6326476088698905, + "grad_norm": 15.063772201538086, + "learning_rate": 1.781001681419957e-07, + "logits/chosen": -1.0075099468231201, + "logits/rejected": -0.9762495160102844, + "logps/chosen": -0.5012757778167725, + "logps/rejected": -0.4941572844982147, + "loss": 1.4992, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2531894445419312, + "rewards/margins": -0.01779627427458763, + "rewards/rejected": -1.2353932857513428, + "step": 296 + }, + { + "epoch": 0.6347849318728293, + "grad_norm": 6.187438488006592, + "learning_rate": 1.763112063972739e-07, + "logits/chosen": -1.086329460144043, + "logits/rejected": -1.003612995147705, + "logps/chosen": -0.43468421697616577, + "logps/rejected": -0.623386800289154, + "loss": 1.5243, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0867105722427368, + "rewards/margins": 0.4717563986778259, + "rewards/rejected": -1.5584670305252075, + "step": 297 + }, + { + "epoch": 0.6369222548757681, + "grad_norm": 16.858556747436523, + "learning_rate": 1.745263675315245e-07, + "logits/chosen": -0.9435803890228271, + "logits/rejected": -0.9401760697364807, + "logps/chosen": -0.41162610054016113, + "logps/rejected": -0.7601633667945862, + "loss": 1.5119, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0290653705596924, + "rewards/margins": 0.871343195438385, + "rewards/rejected": -1.9004085063934326, + "step": 298 + }, + { + "epoch": 0.6390595778787069, + "grad_norm": 9.424429893493652, + "learning_rate": 1.7274575140626315e-07, + "logits/chosen": -1.1454746723175049, + "logits/rejected": -1.0624852180480957, + "logps/chosen": -0.5017335414886475, + "logps/rejected": -0.5628975629806519, + "loss": 1.6427, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2543339729309082, + "rewards/margins": 0.1529100388288498, + "rewards/rejected": -1.4072438478469849, + "step": 299 + }, + { + "epoch": 0.6411969008816457, + "grad_norm": 5.96793794631958, + "learning_rate": 1.7096945764674398e-07, + "logits/chosen": -0.9207834005355835, + "logits/rejected": -0.9202168583869934, + "logps/chosen": -0.3956447243690491, + "logps/rejected": -0.40050509572029114, + "loss": 1.6262, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.9891117811203003, + "rewards/margins": 0.012150941416621208, + "rewards/rejected": -1.0012627840042114, + "step": 300 + }, + { + "epoch": 0.6433342238845846, + "grad_norm": 5.713474750518799, + "learning_rate": 1.6919758563638502e-07, + "logits/chosen": -0.9401556253433228, + "logits/rejected": -0.8605173826217651, + "logps/chosen": -0.4167335331439972, + "logps/rejected": -0.5897922515869141, + "loss": 1.527, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.041833758354187, + "rewards/margins": 0.4326467216014862, + "rewards/rejected": -1.4744806289672852, + "step": 301 + }, + { + "epoch": 0.6454715468875234, + "grad_norm": 11.05538272857666, + "learning_rate": 1.674302345112083e-07, + "logits/chosen": -1.0368751287460327, + "logits/rejected": -1.1599576473236084, + "logps/chosen": -0.42875924706459045, + "logps/rejected": -0.7052878737449646, + "loss": 1.4713, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.071898102760315, + "rewards/margins": 0.6913214325904846, + "rewards/rejected": -1.7632195949554443, + "step": 302 + }, + { + "epoch": 0.6476088698904622, + "grad_norm": 7.134030818939209, + "learning_rate": 1.656675031542925e-07, + "logits/chosen": -1.1293184757232666, + "logits/rejected": -1.1065864562988281, + "logps/chosen": -0.4180205166339874, + "logps/rejected": -0.4501388669013977, + "loss": 1.5318, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.045051097869873, + "rewards/margins": 0.08029599487781525, + "rewards/rejected": -1.1253471374511719, + "step": 303 + }, + { + "epoch": 0.649746192893401, + "grad_norm": 9.838216781616211, + "learning_rate": 1.6390949019024118e-07, + "logits/chosen": -1.2255228757858276, + "logits/rejected": -1.0440919399261475, + "logps/chosen": -0.34637248516082764, + "logps/rejected": -0.332830548286438, + "loss": 1.4995, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8659312725067139, + "rewards/margins": -0.0338548980653286, + "rewards/rejected": -0.8320763111114502, + "step": 304 + }, + { + "epoch": 0.6518835158963399, + "grad_norm": 3.633300542831421, + "learning_rate": 1.621562939796643e-07, + "logits/chosen": -1.0595769882202148, + "logits/rejected": -1.060139536857605, + "logps/chosen": -0.4522903561592102, + "logps/rejected": -0.7426398992538452, + "loss": 1.5239, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.1307260990142822, + "rewards/margins": 0.7258738279342651, + "rewards/rejected": -1.8565996885299683, + "step": 305 + }, + { + "epoch": 0.6540208388992786, + "grad_norm": 6.867697715759277, + "learning_rate": 1.6040801261367493e-07, + "logits/chosen": -1.1086713075637817, + "logits/rejected": -1.1967618465423584, + "logps/chosen": -0.42471063137054443, + "logps/rejected": -0.4879637360572815, + "loss": 1.4941, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0617766380310059, + "rewards/margins": 0.15813271701335907, + "rewards/rejected": -1.2199093103408813, + "step": 306 + }, + { + "epoch": 0.6561581619022174, + "grad_norm": 9.760129928588867, + "learning_rate": 1.5866474390840124e-07, + "logits/chosen": -1.0829546451568604, + "logits/rejected": -1.0814926624298096, + "logps/chosen": -0.4132816791534424, + "logps/rejected": -0.586501955986023, + "loss": 1.5785, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.033204197883606, + "rewards/margins": 0.43305063247680664, + "rewards/rejected": -1.466254711151123, + "step": 307 + }, + { + "epoch": 0.6582954849051563, + "grad_norm": 4.1318511962890625, + "learning_rate": 1.569265853995137e-07, + "logits/chosen": -0.9130831360816956, + "logits/rejected": -1.0174808502197266, + "logps/chosen": -0.3561224639415741, + "logps/rejected": -0.47521257400512695, + "loss": 1.4836, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8903061747550964, + "rewards/margins": 0.2977251708507538, + "rewards/rejected": -1.1880314350128174, + "step": 308 + }, + { + "epoch": 0.6604328079080951, + "grad_norm": 5.264586925506592, + "learning_rate": 1.5519363433676791e-07, + "logits/chosen": -1.2580910921096802, + "logits/rejected": -1.2599362134933472, + "logps/chosen": -0.42455989122390747, + "logps/rejected": -0.5628042221069336, + "loss": 1.6118, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0613996982574463, + "rewards/margins": 0.34561091661453247, + "rewards/rejected": -1.4070106744766235, + "step": 309 + }, + { + "epoch": 0.6625701309110339, + "grad_norm": 19.9966983795166, + "learning_rate": 1.5346598767856345e-07, + "logits/chosen": -0.8979520201683044, + "logits/rejected": -0.9155081510543823, + "logps/chosen": -0.335290789604187, + "logps/rejected": -0.43495267629623413, + "loss": 1.6585, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8382269740104675, + "rewards/margins": 0.24915480613708496, + "rewards/rejected": -1.0873818397521973, + "step": 310 + }, + { + "epoch": 0.6647074539139728, + "grad_norm": 10.383206367492676, + "learning_rate": 1.517437420865191e-07, + "logits/chosen": -1.4863961935043335, + "logits/rejected": -1.2771762609481812, + "logps/chosen": -0.33609017729759216, + "logps/rejected": -0.5989465713500977, + "loss": 1.5329, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.8402254581451416, + "rewards/margins": 0.6571409106254578, + "rewards/rejected": -1.4973664283752441, + "step": 311 + }, + { + "epoch": 0.6668447769169116, + "grad_norm": 4.85746431350708, + "learning_rate": 1.500269939200648e-07, + "logits/chosen": -1.1300700902938843, + "logits/rejected": -1.1000648736953735, + "logps/chosen": -0.4227021336555481, + "logps/rejected": -0.3756105899810791, + "loss": 1.572, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.0567553043365479, + "rewards/margins": -0.1177288144826889, + "rewards/rejected": -0.939026415348053, + "step": 312 + }, + { + "epoch": 0.6689820999198504, + "grad_norm": 15.065166473388672, + "learning_rate": 1.4831583923104998e-07, + "logits/chosen": -1.2687509059906006, + "logits/rejected": -1.2387371063232422, + "logps/chosen": -0.37272506952285767, + "logps/rejected": -0.4628967344760895, + "loss": 1.5731, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9318127632141113, + "rewards/margins": 0.22542911767959595, + "rewards/rejected": -1.157241940498352, + "step": 313 + }, + { + "epoch": 0.6711194229227893, + "grad_norm": 4.742990970611572, + "learning_rate": 1.4661037375836987e-07, + "logits/chosen": -1.1166198253631592, + "logits/rejected": -1.1708546876907349, + "logps/chosen": -0.40323004126548767, + "logps/rejected": -0.49593961238861084, + "loss": 1.5854, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.0080751180648804, + "rewards/margins": 0.2317739725112915, + "rewards/rejected": -1.2398490905761719, + "step": 314 + }, + { + "epoch": 0.673256745925728, + "grad_norm": 6.135270595550537, + "learning_rate": 1.4491069292260866e-07, + "logits/chosen": -1.0454214811325073, + "logits/rejected": -0.9797170162200928, + "logps/chosen": -0.4790341258049011, + "logps/rejected": -0.5334821343421936, + "loss": 1.5961, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.1975852251052856, + "rewards/margins": 0.13612008094787598, + "rewards/rejected": -1.3337054252624512, + "step": 315 + }, + { + "epoch": 0.6753940689286668, + "grad_norm": 5.529047012329102, + "learning_rate": 1.432168918207009e-07, + "logits/chosen": -0.9548214673995972, + "logits/rejected": -1.037723183631897, + "logps/chosen": -0.34955471754074097, + "logps/rejected": -0.6267892122268677, + "loss": 1.5352, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.87388676404953, + "rewards/margins": 0.6930862069129944, + "rewards/rejected": -1.5669729709625244, + "step": 316 + }, + { + "epoch": 0.6775313919316056, + "grad_norm": 4.692312240600586, + "learning_rate": 1.4152906522061047e-07, + "logits/chosen": -1.0882840156555176, + "logits/rejected": -1.0136744976043701, + "logps/chosen": -0.31677815318107605, + "logps/rejected": -0.42752817273139954, + "loss": 1.522, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7919453978538513, + "rewards/margins": 0.2768751382827759, + "rewards/rejected": -1.0688204765319824, + "step": 317 + }, + { + "epoch": 0.6796687149345445, + "grad_norm": 10.370941162109375, + "learning_rate": 1.3984730755602903e-07, + "logits/chosen": -1.161927580833435, + "logits/rejected": -1.0617276430130005, + "logps/chosen": -0.534487783908844, + "logps/rejected": -0.6021491289138794, + "loss": 1.5201, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3362195491790771, + "rewards/margins": 0.1691533476114273, + "rewards/rejected": -1.5053728818893433, + "step": 318 + }, + { + "epoch": 0.6818060379374833, + "grad_norm": 9.610735893249512, + "learning_rate": 1.381717129210918e-07, + "logits/chosen": -1.1923787593841553, + "logits/rejected": -1.2188538312911987, + "logps/chosen": -0.375224769115448, + "logps/rejected": -0.7198653817176819, + "loss": 1.5779, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9380618929862976, + "rewards/margins": 0.8616017699241638, + "rewards/rejected": -1.7996635437011719, + "step": 319 + }, + { + "epoch": 0.6839433609404221, + "grad_norm": 5.218975067138672, + "learning_rate": 1.365023750651133e-07, + "logits/chosen": -1.1556600332260132, + "logits/rejected": -1.095563530921936, + "logps/chosen": -0.37500467896461487, + "logps/rejected": -0.4297294020652771, + "loss": 1.5315, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9375116229057312, + "rewards/margins": 0.1368117332458496, + "rewards/rejected": -1.0743234157562256, + "step": 320 + }, + { + "epoch": 0.686080683943361, + "grad_norm": 6.690799236297607, + "learning_rate": 1.3483938738734195e-07, + "logits/chosen": -0.8860509395599365, + "logits/rejected": -0.8542050123214722, + "logps/chosen": -0.3055468201637268, + "logps/rejected": -0.3592091202735901, + "loss": 1.5345, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7638670802116394, + "rewards/margins": 0.13415566086769104, + "rewards/rejected": -0.8980227708816528, + "step": 321 + }, + { + "epoch": 0.6882180069462998, + "grad_norm": 9.405998229980469, + "learning_rate": 1.3318284293173449e-07, + "logits/chosen": -0.9992817640304565, + "logits/rejected": -0.9634179472923279, + "logps/chosen": -0.4300232529640198, + "logps/rejected": -0.4082144498825073, + "loss": 1.5484, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.075058102607727, + "rewards/margins": -0.05452210083603859, + "rewards/rejected": -1.0205360651016235, + "step": 322 + }, + { + "epoch": 0.6903553299492385, + "grad_norm": 5.07054328918457, + "learning_rate": 1.3153283438175034e-07, + "logits/chosen": -1.0605663061141968, + "logits/rejected": -1.0907377004623413, + "logps/chosen": -0.3852520287036896, + "logps/rejected": -0.4656079113483429, + "loss": 1.5678, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9631301164627075, + "rewards/margins": 0.2008897364139557, + "rewards/rejected": -1.1640198230743408, + "step": 323 + }, + { + "epoch": 0.6924926529521774, + "grad_norm": 7.885300159454346, + "learning_rate": 1.2988945405516565e-07, + "logits/chosen": -1.0609923601150513, + "logits/rejected": -1.0959070920944214, + "logps/chosen": -0.4137347638607025, + "logps/rejected": -0.5460120439529419, + "loss": 1.5155, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.034337043762207, + "rewards/margins": 0.33069324493408203, + "rewards/rejected": -1.3650301694869995, + "step": 324 + }, + { + "epoch": 0.6946299759551162, + "grad_norm": 5.987451553344727, + "learning_rate": 1.2825279389890818e-07, + "logits/chosen": -0.9947149753570557, + "logits/rejected": -1.0909423828125, + "logps/chosen": -0.4138132333755493, + "logps/rejected": -0.4551887512207031, + "loss": 1.4521, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.0345330238342285, + "rewards/margins": 0.10343889147043228, + "rewards/rejected": -1.1379718780517578, + "step": 325 + }, + { + "epoch": 0.696767298958055, + "grad_norm": 5.460362434387207, + "learning_rate": 1.2662294548391328e-07, + "logits/chosen": -1.1488416194915771, + "logits/rejected": -0.9424848556518555, + "logps/chosen": -0.48600658774375916, + "logps/rejected": -0.7618072628974915, + "loss": 1.5489, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.215016484260559, + "rewards/margins": 0.6895018219947815, + "rewards/rejected": -1.9045181274414062, + "step": 326 + }, + { + "epoch": 0.6989046219609939, + "grad_norm": 10.205648422241211, + "learning_rate": 1.2500000000000005e-07, + "logits/chosen": -0.9662617444992065, + "logits/rejected": -1.1120105981826782, + "logps/chosen": -0.40102994441986084, + "logps/rejected": -0.6069145798683167, + "loss": 1.5435, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0025748014450073, + "rewards/margins": 0.5147115588188171, + "rewards/rejected": -1.5172864198684692, + "step": 327 + }, + { + "epoch": 0.7010419449639327, + "grad_norm": 8.57025146484375, + "learning_rate": 1.2338404825076935e-07, + "logits/chosen": -1.1456284523010254, + "logits/rejected": -1.0507254600524902, + "logps/chosen": -0.4316751956939697, + "logps/rejected": -0.44297924637794495, + "loss": 1.5203, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0791878700256348, + "rewards/margins": 0.02826026827096939, + "rewards/rejected": -1.1074482202529907, + "step": 328 + }, + { + "epoch": 0.7031792679668715, + "grad_norm": 9.405667304992676, + "learning_rate": 1.2177518064852345e-07, + "logits/chosen": -1.1795152425765991, + "logits/rejected": -1.0673104524612427, + "logps/chosen": -0.37221118807792664, + "logps/rejected": -0.4177697002887726, + "loss": 1.5138, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.930527925491333, + "rewards/margins": 0.11389636248350143, + "rewards/rejected": -1.044424295425415, + "step": 329 + }, + { + "epoch": 0.7053165909698104, + "grad_norm": 6.466906547546387, + "learning_rate": 1.201734872092077e-07, + "logits/chosen": -1.0346126556396484, + "logits/rejected": -1.0439854860305786, + "logps/chosen": -0.42790815234184265, + "logps/rejected": -1.0795375108718872, + "loss": 1.4512, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0697704553604126, + "rewards/margins": 1.6290733814239502, + "rewards/rejected": -2.6988439559936523, + "step": 330 + }, + { + "epoch": 0.7074539139727491, + "grad_norm": 4.390846252441406, + "learning_rate": 1.185790575473738e-07, + "logits/chosen": -1.1390395164489746, + "logits/rejected": -1.1135765314102173, + "logps/chosen": -0.5658525824546814, + "logps/rejected": -0.5814335942268372, + "loss": 1.4646, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.4146316051483154, + "rewards/margins": 0.03895253688097, + "rewards/rejected": -1.45358407497406, + "step": 331 + }, + { + "epoch": 0.7095912369756879, + "grad_norm": 5.300051212310791, + "learning_rate": 1.1699198087116588e-07, + "logits/chosen": -1.015911340713501, + "logits/rejected": -1.11297607421875, + "logps/chosen": -0.4078459143638611, + "logps/rejected": -0.6018810868263245, + "loss": 1.5814, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.019614815711975, + "rewards/margins": 0.48508787155151367, + "rewards/rejected": -1.5047025680541992, + "step": 332 + }, + { + "epoch": 0.7117285599786267, + "grad_norm": 5.75014066696167, + "learning_rate": 1.1541234597732947e-07, + "logits/chosen": -1.0877783298492432, + "logits/rejected": -1.0997726917266846, + "logps/chosen": -0.3489132523536682, + "logps/rejected": -0.43700623512268066, + "loss": 1.4605, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8722831606864929, + "rewards/margins": 0.22023235261440277, + "rewards/rejected": -1.0925155878067017, + "step": 333 + }, + { + "epoch": 0.7138658829815656, + "grad_norm": 5.7690229415893555, + "learning_rate": 1.1384024124624322e-07, + "logits/chosen": -0.9815250635147095, + "logits/rejected": -0.9534754753112793, + "logps/chosen": -0.42468035221099854, + "logps/rejected": -0.42991912364959717, + "loss": 1.5096, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.0617008209228516, + "rewards/margins": 0.013096902519464493, + "rewards/rejected": -1.0747978687286377, + "step": 334 + }, + { + "epoch": 0.7160032059845044, + "grad_norm": 6.530084609985352, + "learning_rate": 1.1227575463697439e-07, + "logits/chosen": -1.0770314931869507, + "logits/rejected": -1.1593232154846191, + "logps/chosen": -0.41134944558143616, + "logps/rejected": -0.46647369861602783, + "loss": 1.5333, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0283737182617188, + "rewards/margins": 0.1378105729818344, + "rewards/rejected": -1.1661843061447144, + "step": 335 + }, + { + "epoch": 0.7181405289874432, + "grad_norm": 6.972837448120117, + "learning_rate": 1.1071897368235694e-07, + "logits/chosen": -1.0634236335754395, + "logits/rejected": -1.1836671829223633, + "logps/chosen": -0.35514572262763977, + "logps/rejected": -0.6676814556121826, + "loss": 1.4464, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8878642320632935, + "rewards/margins": 0.781339168548584, + "rewards/rejected": -1.6692036390304565, + "step": 336 + }, + { + "epoch": 0.7202778519903821, + "grad_norm": 6.366727828979492, + "learning_rate": 1.0916998548409447e-07, + "logits/chosen": -1.0254015922546387, + "logits/rejected": -0.9457456469535828, + "logps/chosen": -0.3946457803249359, + "logps/rejected": -0.4669502377510071, + "loss": 1.617, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.986614465713501, + "rewards/margins": 0.1807611733675003, + "rewards/rejected": -1.1673755645751953, + "step": 337 + }, + { + "epoch": 0.7224151749933209, + "grad_norm": 6.886241436004639, + "learning_rate": 1.0762887670788701e-07, + "logits/chosen": -0.8809665441513062, + "logits/rejected": -0.7807790637016296, + "logps/chosen": -0.32795268297195435, + "logps/rejected": -0.4252588152885437, + "loss": 1.4383, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8198816776275635, + "rewards/margins": 0.2432653307914734, + "rewards/rejected": -1.063146948814392, + "step": 338 + }, + { + "epoch": 0.7245524979962596, + "grad_norm": 4.194058418273926, + "learning_rate": 1.0609573357858165e-07, + "logits/chosen": -1.1363167762756348, + "logits/rejected": -1.1581072807312012, + "logps/chosen": -0.4632134437561035, + "logps/rejected": -0.6518194079399109, + "loss": 1.6039, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1580334901809692, + "rewards/margins": 0.47151511907577515, + "rewards/rejected": -1.6295486688613892, + "step": 339 + }, + { + "epoch": 0.7266898209991985, + "grad_norm": 23.59259033203125, + "learning_rate": 1.0457064187534861e-07, + "logits/chosen": -1.0518946647644043, + "logits/rejected": -0.9888642430305481, + "logps/chosen": -0.407795786857605, + "logps/rejected": -0.45109352469444275, + "loss": 1.6088, + "rewards/accuracies": 0.3125, + "rewards/chosen": -1.0194894075393677, + "rewards/margins": 0.1082444041967392, + "rewards/rejected": -1.127733826637268, + "step": 340 + }, + { + "epoch": 0.7288271440021373, + "grad_norm": 5.5164079666137695, + "learning_rate": 1.0305368692688174e-07, + "logits/chosen": -1.0163506269454956, + "logits/rejected": -0.9497014284133911, + "logps/chosen": -0.43188926577568054, + "logps/rejected": -0.5361363291740417, + "loss": 1.5212, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0797233581542969, + "rewards/margins": 0.2606176435947418, + "rewards/rejected": -1.3403409719467163, + "step": 341 + }, + { + "epoch": 0.7309644670050761, + "grad_norm": 10.590493202209473, + "learning_rate": 1.0154495360662463e-07, + "logits/chosen": -0.7582399249076843, + "logits/rejected": -0.7725558280944824, + "logps/chosen": -0.3883350193500519, + "logps/rejected": -0.4608793556690216, + "loss": 1.4818, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9708375930786133, + "rewards/margins": 0.18136097490787506, + "rewards/rejected": -1.152198314666748, + "step": 342 + }, + { + "epoch": 0.733101790008015, + "grad_norm": 5.3863677978515625, + "learning_rate": 1.0004452632802158e-07, + "logits/chosen": -1.0777875185012817, + "logits/rejected": -1.0332427024841309, + "logps/chosen": -0.6136234402656555, + "logps/rejected": -0.6208893656730652, + "loss": 1.4504, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.5340585708618164, + "rewards/margins": 0.018164925277233124, + "rewards/rejected": -1.552223563194275, + "step": 343 + }, + { + "epoch": 0.7352391130109538, + "grad_norm": 4.131237506866455, + "learning_rate": 9.855248903979505e-08, + "logits/chosen": -1.1028010845184326, + "logits/rejected": -1.063793420791626, + "logps/chosen": -0.4896419644355774, + "logps/rejected": -0.5126334428787231, + "loss": 1.496, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.224104881286621, + "rewards/margins": 0.05747878551483154, + "rewards/rejected": -1.2815836668014526, + "step": 344 + }, + { + "epoch": 0.7373764360138926, + "grad_norm": 6.84834623336792, + "learning_rate": 9.706892522124838e-08, + "logits/chosen": -1.0494110584259033, + "logits/rejected": -0.994105339050293, + "logps/chosen": -0.5251023173332214, + "logps/rejected": -0.5692360401153564, + "loss": 1.5667, + "rewards/accuracies": 0.3125, + "rewards/chosen": -1.312755823135376, + "rewards/margins": 0.11033419519662857, + "rewards/rejected": -1.4230899810791016, + "step": 345 + }, + { + "epoch": 0.7395137590168315, + "grad_norm": 7.503670692443848, + "learning_rate": 9.559391787759554e-08, + "logits/chosen": -1.304071307182312, + "logits/rejected": -1.1836615800857544, + "logps/chosen": -0.5006979703903198, + "logps/rejected": -0.4764450788497925, + "loss": 1.5827, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2517449855804443, + "rewards/margins": -0.06063230335712433, + "rewards/rejected": -1.191112756729126, + "step": 346 + }, + { + "epoch": 0.7416510820197703, + "grad_norm": 7.5821757316589355, + "learning_rate": 9.412754953531663e-08, + "logits/chosen": -1.0126221179962158, + "logits/rejected": -1.0061360597610474, + "logps/chosen": -0.536862850189209, + "logps/rejected": -0.7791385650634766, + "loss": 1.5476, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.342157244682312, + "rewards/margins": 0.6056893467903137, + "rewards/rejected": -1.9478464126586914, + "step": 347 + }, + { + "epoch": 0.743788405022709, + "grad_norm": 6.262937545776367, + "learning_rate": 9.266990223754067e-08, + "logits/chosen": -0.9866530895233154, + "logits/rejected": -1.087868571281433, + "logps/chosen": -0.35664424300193787, + "logps/rejected": -0.7218388915061951, + "loss": 1.5054, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8916106224060059, + "rewards/margins": 0.9129866361618042, + "rewards/rejected": -1.8045971393585205, + "step": 348 + }, + { + "epoch": 0.7459257280256478, + "grad_norm": 4.314223766326904, + "learning_rate": 9.12210575394553e-08, + "logits/chosen": -1.0736174583435059, + "logits/rejected": -1.0914644002914429, + "logps/chosen": -0.4206693172454834, + "logps/rejected": -0.4104337692260742, + "loss": 1.58, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.0516732931137085, + "rewards/margins": -0.025588899850845337, + "rewards/rejected": -1.0260844230651855, + "step": 349 + }, + { + "epoch": 0.7480630510285867, + "grad_norm": 12.095142364501953, + "learning_rate": 8.978109650374396e-08, + "logits/chosen": -1.0621310472488403, + "logits/rejected": -1.0466017723083496, + "logps/chosen": -0.4226948618888855, + "logps/rejected": -0.46644341945648193, + "loss": 1.5575, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0567370653152466, + "rewards/margins": 0.10937156528234482, + "rewards/rejected": -1.16610848903656, + "step": 350 + }, + { + "epoch": 0.7502003740315255, + "grad_norm": 6.449507236480713, + "learning_rate": 8.835009969605011e-08, + "logits/chosen": -1.1138099431991577, + "logits/rejected": -1.0220037698745728, + "logps/chosen": -0.3483428359031677, + "logps/rejected": -0.3484461307525635, + "loss": 1.5018, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8708571195602417, + "rewards/margins": 0.0002581886947154999, + "rewards/rejected": -0.8711153268814087, + "step": 351 + }, + { + "epoch": 0.7523376970344643, + "grad_norm": 6.543509483337402, + "learning_rate": 8.692814718046978e-08, + "logits/chosen": -1.1011321544647217, + "logits/rejected": -1.0573300123214722, + "logps/chosen": -0.6217561364173889, + "logps/rejected": -0.5529008507728577, + "loss": 1.6158, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.55439031124115, + "rewards/margins": -0.1721382439136505, + "rewards/rejected": -1.3822520971298218, + "step": 352 + }, + { + "epoch": 0.7544750200374032, + "grad_norm": 6.191348552703857, + "learning_rate": 8.551531851507185e-08, + "logits/chosen": -1.0794535875320435, + "logits/rejected": -0.9287205934524536, + "logps/chosen": -0.3870071768760681, + "logps/rejected": -0.4184566140174866, + "loss": 1.5651, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9675179719924927, + "rewards/margins": 0.07862359285354614, + "rewards/rejected": -1.046141505241394, + "step": 353 + }, + { + "epoch": 0.756612343040342, + "grad_norm": 5.186205863952637, + "learning_rate": 8.411169274744723e-08, + "logits/chosen": -1.0084459781646729, + "logits/rejected": -1.0072932243347168, + "logps/chosen": -0.3501359224319458, + "logps/rejected": -0.4932965040206909, + "loss": 1.4669, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8753398656845093, + "rewards/margins": 0.35790154337882996, + "rewards/rejected": -1.233241319656372, + "step": 354 + }, + { + "epoch": 0.7587496660432808, + "grad_norm": 8.382218360900879, + "learning_rate": 8.271734841028552e-08, + "logits/chosen": -1.0127713680267334, + "logits/rejected": -1.0045888423919678, + "logps/chosen": -0.4923154413700104, + "logps/rejected": -0.4377010464668274, + "loss": 1.5886, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.2307885885238647, + "rewards/margins": -0.13653598725795746, + "rewards/rejected": -1.094252586364746, + "step": 355 + }, + { + "epoch": 0.7608869890462197, + "grad_norm": 34.09469985961914, + "learning_rate": 8.133236351698142e-08, + "logits/chosen": -1.191988229751587, + "logits/rejected": -1.0870414972305298, + "logps/chosen": -0.5740368366241455, + "logps/rejected": -0.943623960018158, + "loss": 1.5604, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.4350918531417847, + "rewards/margins": 0.9239681959152222, + "rewards/rejected": -2.359060049057007, + "step": 356 + }, + { + "epoch": 0.7630243120491584, + "grad_norm": 15.886775970458984, + "learning_rate": 7.99568155572701e-08, + "logits/chosen": -1.2273132801055908, + "logits/rejected": -1.1660319566726685, + "logps/chosen": -0.5724566578865051, + "logps/rejected": -0.6214060187339783, + "loss": 1.5502, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.4311414957046509, + "rewards/margins": 0.12237339466810226, + "rewards/rejected": -1.5535149574279785, + "step": 357 + }, + { + "epoch": 0.7651616350520972, + "grad_norm": 5.216745376586914, + "learning_rate": 7.859078149289144e-08, + "logits/chosen": -1.0447226762771606, + "logits/rejected": -1.0748844146728516, + "logps/chosen": -0.3527478873729706, + "logps/rejected": -0.45652708411216736, + "loss": 1.4919, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8818696737289429, + "rewards/margins": 0.2594480514526367, + "rewards/rejected": -1.1413178443908691, + "step": 358 + }, + { + "epoch": 0.7672989580550361, + "grad_norm": 7.554361820220947, + "learning_rate": 7.723433775328384e-08, + "logits/chosen": -1.0472806692123413, + "logits/rejected": -1.2351243495941162, + "logps/chosen": -0.486628919839859, + "logps/rejected": -0.8915761709213257, + "loss": 1.5042, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2165722846984863, + "rewards/margins": 1.012368083000183, + "rewards/rejected": -2.22894024848938, + "step": 359 + }, + { + "epoch": 0.7694362810579749, + "grad_norm": 5.030363082885742, + "learning_rate": 7.588756023130833e-08, + "logits/chosen": -0.7821986079216003, + "logits/rejected": -0.8699120283126831, + "logps/chosen": -0.4785808324813843, + "logps/rejected": -0.5831509828567505, + "loss": 1.4942, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1964521408081055, + "rewards/margins": 0.261425256729126, + "rewards/rejected": -1.4578773975372314, + "step": 360 + }, + { + "epoch": 0.7715736040609137, + "grad_norm": 7.105344295501709, + "learning_rate": 7.455052427900213e-08, + "logits/chosen": -1.215461254119873, + "logits/rejected": -1.0217854976654053, + "logps/chosen": -0.3912314176559448, + "logps/rejected": -0.3650144636631012, + "loss": 1.6164, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.9780785441398621, + "rewards/margins": -0.06554235517978668, + "rewards/rejected": -0.9125362038612366, + "step": 361 + }, + { + "epoch": 0.7737109270638525, + "grad_norm": 5.5884199142456055, + "learning_rate": 7.322330470336313e-08, + "logits/chosen": -1.0982252359390259, + "logits/rejected": -1.0146585702896118, + "logps/chosen": -0.32143884897232056, + "logps/rejected": -0.495330274105072, + "loss": 1.4397, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8035971522331238, + "rewards/margins": 0.4347284734249115, + "rewards/rejected": -1.2383257150650024, + "step": 362 + }, + { + "epoch": 0.7758482500667914, + "grad_norm": 10.065922737121582, + "learning_rate": 7.190597576216384e-08, + "logits/chosen": -0.8641526699066162, + "logits/rejected": -0.8766672015190125, + "logps/chosen": -0.46004733443260193, + "logps/rejected": -0.4904063940048218, + "loss": 1.5026, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.150118350982666, + "rewards/margins": 0.07589760422706604, + "rewards/rejected": -1.2260159254074097, + "step": 363 + }, + { + "epoch": 0.7779855730697302, + "grad_norm": 9.752578735351562, + "learning_rate": 7.059861115979701e-08, + "logits/chosen": -1.0331135988235474, + "logits/rejected": -1.0849862098693848, + "logps/chosen": -0.4356737434864044, + "logps/rejected": -0.4790940582752228, + "loss": 1.6159, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.0891844034194946, + "rewards/margins": 0.1085507869720459, + "rewards/rejected": -1.1977351903915405, + "step": 364 + }, + { + "epoch": 0.7801228960726689, + "grad_norm": 6.089657783508301, + "learning_rate": 6.930128404315214e-08, + "logits/chosen": -1.0164854526519775, + "logits/rejected": -1.006756067276001, + "logps/chosen": -0.6724580526351929, + "logps/rejected": -0.6376264095306396, + "loss": 1.4724, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.6811450719833374, + "rewards/margins": -0.08707903325557709, + "rewards/rejected": -1.5940660238265991, + "step": 365 + }, + { + "epoch": 0.7822602190756078, + "grad_norm": 4.81856107711792, + "learning_rate": 6.801406699752229e-08, + "logits/chosen": -1.1662445068359375, + "logits/rejected": -1.0658493041992188, + "logps/chosen": -0.4783725440502167, + "logps/rejected": -0.4546273946762085, + "loss": 1.6376, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.1959314346313477, + "rewards/margins": -0.059362899512052536, + "rewards/rejected": -1.1365684270858765, + "step": 366 + }, + { + "epoch": 0.7843975420785466, + "grad_norm": 9.418230056762695, + "learning_rate": 6.673703204254347e-08, + "logits/chosen": -1.2206920385360718, + "logits/rejected": -1.2510498762130737, + "logps/chosen": -0.594007670879364, + "logps/rejected": -0.8274646997451782, + "loss": 1.4951, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4850192070007324, + "rewards/margins": 0.5836424827575684, + "rewards/rejected": -2.068661689758301, + "step": 367 + }, + { + "epoch": 0.7865348650814854, + "grad_norm": 10.291698455810547, + "learning_rate": 6.547025062816486e-08, + "logits/chosen": -0.8960355520248413, + "logits/rejected": -0.9344096779823303, + "logps/chosen": -0.36661607027053833, + "logps/rejected": -0.43485212326049805, + "loss": 1.554, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.916540265083313, + "rewards/margins": 0.17059014737606049, + "rewards/rejected": -1.0871303081512451, + "step": 368 + }, + { + "epoch": 0.7886721880844243, + "grad_norm": 4.118884086608887, + "learning_rate": 6.42137936306514e-08, + "logits/chosen": -1.0395543575286865, + "logits/rejected": -0.9369036555290222, + "logps/chosen": -0.3800487816333771, + "logps/rejected": -0.35273048281669617, + "loss": 1.5453, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.9501218795776367, + "rewards/margins": -0.06829574704170227, + "rewards/rejected": -0.8818261623382568, + "step": 369 + }, + { + "epoch": 0.7908095110873631, + "grad_norm": 10.194417953491211, + "learning_rate": 6.296773134861824e-08, + "logits/chosen": -1.06082284450531, + "logits/rejected": -1.0715018510818481, + "logps/chosen": -0.45035964250564575, + "logps/rejected": -0.49578312039375305, + "loss": 1.5399, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.125899076461792, + "rewards/margins": 0.11355876922607422, + "rewards/rejected": -1.2394579648971558, + "step": 370 + }, + { + "epoch": 0.7929468340903019, + "grad_norm": 12.731304168701172, + "learning_rate": 6.173213349909728e-08, + "logits/chosen": -1.1646618843078613, + "logits/rejected": -1.0550154447555542, + "logps/chosen": -0.45987626910209656, + "logps/rejected": -0.6932123303413391, + "loss": 1.4858, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1496906280517578, + "rewards/margins": 0.583340048789978, + "rewards/rejected": -1.7330307960510254, + "step": 371 + }, + { + "epoch": 0.7950841570932408, + "grad_norm": 4.002368927001953, + "learning_rate": 6.050706921363672e-08, + "logits/chosen": -1.1935923099517822, + "logits/rejected": -1.2162138223648071, + "logps/chosen": -0.39056870341300964, + "logps/rejected": -0.5482085943222046, + "loss": 1.472, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9764216542243958, + "rewards/margins": 0.39409980177879333, + "rewards/rejected": -1.3705215454101562, + "step": 372 + }, + { + "epoch": 0.7972214800961795, + "grad_norm": 14.192221641540527, + "learning_rate": 5.929260703443337e-08, + "logits/chosen": -0.7887646555900574, + "logits/rejected": -0.8968151211738586, + "logps/chosen": -0.3553355634212494, + "logps/rejected": -0.43609827756881714, + "loss": 1.5724, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8883388638496399, + "rewards/margins": 0.20190683007240295, + "rewards/rejected": -1.0902457237243652, + "step": 373 + }, + { + "epoch": 0.7993588030991183, + "grad_norm": 14.193215370178223, + "learning_rate": 5.808881491049722e-08, + "logits/chosen": -1.265504240989685, + "logits/rejected": -1.3089518547058105, + "logps/chosen": -0.5325222611427307, + "logps/rejected": -0.530707836151123, + "loss": 1.5907, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.331305742263794, + "rewards/margins": -0.004536189138889313, + "rewards/rejected": -1.3267695903778076, + "step": 374 + }, + { + "epoch": 0.8014961261020572, + "grad_norm": 8.60618782043457, + "learning_rate": 5.6895760193850145e-08, + "logits/chosen": -1.0075315237045288, + "logits/rejected": -1.0355682373046875, + "logps/chosen": -0.4078059792518616, + "logps/rejected": -0.6130856275558472, + "loss": 1.5543, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.019515037536621, + "rewards/margins": 0.5131990313529968, + "rewards/rejected": -1.5327140092849731, + "step": 375 + }, + { + "epoch": 0.803633449104996, + "grad_norm": 8.175145149230957, + "learning_rate": 5.571350963575727e-08, + "logits/chosen": -1.1598341464996338, + "logits/rejected": -0.9834379553794861, + "logps/chosen": -0.4177122414112091, + "logps/rejected": -0.35740387439727783, + "loss": 1.5343, + "rewards/accuracies": 0.1875, + "rewards/chosen": -1.0442806482315063, + "rewards/margins": -0.1507708877325058, + "rewards/rejected": -0.8935096859931946, + "step": 376 + }, + { + "epoch": 0.8057707721079348, + "grad_norm": 20.987957000732422, + "learning_rate": 5.454212938299255e-08, + "logits/chosen": -1.140153169631958, + "logits/rejected": -1.0446147918701172, + "logps/chosen": -0.5767669677734375, + "logps/rejected": -0.4474826455116272, + "loss": 1.5854, + "rewards/accuracies": 0.3125, + "rewards/chosen": -1.4419174194335938, + "rewards/margins": -0.32321077585220337, + "rewards/rejected": -1.1187067031860352, + "step": 377 + }, + { + "epoch": 0.8079080951108736, + "grad_norm": 7.252152919769287, + "learning_rate": 5.338168497413756e-08, + "logits/chosen": -1.1144384145736694, + "logits/rejected": -1.2679189443588257, + "logps/chosen": -0.3241596817970276, + "logps/rejected": -0.5525774955749512, + "loss": 1.5213, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8103991150856018, + "rewards/margins": 0.5710446238517761, + "rewards/rejected": -1.3814438581466675, + "step": 378 + }, + { + "epoch": 0.8100454181138125, + "grad_norm": 5.948306083679199, + "learning_rate": 5.223224133591475e-08, + "logits/chosen": -1.212296724319458, + "logits/rejected": -1.0987417697906494, + "logps/chosen": -0.7626843452453613, + "logps/rejected": -1.2524211406707764, + "loss": 1.4664, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.9067108631134033, + "rewards/margins": 1.2243417501449585, + "rewards/rejected": -3.1310529708862305, + "step": 379 + }, + { + "epoch": 0.8121827411167513, + "grad_norm": 5.803626537322998, + "learning_rate": 5.109386277955477e-08, + "logits/chosen": -1.1813595294952393, + "logits/rejected": -1.1350992918014526, + "logps/chosen": -0.45435211062431335, + "logps/rejected": -0.5456478595733643, + "loss": 1.4602, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.1358802318572998, + "rewards/margins": 0.22823941707611084, + "rewards/rejected": -1.364119529724121, + "step": 380 + }, + { + "epoch": 0.81432006411969, + "grad_norm": 4.112217903137207, + "learning_rate": 4.996661299719845e-08, + "logits/chosen": -0.8679373264312744, + "logits/rejected": -0.8792969584465027, + "logps/chosen": -0.4242730736732483, + "logps/rejected": -0.7051547765731812, + "loss": 1.5246, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.060682773590088, + "rewards/margins": 0.7022043466567993, + "rewards/rejected": -1.7628870010375977, + "step": 381 + }, + { + "epoch": 0.8164573871226289, + "grad_norm": 10.623425483703613, + "learning_rate": 4.885055505833291e-08, + "logits/chosen": -1.2642873525619507, + "logits/rejected": -1.189084768295288, + "logps/chosen": -0.4796138405799866, + "logps/rejected": -0.5438456535339355, + "loss": 1.5916, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.199034571647644, + "rewards/margins": 0.16057954728603363, + "rewards/rejected": -1.3596141338348389, + "step": 382 + }, + { + "epoch": 0.8185947101255677, + "grad_norm": 5.901862144470215, + "learning_rate": 4.774575140626316e-08, + "logits/chosen": -1.0387252569198608, + "logits/rejected": -0.9353397488594055, + "logps/chosen": -0.43210557103157043, + "logps/rejected": -0.474994421005249, + "loss": 1.4878, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0802638530731201, + "rewards/margins": 0.1072220653295517, + "rewards/rejected": -1.187485933303833, + "step": 383 + }, + { + "epoch": 0.8207320331285065, + "grad_norm": 10.615392684936523, + "learning_rate": 4.6652263854618016e-08, + "logits/chosen": -1.2227771282196045, + "logits/rejected": -1.2380874156951904, + "logps/chosen": -0.42436325550079346, + "logps/rejected": -0.6415535807609558, + "loss": 1.4763, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0609081983566284, + "rewards/margins": 0.5429757833480835, + "rewards/rejected": -1.6038841009140015, + "step": 384 + }, + { + "epoch": 0.8228693561314454, + "grad_norm": 7.720395565032959, + "learning_rate": 4.557015358389216e-08, + "logits/chosen": -0.9876857995986938, + "logits/rejected": -1.0623700618743896, + "logps/chosen": -0.37730199098587036, + "logps/rejected": -0.587026059627533, + "loss": 1.4647, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9432549476623535, + "rewards/margins": 0.5243102312088013, + "rewards/rejected": -1.4675650596618652, + "step": 385 + }, + { + "epoch": 0.8250066791343842, + "grad_norm": 8.550374984741211, + "learning_rate": 4.449948113802254e-08, + "logits/chosen": -1.203737735748291, + "logits/rejected": -1.1764881610870361, + "logps/chosen": -0.33095237612724304, + "logps/rejected": -0.3797753155231476, + "loss": 1.5067, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.827380895614624, + "rewards/margins": 0.12205736339092255, + "rewards/rejected": -0.9494383335113525, + "step": 386 + }, + { + "epoch": 0.827144002137323, + "grad_norm": 8.158326148986816, + "learning_rate": 4.3440306421001324e-08, + "logits/chosen": -1.0478994846343994, + "logits/rejected": -1.0275698900222778, + "logps/chosen": -0.4138807952404022, + "logps/rejected": -0.5256016254425049, + "loss": 1.4697, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.0347020626068115, + "rewards/margins": 0.2793022096157074, + "rewards/rejected": -1.3140041828155518, + "step": 387 + }, + { + "epoch": 0.8292813251402619, + "grad_norm": 4.749565124511719, + "learning_rate": 4.2392688693524055e-08, + "logits/chosen": -1.0925198793411255, + "logits/rejected": -1.053438663482666, + "logps/chosen": -0.4893158972263336, + "logps/rejected": -0.7248774766921997, + "loss": 1.4731, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2232897281646729, + "rewards/margins": 0.5889038443565369, + "rewards/rejected": -1.8121936321258545, + "step": 388 + }, + { + "epoch": 0.8314186481432007, + "grad_norm": 17.479476928710938, + "learning_rate": 4.1356686569674335e-08, + "logits/chosen": -1.237557291984558, + "logits/rejected": -1.1833033561706543, + "logps/chosen": -0.6519225835800171, + "logps/rejected": -0.6998899579048157, + "loss": 1.5504, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6298065185546875, + "rewards/margins": 0.1199183464050293, + "rewards/rejected": -1.7497249841690063, + "step": 389 + }, + { + "epoch": 0.8335559711461394, + "grad_norm": 6.035606384277344, + "learning_rate": 4.0332358013644015e-08, + "logits/chosen": -1.16561758518219, + "logits/rejected": -1.1693965196609497, + "logps/chosen": -0.6768723726272583, + "logps/rejected": -1.003299593925476, + "loss": 1.4768, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.692180871963501, + "rewards/margins": 0.8160682916641235, + "rewards/rejected": -2.508248805999756, + "step": 390 + }, + { + "epoch": 0.8356932941490782, + "grad_norm": 6.278528213500977, + "learning_rate": 3.9319760336490205e-08, + "logits/chosen": -0.7999597191810608, + "logits/rejected": -0.6943086385726929, + "logps/chosen": -0.3784443140029907, + "logps/rejected": -0.4581376612186432, + "loss": 1.5857, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.9461109042167664, + "rewards/margins": 0.1992333084344864, + "rewards/rejected": -1.1453441381454468, + "step": 391 + }, + { + "epoch": 0.8378306171520171, + "grad_norm": 4.925440788269043, + "learning_rate": 3.831895019292897e-08, + "logits/chosen": -1.1565301418304443, + "logits/rejected": -1.1625264883041382, + "logps/chosen": -0.5009636878967285, + "logps/rejected": -0.8770073056221008, + "loss": 1.4676, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2524092197418213, + "rewards/margins": 0.9401088953018188, + "rewards/rejected": -2.1925179958343506, + "step": 392 + }, + { + "epoch": 0.8399679401549559, + "grad_norm": 16.391368865966797, + "learning_rate": 3.732998357816514e-08, + "logits/chosen": -1.0616164207458496, + "logits/rejected": -1.0247597694396973, + "logps/chosen": -0.5045540928840637, + "logps/rejected": -0.467951238155365, + "loss": 1.6318, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.261385202407837, + "rewards/margins": -0.09150727093219757, + "rewards/rejected": -1.1698780059814453, + "step": 393 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 8.550324440002441, + "learning_rate": 3.635291582475963e-08, + "logits/chosen": -0.9775318503379822, + "logits/rejected": -0.8906686902046204, + "logps/chosen": -0.5394979119300842, + "logps/rejected": -0.5499922037124634, + "loss": 1.5031, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3487448692321777, + "rewards/margins": 0.02623593807220459, + "rewards/rejected": -1.3749808073043823, + "step": 394 + }, + { + "epoch": 0.8442425861608336, + "grad_norm": 4.05560302734375, + "learning_rate": 3.538780159953347e-08, + "logits/chosen": -1.0048534870147705, + "logits/rejected": -0.839844822883606, + "logps/chosen": -0.4153073728084564, + "logps/rejected": -0.35160398483276367, + "loss": 1.5959, + "rewards/accuracies": 0.3125, + "rewards/chosen": -1.0382684469223022, + "rewards/margins": -0.1592584103345871, + "rewards/rejected": -0.8790099620819092, + "step": 395 + }, + { + "epoch": 0.8463799091637724, + "grad_norm": 7.721834182739258, + "learning_rate": 3.4434694900509345e-08, + "logits/chosen": -1.1735496520996094, + "logits/rejected": -1.150696039199829, + "logps/chosen": -0.5112527012825012, + "logps/rejected": -0.6088312864303589, + "loss": 1.5405, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2781318426132202, + "rewards/margins": 0.24394652247428894, + "rewards/rejected": -1.522078275680542, + "step": 396 + }, + { + "epoch": 0.8485172321667112, + "grad_norm": 5.097099781036377, + "learning_rate": 3.349364905389032e-08, + "logits/chosen": -1.2030649185180664, + "logits/rejected": -1.08577561378479, + "logps/chosen": -0.47224241495132446, + "logps/rejected": -0.6983097791671753, + "loss": 1.5699, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1806060075759888, + "rewards/margins": 0.5651683807373047, + "rewards/rejected": -1.745774507522583, + "step": 397 + }, + { + "epoch": 0.85065455516965, + "grad_norm": 9.325733184814453, + "learning_rate": 3.256471671107616e-08, + "logits/chosen": -0.9340042471885681, + "logits/rejected": -0.946205198764801, + "logps/chosen": -0.7264373302459717, + "logps/rejected": -0.6860026121139526, + "loss": 1.5801, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.8160933256149292, + "rewards/margins": -0.1010868027806282, + "rewards/rejected": -1.7150065898895264, + "step": 398 + }, + { + "epoch": 0.8527918781725888, + "grad_norm": 4.532627105712891, + "learning_rate": 3.1647949845717585e-08, + "logits/chosen": -0.8892905712127686, + "logits/rejected": -0.8267837166786194, + "logps/chosen": -0.4368503987789154, + "logps/rejected": -0.5416733026504517, + "loss": 1.4184, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.0921260118484497, + "rewards/margins": 0.2620573043823242, + "rewards/rejected": -1.3541834354400635, + "step": 399 + }, + { + "epoch": 0.8549292011755276, + "grad_norm": 3.611345052719116, + "learning_rate": 3.074339975080836e-08, + "logits/chosen": -0.9715834259986877, + "logits/rejected": -0.9480459690093994, + "logps/chosen": -0.685413122177124, + "logps/rejected": -0.7482943534851074, + "loss": 1.4719, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.71353280544281, + "rewards/margins": 0.15720298886299133, + "rewards/rejected": -1.870735764503479, + "step": 400 + }, + { + "epoch": 0.8570665241784665, + "grad_norm": 49.35193634033203, + "learning_rate": 2.98511170358155e-08, + "logits/chosen": -0.959058940410614, + "logits/rejected": -0.9538683295249939, + "logps/chosen": -0.43192583322525024, + "logps/rejected": -0.4624338746070862, + "loss": 1.6139, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0798146724700928, + "rewards/margins": 0.07627001404762268, + "rewards/rejected": -1.156084656715393, + "step": 401 + }, + { + "epoch": 0.8592038471814053, + "grad_norm": 11.570405006408691, + "learning_rate": 2.8971151623847584e-08, + "logits/chosen": -1.0528483390808105, + "logits/rejected": -0.9815914630889893, + "logps/chosen": -0.584007740020752, + "logps/rejected": -0.6078099608421326, + "loss": 1.5858, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4600192308425903, + "rewards/margins": 0.05950555205345154, + "rewards/rejected": -1.5195249319076538, + "step": 402 + }, + { + "epoch": 0.8613411701843441, + "grad_norm": 5.403092861175537, + "learning_rate": 2.8103552748861475e-08, + "logits/chosen": -1.052750825881958, + "logits/rejected": -1.0424790382385254, + "logps/chosen": -0.6250475645065308, + "logps/rejected": -0.6444798111915588, + "loss": 1.6129, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.5626189708709717, + "rewards/margins": 0.04858069866895676, + "rewards/rejected": -1.6111997365951538, + "step": 403 + }, + { + "epoch": 0.863478493187283, + "grad_norm": 5.894848346710205, + "learning_rate": 2.724836895290805e-08, + "logits/chosen": -1.0178946256637573, + "logits/rejected": -0.8687437772750854, + "logps/chosen": -0.36898887157440186, + "logps/rejected": -0.7522455453872681, + "loss": 1.5337, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.9224721789360046, + "rewards/margins": 0.958141565322876, + "rewards/rejected": -1.8806138038635254, + "step": 404 + }, + { + "epoch": 0.8656158161902218, + "grad_norm": 6.724638938903809, + "learning_rate": 2.6405648083415833e-08, + "logits/chosen": -1.1396384239196777, + "logits/rejected": -1.0131150484085083, + "logps/chosen": -0.6166858673095703, + "logps/rejected": -0.5131340026855469, + "loss": 1.5427, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.5417147874832153, + "rewards/margins": -0.258879691362381, + "rewards/rejected": -1.2828351259231567, + "step": 405 + }, + { + "epoch": 0.8677531391931605, + "grad_norm": 4.763136863708496, + "learning_rate": 2.55754372905142e-08, + "logits/chosen": -1.1291677951812744, + "logits/rejected": -1.0440433025360107, + "logps/chosen": -0.44505226612091064, + "logps/rejected": -0.46291491389274597, + "loss": 1.499, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1126307249069214, + "rewards/margins": 0.04465658590197563, + "rewards/rejected": -1.157287359237671, + "step": 406 + }, + { + "epoch": 0.8698904621960993, + "grad_norm": 5.994953155517578, + "learning_rate": 2.475778302439524e-08, + "logits/chosen": -1.0867843627929688, + "logits/rejected": -1.1085426807403564, + "logps/chosen": -0.6850754022598267, + "logps/rejected": -0.8681538105010986, + "loss": 1.54, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7126885652542114, + "rewards/margins": 0.4576959013938904, + "rewards/rejected": -2.170384645462036, + "step": 407 + }, + { + "epoch": 0.8720277851990382, + "grad_norm": 4.274337291717529, + "learning_rate": 2.3952731032714973e-08, + "logits/chosen": -0.8507957458496094, + "logits/rejected": -0.8216973543167114, + "logps/chosen": -0.352754145860672, + "logps/rejected": -0.6458288431167603, + "loss": 1.4806, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8818854093551636, + "rewards/margins": 0.7326868176460266, + "rewards/rejected": -1.6145721673965454, + "step": 408 + }, + { + "epoch": 0.874165108201977, + "grad_norm": 19.687917709350586, + "learning_rate": 2.3160326358033778e-08, + "logits/chosen": -1.0179362297058105, + "logits/rejected": -0.9422796964645386, + "logps/chosen": -0.6053561568260193, + "logps/rejected": -1.013503074645996, + "loss": 1.4768, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5133905410766602, + "rewards/margins": 1.0203672647476196, + "rewards/rejected": -2.5337576866149902, + "step": 409 + }, + { + "epoch": 0.8763024312049158, + "grad_norm": 12.98466968536377, + "learning_rate": 2.2380613335296033e-08, + "logits/chosen": -0.8576774597167969, + "logits/rejected": -0.9601131677627563, + "logps/chosen": -0.42103275656700134, + "logps/rejected": -0.41525495052337646, + "loss": 1.5959, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.0525819063186646, + "rewards/margins": -0.014444507658481598, + "rewards/rejected": -1.0381373167037964, + "step": 410 + }, + { + "epoch": 0.8784397542078547, + "grad_norm": 7.204395294189453, + "learning_rate": 2.1613635589349756e-08, + "logits/chosen": -0.9204460978507996, + "logits/rejected": -0.9429717063903809, + "logps/chosen": -0.3754885196685791, + "logps/rejected": -0.4179832935333252, + "loss": 1.5211, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9387211799621582, + "rewards/margins": 0.10623697191476822, + "rewards/rejected": -1.0449581146240234, + "step": 411 + }, + { + "epoch": 0.8805770772107935, + "grad_norm": 14.507763862609863, + "learning_rate": 2.085943603250595e-08, + "logits/chosen": -0.9056552648544312, + "logits/rejected": -0.8666099309921265, + "logps/chosen": -0.4410094618797302, + "logps/rejected": -0.6149858236312866, + "loss": 1.5065, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.1025235652923584, + "rewards/margins": 0.43494072556495667, + "rewards/rejected": -1.5374643802642822, + "step": 412 + }, + { + "epoch": 0.8827144002137323, + "grad_norm": 4.710812568664551, + "learning_rate": 2.0118056862137354e-08, + "logits/chosen": -0.9734061360359192, + "logits/rejected": -0.8919450044631958, + "logps/chosen": -0.4355347156524658, + "logps/rejected": -0.4121510982513428, + "loss": 1.5909, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.0888367891311646, + "rewards/margins": -0.058458905667066574, + "rewards/rejected": -1.030377745628357, + "step": 413 + }, + { + "epoch": 0.8848517232166712, + "grad_norm": 6.681394100189209, + "learning_rate": 1.938953955831771e-08, + "logits/chosen": -1.089871883392334, + "logits/rejected": -1.0652118921279907, + "logps/chosen": -0.45682665705680847, + "logps/rejected": -0.5210414528846741, + "loss": 1.4889, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1420665979385376, + "rewards/margins": 0.1605370044708252, + "rewards/rejected": -1.3026037216186523, + "step": 414 + }, + { + "epoch": 0.88698904621961, + "grad_norm": 10.179585456848145, + "learning_rate": 1.8673924881500823e-08, + "logits/chosen": -0.9976410269737244, + "logits/rejected": -1.0120102167129517, + "logps/chosen": -0.7260380387306213, + "logps/rejected": -0.9380686283111572, + "loss": 1.5325, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.8150951862335205, + "rewards/margins": 0.5300765037536621, + "rewards/rejected": -2.3451716899871826, + "step": 415 + }, + { + "epoch": 0.8891263692225487, + "grad_norm": 6.529333591461182, + "learning_rate": 1.797125287024029e-08, + "logits/chosen": -1.0898345708847046, + "logits/rejected": -1.1249827146530151, + "logps/chosen": -0.5659042596817017, + "logps/rejected": -0.8098978996276855, + "loss": 1.4598, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.414760947227478, + "rewards/margins": 0.6099839210510254, + "rewards/rejected": -2.024744749069214, + "step": 416 + }, + { + "epoch": 0.8912636922254876, + "grad_norm": 18.293628692626953, + "learning_rate": 1.7281562838948966e-08, + "logits/chosen": -0.9206041693687439, + "logits/rejected": -0.9363532066345215, + "logps/chosen": -0.6177800893783569, + "logps/rejected": -0.5964070558547974, + "loss": 1.6686, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.5444501638412476, + "rewards/margins": -0.05343271791934967, + "rewards/rejected": -1.4910173416137695, + "step": 417 + }, + { + "epoch": 0.8934010152284264, + "grad_norm": 6.754497528076172, + "learning_rate": 1.6604893375699592e-08, + "logits/chosen": -1.1073287725448608, + "logits/rejected": -0.9858848452568054, + "logps/chosen": -0.4562823176383972, + "logps/rejected": -0.5044858455657959, + "loss": 1.5667, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.1407058238983154, + "rewards/margins": 0.12050891667604446, + "rewards/rejected": -1.2612147331237793, + "step": 418 + }, + { + "epoch": 0.8955383382313652, + "grad_norm": 4.217410087585449, + "learning_rate": 1.5941282340065697e-08, + "logits/chosen": -1.1437349319458008, + "logits/rejected": -1.2322208881378174, + "logps/chosen": -0.4406435191631317, + "logps/rejected": -0.6784199476242065, + "loss": 1.4618, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1016088724136353, + "rewards/margins": 0.5944410562515259, + "rewards/rejected": -1.6960498094558716, + "step": 419 + }, + { + "epoch": 0.897675661234304, + "grad_norm": 16.80787467956543, + "learning_rate": 1.5290766861003475e-08, + "logits/chosen": -0.9160170555114746, + "logits/rejected": -0.8729619383811951, + "logps/chosen": -0.3433056175708771, + "logps/rejected": -0.37045544385910034, + "loss": 1.6206, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.8582640290260315, + "rewards/margins": 0.06787460297346115, + "rewards/rejected": -0.926138699054718, + "step": 420 + }, + { + "epoch": 0.8998129842372429, + "grad_norm": 13.28708553314209, + "learning_rate": 1.4653383334774228e-08, + "logits/chosen": -1.0449622869491577, + "logits/rejected": -1.0788357257843018, + "logps/chosen": -0.5669997930526733, + "logps/rejected": -0.7596959471702576, + "loss": 1.5144, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.4174995422363281, + "rewards/margins": 0.4817403554916382, + "rewards/rejected": -1.8992400169372559, + "step": 421 + }, + { + "epoch": 0.9019503072401817, + "grad_norm": 7.049681663513184, + "learning_rate": 1.4029167422908105e-08, + "logits/chosen": -1.148837924003601, + "logits/rejected": -1.0951889753341675, + "logps/chosen": -0.4990730583667755, + "logps/rejected": -0.607469916343689, + "loss": 1.5152, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2476826906204224, + "rewards/margins": 0.2709922194480896, + "rewards/rejected": -1.5186748504638672, + "step": 422 + }, + { + "epoch": 0.9040876302431204, + "grad_norm": 6.772598743438721, + "learning_rate": 1.3418154050208936e-08, + "logits/chosen": -0.9771215319633484, + "logits/rejected": -0.9951186776161194, + "logps/chosen": -0.5036316514015198, + "logps/rejected": -0.6173194646835327, + "loss": 1.5277, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2590792179107666, + "rewards/margins": 0.28421932458877563, + "rewards/rejected": -1.543298602104187, + "step": 423 + }, + { + "epoch": 0.9062249532460593, + "grad_norm": 5.654405117034912, + "learning_rate": 1.2820377402800064e-08, + "logits/chosen": -0.8762426972389221, + "logits/rejected": -0.6818346381187439, + "logps/chosen": -0.4185902774333954, + "logps/rejected": -0.9549139738082886, + "loss": 1.4405, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.0464756488800049, + "rewards/margins": 1.3408091068267822, + "rewards/rejected": -2.387284755706787, + "step": 424 + }, + { + "epoch": 0.9083622762489981, + "grad_norm": 9.220845222473145, + "learning_rate": 1.2235870926211616e-08, + "logits/chosen": -0.9336291551589966, + "logits/rejected": -0.9068048596382141, + "logps/chosen": -0.4764101505279541, + "logps/rejected": -0.6432890295982361, + "loss": 1.5286, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1910252571105957, + "rewards/margins": 0.41719722747802734, + "rewards/rejected": -1.608222484588623, + "step": 425 + }, + { + "epoch": 0.9104995992519369, + "grad_norm": 6.728222846984863, + "learning_rate": 1.1664667323509347e-08, + "logits/chosen": -1.0641913414001465, + "logits/rejected": -0.9187748432159424, + "logps/chosen": -0.3940516710281372, + "logps/rejected": -0.4065035283565521, + "loss": 1.5201, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.985129177570343, + "rewards/margins": 0.031129609793424606, + "rewards/rejected": -1.0162588357925415, + "step": 426 + }, + { + "epoch": 0.9126369222548758, + "grad_norm": 7.619234561920166, + "learning_rate": 1.1106798553464802e-08, + "logits/chosen": -0.954318642616272, + "logits/rejected": -0.908359706401825, + "logps/chosen": -0.4075399935245514, + "logps/rejected": -0.4785918891429901, + "loss": 1.5005, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0188499689102173, + "rewards/margins": 0.17762985825538635, + "rewards/rejected": -1.1964799165725708, + "step": 427 + }, + { + "epoch": 0.9147742452578146, + "grad_norm": 6.409261226654053, + "learning_rate": 1.0562295828767387e-08, + "logits/chosen": -1.0399943590164185, + "logits/rejected": -1.0340969562530518, + "logps/chosen": -0.3984678387641907, + "logps/rejected": -0.5552449226379395, + "loss": 1.4632, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9961696863174438, + "rewards/margins": 0.39194267988204956, + "rewards/rejected": -1.3881123065948486, + "step": 428 + }, + { + "epoch": 0.9169115682607534, + "grad_norm": 10.060012817382812, + "learning_rate": 1.0031189614277763e-08, + "logits/chosen": -0.9579813480377197, + "logits/rejected": -0.9383722543716431, + "logps/chosen": -0.5487989187240601, + "logps/rejected": -0.5818829536437988, + "loss": 1.5268, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.371997356414795, + "rewards/margins": 0.0827101320028305, + "rewards/rejected": -1.4547075033187866, + "step": 429 + }, + { + "epoch": 0.9190488912636923, + "grad_norm": 6.2424139976501465, + "learning_rate": 9.513509625323518e-09, + "logits/chosen": -0.9296804666519165, + "logits/rejected": -0.9260187745094299, + "logps/chosen": -0.39111167192459106, + "logps/rejected": -0.45873600244522095, + "loss": 1.4814, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9777792096138, + "rewards/margins": 0.16906076669692993, + "rewards/rejected": -1.14683997631073, + "step": 430 + }, + { + "epoch": 0.921186214266631, + "grad_norm": 10.799667358398438, + "learning_rate": 9.009284826036689e-09, + "logits/chosen": -0.892406702041626, + "logits/rejected": -0.9241263270378113, + "logps/chosen": -0.5018429160118103, + "logps/rejected": -0.6716207265853882, + "loss": 1.4844, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2546073198318481, + "rewards/margins": 0.42444440722465515, + "rewards/rejected": -1.6790517568588257, + "step": 431 + }, + { + "epoch": 0.9233235372695698, + "grad_norm": 4.4624104499816895, + "learning_rate": 8.518543427732949e-09, + "logits/chosen": -0.9802000522613525, + "logits/rejected": -0.9559367895126343, + "logps/chosen": -0.38735339045524597, + "logps/rejected": -0.6876262426376343, + "loss": 1.481, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9683833718299866, + "rewards/margins": 0.7506821155548096, + "rewards/rejected": -1.7190656661987305, + "step": 432 + }, + { + "epoch": 0.9254608602725087, + "grad_norm": 7.951170444488525, + "learning_rate": 8.041312887333396e-09, + "logits/chosen": -0.9957330226898193, + "logits/rejected": -0.9468004703521729, + "logps/chosen": -0.4312437176704407, + "logps/rejected": -0.5361820459365845, + "loss": 1.4571, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0781091451644897, + "rewards/margins": 0.26234593987464905, + "rewards/rejected": -1.3404550552368164, + "step": 433 + }, + { + "epoch": 0.9275981832754475, + "grad_norm": 12.679637908935547, + "learning_rate": 7.577619905828281e-09, + "logits/chosen": -1.0287508964538574, + "logits/rejected": -0.9482178688049316, + "logps/chosen": -0.40496230125427246, + "logps/rejected": -0.38896051049232483, + "loss": 1.4673, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.0124057531356812, + "rewards/margins": -0.040004514157772064, + "rewards/rejected": -0.9724011421203613, + "step": 434 + }, + { + "epoch": 0.9297355062783863, + "grad_norm": 6.805285930633545, + "learning_rate": 7.127490426783123e-09, + "logits/chosen": -1.1413686275482178, + "logits/rejected": -1.09022057056427, + "logps/chosen": -0.5725710391998291, + "logps/rejected": -0.6725842952728271, + "loss": 1.5341, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.4314275979995728, + "rewards/margins": 0.2500333786010742, + "rewards/rejected": -1.6814608573913574, + "step": 435 + }, + { + "epoch": 0.9318728292813251, + "grad_norm": 5.759010314941406, + "learning_rate": 6.6909496348871445e-09, + "logits/chosen": -1.1474663019180298, + "logits/rejected": -1.1745803356170654, + "logps/chosen": -0.6815481185913086, + "logps/rejected": -0.7699655294418335, + "loss": 1.5144, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7038702964782715, + "rewards/margins": 0.2210434526205063, + "rewards/rejected": -1.9249136447906494, + "step": 436 + }, + { + "epoch": 0.934010152284264, + "grad_norm": 4.481965065002441, + "learning_rate": 6.268021954544095e-09, + "logits/chosen": -0.9412963390350342, + "logits/rejected": -0.9516785144805908, + "logps/chosen": -0.3796120285987854, + "logps/rejected": -0.37879857420921326, + "loss": 1.5547, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.9490300416946411, + "rewards/margins": -0.002033662050962448, + "rewards/rejected": -0.9469964504241943, + "step": 437 + }, + { + "epoch": 0.9361474752872028, + "grad_norm": 6.431197166442871, + "learning_rate": 5.858731048505927e-09, + "logits/chosen": -1.076103687286377, + "logits/rejected": -1.0976781845092773, + "logps/chosen": -0.4042336046695709, + "logps/rejected": -0.612984836101532, + "loss": 1.4394, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0105839967727661, + "rewards/margins": 0.5218778848648071, + "rewards/rejected": -1.5324620008468628, + "step": 438 + }, + { + "epoch": 0.9382847982901416, + "grad_norm": 11.778403282165527, + "learning_rate": 5.463099816548577e-09, + "logits/chosen": -1.085112452507019, + "logits/rejected": -1.024551272392273, + "logps/chosen": -0.3431437611579895, + "logps/rejected": -0.5487081408500671, + "loss": 1.5601, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8578594923019409, + "rewards/margins": 0.5139108896255493, + "rewards/rejected": -1.3717702627182007, + "step": 439 + }, + { + "epoch": 0.9404221212930804, + "grad_norm": 4.792006492614746, + "learning_rate": 5.08115039419113e-09, + "logits/chosen": -0.9755135774612427, + "logits/rejected": -0.8824871778488159, + "logps/chosen": -0.3806512653827667, + "logps/rejected": -0.5343747735023499, + "loss": 1.52, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9516281485557556, + "rewards/margins": 0.38430866599082947, + "rewards/rejected": -1.3359367847442627, + "step": 440 + }, + { + "epoch": 0.9425594442960192, + "grad_norm": 6.083741188049316, + "learning_rate": 4.712904151456864e-09, + "logits/chosen": -0.9559481143951416, + "logits/rejected": -0.8965126872062683, + "logps/chosen": -0.4216436743736267, + "logps/rejected": -0.4634767770767212, + "loss": 1.4695, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0541093349456787, + "rewards/margins": 0.1045827567577362, + "rewards/rejected": -1.1586920022964478, + "step": 441 + }, + { + "epoch": 0.944696767298958, + "grad_norm": 4.866091251373291, + "learning_rate": 4.358381691677931e-09, + "logits/chosen": -0.9368703961372375, + "logits/rejected": -0.8826941251754761, + "logps/chosen": -0.3301496207714081, + "logps/rejected": -0.38065895438194275, + "loss": 1.4942, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8253740072250366, + "rewards/margins": 0.1262734830379486, + "rewards/rejected": -0.9516474008560181, + "step": 442 + }, + { + "epoch": 0.9468340903018969, + "grad_norm": 7.398251056671143, + "learning_rate": 4.0176028503425826e-09, + "logits/chosen": -1.079075574874878, + "logits/rejected": -1.00128972530365, + "logps/chosen": -0.45567309856414795, + "logps/rejected": -0.39884287118911743, + "loss": 1.5388, + "rewards/accuracies": 0.3125, + "rewards/chosen": -1.1391828060150146, + "rewards/margins": -0.14207565784454346, + "rewards/rejected": -0.9971071481704712, + "step": 443 + }, + { + "epoch": 0.9489714133048357, + "grad_norm": 9.869709014892578, + "learning_rate": 3.6905866939851983e-09, + "logits/chosen": -1.1050983667373657, + "logits/rejected": -1.0123190879821777, + "logps/chosen": -0.44024163484573364, + "logps/rejected": -0.3926200568675995, + "loss": 1.4883, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.1006040573120117, + "rewards/margins": -0.1190539002418518, + "rewards/rejected": -0.9815501570701599, + "step": 444 + }, + { + "epoch": 0.9511087363077745, + "grad_norm": 4.558749198913574, + "learning_rate": 3.3773515191196646e-09, + "logits/chosen": -1.0600441694259644, + "logits/rejected": -1.0637009143829346, + "logps/chosen": -0.4824512004852295, + "logps/rejected": -0.594225287437439, + "loss": 1.5854, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2061281204223633, + "rewards/margins": 0.2794351577758789, + "rewards/rejected": -1.4855631589889526, + "step": 445 + }, + { + "epoch": 0.9532460593107134, + "grad_norm": 7.49786376953125, + "learning_rate": 3.077914851215585e-09, + "logits/chosen": -1.06005859375, + "logits/rejected": -1.0157767534255981, + "logps/chosen": -0.5099815130233765, + "logps/rejected": -0.6151185035705566, + "loss": 1.4851, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.274953842163086, + "rewards/margins": 0.26284244656562805, + "rewards/rejected": -1.5377962589263916, + "step": 446 + }, + { + "epoch": 0.9553833823136522, + "grad_norm": 5.8320746421813965, + "learning_rate": 2.7922934437178692e-09, + "logits/chosen": -0.8938602209091187, + "logits/rejected": -0.9562631249427795, + "logps/chosen": -0.37319353222846985, + "logps/rejected": -0.37881189584732056, + "loss": 1.4395, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.9329838752746582, + "rewards/margins": 0.0140459556132555, + "rewards/rejected": -0.9470298290252686, + "step": 447 + }, + { + "epoch": 0.957520705316591, + "grad_norm": 3.9899652004241943, + "learning_rate": 2.5205032771092592e-09, + "logits/chosen": -0.9867920279502869, + "logits/rejected": -0.9630347490310669, + "logps/chosen": -0.39268139004707336, + "logps/rejected": -0.635047435760498, + "loss": 1.5503, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9817034006118774, + "rewards/margins": 0.6059151291847229, + "rewards/rejected": -1.5876185894012451, + "step": 448 + }, + { + "epoch": 0.9596580283195298, + "grad_norm": 7.979334354400635, + "learning_rate": 2.2625595580163247e-09, + "logits/chosen": -1.1002980470657349, + "logits/rejected": -1.0980544090270996, + "logps/chosen": -0.7607989311218262, + "logps/rejected": -0.9997137784957886, + "loss": 1.5401, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.9019973278045654, + "rewards/margins": 0.5972872972488403, + "rewards/rejected": -2.4992847442626953, + "step": 449 + }, + { + "epoch": 0.9617953513224686, + "grad_norm": 21.347997665405273, + "learning_rate": 2.0184767183584474e-09, + "logits/chosen": -0.8544862866401672, + "logits/rejected": -0.8984670042991638, + "logps/chosen": -0.5437809228897095, + "logps/rejected": -0.6033496856689453, + "loss": 1.4988, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.359452247619629, + "rewards/margins": 0.14892183244228363, + "rewards/rejected": -1.5083742141723633, + "step": 450 + }, + { + "epoch": 0.9639326743254074, + "grad_norm": 8.341211318969727, + "learning_rate": 1.7882684145406612e-09, + "logits/chosen": -1.0166016817092896, + "logits/rejected": -1.0338623523712158, + "logps/chosen": -0.5460320711135864, + "logps/rejected": -0.5169766545295715, + "loss": 1.5717, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3650802373886108, + "rewards/margins": -0.07263859361410141, + "rewards/rejected": -1.292441725730896, + "step": 451 + }, + { + "epoch": 0.9660699973283462, + "grad_norm": 12.869359970092773, + "learning_rate": 1.5719475266893489e-09, + "logits/chosen": -1.0716265439987183, + "logits/rejected": -1.0736249685287476, + "logps/chosen": -0.4328761696815491, + "logps/rejected": -0.5213409066200256, + "loss": 1.5382, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0821905136108398, + "rewards/margins": 0.22116179764270782, + "rewards/rejected": -1.3033523559570312, + "step": 452 + }, + { + "epoch": 0.9682073203312851, + "grad_norm": 5.998004913330078, + "learning_rate": 1.3695261579316775e-09, + "logits/chosen": -1.1411190032958984, + "logits/rejected": -1.0635725259780884, + "logps/chosen": -0.6178188323974609, + "logps/rejected": -0.592136561870575, + "loss": 1.6393, + "rewards/accuracies": 0.3125, + "rewards/chosen": -1.5445469617843628, + "rewards/margins": -0.064205601811409, + "rewards/rejected": -1.4803414344787598, + "step": 453 + }, + { + "epoch": 0.9703446433342239, + "grad_norm": 7.896289348602295, + "learning_rate": 1.1810156337183908e-09, + "logits/chosen": -1.009376883506775, + "logits/rejected": -0.9895673394203186, + "logps/chosen": -0.7131325602531433, + "logps/rejected": -0.6051285266876221, + "loss": 1.5398, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7828314304351807, + "rewards/margins": -0.2700101137161255, + "rewards/rejected": -1.5128213167190552, + "step": 454 + }, + { + "epoch": 0.9724819663371627, + "grad_norm": 6.323320388793945, + "learning_rate": 1.0064265011902328e-09, + "logits/chosen": -1.0121572017669678, + "logits/rejected": -0.9521965980529785, + "logps/chosen": -0.5518695712089539, + "logps/rejected": -0.5250836610794067, + "loss": 1.5786, + "rewards/accuracies": 0.3125, + "rewards/chosen": -1.379673957824707, + "rewards/margins": -0.06696499139070511, + "rewards/rejected": -1.312708854675293, + "step": 455 + }, + { + "epoch": 0.9746192893401016, + "grad_norm": 6.093278408050537, + "learning_rate": 8.457685285878091e-10, + "logits/chosen": -0.9040694236755371, + "logits/rejected": -1.0100067853927612, + "logps/chosen": -0.6011037230491638, + "logps/rejected": -0.9459134340286255, + "loss": 1.4765, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.502759337425232, + "rewards/margins": 0.8620242476463318, + "rewards/rejected": -2.364783525466919, + "step": 456 + }, + { + "epoch": 0.9767566123430403, + "grad_norm": 7.644316673278809, + "learning_rate": 6.990507047049676e-10, + "logits/chosen": -1.1642239093780518, + "logits/rejected": -1.327940583229065, + "logps/chosen": -0.7490012645721436, + "logps/rejected": -0.8202410340309143, + "loss": 1.6319, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.8725032806396484, + "rewards/margins": 0.1780991107225418, + "rewards/rejected": -2.0506021976470947, + "step": 457 + }, + { + "epoch": 0.9788939353459791, + "grad_norm": 4.500396728515625, + "learning_rate": 5.662812383859794e-10, + "logits/chosen": -1.057512640953064, + "logits/rejected": -1.0183889865875244, + "logps/chosen": -0.5618507862091064, + "logps/rejected": -0.7458251714706421, + "loss": 1.5519, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4046270847320557, + "rewards/margins": 0.4599360227584839, + "rewards/rejected": -1.86456298828125, + "step": 458 + }, + { + "epoch": 0.981031258348918, + "grad_norm": 5.931386470794678, + "learning_rate": 4.4746755806621126e-10, + "logits/chosen": -1.0571941137313843, + "logits/rejected": -1.1213597059249878, + "logps/chosen": -0.642679750919342, + "logps/rejected": -0.7841815948486328, + "loss": 1.4134, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6066994667053223, + "rewards/margins": 0.3537544906139374, + "rewards/rejected": -1.9604538679122925, + "step": 459 + }, + { + "epoch": 0.9831685813518568, + "grad_norm": 9.15275764465332, + "learning_rate": 3.4261631135654167e-10, + "logits/chosen": -0.8693954348564148, + "logits/rejected": -0.7475937008857727, + "logps/chosen": -0.3651605546474457, + "logps/rejected": -0.42595067620277405, + "loss": 1.4887, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9129014015197754, + "rewards/margins": 0.15197524428367615, + "rewards/rejected": -1.064876675605774, + "step": 460 + }, + { + "epoch": 0.9853059043547956, + "grad_norm": 11.78666877746582, + "learning_rate": 2.5173336467135263e-10, + "logits/chosen": -1.1248339414596558, + "logits/rejected": -1.013832688331604, + "logps/chosen": -0.48437923192977905, + "logps/rejected": -0.5153346657752991, + "loss": 1.487, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2109479904174805, + "rewards/margins": 0.07738865166902542, + "rewards/rejected": -1.2883366346359253, + "step": 461 + }, + { + "epoch": 0.9874432273577345, + "grad_norm": 5.332235336303711, + "learning_rate": 1.7482380290034792e-10, + "logits/chosen": -1.0759484767913818, + "logits/rejected": -0.9958518147468567, + "logps/chosen": -0.4139256775379181, + "logps/rejected": -0.7975391745567322, + "loss": 1.3972, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.0348142385482788, + "rewards/margins": 0.9590335488319397, + "rewards/rejected": -1.9938479661941528, + "step": 462 + }, + { + "epoch": 0.9895805503606733, + "grad_norm": 5.443465709686279, + "learning_rate": 1.1189192912416933e-10, + "logits/chosen": -1.0934535264968872, + "logits/rejected": -1.0068289041519165, + "logps/chosen": -0.48808667063713074, + "logps/rejected": -0.6034483313560486, + "loss": 1.4738, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2202166318893433, + "rewards/margins": 0.2884041368961334, + "rewards/rejected": -1.5086207389831543, + "step": 463 + }, + { + "epoch": 0.9917178733636121, + "grad_norm": 5.87878942489624, + "learning_rate": 6.294126437336733e-11, + "logits/chosen": -1.0164133310317993, + "logits/rejected": -0.9699276685714722, + "logps/chosen": -0.4373021125793457, + "logps/rejected": -0.5335453152656555, + "loss": 1.4684, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.0932552814483643, + "rewards/margins": 0.24060802161693573, + "rewards/rejected": -1.3338632583618164, + "step": 464 + }, + { + "epoch": 0.9938551963665508, + "grad_norm": 4.464337348937988, + "learning_rate": 2.797454743164174e-11, + "logits/chosen": -1.2042040824890137, + "logits/rejected": -1.055449366569519, + "logps/chosen": -0.3999456763267517, + "logps/rejected": -0.4777381420135498, + "loss": 1.5844, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9998641610145569, + "rewards/margins": 0.19448117911815643, + "rewards/rejected": -1.1943453550338745, + "step": 465 + }, + { + "epoch": 0.9959925193694897, + "grad_norm": 9.290578842163086, + "learning_rate": 6.993734682547714e-12, + "logits/chosen": -0.8978444933891296, + "logits/rejected": -0.8331681489944458, + "logps/chosen": -0.521608293056488, + "logps/rejected": -0.5517727136611938, + "loss": 1.5965, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.304020643234253, + "rewards/margins": 0.07541122287511826, + "rewards/rejected": -1.379431962966919, + "step": 466 + }, + { + "epoch": 0.9981298423724285, + "grad_norm": 12.642468452453613, + "learning_rate": 0.0, + "logits/chosen": -0.8587179183959961, + "logits/rejected": -0.8208239078521729, + "logps/chosen": -0.4356221556663513, + "logps/rejected": -0.39131855964660645, + "loss": 1.4854, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.0890554189682007, + "rewards/margins": -0.11075909435749054, + "rewards/rejected": -0.9782962799072266, + "step": 467 + }, + { + "epoch": 0.9981298423724285, + "step": 467, + "total_flos": 0.0, + "train_loss": 0.0, + "train_runtime": 0.0036, + "train_samples_per_second": 16407823.488, + "train_steps_per_second": 127972.035 + } + ], + "logging_steps": 1, + "max_steps": 467, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 32, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}