{ "best_global_step": 804, "best_metric": 7.857303619384766, "best_model_checkpoint": "/tmp/svadugur/39821/length_change_preference-speaker=gemma-listener=pixtral_ft-length_conditioned=True-contexts=medium-39821/checkpoint-804", "epoch": 2.4282752120640905, "eval_steps": 67, "global_step": 804, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_logits/chosen": -2.1920204162597656, "eval_logits/rejected": -2.0768539905548096, "eval_logps/chosen": -59.404685974121094, "eval_logps/rejected": -86.27163696289062, "eval_loss": 1.0, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": 0.0, "eval_rewards/margins": 0.0, "eval_rewards/rejected": 0.0, "eval_runtime": 707.6291, "eval_samples_per_second": 0.547, "eval_steps_per_second": 0.274, "step": 0 }, { "epoch": 0.0030160226201696515, "grad_norm": 3.3755955696105957, "learning_rate": 1e-06, "logits/chosen": -2.2553484439849854, "logits/rejected": -2.1206772327423096, "logps/chosen": -55.25015640258789, "logps/rejected": -79.54145050048828, "loss": 1.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.006032045240339303, "grad_norm": 3.4193413257598877, "learning_rate": 9.996978851963746e-07, "logits/chosen": -2.215710163116455, "logits/rejected": -2.1025476455688477, "logps/chosen": -57.01054382324219, "logps/rejected": -78.93753814697266, "loss": 0.9976, "rewards/accuracies": 0.5, "rewards/chosen": -0.012324274517595768, "rewards/margins": 0.009727575816214085, "rewards/rejected": -0.022051848471164703, "step": 2 }, { "epoch": 0.009048067860508954, "grad_norm": 4.439186096191406, "learning_rate": 9.993957703927493e-07, "logits/chosen": -2.310152292251587, "logits/rejected": -2.113879442214966, "logps/chosen": -59.251182556152344, "logps/rejected": -81.71443176269531, "loss": 0.9994, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0029214618261903524, "rewards/margins": 0.002339194994419813, "rewards/rejected": -0.005260657053440809, "step": 3 }, { "epoch": 0.012064090480678606, "grad_norm": 3.5304365158081055, "learning_rate": 9.990936555891238e-07, "logits/chosen": -2.2443408966064453, "logits/rejected": -2.070103645324707, "logps/chosen": -57.5573616027832, "logps/rejected": -80.3204116821289, "loss": 0.998, "rewards/accuracies": 0.46875, "rewards/chosen": 0.0019818777218461037, "rewards/margins": 0.007879888638854027, "rewards/rejected": -0.005898010451346636, "step": 4 }, { "epoch": 0.015080113100848256, "grad_norm": 3.93078875541687, "learning_rate": 9.987915407854984e-07, "logits/chosen": -2.339620351791382, "logits/rejected": -2.1479837894439697, "logps/chosen": -57.142738342285156, "logps/rejected": -84.37262725830078, "loss": 0.9984, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0019825752824544907, "rewards/margins": 0.00641482463106513, "rewards/rejected": -0.008397400379180908, "step": 5 }, { "epoch": 0.018096135721017908, "grad_norm": 3.276793956756592, "learning_rate": 9.98489425981873e-07, "logits/chosen": -2.2446699142456055, "logits/rejected": -2.116443157196045, "logps/chosen": -59.81292724609375, "logps/rejected": -81.53680419921875, "loss": 0.9935, "rewards/accuracies": 0.6875, "rewards/chosen": -0.005816043820232153, "rewards/margins": 0.025974761694669724, "rewards/rejected": -0.031790804117918015, "step": 6 }, { "epoch": 0.02111215834118756, "grad_norm": 4.204862594604492, "learning_rate": 9.981873111782478e-07, "logits/chosen": -2.3174211978912354, "logits/rejected": -2.236790657043457, "logps/chosen": -58.96713638305664, "logps/rejected": -83.80451202392578, "loss": 0.9997, "rewards/accuracies": 0.5625, "rewards/chosen": -0.00938349962234497, "rewards/margins": 0.0012183538638055325, "rewards/rejected": -0.01060185395181179, "step": 7 }, { "epoch": 0.02412818096135721, "grad_norm": 3.800302743911743, "learning_rate": 9.978851963746222e-07, "logits/chosen": -2.32617449760437, "logits/rejected": -2.166128158569336, "logps/chosen": -56.27121353149414, "logps/rejected": -78.00035858154297, "loss": 1.0005, "rewards/accuracies": 0.46875, "rewards/chosen": -0.0072879670187830925, "rewards/margins": -0.0019204637501388788, "rewards/rejected": -0.005367503967136145, "step": 8 }, { "epoch": 0.02714420358152686, "grad_norm": 3.7428598403930664, "learning_rate": 9.97583081570997e-07, "logits/chosen": -2.290562152862549, "logits/rejected": -2.0710337162017822, "logps/chosen": -56.069705963134766, "logps/rejected": -84.53810119628906, "loss": 0.9981, "rewards/accuracies": 0.46875, "rewards/chosen": 0.0025999799836426973, "rewards/margins": 0.007504856679588556, "rewards/rejected": -0.00490487739443779, "step": 9 }, { "epoch": 0.030160226201696512, "grad_norm": 3.480499267578125, "learning_rate": 9.972809667673716e-07, "logits/chosen": -2.313340663909912, "logits/rejected": -2.1386704444885254, "logps/chosen": -57.392974853515625, "logps/rejected": -86.99319458007812, "loss": 0.9961, "rewards/accuracies": 0.53125, "rewards/chosen": -0.020646613091230392, "rewards/margins": 0.015718193724751472, "rewards/rejected": -0.036364804953336716, "step": 10 }, { "epoch": 0.033176248821866164, "grad_norm": 3.4964778423309326, "learning_rate": 9.969788519637462e-07, "logits/chosen": -2.2467234134674072, "logits/rejected": -2.0930910110473633, "logps/chosen": -55.06146240234375, "logps/rejected": -80.63308715820312, "loss": 1.0005, "rewards/accuracies": 0.5, "rewards/chosen": -0.009590602479875088, "rewards/margins": -0.0018975369166582823, "rewards/rejected": -0.007693065330386162, "step": 11 }, { "epoch": 0.036192271442035816, "grad_norm": 3.652778148651123, "learning_rate": 9.966767371601207e-07, "logits/chosen": -2.1944217681884766, "logits/rejected": -2.1146106719970703, "logps/chosen": -57.50394821166992, "logps/rejected": -81.46492004394531, "loss": 0.995, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0033359660301357508, "rewards/margins": 0.019905494526028633, "rewards/rejected": -0.016569532454013824, "step": 12 }, { "epoch": 0.03920829406220547, "grad_norm": 3.2849831581115723, "learning_rate": 9.963746223564954e-07, "logits/chosen": -2.3225655555725098, "logits/rejected": -2.1859824657440186, "logps/chosen": -59.93860626220703, "logps/rejected": -81.9677505493164, "loss": 0.9965, "rewards/accuracies": 0.625, "rewards/chosen": 0.00390021875500679, "rewards/margins": 0.013945544138550758, "rewards/rejected": -0.010045328177511692, "step": 13 }, { "epoch": 0.04222431668237512, "grad_norm": 34.31122970581055, "learning_rate": 9.9607250755287e-07, "logits/chosen": -2.2213492393493652, "logits/rejected": -2.0588982105255127, "logps/chosen": -55.313175201416016, "logps/rejected": -85.84236907958984, "loss": 1.0029, "rewards/accuracies": 0.40625, "rewards/chosen": -0.04658465087413788, "rewards/margins": -0.011807199567556381, "rewards/rejected": -0.0347774513065815, "step": 14 }, { "epoch": 0.04524033930254477, "grad_norm": 3.937756299972534, "learning_rate": 9.957703927492447e-07, "logits/chosen": -2.275630474090576, "logits/rejected": -2.0819785594940186, "logps/chosen": -57.64944839477539, "logps/rejected": -93.6537094116211, "loss": 1.0076, "rewards/accuracies": 0.40625, "rewards/chosen": -0.0587458610534668, "rewards/margins": -0.03031480312347412, "rewards/rejected": -0.028431057929992676, "step": 15 }, { "epoch": 0.04825636192271442, "grad_norm": 3.7424278259277344, "learning_rate": 9.954682779456192e-07, "logits/chosen": -2.299224376678467, "logits/rejected": -2.1267309188842773, "logps/chosen": -51.32215881347656, "logps/rejected": -88.37657928466797, "loss": 0.9889, "rewards/accuracies": 0.6875, "rewards/chosen": -0.03230787441134453, "rewards/margins": 0.04444975405931473, "rewards/rejected": -0.07675763219594955, "step": 16 }, { "epoch": 0.05127238454288407, "grad_norm": 3.4935691356658936, "learning_rate": 9.951661631419938e-07, "logits/chosen": -2.3003880977630615, "logits/rejected": -2.1235735416412354, "logps/chosen": -60.22377395629883, "logps/rejected": -86.99348449707031, "loss": 0.9978, "rewards/accuracies": 0.53125, "rewards/chosen": -0.04892918840050697, "rewards/margins": 0.008812141604721546, "rewards/rejected": -0.05774133652448654, "step": 17 }, { "epoch": 0.05428840716305372, "grad_norm": 3.902526617050171, "learning_rate": 9.948640483383685e-07, "logits/chosen": -2.255798101425171, "logits/rejected": -2.163147449493408, "logps/chosen": -56.51237487792969, "logps/rejected": -88.56672668457031, "loss": 0.9936, "rewards/accuracies": 0.65625, "rewards/chosen": -0.022262586280703545, "rewards/margins": 0.0258499626070261, "rewards/rejected": -0.048112548887729645, "step": 18 }, { "epoch": 0.05730442978322337, "grad_norm": 3.566249132156372, "learning_rate": 9.945619335347432e-07, "logits/chosen": -2.295321226119995, "logits/rejected": -2.150563955307007, "logps/chosen": -54.549652099609375, "logps/rejected": -83.310302734375, "loss": 0.993, "rewards/accuracies": 0.53125, "rewards/chosen": -0.021701108664274216, "rewards/margins": 0.02813052199780941, "rewards/rejected": -0.04983162879943848, "step": 19 }, { "epoch": 0.060320452403393024, "grad_norm": 3.681709051132202, "learning_rate": 9.942598187311179e-07, "logits/chosen": -2.2661325931549072, "logits/rejected": -2.1538147926330566, "logps/chosen": -55.111083984375, "logps/rejected": -82.09957885742188, "loss": 0.9871, "rewards/accuracies": 0.65625, "rewards/chosen": -0.009586215019226074, "rewards/margins": 0.05174358934164047, "rewards/rejected": -0.06132980436086655, "step": 20 }, { "epoch": 0.06333647502356268, "grad_norm": 3.6251463890075684, "learning_rate": 9.939577039274925e-07, "logits/chosen": -2.3157265186309814, "logits/rejected": -2.1768627166748047, "logps/chosen": -55.83133316040039, "logps/rejected": -85.19308471679688, "loss": 0.9934, "rewards/accuracies": 0.5, "rewards/chosen": -0.033530108630657196, "rewards/margins": 0.026398785412311554, "rewards/rejected": -0.05992889404296875, "step": 21 }, { "epoch": 0.06635249764373233, "grad_norm": 3.622174024581909, "learning_rate": 9.93655589123867e-07, "logits/chosen": -2.2395472526550293, "logits/rejected": -2.1148579120635986, "logps/chosen": -53.79273986816406, "logps/rejected": -79.97005462646484, "loss": 0.9816, "rewards/accuracies": 0.71875, "rewards/chosen": 0.003089225385338068, "rewards/margins": 0.07397472858428955, "rewards/rejected": -0.07088550180196762, "step": 22 }, { "epoch": 0.06936852026390197, "grad_norm": 3.6636030673980713, "learning_rate": 9.933534743202417e-07, "logits/chosen": -2.2952771186828613, "logits/rejected": -2.053246259689331, "logps/chosen": -54.40401840209961, "logps/rejected": -81.5823745727539, "loss": 0.9891, "rewards/accuracies": 0.5625, "rewards/chosen": -0.014181423932313919, "rewards/margins": 0.043855272233486176, "rewards/rejected": -0.058036696165800095, "step": 23 }, { "epoch": 0.07238454288407163, "grad_norm": 3.639965295791626, "learning_rate": 9.930513595166163e-07, "logits/chosen": -2.226130485534668, "logits/rejected": -2.133941650390625, "logps/chosen": -52.560455322265625, "logps/rejected": -77.84488677978516, "loss": 0.9885, "rewards/accuracies": 0.71875, "rewards/chosen": -0.0181445125490427, "rewards/margins": 0.04636930674314499, "rewards/rejected": -0.06451381742954254, "step": 24 }, { "epoch": 0.07540056550424128, "grad_norm": 4.291306972503662, "learning_rate": 9.92749244712991e-07, "logits/chosen": -2.268420696258545, "logits/rejected": -2.1234636306762695, "logps/chosen": -56.94636917114258, "logps/rejected": -88.42636108398438, "loss": 0.976, "rewards/accuracies": 0.875, "rewards/chosen": -0.03346466273069382, "rewards/margins": 0.09632758796215057, "rewards/rejected": -0.1297922432422638, "step": 25 }, { "epoch": 0.07841658812441094, "grad_norm": 4.062901973724365, "learning_rate": 9.924471299093655e-07, "logits/chosen": -2.1932077407836914, "logits/rejected": -2.0747928619384766, "logps/chosen": -61.77953338623047, "logps/rejected": -89.220703125, "loss": 0.9923, "rewards/accuracies": 0.59375, "rewards/chosen": -0.049230560660362244, "rewards/margins": 0.030738815665245056, "rewards/rejected": -0.0799693763256073, "step": 26 }, { "epoch": 0.08143261074458058, "grad_norm": 3.1878859996795654, "learning_rate": 9.921450151057401e-07, "logits/chosen": -2.2348599433898926, "logits/rejected": -2.171837329864502, "logps/chosen": -55.70494842529297, "logps/rejected": -80.39353942871094, "loss": 0.9869, "rewards/accuracies": 0.6875, "rewards/chosen": -0.04770302027463913, "rewards/margins": 0.052479151636362076, "rewards/rejected": -0.10018216073513031, "step": 27 }, { "epoch": 0.08444863336475024, "grad_norm": 3.844719648361206, "learning_rate": 9.918429003021148e-07, "logits/chosen": -2.213869333267212, "logits/rejected": -2.117082118988037, "logps/chosen": -56.50190734863281, "logps/rejected": -89.81669616699219, "loss": 0.9752, "rewards/accuracies": 0.875, "rewards/chosen": -0.0251077301800251, "rewards/margins": 0.09957974404096603, "rewards/rejected": -0.12468748539686203, "step": 28 }, { "epoch": 0.08746465598491988, "grad_norm": 3.442169427871704, "learning_rate": 9.915407854984895e-07, "logits/chosen": -2.25830340385437, "logits/rejected": -2.1151771545410156, "logps/chosen": -50.68057632446289, "logps/rejected": -79.68639373779297, "loss": 0.9798, "rewards/accuracies": 0.78125, "rewards/chosen": 0.0043763285502791405, "rewards/margins": 0.08114366233348846, "rewards/rejected": -0.07676732540130615, "step": 29 }, { "epoch": 0.09048067860508954, "grad_norm": 5.208356857299805, "learning_rate": 9.91238670694864e-07, "logits/chosen": -2.23907208442688, "logits/rejected": -2.1594414710998535, "logps/chosen": -54.56939697265625, "logps/rejected": -84.51333618164062, "loss": 0.9724, "rewards/accuracies": 0.78125, "rewards/chosen": -0.01956946961581707, "rewards/margins": 0.11064708232879639, "rewards/rejected": -0.1302165538072586, "step": 30 }, { "epoch": 0.09349670122525919, "grad_norm": 4.8146891593933105, "learning_rate": 9.909365558912386e-07, "logits/chosen": -2.2671127319335938, "logits/rejected": -2.1985244750976562, "logps/chosen": -61.98454284667969, "logps/rejected": -83.38849639892578, "loss": 0.9702, "rewards/accuracies": 0.84375, "rewards/chosen": 0.0063543906435370445, "rewards/margins": 0.11966527998447418, "rewards/rejected": -0.11331088095903397, "step": 31 }, { "epoch": 0.09651272384542885, "grad_norm": 3.7575671672821045, "learning_rate": 9.906344410876133e-07, "logits/chosen": -2.234227180480957, "logits/rejected": -2.0970661640167236, "logps/chosen": -54.73686981201172, "logps/rejected": -82.94193267822266, "loss": 0.9776, "rewards/accuracies": 0.78125, "rewards/chosen": -0.04276373237371445, "rewards/margins": 0.09000205248594284, "rewards/rejected": -0.1327657848596573, "step": 32 }, { "epoch": 0.09952874646559849, "grad_norm": 3.968949794769287, "learning_rate": 9.90332326283988e-07, "logits/chosen": -2.3798561096191406, "logits/rejected": -2.2311830520629883, "logps/chosen": -58.05204391479492, "logps/rejected": -84.93900299072266, "loss": 0.9715, "rewards/accuracies": 0.78125, "rewards/chosen": -0.04501646012067795, "rewards/margins": 0.11494946479797363, "rewards/rejected": -0.1599659025669098, "step": 33 }, { "epoch": 0.10254476908576814, "grad_norm": 4.029476165771484, "learning_rate": 9.900302114803624e-07, "logits/chosen": -2.2183918952941895, "logits/rejected": -2.1517586708068848, "logps/chosen": -58.38212203979492, "logps/rejected": -85.02774810791016, "loss": 0.9613, "rewards/accuracies": 0.875, "rewards/chosen": -0.014175701886415482, "rewards/margins": 0.15577144920825958, "rewards/rejected": -0.16994713246822357, "step": 34 }, { "epoch": 0.1055607917059378, "grad_norm": 4.203932285308838, "learning_rate": 9.89728096676737e-07, "logits/chosen": -2.2803635597229004, "logits/rejected": -2.126408338546753, "logps/chosen": -61.44166946411133, "logps/rejected": -91.71792602539062, "loss": 0.9619, "rewards/accuracies": 0.84375, "rewards/chosen": -0.07070858031511307, "rewards/margins": 0.15369878709316254, "rewards/rejected": -0.224407359957695, "step": 35 }, { "epoch": 0.10857681432610744, "grad_norm": 3.4377007484436035, "learning_rate": 9.894259818731117e-07, "logits/chosen": -2.1788058280944824, "logits/rejected": -2.082000255584717, "logps/chosen": -59.31795120239258, "logps/rejected": -82.4972152709961, "loss": 0.9725, "rewards/accuracies": 0.84375, "rewards/chosen": -0.08165419101715088, "rewards/margins": 0.11069943755865097, "rewards/rejected": -0.19235362112522125, "step": 36 }, { "epoch": 0.1115928369462771, "grad_norm": 3.8397305011749268, "learning_rate": 9.891238670694864e-07, "logits/chosen": -2.308764934539795, "logits/rejected": -2.1735079288482666, "logps/chosen": -57.679542541503906, "logps/rejected": -79.48867797851562, "loss": 0.962, "rewards/accuracies": 0.90625, "rewards/chosen": -0.04306763410568237, "rewards/margins": 0.153228297829628, "rewards/rejected": -0.19629594683647156, "step": 37 }, { "epoch": 0.11460885956644674, "grad_norm": 4.138980865478516, "learning_rate": 9.888217522658609e-07, "logits/chosen": -2.358689546585083, "logits/rejected": -2.23755145072937, "logps/chosen": -57.167694091796875, "logps/rejected": -91.09989166259766, "loss": 0.9553, "rewards/accuracies": 0.8125, "rewards/chosen": -0.04707568138837814, "rewards/margins": 0.18081767857074738, "rewards/rejected": -0.22789333760738373, "step": 38 }, { "epoch": 0.1176248821866164, "grad_norm": 3.8915581703186035, "learning_rate": 9.885196374622357e-07, "logits/chosen": -2.265199899673462, "logits/rejected": -2.1462221145629883, "logps/chosen": -57.124961853027344, "logps/rejected": -85.11444091796875, "loss": 0.9562, "rewards/accuracies": 0.90625, "rewards/chosen": -0.03957351669669151, "rewards/margins": 0.17672106623649597, "rewards/rejected": -0.21629458665847778, "step": 39 }, { "epoch": 0.12064090480678605, "grad_norm": 4.228728294372559, "learning_rate": 9.882175226586102e-07, "logits/chosen": -2.2470297813415527, "logits/rejected": -2.0780773162841797, "logps/chosen": -54.03599548339844, "logps/rejected": -83.22564697265625, "loss": 0.9516, "rewards/accuracies": 0.96875, "rewards/chosen": -0.023431386798620224, "rewards/margins": 0.19534292817115784, "rewards/rejected": -0.21877431869506836, "step": 40 }, { "epoch": 0.1236569274269557, "grad_norm": 4.085625648498535, "learning_rate": 9.879154078549849e-07, "logits/chosen": -2.2722222805023193, "logits/rejected": -2.1488194465637207, "logps/chosen": -57.995323181152344, "logps/rejected": -87.24188232421875, "loss": 0.9405, "rewards/accuracies": 0.96875, "rewards/chosen": -0.03535294532775879, "rewards/margins": 0.24090144038200378, "rewards/rejected": -0.27625441551208496, "step": 41 }, { "epoch": 0.12667295004712537, "grad_norm": 4.2383246421813965, "learning_rate": 9.876132930513593e-07, "logits/chosen": -2.275418758392334, "logits/rejected": -2.1550374031066895, "logps/chosen": -56.30076599121094, "logps/rejected": -89.16744232177734, "loss": 0.9396, "rewards/accuracies": 0.9375, "rewards/chosen": -0.059954285621643066, "rewards/margins": 0.2455308586359024, "rewards/rejected": -0.3054851293563843, "step": 42 }, { "epoch": 0.129688972667295, "grad_norm": 3.818253517150879, "learning_rate": 9.873111782477342e-07, "logits/chosen": -2.1690359115600586, "logits/rejected": -2.093170166015625, "logps/chosen": -52.516727447509766, "logps/rejected": -79.12615966796875, "loss": 0.9463, "rewards/accuracies": 0.96875, "rewards/chosen": -0.033910829573869705, "rewards/margins": 0.21745765209197998, "rewards/rejected": -0.2513684928417206, "step": 43 }, { "epoch": 0.13270499528746466, "grad_norm": 6.75556755065918, "learning_rate": 9.870090634441087e-07, "logits/chosen": -2.2973318099975586, "logits/rejected": -2.1542844772338867, "logps/chosen": -54.42872619628906, "logps/rejected": -93.9161605834961, "loss": 0.9342, "rewards/accuracies": 0.90625, "rewards/chosen": -0.10671578347682953, "rewards/margins": 0.26940932869911194, "rewards/rejected": -0.3761250972747803, "step": 44 }, { "epoch": 0.1357210179076343, "grad_norm": 4.149394989013672, "learning_rate": 9.867069486404833e-07, "logits/chosen": -2.246821403503418, "logits/rejected": -2.1341776847839355, "logps/chosen": -58.09614944458008, "logps/rejected": -86.50494384765625, "loss": 0.9504, "rewards/accuracies": 0.8125, "rewards/chosen": -0.12004183232784271, "rewards/margins": 0.20197732746601105, "rewards/rejected": -0.32201918959617615, "step": 45 }, { "epoch": 0.13873704052780395, "grad_norm": 3.241729736328125, "learning_rate": 9.864048338368578e-07, "logits/chosen": -2.316793918609619, "logits/rejected": -2.139280080795288, "logps/chosen": -55.51958465576172, "logps/rejected": -79.81816101074219, "loss": 0.9486, "rewards/accuracies": 0.90625, "rewards/chosen": -0.05623524263501167, "rewards/margins": 0.20919768512248993, "rewards/rejected": -0.2654329240322113, "step": 46 }, { "epoch": 0.14175306314797362, "grad_norm": 4.158636569976807, "learning_rate": 9.861027190332327e-07, "logits/chosen": -2.2742843627929688, "logits/rejected": -2.156515121459961, "logps/chosen": -58.054256439208984, "logps/rejected": -90.16730499267578, "loss": 0.9288, "rewards/accuracies": 0.90625, "rewards/chosen": -0.07182648777961731, "rewards/margins": 0.2900038957595825, "rewards/rejected": -0.36183038353919983, "step": 47 }, { "epoch": 0.14476908576814326, "grad_norm": 3.9891815185546875, "learning_rate": 9.858006042296071e-07, "logits/chosen": -2.2609877586364746, "logits/rejected": -2.1507513523101807, "logps/chosen": -58.61878204345703, "logps/rejected": -86.70022583007812, "loss": 0.9363, "rewards/accuracies": 0.90625, "rewards/chosen": -0.09076130390167236, "rewards/margins": 0.2597510814666748, "rewards/rejected": -0.35051241517066956, "step": 48 }, { "epoch": 0.1477851083883129, "grad_norm": 4.150099754333496, "learning_rate": 9.854984894259818e-07, "logits/chosen": -2.2460951805114746, "logits/rejected": -2.054863214492798, "logps/chosen": -52.70261764526367, "logps/rejected": -90.6244888305664, "loss": 0.9242, "rewards/accuracies": 0.9375, "rewards/chosen": -0.06902869045734406, "rewards/margins": 0.31043004989624023, "rewards/rejected": -0.3794587254524231, "step": 49 }, { "epoch": 0.15080113100848255, "grad_norm": 3.8503687381744385, "learning_rate": 9.851963746223565e-07, "logits/chosen": -2.2700796127319336, "logits/rejected": -2.231461524963379, "logps/chosen": -60.67612075805664, "logps/rejected": -89.62191772460938, "loss": 0.9355, "rewards/accuracies": 0.90625, "rewards/chosen": -0.12461702525615692, "rewards/margins": 0.26424041390419006, "rewards/rejected": -0.3888574540615082, "step": 50 }, { "epoch": 0.15381715362865223, "grad_norm": 4.701745510101318, "learning_rate": 9.848942598187312e-07, "logits/chosen": -2.2676820755004883, "logits/rejected": -2.1923577785491943, "logps/chosen": -55.61472702026367, "logps/rejected": -95.21495056152344, "loss": 0.902, "rewards/accuracies": 1.0, "rewards/chosen": -0.10065345466136932, "rewards/margins": 0.4048173725605011, "rewards/rejected": -0.5054708123207092, "step": 51 }, { "epoch": 0.15683317624882187, "grad_norm": 3.534820795059204, "learning_rate": 9.845921450151056e-07, "logits/chosen": -2.279330015182495, "logits/rejected": -2.2518463134765625, "logps/chosen": -60.14490509033203, "logps/rejected": -83.98274993896484, "loss": 0.9368, "rewards/accuracies": 0.84375, "rewards/chosen": -0.13593026995658875, "rewards/margins": 0.25904935598373413, "rewards/rejected": -0.3949796259403229, "step": 52 }, { "epoch": 0.15984919886899152, "grad_norm": 4.730432987213135, "learning_rate": 9.842900302114803e-07, "logits/chosen": -2.338526964187622, "logits/rejected": -2.1884193420410156, "logps/chosen": -61.62157440185547, "logps/rejected": -95.10875701904297, "loss": 0.9037, "rewards/accuracies": 0.90625, "rewards/chosen": -0.10432429611682892, "rewards/margins": 0.39881494641304016, "rewards/rejected": -0.5031391978263855, "step": 53 }, { "epoch": 0.16286522148916116, "grad_norm": 3.6380181312561035, "learning_rate": 9.83987915407855e-07, "logits/chosen": -2.337097406387329, "logits/rejected": -2.237159490585327, "logps/chosen": -63.557064056396484, "logps/rejected": -90.04523468017578, "loss": 0.9261, "rewards/accuracies": 0.9375, "rewards/chosen": -0.20270851254463196, "rewards/margins": 0.30580317974090576, "rewards/rejected": -0.5085116624832153, "step": 54 }, { "epoch": 0.16588124410933083, "grad_norm": 4.088222503662109, "learning_rate": 9.836858006042296e-07, "logits/chosen": -2.34084415435791, "logits/rejected": -2.2009730339050293, "logps/chosen": -59.226600646972656, "logps/rejected": -90.01954650878906, "loss": 0.8953, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05887800455093384, "rewards/margins": 0.43374162912368774, "rewards/rejected": -0.4926196336746216, "step": 55 }, { "epoch": 0.16889726672950048, "grad_norm": 4.826124668121338, "learning_rate": 9.83383685800604e-07, "logits/chosen": -2.3022305965423584, "logits/rejected": -2.1939494609832764, "logps/chosen": -59.41986083984375, "logps/rejected": -95.28821563720703, "loss": 0.8949, "rewards/accuracies": 1.0, "rewards/chosen": -0.12728393077850342, "rewards/margins": 0.43793752789497375, "rewards/rejected": -0.5652214288711548, "step": 56 }, { "epoch": 0.17191328934967012, "grad_norm": 4.908831596374512, "learning_rate": 9.830815709969788e-07, "logits/chosen": -2.2557103633880615, "logits/rejected": -2.1875264644622803, "logps/chosen": -61.17342758178711, "logps/rejected": -93.0698471069336, "loss": 0.9123, "rewards/accuracies": 0.90625, "rewards/chosen": -0.1717473566532135, "rewards/margins": 0.36359184980392456, "rewards/rejected": -0.5353392362594604, "step": 57 }, { "epoch": 0.17492931196983977, "grad_norm": 4.027940273284912, "learning_rate": 9.827794561933534e-07, "logits/chosen": -2.3626837730407715, "logits/rejected": -2.2270796298980713, "logps/chosen": -58.915279388427734, "logps/rejected": -92.39025115966797, "loss": 0.9058, "rewards/accuracies": 0.90625, "rewards/chosen": -0.13496284186840057, "rewards/margins": 0.3935774862766266, "rewards/rejected": -0.5285402536392212, "step": 58 }, { "epoch": 0.1779453345900094, "grad_norm": 4.060853958129883, "learning_rate": 9.82477341389728e-07, "logits/chosen": -2.3412177562713623, "logits/rejected": -2.164668560028076, "logps/chosen": -57.50774383544922, "logps/rejected": -90.56875610351562, "loss": 0.8968, "rewards/accuracies": 1.0, "rewards/chosen": -0.1172608882188797, "rewards/margins": 0.429917573928833, "rewards/rejected": -0.5471784472465515, "step": 59 }, { "epoch": 0.18096135721017909, "grad_norm": 4.018914222717285, "learning_rate": 9.821752265861026e-07, "logits/chosen": -2.2560245990753174, "logits/rejected": -2.129998207092285, "logps/chosen": -56.92362594604492, "logps/rejected": -86.41961669921875, "loss": 0.9015, "rewards/accuracies": 0.9375, "rewards/chosen": -0.09702590107917786, "rewards/margins": 0.41105878353118896, "rewards/rejected": -0.5080847144126892, "step": 60 }, { "epoch": 0.18397737983034873, "grad_norm": 4.286045551300049, "learning_rate": 9.818731117824774e-07, "logits/chosen": -2.2191572189331055, "logits/rejected": -2.151203155517578, "logps/chosen": -53.64063262939453, "logps/rejected": -85.89777374267578, "loss": 0.8706, "rewards/accuracies": 0.96875, "rewards/chosen": -0.025279652327299118, "rewards/margins": 0.5354368686676025, "rewards/rejected": -0.5607165098190308, "step": 61 }, { "epoch": 0.18699340245051838, "grad_norm": 3.489643096923828, "learning_rate": 9.81570996978852e-07, "logits/chosen": -2.2757174968719482, "logits/rejected": -2.1964213848114014, "logps/chosen": -62.72797393798828, "logps/rejected": -87.94793701171875, "loss": 0.9067, "rewards/accuracies": 0.8125, "rewards/chosen": -0.19786116480827332, "rewards/margins": 0.3952963352203369, "rewards/rejected": -0.5931575298309326, "step": 62 }, { "epoch": 0.19000942507068802, "grad_norm": 4.45051908493042, "learning_rate": 9.812688821752266e-07, "logits/chosen": -2.317206382751465, "logits/rejected": -2.1925652027130127, "logps/chosen": -57.72392654418945, "logps/rejected": -90.60494995117188, "loss": 0.8958, "rewards/accuracies": 0.96875, "rewards/chosen": -0.11160965263843536, "rewards/margins": 0.4374327063560486, "rewards/rejected": -0.5490423440933228, "step": 63 }, { "epoch": 0.1930254476908577, "grad_norm": 3.7237088680267334, "learning_rate": 9.80966767371601e-07, "logits/chosen": -2.3555734157562256, "logits/rejected": -2.270644187927246, "logps/chosen": -58.9019775390625, "logps/rejected": -94.68910217285156, "loss": 0.9162, "rewards/accuracies": 0.875, "rewards/chosen": -0.2365657240152359, "rewards/margins": 0.3560475707054138, "rewards/rejected": -0.5926133394241333, "step": 64 }, { "epoch": 0.19604147031102734, "grad_norm": 4.155421733856201, "learning_rate": 9.80664652567976e-07, "logits/chosen": -2.353782892227173, "logits/rejected": -2.1645355224609375, "logps/chosen": -57.29245376586914, "logps/rejected": -87.25802612304688, "loss": 0.8795, "rewards/accuracies": 0.90625, "rewards/chosen": -0.114751435816288, "rewards/margins": 0.5109544396400452, "rewards/rejected": -0.6257058382034302, "step": 65 }, { "epoch": 0.19905749293119698, "grad_norm": 4.311178684234619, "learning_rate": 9.803625377643504e-07, "logits/chosen": -2.2907989025115967, "logits/rejected": -2.165325164794922, "logps/chosen": -55.48672866821289, "logps/rejected": -89.24369049072266, "loss": 0.8542, "rewards/accuracies": 0.96875, "rewards/chosen": -0.057222820818424225, "rewards/margins": 0.6178129315376282, "rewards/rejected": -0.6750357151031494, "step": 66 }, { "epoch": 0.20207351555136663, "grad_norm": 3.8485631942749023, "learning_rate": 9.80060422960725e-07, "logits/chosen": -2.2706613540649414, "logits/rejected": -2.2136471271514893, "logps/chosen": -57.71736145019531, "logps/rejected": -84.98796081542969, "loss": 0.8578, "rewards/accuracies": 0.90625, "rewards/chosen": -0.09612338244915009, "rewards/margins": 0.6059404611587524, "rewards/rejected": -0.7020638585090637, "step": 67 }, { "epoch": 0.20207351555136663, "eval_logits/chosen": -2.256058931350708, "eval_logits/rejected": -2.1461942195892334, "eval_logps/chosen": -60.77305603027344, "eval_logps/rejected": -93.71211242675781, "eval_loss": 0.8593654632568359, "eval_rewards/accuracies": 0.9613401889801025, "eval_rewards/chosen": -0.13683760166168213, "eval_rewards/margins": 0.6072103977203369, "eval_rewards/rejected": -0.744047999382019, "eval_runtime": 700.7399, "eval_samples_per_second": 0.552, "eval_steps_per_second": 0.277, "step": 67 }, { "epoch": 0.20508953817153627, "grad_norm": 4.39597225189209, "learning_rate": 9.797583081570997e-07, "logits/chosen": -2.309138298034668, "logits/rejected": -2.1709914207458496, "logps/chosen": -54.235130310058594, "logps/rejected": -80.96202087402344, "loss": 0.8454, "rewards/accuracies": 0.9375, "rewards/chosen": -0.02097412198781967, "rewards/margins": 0.6590630412101746, "rewards/rejected": -0.6800371408462524, "step": 68 }, { "epoch": 0.20810556079170595, "grad_norm": 4.042968273162842, "learning_rate": 9.794561933534744e-07, "logits/chosen": -2.332859754562378, "logits/rejected": -2.2572295665740967, "logps/chosen": -60.10956954956055, "logps/rejected": -90.36128997802734, "loss": 0.8477, "rewards/accuracies": 0.96875, "rewards/chosen": -0.16183176636695862, "rewards/margins": 0.6645244359970093, "rewards/rejected": -0.8263561725616455, "step": 69 }, { "epoch": 0.2111215834118756, "grad_norm": 4.335555553436279, "learning_rate": 9.791540785498488e-07, "logits/chosen": -2.269598960876465, "logits/rejected": -2.2028770446777344, "logps/chosen": -54.9040412902832, "logps/rejected": -91.29539489746094, "loss": 0.8251, "rewards/accuracies": 1.0, "rewards/chosen": -0.057094328105449677, "rewards/margins": 0.7625002861022949, "rewards/rejected": -0.8195945620536804, "step": 70 }, { "epoch": 0.21413760603204524, "grad_norm": 3.9935100078582764, "learning_rate": 9.788519637462235e-07, "logits/chosen": -2.293064594268799, "logits/rejected": -2.1525468826293945, "logps/chosen": -59.934478759765625, "logps/rejected": -97.45828247070312, "loss": 0.8391, "rewards/accuracies": 1.0, "rewards/chosen": -0.1883009523153305, "rewards/margins": 0.7076339721679688, "rewards/rejected": -0.8959349393844604, "step": 71 }, { "epoch": 0.21715362865221488, "grad_norm": 4.258722305297852, "learning_rate": 9.785498489425982e-07, "logits/chosen": -2.3550429344177246, "logits/rejected": -2.156381607055664, "logps/chosen": -59.12317657470703, "logps/rejected": -96.22732543945312, "loss": 0.8512, "rewards/accuracies": 0.96875, "rewards/chosen": -0.14126363396644592, "rewards/margins": 0.6466884613037109, "rewards/rejected": -0.7879520654678345, "step": 72 }, { "epoch": 0.22016965127238455, "grad_norm": 4.541208744049072, "learning_rate": 9.782477341389729e-07, "logits/chosen": -2.3447816371917725, "logits/rejected": -2.1907031536102295, "logps/chosen": -51.723060607910156, "logps/rejected": -92.24698638916016, "loss": 0.7955, "rewards/accuracies": 0.96875, "rewards/chosen": 0.021348880603909492, "rewards/margins": 0.8902764916419983, "rewards/rejected": -0.868927538394928, "step": 73 }, { "epoch": 0.2231856738925542, "grad_norm": 3.900068998336792, "learning_rate": 9.779456193353473e-07, "logits/chosen": -2.3160040378570557, "logits/rejected": -2.1790428161621094, "logps/chosen": -54.08250045776367, "logps/rejected": -84.6230239868164, "loss": 0.831, "rewards/accuracies": 1.0, "rewards/chosen": -0.06140289828181267, "rewards/margins": 0.7294041514396667, "rewards/rejected": -0.7908070683479309, "step": 74 }, { "epoch": 0.22620169651272384, "grad_norm": 3.836801528930664, "learning_rate": 9.77643504531722e-07, "logits/chosen": -2.3177502155303955, "logits/rejected": -2.2172493934631348, "logps/chosen": -58.34757614135742, "logps/rejected": -97.44451141357422, "loss": 0.8282, "rewards/accuracies": 0.9375, "rewards/chosen": -0.18776001036167145, "rewards/margins": 0.7667455673217773, "rewards/rejected": -0.9545055627822876, "step": 75 }, { "epoch": 0.2292177191328935, "grad_norm": 4.09732723236084, "learning_rate": 9.773413897280967e-07, "logits/chosen": -2.2663087844848633, "logits/rejected": -2.166821002960205, "logps/chosen": -53.36769104003906, "logps/rejected": -92.57527160644531, "loss": 0.8116, "rewards/accuracies": 0.96875, "rewards/chosen": -0.08415424823760986, "rewards/margins": 0.8397340774536133, "rewards/rejected": -0.9238882660865784, "step": 76 }, { "epoch": 0.23223374175306316, "grad_norm": 4.079648494720459, "learning_rate": 9.770392749244713e-07, "logits/chosen": -2.3139312267303467, "logits/rejected": -2.240048408508301, "logps/chosen": -55.0101203918457, "logps/rejected": -90.34867095947266, "loss": 0.8001, "rewards/accuracies": 1.0, "rewards/chosen": -0.06664510816335678, "rewards/margins": 0.8980607390403748, "rewards/rejected": -0.9647058248519897, "step": 77 }, { "epoch": 0.2352497643732328, "grad_norm": 3.8493144512176514, "learning_rate": 9.767371601208458e-07, "logits/chosen": -2.356860876083374, "logits/rejected": -2.177619457244873, "logps/chosen": -56.75716781616211, "logps/rejected": -93.15266418457031, "loss": 0.7919, "rewards/accuracies": 1.0, "rewards/chosen": -0.05263868346810341, "rewards/margins": 0.9236819744110107, "rewards/rejected": -0.9763206243515015, "step": 78 }, { "epoch": 0.23826578699340245, "grad_norm": 4.044839859008789, "learning_rate": 9.764350453172205e-07, "logits/chosen": -2.4059882164001465, "logits/rejected": -2.1930527687072754, "logps/chosen": -54.19709014892578, "logps/rejected": -87.22486114501953, "loss": 0.7908, "rewards/accuracies": 1.0, "rewards/chosen": 0.003855302929878235, "rewards/margins": 0.926905632019043, "rewards/rejected": -0.9230502843856812, "step": 79 }, { "epoch": 0.2412818096135721, "grad_norm": 3.628810405731201, "learning_rate": 9.761329305135951e-07, "logits/chosen": -2.361284017562866, "logits/rejected": -2.2092297077178955, "logps/chosen": -56.89401626586914, "logps/rejected": -94.58021545410156, "loss": 0.8013, "rewards/accuracies": 0.96875, "rewards/chosen": -0.14218401908874512, "rewards/margins": 0.9267742037773132, "rewards/rejected": -1.0689582824707031, "step": 80 }, { "epoch": 0.24429783223374174, "grad_norm": 3.9417264461517334, "learning_rate": 9.758308157099698e-07, "logits/chosen": -2.386610984802246, "logits/rejected": -2.21774959564209, "logps/chosen": -54.376251220703125, "logps/rejected": -95.96426391601562, "loss": 0.7802, "rewards/accuracies": 0.9375, "rewards/chosen": -0.032562606036663055, "rewards/margins": 1.012057900428772, "rewards/rejected": -1.044620394706726, "step": 81 }, { "epoch": 0.2473138548539114, "grad_norm": 4.09711217880249, "learning_rate": 9.755287009063443e-07, "logits/chosen": -2.366408586502075, "logits/rejected": -2.253300666809082, "logps/chosen": -56.43834686279297, "logps/rejected": -99.0872802734375, "loss": 0.7565, "rewards/accuracies": 1.0, "rewards/chosen": -0.06177884340286255, "rewards/margins": 1.1336709260940552, "rewards/rejected": -1.1954498291015625, "step": 82 }, { "epoch": 0.25032987747408103, "grad_norm": 3.78019118309021, "learning_rate": 9.75226586102719e-07, "logits/chosen": -2.350933790206909, "logits/rejected": -2.1586055755615234, "logps/chosen": -57.053558349609375, "logps/rejected": -94.29698944091797, "loss": 0.7686, "rewards/accuracies": 0.96875, "rewards/chosen": -0.014418184757232666, "rewards/margins": 1.0392497777938843, "rewards/rejected": -1.0536679029464722, "step": 83 }, { "epoch": 0.25334590009425073, "grad_norm": 3.7551021575927734, "learning_rate": 9.749244712990936e-07, "logits/chosen": -2.2991275787353516, "logits/rejected": -2.2209792137145996, "logps/chosen": -60.498992919921875, "logps/rejected": -103.00932312011719, "loss": 0.767, "rewards/accuracies": 0.96875, "rewards/chosen": -0.14052177965641022, "rewards/margins": 1.0952491760253906, "rewards/rejected": -1.23577082157135, "step": 84 }, { "epoch": 0.2563619227144204, "grad_norm": 3.638862133026123, "learning_rate": 9.746223564954683e-07, "logits/chosen": -2.379275321960449, "logits/rejected": -2.1952693462371826, "logps/chosen": -55.996429443359375, "logps/rejected": -97.78118896484375, "loss": 0.7704, "rewards/accuracies": 1.0, "rewards/chosen": -0.05160846188664436, "rewards/margins": 1.0679454803466797, "rewards/rejected": -1.1195539236068726, "step": 85 }, { "epoch": 0.25937794533459, "grad_norm": 4.084123134613037, "learning_rate": 9.74320241691843e-07, "logits/chosen": -2.3157029151916504, "logits/rejected": -2.2285702228546143, "logps/chosen": -58.06369400024414, "logps/rejected": -101.6407470703125, "loss": 0.7497, "rewards/accuracies": 1.0, "rewards/chosen": -0.11285893619060516, "rewards/margins": 1.191612720489502, "rewards/rejected": -1.3044716119766235, "step": 86 }, { "epoch": 0.26239396795475967, "grad_norm": 4.58299446105957, "learning_rate": 9.740181268882174e-07, "logits/chosen": -2.312237024307251, "logits/rejected": -2.226510524749756, "logps/chosen": -58.88618087768555, "logps/rejected": -104.65438079833984, "loss": 0.7072, "rewards/accuracies": 1.0, "rewards/chosen": 0.020559703931212425, "rewards/margins": 1.39864981174469, "rewards/rejected": -1.3780900239944458, "step": 87 }, { "epoch": 0.2654099905749293, "grad_norm": 3.735440254211426, "learning_rate": 9.73716012084592e-07, "logits/chosen": -2.4173567295074463, "logits/rejected": -2.288651943206787, "logps/chosen": -59.24122619628906, "logps/rejected": -92.02457427978516, "loss": 0.7617, "rewards/accuracies": 0.9375, "rewards/chosen": 0.08667103946208954, "rewards/margins": 1.1716630458831787, "rewards/rejected": -1.0849920511245728, "step": 88 }, { "epoch": 0.26842601319509896, "grad_norm": 3.8996169567108154, "learning_rate": 9.734138972809667e-07, "logits/chosen": -2.4403464794158936, "logits/rejected": -2.2609128952026367, "logps/chosen": -52.22582244873047, "logps/rejected": -99.91914367675781, "loss": 0.6954, "rewards/accuracies": 1.0, "rewards/chosen": 0.0870881974697113, "rewards/margins": 1.441041350364685, "rewards/rejected": -1.3539530038833618, "step": 89 }, { "epoch": 0.2714420358152686, "grad_norm": 4.103704452514648, "learning_rate": 9.731117824773414e-07, "logits/chosen": -2.3222804069519043, "logits/rejected": -2.224429130554199, "logps/chosen": -57.48881530761719, "logps/rejected": -101.90032196044922, "loss": 0.7229, "rewards/accuracies": 0.96875, "rewards/chosen": 0.000397704541683197, "rewards/margins": 1.343838095664978, "rewards/rejected": -1.3434404134750366, "step": 90 }, { "epoch": 0.27445805843543825, "grad_norm": 3.5731470584869385, "learning_rate": 9.728096676737159e-07, "logits/chosen": -2.437788248062134, "logits/rejected": -2.282911777496338, "logps/chosen": -55.0865364074707, "logps/rejected": -93.16254425048828, "loss": 0.7587, "rewards/accuracies": 0.96875, "rewards/chosen": -0.026715070009231567, "rewards/margins": 1.1369513273239136, "rewards/rejected": -1.1636664867401123, "step": 91 }, { "epoch": 0.2774740810556079, "grad_norm": 3.2819085121154785, "learning_rate": 9.725075528700905e-07, "logits/chosen": -2.4049124717712402, "logits/rejected": -2.238008499145508, "logps/chosen": -55.837867736816406, "logps/rejected": -95.04115295410156, "loss": 0.7589, "rewards/accuracies": 0.9375, "rewards/chosen": 0.030945371836423874, "rewards/margins": 1.156395435333252, "rewards/rejected": -1.1254501342773438, "step": 92 }, { "epoch": 0.2804901036757776, "grad_norm": 3.890367269515991, "learning_rate": 9.722054380664652e-07, "logits/chosen": -2.3425214290618896, "logits/rejected": -2.27998423576355, "logps/chosen": -57.33137893676758, "logps/rejected": -94.94901275634766, "loss": 0.7253, "rewards/accuracies": 0.875, "rewards/chosen": -0.021207205951213837, "rewards/margins": 1.312483787536621, "rewards/rejected": -1.3336910009384155, "step": 93 }, { "epoch": 0.28350612629594724, "grad_norm": 3.824268341064453, "learning_rate": 9.719033232628399e-07, "logits/chosen": -2.398597478866577, "logits/rejected": -2.277431011199951, "logps/chosen": -55.294166564941406, "logps/rejected": -88.46250915527344, "loss": 0.7397, "rewards/accuracies": 0.96875, "rewards/chosen": 0.15162764489650726, "rewards/margins": 1.2239660024642944, "rewards/rejected": -1.0723384618759155, "step": 94 }, { "epoch": 0.2865221489161169, "grad_norm": 3.7927963733673096, "learning_rate": 9.716012084592146e-07, "logits/chosen": -2.2935163974761963, "logits/rejected": -2.2833595275878906, "logps/chosen": -58.810142517089844, "logps/rejected": -100.87290954589844, "loss": 0.6958, "rewards/accuracies": 0.96875, "rewards/chosen": 0.06340834498405457, "rewards/margins": 1.465827465057373, "rewards/rejected": -1.4024192094802856, "step": 95 }, { "epoch": 0.2895381715362865, "grad_norm": 3.4132139682769775, "learning_rate": 9.71299093655589e-07, "logits/chosen": -2.390028953552246, "logits/rejected": -2.1896331310272217, "logps/chosen": -58.797481536865234, "logps/rejected": -96.88456726074219, "loss": 0.7247, "rewards/accuracies": 1.0, "rewards/chosen": 0.14563514292240143, "rewards/margins": 1.3148603439331055, "rewards/rejected": -1.1692252159118652, "step": 96 }, { "epoch": 0.29255419415645617, "grad_norm": 3.6910643577575684, "learning_rate": 9.709969788519637e-07, "logits/chosen": -2.3618290424346924, "logits/rejected": -2.2120091915130615, "logps/chosen": -57.57598114013672, "logps/rejected": -100.84781646728516, "loss": 0.7221, "rewards/accuracies": 0.90625, "rewards/chosen": 0.07303296774625778, "rewards/margins": 1.3497439622879028, "rewards/rejected": -1.2767109870910645, "step": 97 }, { "epoch": 0.2955702167766258, "grad_norm": 3.426823139190674, "learning_rate": 9.706948640483384e-07, "logits/chosen": -2.3751161098480225, "logits/rejected": -2.275686740875244, "logps/chosen": -56.40439224243164, "logps/rejected": -100.8688735961914, "loss": 0.6901, "rewards/accuracies": 0.9375, "rewards/chosen": 0.06014891713857651, "rewards/margins": 1.5400049686431885, "rewards/rejected": -1.4798561334609985, "step": 98 }, { "epoch": 0.29858623939679546, "grad_norm": 3.6953494548797607, "learning_rate": 9.70392749244713e-07, "logits/chosen": -2.3937087059020996, "logits/rejected": -2.2441577911376953, "logps/chosen": -53.454078674316406, "logps/rejected": -95.21162414550781, "loss": 0.6936, "rewards/accuracies": 0.96875, "rewards/chosen": 0.21240216493606567, "rewards/margins": 1.4729377031326294, "rewards/rejected": -1.260535478591919, "step": 99 }, { "epoch": 0.3016022620169651, "grad_norm": 3.6052329540252686, "learning_rate": 9.700906344410875e-07, "logits/chosen": -2.4559030532836914, "logits/rejected": -2.321957588195801, "logps/chosen": -53.66700744628906, "logps/rejected": -88.87232971191406, "loss": 0.6973, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3188741207122803, "rewards/margins": 1.4090721607208252, "rewards/rejected": -1.0901981592178345, "step": 100 }, { "epoch": 0.30461828463713475, "grad_norm": 4.0523681640625, "learning_rate": 9.697885196374622e-07, "logits/chosen": -2.3945209980010986, "logits/rejected": -2.2941877841949463, "logps/chosen": -55.3075065612793, "logps/rejected": -88.5333480834961, "loss": 0.6929, "rewards/accuracies": 0.96875, "rewards/chosen": 0.22522836923599243, "rewards/margins": 1.4524867534637451, "rewards/rejected": -1.2272584438323975, "step": 101 }, { "epoch": 0.30763430725730445, "grad_norm": 3.1073591709136963, "learning_rate": 9.694864048338368e-07, "logits/chosen": -2.4063148498535156, "logits/rejected": -2.3307251930236816, "logps/chosen": -54.69801330566406, "logps/rejected": -93.97372436523438, "loss": 0.7022, "rewards/accuracies": 1.0, "rewards/chosen": 0.08517104387283325, "rewards/margins": 1.497436285018921, "rewards/rejected": -1.4122650623321533, "step": 102 }, { "epoch": 0.3106503298774741, "grad_norm": 3.664252281188965, "learning_rate": 9.691842900302115e-07, "logits/chosen": -2.4857800006866455, "logits/rejected": -2.350292444229126, "logps/chosen": -53.50385665893555, "logps/rejected": -85.10662078857422, "loss": 0.7203, "rewards/accuracies": 0.90625, "rewards/chosen": 0.1730692982673645, "rewards/margins": 1.3157007694244385, "rewards/rejected": -1.1426315307617188, "step": 103 }, { "epoch": 0.31366635249764374, "grad_norm": 4.617639541625977, "learning_rate": 9.688821752265862e-07, "logits/chosen": -2.4646334648132324, "logits/rejected": -2.310265064239502, "logps/chosen": -49.809391021728516, "logps/rejected": -82.89166259765625, "loss": 0.7495, "rewards/accuracies": 0.90625, "rewards/chosen": 0.18061178922653198, "rewards/margins": 1.2156604528427124, "rewards/rejected": -1.0350486040115356, "step": 104 }, { "epoch": 0.3166823751178134, "grad_norm": 3.5545756816864014, "learning_rate": 9.685800604229606e-07, "logits/chosen": -2.3763692378997803, "logits/rejected": -2.223851203918457, "logps/chosen": -57.659629821777344, "logps/rejected": -103.27882385253906, "loss": 0.6735, "rewards/accuracies": 0.96875, "rewards/chosen": 0.15960198640823364, "rewards/margins": 1.5702650547027588, "rewards/rejected": -1.4106628894805908, "step": 105 }, { "epoch": 0.31969839773798303, "grad_norm": 3.440394163131714, "learning_rate": 9.682779456193353e-07, "logits/chosen": -2.3196053504943848, "logits/rejected": -2.2389020919799805, "logps/chosen": -51.8743896484375, "logps/rejected": -92.71244812011719, "loss": 0.6252, "rewards/accuracies": 0.90625, "rewards/chosen": 0.4698944389820099, "rewards/margins": 1.8390076160430908, "rewards/rejected": -1.3691132068634033, "step": 106 }, { "epoch": 0.3227144203581527, "grad_norm": 3.488429069519043, "learning_rate": 9.6797583081571e-07, "logits/chosen": -2.3653664588928223, "logits/rejected": -2.256066083908081, "logps/chosen": -53.34688949584961, "logps/rejected": -99.76814270019531, "loss": 0.6807, "rewards/accuracies": 0.96875, "rewards/chosen": 0.13588032126426697, "rewards/margins": 1.6301050186157227, "rewards/rejected": -1.4942247867584229, "step": 107 }, { "epoch": 0.3257304429783223, "grad_norm": 3.4738399982452393, "learning_rate": 9.676737160120846e-07, "logits/chosen": -2.3859143257141113, "logits/rejected": -2.312906265258789, "logps/chosen": -51.894691467285156, "logps/rejected": -96.33088684082031, "loss": 0.6366, "rewards/accuracies": 0.96875, "rewards/chosen": 0.45619040727615356, "rewards/margins": 1.8896691799163818, "rewards/rejected": -1.433478832244873, "step": 108 }, { "epoch": 0.32874646559849197, "grad_norm": 3.0532305240631104, "learning_rate": 9.67371601208459e-07, "logits/chosen": -2.3544921875, "logits/rejected": -2.2640507221221924, "logps/chosen": -50.92245864868164, "logps/rejected": -96.62598419189453, "loss": 0.6419, "rewards/accuracies": 1.0, "rewards/chosen": 0.44735750555992126, "rewards/margins": 1.8198423385620117, "rewards/rejected": -1.3724849224090576, "step": 109 }, { "epoch": 0.33176248821866167, "grad_norm": 3.8054895401000977, "learning_rate": 9.670694864048338e-07, "logits/chosen": -2.3163139820098877, "logits/rejected": -2.2540411949157715, "logps/chosen": -58.48188781738281, "logps/rejected": -100.45158386230469, "loss": 0.7041, "rewards/accuracies": 1.0, "rewards/chosen": 0.07748913764953613, "rewards/margins": 1.5173516273498535, "rewards/rejected": -1.4398627281188965, "step": 110 }, { "epoch": 0.3347785108388313, "grad_norm": 3.551115036010742, "learning_rate": 9.667673716012084e-07, "logits/chosen": -2.430685520172119, "logits/rejected": -2.294037103652954, "logps/chosen": -51.739349365234375, "logps/rejected": -99.87422943115234, "loss": 0.6167, "rewards/accuracies": 1.0, "rewards/chosen": 0.43015584349632263, "rewards/margins": 1.9568288326263428, "rewards/rejected": -1.5266729593276978, "step": 111 }, { "epoch": 0.33779453345900096, "grad_norm": 3.530579090118408, "learning_rate": 9.66465256797583e-07, "logits/chosen": -2.4519522190093994, "logits/rejected": -2.28436279296875, "logps/chosen": -49.41943359375, "logps/rejected": -95.55694580078125, "loss": 0.6211, "rewards/accuracies": 0.9375, "rewards/chosen": 0.48828911781311035, "rewards/margins": 1.8927828073501587, "rewards/rejected": -1.4044936895370483, "step": 112 }, { "epoch": 0.3408105560791706, "grad_norm": 3.6994285583496094, "learning_rate": 9.661631419939576e-07, "logits/chosen": -2.4131217002868652, "logits/rejected": -2.238986015319824, "logps/chosen": -48.64258575439453, "logps/rejected": -100.00653839111328, "loss": 0.5991, "rewards/accuracies": 0.96875, "rewards/chosen": 0.536800742149353, "rewards/margins": 2.0364904403686523, "rewards/rejected": -1.4996898174285889, "step": 113 }, { "epoch": 0.34382657869934025, "grad_norm": 2.8024401664733887, "learning_rate": 9.658610271903322e-07, "logits/chosen": -2.3564605712890625, "logits/rejected": -2.2974112033843994, "logps/chosen": -52.8522834777832, "logps/rejected": -89.34681701660156, "loss": 0.7004, "rewards/accuracies": 0.96875, "rewards/chosen": 0.31507396697998047, "rewards/margins": 1.581256628036499, "rewards/rejected": -1.2661826610565186, "step": 114 }, { "epoch": 0.3468426013195099, "grad_norm": 3.320683002471924, "learning_rate": 9.65558912386707e-07, "logits/chosen": -2.4018330574035645, "logits/rejected": -2.317176103591919, "logps/chosen": -57.07184600830078, "logps/rejected": -94.7877197265625, "loss": 0.623, "rewards/accuracies": 0.96875, "rewards/chosen": 0.43911415338516235, "rewards/margins": 1.951378583908081, "rewards/rejected": -1.5122644901275635, "step": 115 }, { "epoch": 0.34985862393967954, "grad_norm": 3.6765835285186768, "learning_rate": 9.652567975830816e-07, "logits/chosen": -2.42112135887146, "logits/rejected": -2.3074705600738525, "logps/chosen": -48.24287414550781, "logps/rejected": -92.77735137939453, "loss": 0.5953, "rewards/accuracies": 0.96875, "rewards/chosen": 0.707151472568512, "rewards/margins": 2.064208984375, "rewards/rejected": -1.3570573329925537, "step": 116 }, { "epoch": 0.3528746465598492, "grad_norm": 3.1525673866271973, "learning_rate": 9.64954682779456e-07, "logits/chosen": -2.394606828689575, "logits/rejected": -2.3334810733795166, "logps/chosen": -49.841468811035156, "logps/rejected": -95.40814208984375, "loss": 0.6049, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6286394596099854, "rewards/margins": 2.140758514404297, "rewards/rejected": -1.512118935585022, "step": 117 }, { "epoch": 0.3558906691800188, "grad_norm": 3.320131778717041, "learning_rate": 9.646525679758307e-07, "logits/chosen": -2.401702642440796, "logits/rejected": -2.34861159324646, "logps/chosen": -47.99945068359375, "logps/rejected": -97.310791015625, "loss": 0.579, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6937269568443298, "rewards/margins": 2.16561222076416, "rewards/rejected": -1.471885323524475, "step": 118 }, { "epoch": 0.3589066918001885, "grad_norm": 4.925288200378418, "learning_rate": 9.643504531722054e-07, "logits/chosen": -2.3785617351531982, "logits/rejected": -2.3331246376037598, "logps/chosen": -56.01031494140625, "logps/rejected": -95.61845397949219, "loss": 0.646, "rewards/accuracies": 0.9375, "rewards/chosen": 0.22489067912101746, "rewards/margins": 1.895341396331787, "rewards/rejected": -1.6704506874084473, "step": 119 }, { "epoch": 0.36192271442035817, "grad_norm": 4.198221206665039, "learning_rate": 9.6404833836858e-07, "logits/chosen": -2.394716739654541, "logits/rejected": -2.3344593048095703, "logps/chosen": -46.576873779296875, "logps/rejected": -86.79048156738281, "loss": 0.6144, "rewards/accuracies": 1.0, "rewards/chosen": 0.7756027579307556, "rewards/margins": 1.957823634147644, "rewards/rejected": -1.1822208166122437, "step": 120 }, { "epoch": 0.3649387370405278, "grad_norm": 2.7662036418914795, "learning_rate": 9.637462235649545e-07, "logits/chosen": -2.3949804306030273, "logits/rejected": -2.2852303981781006, "logps/chosen": -51.07152557373047, "logps/rejected": -100.06791687011719, "loss": 0.5637, "rewards/accuracies": 0.96875, "rewards/chosen": 0.5699265003204346, "rewards/margins": 2.3447365760803223, "rewards/rejected": -1.7748099565505981, "step": 121 }, { "epoch": 0.36795475966069746, "grad_norm": 3.1530256271362305, "learning_rate": 9.634441087613294e-07, "logits/chosen": -2.4206151962280273, "logits/rejected": -2.3803343772888184, "logps/chosen": -49.02263641357422, "logps/rejected": -99.75690460205078, "loss": 0.5557, "rewards/accuracies": 1.0, "rewards/chosen": 0.650560736656189, "rewards/margins": 2.3806731700897217, "rewards/rejected": -1.7301126718521118, "step": 122 }, { "epoch": 0.3709707822808671, "grad_norm": 3.2499616146087646, "learning_rate": 9.631419939577038e-07, "logits/chosen": -2.3936641216278076, "logits/rejected": -2.3159050941467285, "logps/chosen": -52.29252624511719, "logps/rejected": -91.70088958740234, "loss": 0.6478, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6409454345703125, "rewards/margins": 2.0162758827209473, "rewards/rejected": -1.3753303289413452, "step": 123 }, { "epoch": 0.37398680490103675, "grad_norm": 3.991055965423584, "learning_rate": 9.628398791540785e-07, "logits/chosen": -2.4456489086151123, "logits/rejected": -2.3128232955932617, "logps/chosen": -47.88347625732422, "logps/rejected": -109.89127349853516, "loss": 0.5451, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5823150873184204, "rewards/margins": 2.644470691680908, "rewards/rejected": -2.0621557235717773, "step": 124 }, { "epoch": 0.3770028275212064, "grad_norm": 3.415842056274414, "learning_rate": 9.625377643504532e-07, "logits/chosen": -2.396725654602051, "logits/rejected": -2.299048900604248, "logps/chosen": -48.3135871887207, "logps/rejected": -96.44157409667969, "loss": 0.5743, "rewards/accuracies": 1.0, "rewards/chosen": 0.6667695641517639, "rewards/margins": 2.3224430084228516, "rewards/rejected": -1.6556733846664429, "step": 125 }, { "epoch": 0.38001885014137604, "grad_norm": 3.475870370864868, "learning_rate": 9.622356495468279e-07, "logits/chosen": -2.439563512802124, "logits/rejected": -2.332169532775879, "logps/chosen": -47.23240661621094, "logps/rejected": -100.5015869140625, "loss": 0.5557, "rewards/accuracies": 1.0, "rewards/chosen": 0.8449040055274963, "rewards/margins": 2.534376382827759, "rewards/rejected": -1.6894724369049072, "step": 126 }, { "epoch": 0.3830348727615457, "grad_norm": 3.069298505783081, "learning_rate": 9.619335347432023e-07, "logits/chosen": -2.465283155441284, "logits/rejected": -2.4149601459503174, "logps/chosen": -47.588951110839844, "logps/rejected": -99.19448852539062, "loss": 0.5569, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8125267624855042, "rewards/margins": 2.522831678390503, "rewards/rejected": -1.7103047370910645, "step": 127 }, { "epoch": 0.3860508953817154, "grad_norm": 3.024656295776367, "learning_rate": 9.61631419939577e-07, "logits/chosen": -2.4475765228271484, "logits/rejected": -2.2920360565185547, "logps/chosen": -50.95102310180664, "logps/rejected": -94.44181823730469, "loss": 0.6372, "rewards/accuracies": 0.9375, "rewards/chosen": 0.44171658158302307, "rewards/margins": 1.9985623359680176, "rewards/rejected": -1.5568456649780273, "step": 128 }, { "epoch": 0.38906691800188503, "grad_norm": 3.3394720554351807, "learning_rate": 9.613293051359517e-07, "logits/chosen": -2.399834156036377, "logits/rejected": -2.3115453720092773, "logps/chosen": -54.25136184692383, "logps/rejected": -111.6705322265625, "loss": 0.5442, "rewards/accuracies": 0.96875, "rewards/chosen": 0.4682779312133789, "rewards/margins": 2.624988079071045, "rewards/rejected": -2.156709671020508, "step": 129 }, { "epoch": 0.3920829406220547, "grad_norm": 3.3385941982269287, "learning_rate": 9.610271903323263e-07, "logits/chosen": -2.407618522644043, "logits/rejected": -2.3547213077545166, "logps/chosen": -52.97495651245117, "logps/rejected": -101.54936218261719, "loss": 0.6002, "rewards/accuracies": 0.96875, "rewards/chosen": 0.3211698532104492, "rewards/margins": 2.1538426876068115, "rewards/rejected": -1.8326728343963623, "step": 130 }, { "epoch": 0.3950989632422243, "grad_norm": 3.0187158584594727, "learning_rate": 9.607250755287008e-07, "logits/chosen": -2.433638572692871, "logits/rejected": -2.3458149433135986, "logps/chosen": -46.749717712402344, "logps/rejected": -103.74430084228516, "loss": 0.5629, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6988454461097717, "rewards/margins": 2.60517954826355, "rewards/rejected": -1.9063341617584229, "step": 131 }, { "epoch": 0.39811498586239397, "grad_norm": 2.8365941047668457, "learning_rate": 9.604229607250755e-07, "logits/chosen": -2.3843164443969727, "logits/rejected": -2.3205275535583496, "logps/chosen": -43.20816421508789, "logps/rejected": -99.18968963623047, "loss": 0.5007, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0262800455093384, "rewards/margins": 2.97636342048645, "rewards/rejected": -1.950083613395691, "step": 132 }, { "epoch": 0.4011310084825636, "grad_norm": 3.7539408206939697, "learning_rate": 9.601208459214501e-07, "logits/chosen": -2.4711897373199463, "logits/rejected": -2.336076259613037, "logps/chosen": -46.432167053222656, "logps/rejected": -92.49319458007812, "loss": 0.6068, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9411394000053406, "rewards/margins": 2.2827515602111816, "rewards/rejected": -1.3416123390197754, "step": 133 }, { "epoch": 0.40414703110273326, "grad_norm": 3.2211766242980957, "learning_rate": 9.598187311178248e-07, "logits/chosen": -2.486934185028076, "logits/rejected": -2.35622239112854, "logps/chosen": -43.284210205078125, "logps/rejected": -99.23542022705078, "loss": 0.484, "rewards/accuracies": 1.0, "rewards/chosen": 0.9607818126678467, "rewards/margins": 2.907580852508545, "rewards/rejected": -1.9467989206314087, "step": 134 }, { "epoch": 0.40414703110273326, "eval_logits/chosen": -2.3462183475494385, "eval_logits/rejected": -2.289701223373413, "eval_logps/chosen": -51.815826416015625, "eval_logps/rejected": -106.7496566772461, "eval_loss": 0.5177380442619324, "eval_rewards/accuracies": 0.9716494679450989, "eval_rewards/chosen": 0.7588855028152466, "eval_rewards/margins": 2.8066885471343994, "eval_rewards/rejected": -2.047802686691284, "eval_runtime": 699.0284, "eval_samples_per_second": 0.554, "eval_steps_per_second": 0.278, "step": 134 }, { "epoch": 0.4071630537229029, "grad_norm": 2.8945586681365967, "learning_rate": 9.595166163141993e-07, "logits/chosen": -2.4125022888183594, "logits/rejected": -2.373410701751709, "logps/chosen": -55.002750396728516, "logps/rejected": -108.00238800048828, "loss": 0.5145, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7097357511520386, "rewards/margins": 2.7913334369659424, "rewards/rejected": -2.0815975666046143, "step": 135 }, { "epoch": 0.41017907634307255, "grad_norm": 3.2503156661987305, "learning_rate": 9.59214501510574e-07, "logits/chosen": -2.438366651535034, "logits/rejected": -2.3551950454711914, "logps/chosen": -43.08580780029297, "logps/rejected": -87.43081665039062, "loss": 0.6257, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9585327506065369, "rewards/margins": 2.13162899017334, "rewards/rejected": -1.1730962991714478, "step": 136 }, { "epoch": 0.41319509896324225, "grad_norm": 3.366487741470337, "learning_rate": 9.589123867069486e-07, "logits/chosen": -2.4088425636291504, "logits/rejected": -2.305723190307617, "logps/chosen": -47.95932388305664, "logps/rejected": -88.77972412109375, "loss": 0.5703, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8151925206184387, "rewards/margins": 2.42730975151062, "rewards/rejected": -1.6121171712875366, "step": 137 }, { "epoch": 0.4162111215834119, "grad_norm": 3.1258950233459473, "learning_rate": 9.586102719033233e-07, "logits/chosen": -2.4366936683654785, "logits/rejected": -2.38826584815979, "logps/chosen": -45.457672119140625, "logps/rejected": -107.68341827392578, "loss": 0.5144, "rewards/accuracies": 1.0, "rewards/chosen": 0.7303955554962158, "rewards/margins": 3.0167503356933594, "rewards/rejected": -2.2863545417785645, "step": 138 }, { "epoch": 0.41922714420358154, "grad_norm": 4.1397318840026855, "learning_rate": 9.583081570996977e-07, "logits/chosen": -2.3999521732330322, "logits/rejected": -2.2967116832733154, "logps/chosen": -45.49467086791992, "logps/rejected": -99.62410736083984, "loss": 0.5351, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8054246306419373, "rewards/margins": 2.6443309783935547, "rewards/rejected": -1.8389062881469727, "step": 139 }, { "epoch": 0.4222431668237512, "grad_norm": 2.920639991760254, "learning_rate": 9.580060422960726e-07, "logits/chosen": -2.3775758743286133, "logits/rejected": -2.3858542442321777, "logps/chosen": -54.12731170654297, "logps/rejected": -118.37203216552734, "loss": 0.4801, "rewards/accuracies": 0.90625, "rewards/chosen": 0.5076711177825928, "rewards/margins": 3.320169687271118, "rewards/rejected": -2.8124985694885254, "step": 140 }, { "epoch": 0.4252591894439208, "grad_norm": 3.0556817054748535, "learning_rate": 9.57703927492447e-07, "logits/chosen": -2.4840822219848633, "logits/rejected": -2.420844316482544, "logps/chosen": -47.71942901611328, "logps/rejected": -95.11703491210938, "loss": 0.5357, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9232643842697144, "rewards/margins": 2.7516181468963623, "rewards/rejected": -1.8283535242080688, "step": 141 }, { "epoch": 0.42827521206409047, "grad_norm": 3.576336145401001, "learning_rate": 9.574018126888217e-07, "logits/chosen": -2.4726476669311523, "logits/rejected": -2.366382598876953, "logps/chosen": -44.456336975097656, "logps/rejected": -107.17925262451172, "loss": 0.5217, "rewards/accuracies": 1.0, "rewards/chosen": 0.9360989928245544, "rewards/margins": 3.0075464248657227, "rewards/rejected": -2.0714473724365234, "step": 142 }, { "epoch": 0.4312912346842601, "grad_norm": 2.8876328468322754, "learning_rate": 9.570996978851962e-07, "logits/chosen": -2.430821418762207, "logits/rejected": -2.373368978500366, "logps/chosen": -46.816856384277344, "logps/rejected": -109.01123046875, "loss": 0.451, "rewards/accuracies": 1.0, "rewards/chosen": 1.1665408611297607, "rewards/margins": 3.543220281600952, "rewards/rejected": -2.3766794204711914, "step": 143 }, { "epoch": 0.43430725730442976, "grad_norm": 3.157383441925049, "learning_rate": 9.56797583081571e-07, "logits/chosen": -2.4283783435821533, "logits/rejected": -2.341928243637085, "logps/chosen": -46.67904281616211, "logps/rejected": -107.7857894897461, "loss": 0.5065, "rewards/accuracies": 0.96875, "rewards/chosen": 1.061798095703125, "rewards/margins": 3.2583236694335938, "rewards/rejected": -2.1965255737304688, "step": 144 }, { "epoch": 0.43732327992459946, "grad_norm": 3.5322461128234863, "learning_rate": 9.564954682779455e-07, "logits/chosen": -2.4884989261627197, "logits/rejected": -2.452521324157715, "logps/chosen": -42.12776565551758, "logps/rejected": -95.7225570678711, "loss": 0.5065, "rewards/accuracies": 1.0, "rewards/chosen": 1.1209968328475952, "rewards/margins": 3.0145444869995117, "rewards/rejected": -1.893547534942627, "step": 145 }, { "epoch": 0.4403393025447691, "grad_norm": 3.0425126552581787, "learning_rate": 9.561933534743202e-07, "logits/chosen": -2.501253843307495, "logits/rejected": -2.4187629222869873, "logps/chosen": -47.32474899291992, "logps/rejected": -98.88484191894531, "loss": 0.5507, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9015798568725586, "rewards/margins": 2.7704453468322754, "rewards/rejected": -1.8688654899597168, "step": 146 }, { "epoch": 0.44335532516493875, "grad_norm": 2.717756509780884, "learning_rate": 9.558912386706947e-07, "logits/chosen": -2.4850387573242188, "logits/rejected": -2.380521297454834, "logps/chosen": -38.25093460083008, "logps/rejected": -108.55880737304688, "loss": 0.3999, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5106443166732788, "rewards/margins": 3.9453272819519043, "rewards/rejected": -2.434683084487915, "step": 147 }, { "epoch": 0.4463713477851084, "grad_norm": 2.8533811569213867, "learning_rate": 9.555891238670696e-07, "logits/chosen": -2.500243663787842, "logits/rejected": -2.347334861755371, "logps/chosen": -41.01979064941406, "logps/rejected": -100.00245666503906, "loss": 0.4715, "rewards/accuracies": 0.96875, "rewards/chosen": 1.136362075805664, "rewards/margins": 3.171830415725708, "rewards/rejected": -2.035468578338623, "step": 148 }, { "epoch": 0.44938737040527804, "grad_norm": 2.9460959434509277, "learning_rate": 9.55287009063444e-07, "logits/chosen": -2.5108442306518555, "logits/rejected": -2.380781650543213, "logps/chosen": -48.18000411987305, "logps/rejected": -104.49465942382812, "loss": 0.5159, "rewards/accuracies": 0.96875, "rewards/chosen": 1.057487964630127, "rewards/margins": 3.0496649742126465, "rewards/rejected": -1.9921767711639404, "step": 149 }, { "epoch": 0.4524033930254477, "grad_norm": 3.7788751125335693, "learning_rate": 9.549848942598187e-07, "logits/chosen": -2.465268611907959, "logits/rejected": -2.4488425254821777, "logps/chosen": -41.435394287109375, "logps/rejected": -96.38487243652344, "loss": 0.5315, "rewards/accuracies": 1.0, "rewards/chosen": 1.2514019012451172, "rewards/margins": 3.1160950660705566, "rewards/rejected": -1.8646928071975708, "step": 150 }, { "epoch": 0.45541941564561733, "grad_norm": 3.352653980255127, "learning_rate": 9.546827794561934e-07, "logits/chosen": -2.411245822906494, "logits/rejected": -2.3888721466064453, "logps/chosen": -42.86935043334961, "logps/rejected": -104.19487762451172, "loss": 0.5003, "rewards/accuracies": 1.0, "rewards/chosen": 1.1224102973937988, "rewards/margins": 3.2327561378479004, "rewards/rejected": -2.1103458404541016, "step": 151 }, { "epoch": 0.458435438265787, "grad_norm": 2.675901174545288, "learning_rate": 9.54380664652568e-07, "logits/chosen": -2.570946455001831, "logits/rejected": -2.5275285243988037, "logps/chosen": -51.159847259521484, "logps/rejected": -113.65765380859375, "loss": 0.4918, "rewards/accuracies": 1.0, "rewards/chosen": 0.9454026222229004, "rewards/margins": 3.3354265689849854, "rewards/rejected": -2.390023946762085, "step": 152 }, { "epoch": 0.4614514608859566, "grad_norm": 3.26721453666687, "learning_rate": 9.540785498489425e-07, "logits/chosen": -2.444430351257324, "logits/rejected": -2.3740570545196533, "logps/chosen": -47.954627990722656, "logps/rejected": -109.1871109008789, "loss": 0.4945, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9491790533065796, "rewards/margins": 3.244446277618408, "rewards/rejected": -2.295267105102539, "step": 153 }, { "epoch": 0.4644674835061263, "grad_norm": 3.0409932136535645, "learning_rate": 9.537764350453172e-07, "logits/chosen": -2.491486072540283, "logits/rejected": -2.468309164047241, "logps/chosen": -53.23114776611328, "logps/rejected": -103.76690673828125, "loss": 0.5333, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0606061220169067, "rewards/margins": 3.0199012756347656, "rewards/rejected": -1.9592949151992798, "step": 154 }, { "epoch": 0.46748350612629597, "grad_norm": 2.8308212757110596, "learning_rate": 9.534743202416918e-07, "logits/chosen": -2.4708778858184814, "logits/rejected": -2.3897531032562256, "logps/chosen": -51.073883056640625, "logps/rejected": -105.58587646484375, "loss": 0.5327, "rewards/accuracies": 1.0, "rewards/chosen": 0.8508787751197815, "rewards/margins": 3.098712921142578, "rewards/rejected": -2.2478344440460205, "step": 155 }, { "epoch": 0.4704995287464656, "grad_norm": 2.886406660079956, "learning_rate": 9.531722054380664e-07, "logits/chosen": -2.4929769039154053, "logits/rejected": -2.3712172508239746, "logps/chosen": -45.730350494384766, "logps/rejected": -102.17105865478516, "loss": 0.4767, "rewards/accuracies": 1.0, "rewards/chosen": 1.0949374437332153, "rewards/margins": 3.2826714515686035, "rewards/rejected": -2.1877341270446777, "step": 156 }, { "epoch": 0.47351555136663526, "grad_norm": 3.226893186569214, "learning_rate": 9.528700906344411e-07, "logits/chosen": -2.4280951023101807, "logits/rejected": -2.3605728149414062, "logps/chosen": -41.06540298461914, "logps/rejected": -101.1469497680664, "loss": 0.44, "rewards/accuracies": 0.9375, "rewards/chosen": 1.425709843635559, "rewards/margins": 3.4719643592834473, "rewards/rejected": -2.0462543964385986, "step": 157 }, { "epoch": 0.4765315739868049, "grad_norm": 2.9261481761932373, "learning_rate": 9.525679758308157e-07, "logits/chosen": -2.4820215702056885, "logits/rejected": -2.4482932090759277, "logps/chosen": -45.636802673339844, "logps/rejected": -108.00924682617188, "loss": 0.4717, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9977685809135437, "rewards/margins": 3.4875214099884033, "rewards/rejected": -2.489752769470215, "step": 158 }, { "epoch": 0.47954759660697455, "grad_norm": 3.185485601425171, "learning_rate": 9.522658610271903e-07, "logits/chosen": -2.504692792892456, "logits/rejected": -2.4654762744903564, "logps/chosen": -47.52208709716797, "logps/rejected": -103.66485595703125, "loss": 0.5146, "rewards/accuracies": 1.0, "rewards/chosen": 1.1185333728790283, "rewards/margins": 3.065340042114258, "rewards/rejected": -1.946806788444519, "step": 159 }, { "epoch": 0.4825636192271442, "grad_norm": 3.2019269466400146, "learning_rate": 9.519637462235649e-07, "logits/chosen": -2.453336238861084, "logits/rejected": -2.3655736446380615, "logps/chosen": -41.131927490234375, "logps/rejected": -105.68437957763672, "loss": 0.4216, "rewards/accuracies": 0.90625, "rewards/chosen": 1.464385986328125, "rewards/margins": 3.6301326751708984, "rewards/rejected": -2.1657466888427734, "step": 160 }, { "epoch": 0.48557964184731384, "grad_norm": 2.5315115451812744, "learning_rate": 9.516616314199395e-07, "logits/chosen": -2.471682548522949, "logits/rejected": -2.430450439453125, "logps/chosen": -41.69236755371094, "logps/rejected": -108.7340087890625, "loss": 0.389, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5401560068130493, "rewards/margins": 3.934865951538086, "rewards/rejected": -2.394709587097168, "step": 161 }, { "epoch": 0.4885956644674835, "grad_norm": 3.230886220932007, "learning_rate": 9.513595166163142e-07, "logits/chosen": -2.4212896823883057, "logits/rejected": -2.406191349029541, "logps/chosen": -42.35012435913086, "logps/rejected": -105.93392181396484, "loss": 0.4345, "rewards/accuracies": 1.0, "rewards/chosen": 1.3938714265823364, "rewards/margins": 3.7294986248016357, "rewards/rejected": -2.335627317428589, "step": 162 }, { "epoch": 0.4916116870876532, "grad_norm": 3.1707656383514404, "learning_rate": 9.510574018126888e-07, "logits/chosen": -2.4515676498413086, "logits/rejected": -2.3654489517211914, "logps/chosen": -43.2747917175293, "logps/rejected": -114.25218200683594, "loss": 0.3972, "rewards/accuracies": 1.0, "rewards/chosen": 1.340664029121399, "rewards/margins": 4.189483165740967, "rewards/rejected": -2.8488192558288574, "step": 163 }, { "epoch": 0.4946277097078228, "grad_norm": 3.0487866401672363, "learning_rate": 9.507552870090633e-07, "logits/chosen": -2.383202075958252, "logits/rejected": -2.373002767562866, "logps/chosen": -51.087459564208984, "logps/rejected": -107.933837890625, "loss": 0.4465, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0060715675354004, "rewards/margins": 3.700961112976074, "rewards/rejected": -2.694890022277832, "step": 164 }, { "epoch": 0.49764373232799247, "grad_norm": 2.5185070037841797, "learning_rate": 9.50453172205438e-07, "logits/chosen": -2.489682197570801, "logits/rejected": -2.479102373123169, "logps/chosen": -48.45993423461914, "logps/rejected": -107.07740020751953, "loss": 0.523, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9523181915283203, "rewards/margins": 3.31866455078125, "rewards/rejected": -2.366346836090088, "step": 165 }, { "epoch": 0.5006597549481621, "grad_norm": 2.8383677005767822, "learning_rate": 9.501510574018127e-07, "logits/chosen": -2.412262439727783, "logits/rejected": -2.397218704223633, "logps/chosen": -44.192256927490234, "logps/rejected": -107.91879272460938, "loss": 0.4069, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5941723585128784, "rewards/margins": 3.9439404010772705, "rewards/rejected": -2.3497679233551025, "step": 166 }, { "epoch": 0.5036757775683318, "grad_norm": 3.2368154525756836, "learning_rate": 9.498489425981872e-07, "logits/chosen": -2.494245767593384, "logits/rejected": -2.4164395332336426, "logps/chosen": -43.32606887817383, "logps/rejected": -107.0510025024414, "loss": 0.4371, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3780171871185303, "rewards/margins": 3.7125890254974365, "rewards/rejected": -2.334571599960327, "step": 167 }, { "epoch": 0.5066918001885015, "grad_norm": 2.4795756340026855, "learning_rate": 9.495468277945618e-07, "logits/chosen": -2.558892250061035, "logits/rejected": -2.449047327041626, "logps/chosen": -41.5377082824707, "logps/rejected": -106.25282287597656, "loss": 0.4778, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2756589651107788, "rewards/margins": 3.8087430000305176, "rewards/rejected": -2.533083915710449, "step": 168 }, { "epoch": 0.5097078228086711, "grad_norm": 3.0371768474578857, "learning_rate": 9.492447129909366e-07, "logits/chosen": -2.625248432159424, "logits/rejected": -2.4507904052734375, "logps/chosen": -38.73438262939453, "logps/rejected": -97.71344757080078, "loss": 0.5039, "rewards/accuracies": 0.96875, "rewards/chosen": 1.6785345077514648, "rewards/margins": 3.7040154933929443, "rewards/rejected": -2.0254809856414795, "step": 169 }, { "epoch": 0.5127238454288408, "grad_norm": 3.1497883796691895, "learning_rate": 9.489425981873111e-07, "logits/chosen": -2.4273760318756104, "logits/rejected": -2.3670473098754883, "logps/chosen": -45.11663055419922, "logps/rejected": -103.30660247802734, "loss": 0.4871, "rewards/accuracies": 1.0, "rewards/chosen": 1.5786757469177246, "rewards/margins": 3.4950828552246094, "rewards/rejected": -1.9164074659347534, "step": 170 }, { "epoch": 0.5157398680490104, "grad_norm": 2.569179058074951, "learning_rate": 9.486404833836857e-07, "logits/chosen": -2.528972625732422, "logits/rejected": -2.476407527923584, "logps/chosen": -40.597633361816406, "logps/rejected": -117.24687957763672, "loss": 0.4017, "rewards/accuracies": 1.0, "rewards/chosen": 1.6059541702270508, "rewards/margins": 4.201093673706055, "rewards/rejected": -2.595139265060425, "step": 171 }, { "epoch": 0.51875589066918, "grad_norm": 3.298476219177246, "learning_rate": 9.483383685800604e-07, "logits/chosen": -2.5101094245910645, "logits/rejected": -2.46801495552063, "logps/chosen": -38.84668731689453, "logps/rejected": -100.85050964355469, "loss": 0.4641, "rewards/accuracies": 0.96875, "rewards/chosen": 1.6255462169647217, "rewards/margins": 3.7474749088287354, "rewards/rejected": -2.1219286918640137, "step": 172 }, { "epoch": 0.5217719132893497, "grad_norm": 2.740663766860962, "learning_rate": 9.48036253776435e-07, "logits/chosen": -2.5557868480682373, "logits/rejected": -2.5482282638549805, "logps/chosen": -38.40345001220703, "logps/rejected": -113.72470092773438, "loss": 0.3813, "rewards/accuracies": 0.96875, "rewards/chosen": 1.8201353549957275, "rewards/margins": 4.482444763183594, "rewards/rejected": -2.6623098850250244, "step": 173 }, { "epoch": 0.5247879359095193, "grad_norm": 2.6578476428985596, "learning_rate": 9.477341389728096e-07, "logits/chosen": -2.394879102706909, "logits/rejected": -2.3738369941711426, "logps/chosen": -39.594730377197266, "logps/rejected": -99.6746826171875, "loss": 0.4672, "rewards/accuracies": 0.96875, "rewards/chosen": 1.622254490852356, "rewards/margins": 3.7094287872314453, "rewards/rejected": -2.0871741771698, "step": 174 }, { "epoch": 0.527803958529689, "grad_norm": 4.641773700714111, "learning_rate": 9.474320241691842e-07, "logits/chosen": -2.551459312438965, "logits/rejected": -2.5326833724975586, "logps/chosen": -40.32202911376953, "logps/rejected": -96.36719512939453, "loss": 0.4513, "rewards/accuracies": 0.9375, "rewards/chosen": 1.7307772636413574, "rewards/margins": 3.79290509223938, "rewards/rejected": -2.0621280670166016, "step": 175 }, { "epoch": 0.5308199811498586, "grad_norm": 2.73991322517395, "learning_rate": 9.47129909365559e-07, "logits/chosen": -2.473818063735962, "logits/rejected": -2.476036548614502, "logps/chosen": -44.080833435058594, "logps/rejected": -107.69650268554688, "loss": 0.4712, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2420481443405151, "rewards/margins": 3.7099921703338623, "rewards/rejected": -2.467944383621216, "step": 176 }, { "epoch": 0.5338360037700283, "grad_norm": 3.0487911701202393, "learning_rate": 9.468277945619335e-07, "logits/chosen": -2.509071111679077, "logits/rejected": -2.493241786956787, "logps/chosen": -43.95523452758789, "logps/rejected": -107.52015686035156, "loss": 0.4506, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4583337306976318, "rewards/margins": 4.211301803588867, "rewards/rejected": -2.7529680728912354, "step": 177 }, { "epoch": 0.5368520263901979, "grad_norm": 3.19305682182312, "learning_rate": 9.465256797583081e-07, "logits/chosen": -2.596350908279419, "logits/rejected": -2.4609453678131104, "logps/chosen": -45.365413665771484, "logps/rejected": -114.09317779541016, "loss": 0.4735, "rewards/accuracies": 1.0, "rewards/chosen": 1.206078290939331, "rewards/margins": 4.047790050506592, "rewards/rejected": -2.8417115211486816, "step": 178 }, { "epoch": 0.5398680490103676, "grad_norm": 2.9536051750183105, "learning_rate": 9.462235649546827e-07, "logits/chosen": -2.5648868083953857, "logits/rejected": -2.4730498790740967, "logps/chosen": -36.005374908447266, "logps/rejected": -114.17948150634766, "loss": 0.3566, "rewards/accuracies": 1.0, "rewards/chosen": 1.848429560661316, "rewards/margins": 4.644636154174805, "rewards/rejected": -2.7962067127227783, "step": 179 }, { "epoch": 0.5428840716305372, "grad_norm": 3.0292370319366455, "learning_rate": 9.459214501510574e-07, "logits/chosen": -2.450333595275879, "logits/rejected": -2.421992540359497, "logps/chosen": -42.926605224609375, "logps/rejected": -110.36381530761719, "loss": 0.4326, "rewards/accuracies": 1.0, "rewards/chosen": 1.2996759414672852, "rewards/margins": 4.044602394104004, "rewards/rejected": -2.7449262142181396, "step": 180 }, { "epoch": 0.5459000942507068, "grad_norm": 4.027276515960693, "learning_rate": 9.45619335347432e-07, "logits/chosen": -2.4857282638549805, "logits/rejected": -2.4616804122924805, "logps/chosen": -44.17033386230469, "logps/rejected": -102.02568054199219, "loss": 0.4788, "rewards/accuracies": 1.0, "rewards/chosen": 1.5687494277954102, "rewards/margins": 3.801888942718506, "rewards/rejected": -2.2331395149230957, "step": 181 }, { "epoch": 0.5489161168708765, "grad_norm": 2.751026153564453, "learning_rate": 9.453172205438066e-07, "logits/chosen": -2.3872833251953125, "logits/rejected": -2.4980804920196533, "logps/chosen": -47.877323150634766, "logps/rejected": -113.20404815673828, "loss": 0.4465, "rewards/accuracies": 1.0, "rewards/chosen": 1.0844634771347046, "rewards/margins": 4.065066337585449, "rewards/rejected": -2.9806032180786133, "step": 182 }, { "epoch": 0.5519321394910461, "grad_norm": 2.8381264209747314, "learning_rate": 9.450151057401811e-07, "logits/chosen": -2.4985737800598145, "logits/rejected": -2.542813777923584, "logps/chosen": -36.570064544677734, "logps/rejected": -114.79473876953125, "loss": 0.3937, "rewards/accuracies": 1.0, "rewards/chosen": 1.844085931777954, "rewards/margins": 4.85472297668457, "rewards/rejected": -3.010636806488037, "step": 183 }, { "epoch": 0.5549481621112158, "grad_norm": 3.111541509628296, "learning_rate": 9.447129909365559e-07, "logits/chosen": -2.6185097694396973, "logits/rejected": -2.4865152835845947, "logps/chosen": -43.62076950073242, "logps/rejected": -96.4963150024414, "loss": 0.5465, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1520557403564453, "rewards/margins": 3.397226333618164, "rewards/rejected": -2.2451703548431396, "step": 184 }, { "epoch": 0.5579641847313855, "grad_norm": 4.228057384490967, "learning_rate": 9.444108761329305e-07, "logits/chosen": -2.4929261207580566, "logits/rejected": -2.437589645385742, "logps/chosen": -53.58972930908203, "logps/rejected": -117.24298095703125, "loss": 0.5327, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6708846092224121, "rewards/margins": 3.8305180072784424, "rewards/rejected": -3.1596333980560303, "step": 185 }, { "epoch": 0.5609802073515552, "grad_norm": 3.5287318229675293, "learning_rate": 9.44108761329305e-07, "logits/chosen": -2.5920920372009277, "logits/rejected": -2.5298116207122803, "logps/chosen": -43.10822296142578, "logps/rejected": -108.42880249023438, "loss": 0.4441, "rewards/accuracies": 1.0, "rewards/chosen": 1.605562686920166, "rewards/margins": 4.308238983154297, "rewards/rejected": -2.702676296234131, "step": 186 }, { "epoch": 0.5639962299717248, "grad_norm": 3.6963717937469482, "learning_rate": 9.438066465256798e-07, "logits/chosen": -2.4724624156951904, "logits/rejected": -2.439478874206543, "logps/chosen": -47.05333709716797, "logps/rejected": -128.06170654296875, "loss": 0.4267, "rewards/accuracies": 1.0, "rewards/chosen": 1.0649569034576416, "rewards/margins": 4.831277847290039, "rewards/rejected": -3.7663214206695557, "step": 187 }, { "epoch": 0.5670122525918945, "grad_norm": 3.7042627334594727, "learning_rate": 9.435045317220544e-07, "logits/chosen": -2.528198003768921, "logits/rejected": -2.4471139907836914, "logps/chosen": -44.76411437988281, "logps/rejected": -122.27941131591797, "loss": 0.4289, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1784056425094604, "rewards/margins": 4.4914231300354, "rewards/rejected": -3.3130173683166504, "step": 188 }, { "epoch": 0.5700282752120641, "grad_norm": 2.813136339187622, "learning_rate": 9.432024169184289e-07, "logits/chosen": -2.5065114498138428, "logits/rejected": -2.4803357124328613, "logps/chosen": -34.106781005859375, "logps/rejected": -109.144775390625, "loss": 0.3901, "rewards/accuracies": 1.0, "rewards/chosen": 2.0575740337371826, "rewards/margins": 4.6773200035095215, "rewards/rejected": -2.6197457313537598, "step": 189 }, { "epoch": 0.5730442978322338, "grad_norm": 2.5253374576568604, "learning_rate": 9.429003021148035e-07, "logits/chosen": -2.59246826171875, "logits/rejected": -2.578223466873169, "logps/chosen": -39.89678955078125, "logps/rejected": -121.27310180664062, "loss": 0.3595, "rewards/accuracies": 0.96875, "rewards/chosen": 1.767324686050415, "rewards/margins": 5.274317741394043, "rewards/rejected": -3.506993293762207, "step": 190 }, { "epoch": 0.5760603204524034, "grad_norm": 3.3022563457489014, "learning_rate": 9.425981873111783e-07, "logits/chosen": -2.5145890712738037, "logits/rejected": -2.5075860023498535, "logps/chosen": -38.64506530761719, "logps/rejected": -108.06781005859375, "loss": 0.4471, "rewards/accuracies": 0.9375, "rewards/chosen": 1.487909197807312, "rewards/margins": 4.370731830596924, "rewards/rejected": -2.8828227519989014, "step": 191 }, { "epoch": 0.579076343072573, "grad_norm": 3.55273175239563, "learning_rate": 9.422960725075528e-07, "logits/chosen": -2.5053658485412598, "logits/rejected": -2.4451355934143066, "logps/chosen": -40.56679916381836, "logps/rejected": -114.86473083496094, "loss": 0.344, "rewards/accuracies": 0.90625, "rewards/chosen": 1.5682069063186646, "rewards/margins": 4.845963001251221, "rewards/rejected": -3.2777559757232666, "step": 192 }, { "epoch": 0.5820923656927427, "grad_norm": 2.1104962825775146, "learning_rate": 9.419939577039274e-07, "logits/chosen": -2.5640060901641846, "logits/rejected": -2.5431745052337646, "logps/chosen": -45.92088317871094, "logps/rejected": -113.9984359741211, "loss": 0.5321, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2570180892944336, "rewards/margins": 4.364346504211426, "rewards/rejected": -3.107328176498413, "step": 193 }, { "epoch": 0.5851083883129123, "grad_norm": 3.604384660720825, "learning_rate": 9.416918429003021e-07, "logits/chosen": -2.5753297805786133, "logits/rejected": -2.511643886566162, "logps/chosen": -43.294410705566406, "logps/rejected": -96.47563171386719, "loss": 0.4821, "rewards/accuracies": 0.90625, "rewards/chosen": 1.435579776763916, "rewards/margins": 3.497666835784912, "rewards/rejected": -2.062087059020996, "step": 194 }, { "epoch": 0.588124410933082, "grad_norm": 2.859342575073242, "learning_rate": 9.413897280966767e-07, "logits/chosen": -2.4420089721679688, "logits/rejected": -2.486297607421875, "logps/chosen": -39.58531951904297, "logps/rejected": -119.28495025634766, "loss": 0.3802, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7735278606414795, "rewards/margins": 4.894996166229248, "rewards/rejected": -3.1214680671691895, "step": 195 }, { "epoch": 0.5911404335532516, "grad_norm": 3.3549516201019287, "learning_rate": 9.410876132930513e-07, "logits/chosen": -2.543569326400757, "logits/rejected": -2.560467004776001, "logps/chosen": -37.090850830078125, "logps/rejected": -113.89998626708984, "loss": 0.3551, "rewards/accuracies": 0.96875, "rewards/chosen": 1.882619857788086, "rewards/margins": 4.993287086486816, "rewards/rejected": -3.1106672286987305, "step": 196 }, { "epoch": 0.5941564561734213, "grad_norm": 3.1135528087615967, "learning_rate": 9.407854984894259e-07, "logits/chosen": -2.569265604019165, "logits/rejected": -2.5644798278808594, "logps/chosen": -40.7438850402832, "logps/rejected": -108.18468475341797, "loss": 0.4827, "rewards/accuracies": 0.9375, "rewards/chosen": 1.379897117614746, "rewards/margins": 4.3375959396362305, "rewards/rejected": -2.9576988220214844, "step": 197 }, { "epoch": 0.5971724787935909, "grad_norm": 3.195584774017334, "learning_rate": 9.404833836858007e-07, "logits/chosen": -2.6019363403320312, "logits/rejected": -2.5627987384796143, "logps/chosen": -48.53130340576172, "logps/rejected": -118.37577819824219, "loss": 0.4234, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3805427551269531, "rewards/margins": 4.596936225891113, "rewards/rejected": -3.21639347076416, "step": 198 }, { "epoch": 0.6001885014137606, "grad_norm": 3.1400671005249023, "learning_rate": 9.401812688821752e-07, "logits/chosen": -2.600717067718506, "logits/rejected": -2.54109525680542, "logps/chosen": -34.611534118652344, "logps/rejected": -117.01653289794922, "loss": 0.3662, "rewards/accuracies": 1.0, "rewards/chosen": 2.0948171615600586, "rewards/margins": 5.051046848297119, "rewards/rejected": -2.9562296867370605, "step": 199 }, { "epoch": 0.6032045240339302, "grad_norm": 2.842038154602051, "learning_rate": 9.398791540785498e-07, "logits/chosen": -2.5685036182403564, "logits/rejected": -2.5538830757141113, "logps/chosen": -43.53082275390625, "logps/rejected": -120.28840637207031, "loss": 0.4551, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4135371446609497, "rewards/margins": 4.677258014678955, "rewards/rejected": -3.263720750808716, "step": 200 }, { "epoch": 0.6062205466540999, "grad_norm": 2.8361129760742188, "learning_rate": 9.395770392749243e-07, "logits/chosen": -2.480046510696411, "logits/rejected": -2.4787354469299316, "logps/chosen": -44.396385192871094, "logps/rejected": -111.39060974121094, "loss": 0.4904, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6001253128051758, "rewards/margins": 4.254070281982422, "rewards/rejected": -2.653944730758667, "step": 201 }, { "epoch": 0.6062205466540999, "eval_logits/chosen": -2.4587020874023438, "eval_logits/rejected": -2.447326898574829, "eval_logps/chosen": -41.46421432495117, "eval_logps/rejected": -116.4273910522461, "eval_loss": 0.3714239299297333, "eval_rewards/accuracies": 0.9742268323898315, "eval_rewards/chosen": 1.794047236442566, "eval_rewards/margins": 4.809622764587402, "eval_rewards/rejected": -3.015575647354126, "eval_runtime": 695.728, "eval_samples_per_second": 0.556, "eval_steps_per_second": 0.279, "step": 201 }, { "epoch": 0.6092365692742695, "grad_norm": 2.72497296333313, "learning_rate": 9.392749244712991e-07, "logits/chosen": -2.5545237064361572, "logits/rejected": -2.4591872692108154, "logps/chosen": -41.36058044433594, "logps/rejected": -115.9559555053711, "loss": 0.4441, "rewards/accuracies": 0.9375, "rewards/chosen": 1.765066146850586, "rewards/margins": 4.654053688049316, "rewards/rejected": -2.8889873027801514, "step": 202 }, { "epoch": 0.6122525918944393, "grad_norm": 3.672527551651001, "learning_rate": 9.389728096676737e-07, "logits/chosen": -2.557554006576538, "logits/rejected": -2.5814192295074463, "logps/chosen": -30.13414192199707, "logps/rejected": -101.52084350585938, "loss": 0.404, "rewards/accuracies": 1.0, "rewards/chosen": 2.256441593170166, "rewards/margins": 4.667902946472168, "rewards/rejected": -2.41146183013916, "step": 203 }, { "epoch": 0.6152686145146089, "grad_norm": 5.102949619293213, "learning_rate": 9.386706948640483e-07, "logits/chosen": -2.5270469188690186, "logits/rejected": -2.5078041553497314, "logps/chosen": -39.220909118652344, "logps/rejected": -100.90287780761719, "loss": 0.5026, "rewards/accuracies": 0.9375, "rewards/chosen": 1.7768185138702393, "rewards/margins": 3.9804043769836426, "rewards/rejected": -2.2035858631134033, "step": 204 }, { "epoch": 0.6182846371347785, "grad_norm": 3.228632926940918, "learning_rate": 9.383685800604229e-07, "logits/chosen": -2.4692537784576416, "logits/rejected": -2.4839539527893066, "logps/chosen": -32.259735107421875, "logps/rejected": -114.87855529785156, "loss": 0.3012, "rewards/accuracies": 0.96875, "rewards/chosen": 2.349250555038452, "rewards/margins": 5.31973934173584, "rewards/rejected": -2.9704885482788086, "step": 205 }, { "epoch": 0.6213006597549482, "grad_norm": 2.660219669342041, "learning_rate": 9.380664652567976e-07, "logits/chosen": -2.6417996883392334, "logits/rejected": -2.492039918899536, "logps/chosen": -36.417911529541016, "logps/rejected": -115.51174926757812, "loss": 0.3572, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6556541919708252, "rewards/margins": 5.072952747344971, "rewards/rejected": -3.4172987937927246, "step": 206 }, { "epoch": 0.6243166823751178, "grad_norm": 2.969820976257324, "learning_rate": 9.377643504531722e-07, "logits/chosen": -2.5599656105041504, "logits/rejected": -2.539199113845825, "logps/chosen": -41.16075134277344, "logps/rejected": -99.18949127197266, "loss": 0.5081, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4302526712417603, "rewards/margins": 3.6164474487304688, "rewards/rejected": -2.186194896697998, "step": 207 }, { "epoch": 0.6273327049952875, "grad_norm": 2.892348051071167, "learning_rate": 9.374622356495467e-07, "logits/chosen": -2.59818172454834, "logits/rejected": -2.521881341934204, "logps/chosen": -35.64634704589844, "logps/rejected": -100.41941833496094, "loss": 0.4538, "rewards/accuracies": 0.9375, "rewards/chosen": 1.8788514137268066, "rewards/margins": 4.487547874450684, "rewards/rejected": -2.608696222305298, "step": 208 }, { "epoch": 0.6303487276154571, "grad_norm": 3.85239577293396, "learning_rate": 9.371601208459214e-07, "logits/chosen": -2.4886929988861084, "logits/rejected": -2.4921436309814453, "logps/chosen": -49.0555419921875, "logps/rejected": -125.49264526367188, "loss": 0.4567, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0974922180175781, "rewards/margins": 4.746394157409668, "rewards/rejected": -3.6489017009735107, "step": 209 }, { "epoch": 0.6333647502356268, "grad_norm": 3.1301214694976807, "learning_rate": 9.368580060422961e-07, "logits/chosen": -2.723863363265991, "logits/rejected": -2.6549947261810303, "logps/chosen": -33.420562744140625, "logps/rejected": -100.75640106201172, "loss": 0.4706, "rewards/accuracies": 1.0, "rewards/chosen": 2.1280367374420166, "rewards/margins": 4.277506351470947, "rewards/rejected": -2.1494698524475098, "step": 210 }, { "epoch": 0.6363807728557964, "grad_norm": 3.213770627975464, "learning_rate": 9.365558912386706e-07, "logits/chosen": -2.5833592414855957, "logits/rejected": -2.5561084747314453, "logps/chosen": -39.86476135253906, "logps/rejected": -111.26737976074219, "loss": 0.4345, "rewards/accuracies": 0.96875, "rewards/chosen": 1.6711885929107666, "rewards/margins": 4.6933274269104, "rewards/rejected": -3.0221385955810547, "step": 211 }, { "epoch": 0.6393967954759661, "grad_norm": 3.1153016090393066, "learning_rate": 9.362537764350453e-07, "logits/chosen": -2.539299249649048, "logits/rejected": -2.455317735671997, "logps/chosen": -33.95166778564453, "logps/rejected": -115.59747314453125, "loss": 0.3505, "rewards/accuracies": 0.96875, "rewards/chosen": 2.086231231689453, "rewards/margins": 5.144569396972656, "rewards/rejected": -3.058338165283203, "step": 212 }, { "epoch": 0.6424128180961357, "grad_norm": 4.045174598693848, "learning_rate": 9.3595166163142e-07, "logits/chosen": -2.6256237030029297, "logits/rejected": -2.6153364181518555, "logps/chosen": -34.748558044433594, "logps/rejected": -108.75479888916016, "loss": 0.4253, "rewards/accuracies": 1.0, "rewards/chosen": 1.8345978260040283, "rewards/margins": 4.899069786071777, "rewards/rejected": -3.064471960067749, "step": 213 }, { "epoch": 0.6454288407163054, "grad_norm": 2.883575916290283, "learning_rate": 9.356495468277945e-07, "logits/chosen": -2.5392794609069824, "logits/rejected": -2.510118246078491, "logps/chosen": -34.21550369262695, "logps/rejected": -118.05947875976562, "loss": 0.3286, "rewards/accuracies": 0.96875, "rewards/chosen": 2.1137540340423584, "rewards/margins": 5.67784309387207, "rewards/rejected": -3.564089298248291, "step": 214 }, { "epoch": 0.648444863336475, "grad_norm": 2.9538323879241943, "learning_rate": 9.353474320241691e-07, "logits/chosen": -2.5055723190307617, "logits/rejected": -2.571354627609253, "logps/chosen": -38.52871322631836, "logps/rejected": -125.37238311767578, "loss": 0.3591, "rewards/accuracies": 0.96875, "rewards/chosen": 1.982905387878418, "rewards/margins": 5.486116409301758, "rewards/rejected": -3.50321102142334, "step": 215 }, { "epoch": 0.6514608859566446, "grad_norm": 2.3118367195129395, "learning_rate": 9.350453172205438e-07, "logits/chosen": -2.523685932159424, "logits/rejected": -2.5439326763153076, "logps/chosen": -35.64182662963867, "logps/rejected": -122.46370697021484, "loss": 0.3575, "rewards/accuracies": 1.0, "rewards/chosen": 2.0632736682891846, "rewards/margins": 5.911718845367432, "rewards/rejected": -3.848444938659668, "step": 216 }, { "epoch": 0.6544769085768143, "grad_norm": 3.458827018737793, "learning_rate": 9.347432024169184e-07, "logits/chosen": -2.605689287185669, "logits/rejected": -2.6558425426483154, "logps/chosen": -42.8387451171875, "logps/rejected": -127.13153076171875, "loss": 0.398, "rewards/accuracies": 1.0, "rewards/chosen": 1.3147345781326294, "rewards/margins": 5.499625205993652, "rewards/rejected": -4.184891700744629, "step": 217 }, { "epoch": 0.6574929311969839, "grad_norm": 4.24025821685791, "learning_rate": 9.34441087613293e-07, "logits/chosen": -2.5257070064544678, "logits/rejected": -2.5608949661254883, "logps/chosen": -45.01060104370117, "logps/rejected": -124.1464614868164, "loss": 0.4669, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3069195747375488, "rewards/margins": 4.921163082122803, "rewards/rejected": -3.614243507385254, "step": 218 }, { "epoch": 0.6605089538171536, "grad_norm": 2.45644474029541, "learning_rate": 9.341389728096676e-07, "logits/chosen": -2.6214380264282227, "logits/rejected": -2.5460987091064453, "logps/chosen": -39.72566604614258, "logps/rejected": -122.95550537109375, "loss": 0.3971, "rewards/accuracies": 1.0, "rewards/chosen": 1.8456761837005615, "rewards/margins": 5.551750659942627, "rewards/rejected": -3.7060742378234863, "step": 219 }, { "epoch": 0.6635249764373233, "grad_norm": 2.668877363204956, "learning_rate": 9.338368580060422e-07, "logits/chosen": -2.600541114807129, "logits/rejected": -2.62526273727417, "logps/chosen": -46.30439758300781, "logps/rejected": -128.52171325683594, "loss": 0.4058, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3817323446273804, "rewards/margins": 5.712912559509277, "rewards/rejected": -4.331180095672607, "step": 220 }, { "epoch": 0.666540999057493, "grad_norm": 3.4571309089660645, "learning_rate": 9.335347432024169e-07, "logits/chosen": -2.6293575763702393, "logits/rejected": -2.6295526027679443, "logps/chosen": -45.162471771240234, "logps/rejected": -107.9952621459961, "loss": 0.5621, "rewards/accuracies": 0.875, "rewards/chosen": 1.2251129150390625, "rewards/margins": 3.8903064727783203, "rewards/rejected": -2.665194034576416, "step": 221 }, { "epoch": 0.6695570216776626, "grad_norm": 3.955749273300171, "learning_rate": 9.332326283987915e-07, "logits/chosen": -2.5594310760498047, "logits/rejected": -2.5151758193969727, "logps/chosen": -38.92884063720703, "logps/rejected": -113.77078247070312, "loss": 0.4541, "rewards/accuracies": 0.96875, "rewards/chosen": 1.8062596321105957, "rewards/margins": 4.749693393707275, "rewards/rejected": -2.9434337615966797, "step": 222 }, { "epoch": 0.6725730442978323, "grad_norm": 3.507124900817871, "learning_rate": 9.329305135951662e-07, "logits/chosen": -2.587022542953491, "logits/rejected": -2.5507874488830566, "logps/chosen": -45.8718376159668, "logps/rejected": -122.17427825927734, "loss": 0.4711, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1252496242523193, "rewards/margins": 4.657262325286865, "rewards/rejected": -3.532012462615967, "step": 223 }, { "epoch": 0.6755890669180019, "grad_norm": 4.290508270263672, "learning_rate": 9.326283987915407e-07, "logits/chosen": -2.5388906002044678, "logits/rejected": -2.5558817386627197, "logps/chosen": -44.745811462402344, "logps/rejected": -126.95327758789062, "loss": 0.3892, "rewards/accuracies": 0.90625, "rewards/chosen": 1.2931606769561768, "rewards/margins": 5.321786403656006, "rewards/rejected": -4.028625965118408, "step": 224 }, { "epoch": 0.6786050895381716, "grad_norm": 3.5446488857269287, "learning_rate": 9.323262839879154e-07, "logits/chosen": -2.5871896743774414, "logits/rejected": -2.51324200630188, "logps/chosen": -46.21542739868164, "logps/rejected": -121.04305267333984, "loss": 0.4038, "rewards/accuracies": 0.90625, "rewards/chosen": 1.4898450374603271, "rewards/margins": 5.239337921142578, "rewards/rejected": -3.74949312210083, "step": 225 }, { "epoch": 0.6816211121583412, "grad_norm": 3.225987195968628, "learning_rate": 9.3202416918429e-07, "logits/chosen": -2.5909602642059326, "logits/rejected": -2.5612897872924805, "logps/chosen": -39.541259765625, "logps/rejected": -132.66867065429688, "loss": 0.366, "rewards/accuracies": 0.96875, "rewards/chosen": 1.6090188026428223, "rewards/margins": 5.809476852416992, "rewards/rejected": -4.20045804977417, "step": 226 }, { "epoch": 0.6846371347785108, "grad_norm": 3.7820210456848145, "learning_rate": 9.317220543806646e-07, "logits/chosen": -2.595675468444824, "logits/rejected": -2.5841095447540283, "logps/chosen": -29.43842887878418, "logps/rejected": -114.17214965820312, "loss": 0.3381, "rewards/accuracies": 1.0, "rewards/chosen": 2.5392863750457764, "rewards/margins": 5.519532203674316, "rewards/rejected": -2.980246067047119, "step": 227 }, { "epoch": 0.6876531573986805, "grad_norm": 2.6373958587646484, "learning_rate": 9.314199395770392e-07, "logits/chosen": -2.59651517868042, "logits/rejected": -2.5863919258117676, "logps/chosen": -33.12738037109375, "logps/rejected": -112.67724609375, "loss": 0.388, "rewards/accuracies": 0.9375, "rewards/chosen": 2.004739284515381, "rewards/margins": 5.028158664703369, "rewards/rejected": -3.023418664932251, "step": 228 }, { "epoch": 0.6906691800188501, "grad_norm": 4.060349464416504, "learning_rate": 9.311178247734139e-07, "logits/chosen": -2.5484423637390137, "logits/rejected": -2.505861759185791, "logps/chosen": -29.438764572143555, "logps/rejected": -111.35077667236328, "loss": 0.3647, "rewards/accuracies": 0.96875, "rewards/chosen": 2.4725983142852783, "rewards/margins": 5.253214359283447, "rewards/rejected": -2.78061580657959, "step": 229 }, { "epoch": 0.6936852026390198, "grad_norm": 2.6436564922332764, "learning_rate": 9.308157099697885e-07, "logits/chosen": -2.586247682571411, "logits/rejected": -2.5790719985961914, "logps/chosen": -50.61700439453125, "logps/rejected": -127.357177734375, "loss": 0.4516, "rewards/accuracies": 0.90625, "rewards/chosen": 0.97505784034729, "rewards/margins": 4.779726505279541, "rewards/rejected": -3.80466890335083, "step": 230 }, { "epoch": 0.6967012252591894, "grad_norm": 3.1802210807800293, "learning_rate": 9.305135951661631e-07, "logits/chosen": -2.5894775390625, "logits/rejected": -2.5564637184143066, "logps/chosen": -43.489383697509766, "logps/rejected": -124.87005615234375, "loss": 0.449, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4347997903823853, "rewards/margins": 5.044289588928223, "rewards/rejected": -3.609489917755127, "step": 231 }, { "epoch": 0.6997172478793591, "grad_norm": 4.292280197143555, "learning_rate": 9.302114803625378e-07, "logits/chosen": -2.5941646099090576, "logits/rejected": -2.5814361572265625, "logps/chosen": -35.786643981933594, "logps/rejected": -111.88148498535156, "loss": 0.3996, "rewards/accuracies": 0.96875, "rewards/chosen": 1.9945216178894043, "rewards/margins": 4.855676651000977, "rewards/rejected": -2.8611550331115723, "step": 232 }, { "epoch": 0.7027332704995287, "grad_norm": 3.747868299484253, "learning_rate": 9.299093655589123e-07, "logits/chosen": -2.563495397567749, "logits/rejected": -2.530437469482422, "logps/chosen": -39.660247802734375, "logps/rejected": -115.4126205444336, "loss": 0.3995, "rewards/accuracies": 1.0, "rewards/chosen": 1.7041140794754028, "rewards/margins": 4.994756698608398, "rewards/rejected": -3.290642261505127, "step": 233 }, { "epoch": 0.7057492931196984, "grad_norm": 2.2661526203155518, "learning_rate": 9.29607250755287e-07, "logits/chosen": -2.577228307723999, "logits/rejected": -2.5285112857818604, "logps/chosen": -39.4295539855957, "logps/rejected": -119.42034149169922, "loss": 0.3714, "rewards/accuracies": 0.90625, "rewards/chosen": 1.9258683919906616, "rewards/margins": 5.129205703735352, "rewards/rejected": -3.203336715698242, "step": 234 }, { "epoch": 0.708765315739868, "grad_norm": 3.6377451419830322, "learning_rate": 9.293051359516616e-07, "logits/chosen": -2.6388206481933594, "logits/rejected": -2.6016805171966553, "logps/chosen": -29.03183364868164, "logps/rejected": -98.73515319824219, "loss": 0.4414, "rewards/accuracies": 0.96875, "rewards/chosen": 2.4472811222076416, "rewards/margins": 4.918148040771484, "rewards/rejected": -2.4708666801452637, "step": 235 }, { "epoch": 0.7117813383600377, "grad_norm": 2.8402254581451416, "learning_rate": 9.290030211480362e-07, "logits/chosen": -2.5979740619659424, "logits/rejected": -2.5924417972564697, "logps/chosen": -29.869029998779297, "logps/rejected": -115.23387145996094, "loss": 0.3544, "rewards/accuracies": 0.96875, "rewards/chosen": 2.2251148223876953, "rewards/margins": 5.393055438995361, "rewards/rejected": -3.167940139770508, "step": 236 }, { "epoch": 0.7147973609802073, "grad_norm": 3.079211711883545, "learning_rate": 9.287009063444108e-07, "logits/chosen": -2.6110637187957764, "logits/rejected": -2.5514702796936035, "logps/chosen": -48.70405578613281, "logps/rejected": -132.5725860595703, "loss": 0.3677, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3250224590301514, "rewards/margins": 5.493358612060547, "rewards/rejected": -4.168335914611816, "step": 237 }, { "epoch": 0.717813383600377, "grad_norm": 4.331564903259277, "learning_rate": 9.283987915407855e-07, "logits/chosen": -2.6753108501434326, "logits/rejected": -2.601111650466919, "logps/chosen": -29.945850372314453, "logps/rejected": -104.74176788330078, "loss": 0.3495, "rewards/accuracies": 1.0, "rewards/chosen": 2.415761947631836, "rewards/margins": 5.04657506942749, "rewards/rejected": -2.6308131217956543, "step": 238 }, { "epoch": 0.7208294062205467, "grad_norm": 2.457183599472046, "learning_rate": 9.2809667673716e-07, "logits/chosen": -2.583000421524048, "logits/rejected": -2.601790189743042, "logps/chosen": -30.04819679260254, "logps/rejected": -116.79561614990234, "loss": 0.3822, "rewards/accuracies": 1.0, "rewards/chosen": 2.4156789779663086, "rewards/margins": 5.807142734527588, "rewards/rejected": -3.3914637565612793, "step": 239 }, { "epoch": 0.7238454288407163, "grad_norm": 4.739843368530273, "learning_rate": 9.277945619335347e-07, "logits/chosen": -2.60076642036438, "logits/rejected": -2.60536527633667, "logps/chosen": -25.146087646484375, "logps/rejected": -102.53579711914062, "loss": 0.369, "rewards/accuracies": 1.0, "rewards/chosen": 2.8303050994873047, "rewards/margins": 5.5055928230285645, "rewards/rejected": -2.6752877235412598, "step": 240 }, { "epoch": 0.726861451460886, "grad_norm": 3.4170758724212646, "learning_rate": 9.274924471299094e-07, "logits/chosen": -2.6498258113861084, "logits/rejected": -2.579085350036621, "logps/chosen": -36.530250549316406, "logps/rejected": -120.65702056884766, "loss": 0.3855, "rewards/accuracies": 0.90625, "rewards/chosen": 2.176609992980957, "rewards/margins": 5.58742618560791, "rewards/rejected": -3.410815715789795, "step": 241 }, { "epoch": 0.7298774740810556, "grad_norm": 3.619391441345215, "learning_rate": 9.271903323262839e-07, "logits/chosen": -2.5764858722686768, "logits/rejected": -2.5838727951049805, "logps/chosen": -35.19911575317383, "logps/rejected": -120.56159973144531, "loss": 0.318, "rewards/accuracies": 1.0, "rewards/chosen": 2.1524922847747803, "rewards/margins": 5.501767635345459, "rewards/rejected": -3.349275588989258, "step": 242 }, { "epoch": 0.7328934967012253, "grad_norm": 3.937147378921509, "learning_rate": 9.268882175226585e-07, "logits/chosen": -2.6204299926757812, "logits/rejected": -2.5958380699157715, "logps/chosen": -39.52198791503906, "logps/rejected": -118.6393814086914, "loss": 0.4069, "rewards/accuracies": 1.0, "rewards/chosen": 1.7933323383331299, "rewards/margins": 5.241230010986328, "rewards/rejected": -3.447896957397461, "step": 243 }, { "epoch": 0.7359095193213949, "grad_norm": 3.012561321258545, "learning_rate": 9.265861027190332e-07, "logits/chosen": -2.568047046661377, "logits/rejected": -2.5912139415740967, "logps/chosen": -29.952239990234375, "logps/rejected": -126.51023864746094, "loss": 0.2872, "rewards/accuracies": 0.96875, "rewards/chosen": 2.5558278560638428, "rewards/margins": 6.674716949462891, "rewards/rejected": -4.118888854980469, "step": 244 }, { "epoch": 0.7389255419415646, "grad_norm": 3.2441844940185547, "learning_rate": 9.262839879154078e-07, "logits/chosen": -2.6309638023376465, "logits/rejected": -2.567746162414551, "logps/chosen": -44.57331085205078, "logps/rejected": -109.73836517333984, "loss": 0.586, "rewards/accuracies": 0.90625, "rewards/chosen": 1.3172574043273926, "rewards/margins": 4.120973587036133, "rewards/rejected": -2.8037164211273193, "step": 245 }, { "epoch": 0.7419415645617342, "grad_norm": 3.4256911277770996, "learning_rate": 9.259818731117824e-07, "logits/chosen": -2.5593221187591553, "logits/rejected": -2.5294525623321533, "logps/chosen": -31.438255310058594, "logps/rejected": -117.08734130859375, "loss": 0.4053, "rewards/accuracies": 0.96875, "rewards/chosen": 2.116673469543457, "rewards/margins": 5.3221635818481445, "rewards/rejected": -3.2054901123046875, "step": 246 }, { "epoch": 0.7449575871819039, "grad_norm": 3.6133604049682617, "learning_rate": 9.256797583081571e-07, "logits/chosen": -2.6037075519561768, "logits/rejected": -2.5960612297058105, "logps/chosen": -39.11980438232422, "logps/rejected": -113.00597381591797, "loss": 0.4131, "rewards/accuracies": 0.96875, "rewards/chosen": 1.9413148164749146, "rewards/margins": 5.222982883453369, "rewards/rejected": -3.2816686630249023, "step": 247 }, { "epoch": 0.7479736098020735, "grad_norm": 3.6548099517822266, "learning_rate": 9.253776435045318e-07, "logits/chosen": -2.7215776443481445, "logits/rejected": -2.6383023262023926, "logps/chosen": -32.90032958984375, "logps/rejected": -131.9541778564453, "loss": 0.2771, "rewards/accuracies": 1.0, "rewards/chosen": 2.330225706100464, "rewards/margins": 6.939253330230713, "rewards/rejected": -4.609026908874512, "step": 248 }, { "epoch": 0.7509896324222431, "grad_norm": 3.150853395462036, "learning_rate": 9.250755287009063e-07, "logits/chosen": -2.605530023574829, "logits/rejected": -2.6564950942993164, "logps/chosen": -29.44303321838379, "logps/rejected": -105.7828598022461, "loss": 0.4028, "rewards/accuracies": 1.0, "rewards/chosen": 2.2524430751800537, "rewards/margins": 5.082771301269531, "rewards/rejected": -2.8303279876708984, "step": 249 }, { "epoch": 0.7540056550424128, "grad_norm": 3.8560574054718018, "learning_rate": 9.247734138972809e-07, "logits/chosen": -2.5981404781341553, "logits/rejected": -2.592167615890503, "logps/chosen": -30.017745971679688, "logps/rejected": -113.75418853759766, "loss": 0.3278, "rewards/accuracies": 1.0, "rewards/chosen": 2.5786385536193848, "rewards/margins": 5.571887969970703, "rewards/rejected": -2.9932494163513184, "step": 250 }, { "epoch": 0.7570216776625824, "grad_norm": 4.038232803344727, "learning_rate": 9.244712990936556e-07, "logits/chosen": -2.580045461654663, "logits/rejected": -2.6540842056274414, "logps/chosen": -39.275291442871094, "logps/rejected": -103.70063781738281, "loss": 0.4956, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7314910888671875, "rewards/margins": 4.711933135986328, "rewards/rejected": -2.9804422855377197, "step": 251 }, { "epoch": 0.7600377002827521, "grad_norm": 3.5616915225982666, "learning_rate": 9.241691842900302e-07, "logits/chosen": -2.5762414932250977, "logits/rejected": -2.5918076038360596, "logps/chosen": -33.03755569458008, "logps/rejected": -122.45536804199219, "loss": 0.3485, "rewards/accuracies": 1.0, "rewards/chosen": 1.9837690591812134, "rewards/margins": 5.851516246795654, "rewards/rejected": -3.8677470684051514, "step": 252 }, { "epoch": 0.7630537229029217, "grad_norm": 3.987177610397339, "learning_rate": 9.238670694864048e-07, "logits/chosen": -2.6714608669281006, "logits/rejected": -2.660938262939453, "logps/chosen": -34.358375549316406, "logps/rejected": -123.58916473388672, "loss": 0.3337, "rewards/accuracies": 0.96875, "rewards/chosen": 2.0097320079803467, "rewards/margins": 6.273241996765137, "rewards/rejected": -4.263509750366211, "step": 253 }, { "epoch": 0.7660697455230914, "grad_norm": 3.7350010871887207, "learning_rate": 9.235649546827794e-07, "logits/chosen": -2.5972955226898193, "logits/rejected": -2.57731556892395, "logps/chosen": -38.69502258300781, "logps/rejected": -135.0662384033203, "loss": 0.3219, "rewards/accuracies": 1.0, "rewards/chosen": 1.9963241815567017, "rewards/margins": 6.66257905960083, "rewards/rejected": -4.666254997253418, "step": 254 }, { "epoch": 0.7690857681432611, "grad_norm": 2.821082592010498, "learning_rate": 9.23262839879154e-07, "logits/chosen": -2.614346504211426, "logits/rejected": -2.594996690750122, "logps/chosen": -35.114112854003906, "logps/rejected": -120.6429672241211, "loss": 0.371, "rewards/accuracies": 0.96875, "rewards/chosen": 1.988102912902832, "rewards/margins": 5.811692237854004, "rewards/rejected": -3.823589324951172, "step": 255 }, { "epoch": 0.7721017907634308, "grad_norm": 4.038802146911621, "learning_rate": 9.229607250755287e-07, "logits/chosen": -2.6689906120300293, "logits/rejected": -2.6125645637512207, "logps/chosen": -38.38271713256836, "logps/rejected": -125.96461486816406, "loss": 0.4034, "rewards/accuracies": 0.96875, "rewards/chosen": 1.6309963464736938, "rewards/margins": 5.580226898193359, "rewards/rejected": -3.949230670928955, "step": 256 }, { "epoch": 0.7751178133836004, "grad_norm": 3.9164252281188965, "learning_rate": 9.226586102719033e-07, "logits/chosen": -2.5730504989624023, "logits/rejected": -2.6265528202056885, "logps/chosen": -25.98594093322754, "logps/rejected": -113.01664733886719, "loss": 0.2976, "rewards/accuracies": 0.96875, "rewards/chosen": 2.7786152362823486, "rewards/margins": 6.1476640701293945, "rewards/rejected": -3.369048595428467, "step": 257 }, { "epoch": 0.7781338360037701, "grad_norm": 3.4912374019622803, "learning_rate": 9.223564954682778e-07, "logits/chosen": -2.5742175579071045, "logits/rejected": -2.6411008834838867, "logps/chosen": -40.944297790527344, "logps/rejected": -118.7137451171875, "loss": 0.4514, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4100043773651123, "rewards/margins": 5.053955078125, "rewards/rejected": -3.6439507007598877, "step": 258 }, { "epoch": 0.7811498586239397, "grad_norm": 3.2159502506256104, "learning_rate": 9.220543806646526e-07, "logits/chosen": -2.632023334503174, "logits/rejected": -2.5708353519439697, "logps/chosen": -37.052406311035156, "logps/rejected": -124.02879333496094, "loss": 0.3666, "rewards/accuracies": 0.96875, "rewards/chosen": 1.8699228763580322, "rewards/margins": 5.901445388793945, "rewards/rejected": -4.031522750854492, "step": 259 }, { "epoch": 0.7841658812441094, "grad_norm": 4.635215759277344, "learning_rate": 9.217522658610272e-07, "logits/chosen": -2.6835224628448486, "logits/rejected": -2.62738037109375, "logps/chosen": -39.410675048828125, "logps/rejected": -118.2173843383789, "loss": 0.3536, "rewards/accuracies": 0.96875, "rewards/chosen": 1.6182273626327515, "rewards/margins": 5.526690483093262, "rewards/rejected": -3.9084632396698, "step": 260 }, { "epoch": 0.787181903864279, "grad_norm": 3.2065494060516357, "learning_rate": 9.214501510574017e-07, "logits/chosen": -2.7262136936187744, "logits/rejected": -2.62113881111145, "logps/chosen": -35.45907211303711, "logps/rejected": -140.76806640625, "loss": 0.3282, "rewards/accuracies": 1.0, "rewards/chosen": 2.117213487625122, "rewards/margins": 7.050081729888916, "rewards/rejected": -4.932868957519531, "step": 261 }, { "epoch": 0.7901979264844486, "grad_norm": 3.015484571456909, "learning_rate": 9.211480362537764e-07, "logits/chosen": -2.654388427734375, "logits/rejected": -2.655649185180664, "logps/chosen": -38.784767150878906, "logps/rejected": -121.3598861694336, "loss": 0.4639, "rewards/accuracies": 0.96875, "rewards/chosen": 1.6048314571380615, "rewards/margins": 5.530261993408203, "rewards/rejected": -3.9254300594329834, "step": 262 }, { "epoch": 0.7932139491046183, "grad_norm": 3.342820405960083, "learning_rate": 9.208459214501511e-07, "logits/chosen": -2.622694730758667, "logits/rejected": -2.6513233184814453, "logps/chosen": -33.486289978027344, "logps/rejected": -120.56976318359375, "loss": 0.3371, "rewards/accuracies": 1.0, "rewards/chosen": 1.9383151531219482, "rewards/margins": 5.831571578979492, "rewards/rejected": -3.893256902694702, "step": 263 }, { "epoch": 0.7962299717247879, "grad_norm": 3.2330336570739746, "learning_rate": 9.205438066465256e-07, "logits/chosen": -2.6366162300109863, "logits/rejected": -2.5557148456573486, "logps/chosen": -45.4503059387207, "logps/rejected": -123.9853515625, "loss": 0.4591, "rewards/accuracies": 1.0, "rewards/chosen": 1.2411240339279175, "rewards/margins": 5.590984344482422, "rewards/rejected": -4.349859714508057, "step": 264 }, { "epoch": 0.7992459943449576, "grad_norm": 2.9766252040863037, "learning_rate": 9.202416918429002e-07, "logits/chosen": -2.643658399581909, "logits/rejected": -2.584506034851074, "logps/chosen": -42.834495544433594, "logps/rejected": -145.93019104003906, "loss": 0.3183, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5243803262710571, "rewards/margins": 6.88662576675415, "rewards/rejected": -5.362246036529541, "step": 265 }, { "epoch": 0.8022620169651272, "grad_norm": 3.1385953426361084, "learning_rate": 9.199395770392749e-07, "logits/chosen": -2.7025763988494873, "logits/rejected": -2.6055755615234375, "logps/chosen": -35.83489990234375, "logps/rejected": -123.45831298828125, "loss": 0.4217, "rewards/accuracies": 0.9375, "rewards/chosen": 2.2272486686706543, "rewards/margins": 5.9584150314331055, "rewards/rejected": -3.731165885925293, "step": 266 }, { "epoch": 0.8052780395852969, "grad_norm": 4.01106071472168, "learning_rate": 9.196374622356495e-07, "logits/chosen": -2.6735634803771973, "logits/rejected": -2.62910795211792, "logps/chosen": -37.3286247253418, "logps/rejected": -110.36415100097656, "loss": 0.4753, "rewards/accuracies": 1.0, "rewards/chosen": 1.8849306106567383, "rewards/margins": 4.661683559417725, "rewards/rejected": -2.7767531871795654, "step": 267 }, { "epoch": 0.8082940622054665, "grad_norm": 3.2596595287323, "learning_rate": 9.193353474320241e-07, "logits/chosen": -2.63472580909729, "logits/rejected": -2.67694091796875, "logps/chosen": -35.92645263671875, "logps/rejected": -122.6493148803711, "loss": 0.3457, "rewards/accuracies": 0.9375, "rewards/chosen": 1.9283381700515747, "rewards/margins": 5.9604363441467285, "rewards/rejected": -4.032098293304443, "step": 268 }, { "epoch": 0.8082940622054665, "eval_logits/chosen": -2.550990343093872, "eval_logits/rejected": -2.5519466400146484, "eval_logps/chosen": -39.258182525634766, "eval_logps/rejected": -126.3482666015625, "eval_loss": 0.3394588530063629, "eval_rewards/accuracies": 0.9639175534248352, "eval_rewards/chosen": 2.014650344848633, "eval_rewards/margins": 6.022314548492432, "eval_rewards/rejected": -4.007664203643799, "eval_runtime": 694.4443, "eval_samples_per_second": 0.557, "eval_steps_per_second": 0.279, "step": 268 }, { "epoch": 0.8113100848256362, "grad_norm": 3.9166812896728516, "learning_rate": 9.190332326283987e-07, "logits/chosen": -2.571134567260742, "logits/rejected": -2.5836033821105957, "logps/chosen": -34.11045837402344, "logps/rejected": -107.47348022460938, "loss": 0.4555, "rewards/accuracies": 1.0, "rewards/chosen": 2.237888813018799, "rewards/margins": 4.882692337036133, "rewards/rejected": -2.6448028087615967, "step": 269 }, { "epoch": 0.8143261074458058, "grad_norm": 3.8389511108398438, "learning_rate": 9.187311178247734e-07, "logits/chosen": -2.614830732345581, "logits/rejected": -2.6504979133605957, "logps/chosen": -30.70520782470703, "logps/rejected": -115.54693603515625, "loss": 0.3359, "rewards/accuracies": 1.0, "rewards/chosen": 2.3481802940368652, "rewards/margins": 5.803779125213623, "rewards/rejected": -3.455598831176758, "step": 270 }, { "epoch": 0.8173421300659754, "grad_norm": 3.5696234703063965, "learning_rate": 9.18429003021148e-07, "logits/chosen": -2.619418144226074, "logits/rejected": -2.632321357727051, "logps/chosen": -30.47220802307129, "logps/rejected": -109.5301742553711, "loss": 0.3433, "rewards/accuracies": 1.0, "rewards/chosen": 2.580420970916748, "rewards/margins": 5.797632694244385, "rewards/rejected": -3.217212438583374, "step": 271 }, { "epoch": 0.8203581526861451, "grad_norm": 2.698505401611328, "learning_rate": 9.181268882175226e-07, "logits/chosen": -2.6173641681671143, "logits/rejected": -2.579453945159912, "logps/chosen": -38.32424545288086, "logps/rejected": -124.09661102294922, "loss": 0.4257, "rewards/accuracies": 0.96875, "rewards/chosen": 1.837700605392456, "rewards/margins": 5.723989009857178, "rewards/rejected": -3.8862884044647217, "step": 272 }, { "epoch": 0.8233741753063148, "grad_norm": 3.705084800720215, "learning_rate": 9.178247734138971e-07, "logits/chosen": -2.6653759479522705, "logits/rejected": -2.626272439956665, "logps/chosen": -35.29466247558594, "logps/rejected": -109.72173309326172, "loss": 0.3915, "rewards/accuracies": 0.96875, "rewards/chosen": 2.0961673259735107, "rewards/margins": 5.450488567352295, "rewards/rejected": -3.354320764541626, "step": 273 }, { "epoch": 0.8263901979264845, "grad_norm": 4.50526762008667, "learning_rate": 9.175226586102719e-07, "logits/chosen": -2.596966505050659, "logits/rejected": -2.609055519104004, "logps/chosen": -35.11848449707031, "logps/rejected": -118.73785400390625, "loss": 0.4066, "rewards/accuracies": 0.96875, "rewards/chosen": 1.9911614656448364, "rewards/margins": 5.7709550857543945, "rewards/rejected": -3.7797930240631104, "step": 274 }, { "epoch": 0.8294062205466541, "grad_norm": 3.521263599395752, "learning_rate": 9.172205438066465e-07, "logits/chosen": -2.6410226821899414, "logits/rejected": -2.5905768871307373, "logps/chosen": -30.13991355895996, "logps/rejected": -121.66047668457031, "loss": 0.3276, "rewards/accuracies": 1.0, "rewards/chosen": 2.604072093963623, "rewards/margins": 6.572639465332031, "rewards/rejected": -3.968567371368408, "step": 275 }, { "epoch": 0.8324222431668238, "grad_norm": 3.959840774536133, "learning_rate": 9.16918429003021e-07, "logits/chosen": -2.677837371826172, "logits/rejected": -2.597052574157715, "logps/chosen": -38.03804397583008, "logps/rejected": -124.20255279541016, "loss": 0.3975, "rewards/accuracies": 0.96875, "rewards/chosen": 2.087094783782959, "rewards/margins": 6.067094326019287, "rewards/rejected": -3.979999542236328, "step": 276 }, { "epoch": 0.8354382657869934, "grad_norm": 3.447019577026367, "learning_rate": 9.166163141993958e-07, "logits/chosen": -2.697606086730957, "logits/rejected": -2.604112386703491, "logps/chosen": -41.47817611694336, "logps/rejected": -134.53366088867188, "loss": 0.3683, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4427622556686401, "rewards/margins": 6.379879474639893, "rewards/rejected": -4.937117576599121, "step": 277 }, { "epoch": 0.8384542884071631, "grad_norm": 2.939883232116699, "learning_rate": 9.163141993957704e-07, "logits/chosen": -2.6832919120788574, "logits/rejected": -2.650692939758301, "logps/chosen": -30.365848541259766, "logps/rejected": -141.26449584960938, "loss": 0.2473, "rewards/accuracies": 1.0, "rewards/chosen": 2.252995014190674, "rewards/margins": 7.176512718200684, "rewards/rejected": -4.923517227172852, "step": 278 }, { "epoch": 0.8414703110273327, "grad_norm": 3.6301376819610596, "learning_rate": 9.16012084592145e-07, "logits/chosen": -2.681119203567505, "logits/rejected": -2.648449182510376, "logps/chosen": -40.356937408447266, "logps/rejected": -121.53636169433594, "loss": 0.368, "rewards/accuracies": 0.96875, "rewards/chosen": 1.8747016191482544, "rewards/margins": 5.563436031341553, "rewards/rejected": -3.6887340545654297, "step": 279 }, { "epoch": 0.8444863336475024, "grad_norm": 3.0058701038360596, "learning_rate": 9.157099697885195e-07, "logits/chosen": -2.575779914855957, "logits/rejected": -2.6108546257019043, "logps/chosen": -36.48054504394531, "logps/rejected": -127.40420532226562, "loss": 0.2986, "rewards/accuracies": 0.96875, "rewards/chosen": 2.1376521587371826, "rewards/margins": 6.44553804397583, "rewards/rejected": -4.307885646820068, "step": 280 }, { "epoch": 0.847502356267672, "grad_norm": 2.9909307956695557, "learning_rate": 9.154078549848943e-07, "logits/chosen": -2.675196409225464, "logits/rejected": -2.6624865531921387, "logps/chosen": -35.32107162475586, "logps/rejected": -135.5714111328125, "loss": 0.3991, "rewards/accuracies": 1.0, "rewards/chosen": 1.9455938339233398, "rewards/margins": 6.609195709228516, "rewards/rejected": -4.663600921630859, "step": 281 }, { "epoch": 0.8505183788878417, "grad_norm": 3.6060233116149902, "learning_rate": 9.151057401812689e-07, "logits/chosen": -2.6986727714538574, "logits/rejected": -2.667952299118042, "logps/chosen": -36.984954833984375, "logps/rejected": -138.51161193847656, "loss": 0.2878, "rewards/accuracies": 1.0, "rewards/chosen": 2.080252170562744, "rewards/margins": 6.998438358306885, "rewards/rejected": -4.918186187744141, "step": 282 }, { "epoch": 0.8535344015080113, "grad_norm": 22.916868209838867, "learning_rate": 9.148036253776434e-07, "logits/chosen": -2.6836025714874268, "logits/rejected": -2.6283771991729736, "logps/chosen": -29.098621368408203, "logps/rejected": -123.14472961425781, "loss": 0.3108, "rewards/accuracies": 0.9375, "rewards/chosen": 2.5751163959503174, "rewards/margins": 6.255995273590088, "rewards/rejected": -3.6808788776397705, "step": 283 }, { "epoch": 0.8565504241281809, "grad_norm": 3.1876158714294434, "learning_rate": 9.14501510574018e-07, "logits/chosen": -2.657439947128296, "logits/rejected": -2.730288028717041, "logps/chosen": -43.20960998535156, "logps/rejected": -139.41896057128906, "loss": 0.363, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5236432552337646, "rewards/margins": 6.370984077453613, "rewards/rejected": -4.8473405838012695, "step": 284 }, { "epoch": 0.8595664467483506, "grad_norm": 2.7515501976013184, "learning_rate": 9.141993957703928e-07, "logits/chosen": -2.616990804672241, "logits/rejected": -2.6757140159606934, "logps/chosen": -33.297847747802734, "logps/rejected": -126.00401306152344, "loss": 0.3478, "rewards/accuracies": 0.96875, "rewards/chosen": 2.1188859939575195, "rewards/margins": 6.4696173667907715, "rewards/rejected": -4.35073184967041, "step": 285 }, { "epoch": 0.8625824693685202, "grad_norm": 3.479750156402588, "learning_rate": 9.138972809667673e-07, "logits/chosen": -2.637895107269287, "logits/rejected": -2.664796829223633, "logps/chosen": -36.584022521972656, "logps/rejected": -142.40130615234375, "loss": 0.3148, "rewards/accuracies": 1.0, "rewards/chosen": 2.0974433422088623, "rewards/margins": 7.214588642120361, "rewards/rejected": -5.117145538330078, "step": 286 }, { "epoch": 0.8655984919886899, "grad_norm": 3.21258807182312, "learning_rate": 9.135951661631419e-07, "logits/chosen": -2.618966579437256, "logits/rejected": -2.631732940673828, "logps/chosen": -39.13117599487305, "logps/rejected": -128.33834838867188, "loss": 0.3958, "rewards/accuracies": 0.90625, "rewards/chosen": 1.9488768577575684, "rewards/margins": 6.101250171661377, "rewards/rejected": -4.15237283706665, "step": 287 }, { "epoch": 0.8686145146088595, "grad_norm": 3.3783841133117676, "learning_rate": 9.132930513595166e-07, "logits/chosen": -2.6945266723632812, "logits/rejected": -2.6417899131774902, "logps/chosen": -25.75548553466797, "logps/rejected": -110.44913482666016, "loss": 0.3575, "rewards/accuracies": 1.0, "rewards/chosen": 2.763758659362793, "rewards/margins": 6.098465442657471, "rewards/rejected": -3.3347063064575195, "step": 288 }, { "epoch": 0.8716305372290292, "grad_norm": 3.375331401824951, "learning_rate": 9.129909365558912e-07, "logits/chosen": -2.651271343231201, "logits/rejected": -2.5984842777252197, "logps/chosen": -37.76849365234375, "logps/rejected": -139.80075073242188, "loss": 0.3019, "rewards/accuracies": 1.0, "rewards/chosen": 2.0252575874328613, "rewards/margins": 6.936153888702393, "rewards/rejected": -4.910896301269531, "step": 289 }, { "epoch": 0.8746465598491989, "grad_norm": 2.8726441860198975, "learning_rate": 9.126888217522658e-07, "logits/chosen": -2.716867685317993, "logits/rejected": -2.6289868354797363, "logps/chosen": -29.63542366027832, "logps/rejected": -140.89837646484375, "loss": 0.285, "rewards/accuracies": 1.0, "rewards/chosen": 2.61297869682312, "rewards/margins": 7.608521938323975, "rewards/rejected": -4.995543479919434, "step": 290 }, { "epoch": 0.8776625824693686, "grad_norm": 4.063046455383301, "learning_rate": 9.123867069486404e-07, "logits/chosen": -2.604398727416992, "logits/rejected": -2.6420645713806152, "logps/chosen": -30.387271881103516, "logps/rejected": -110.95226287841797, "loss": 0.352, "rewards/accuracies": 0.96875, "rewards/chosen": 2.5873866081237793, "rewards/margins": 5.855345726013184, "rewards/rejected": -3.2679593563079834, "step": 291 }, { "epoch": 0.8806786050895382, "grad_norm": 2.7301855087280273, "learning_rate": 9.120845921450151e-07, "logits/chosen": -2.6098506450653076, "logits/rejected": -2.6303951740264893, "logps/chosen": -37.48124313354492, "logps/rejected": -117.81102752685547, "loss": 0.4339, "rewards/accuracies": 0.9375, "rewards/chosen": 1.916374683380127, "rewards/margins": 5.6117143630981445, "rewards/rejected": -3.695340156555176, "step": 292 }, { "epoch": 0.8836946277097079, "grad_norm": 4.272204399108887, "learning_rate": 9.117824773413897e-07, "logits/chosen": -2.688533306121826, "logits/rejected": -2.6433167457580566, "logps/chosen": -40.70674133300781, "logps/rejected": -122.44318389892578, "loss": 0.4696, "rewards/accuracies": 0.9375, "rewards/chosen": 1.7093135118484497, "rewards/margins": 5.6076459884643555, "rewards/rejected": -3.898331880569458, "step": 293 }, { "epoch": 0.8867106503298775, "grad_norm": 3.445237398147583, "learning_rate": 9.114803625377643e-07, "logits/chosen": -2.624828577041626, "logits/rejected": -2.6308979988098145, "logps/chosen": -42.162994384765625, "logps/rejected": -127.66302490234375, "loss": 0.4455, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4975866079330444, "rewards/margins": 5.726366996765137, "rewards/rejected": -4.228780746459961, "step": 294 }, { "epoch": 0.8897266729500471, "grad_norm": 3.316920280456543, "learning_rate": 9.111782477341389e-07, "logits/chosen": -2.629857063293457, "logits/rejected": -2.569312810897827, "logps/chosen": -32.57796096801758, "logps/rejected": -121.46913146972656, "loss": 0.335, "rewards/accuracies": 0.96875, "rewards/chosen": 2.2551841735839844, "rewards/margins": 6.136897563934326, "rewards/rejected": -3.881713390350342, "step": 295 }, { "epoch": 0.8927426955702168, "grad_norm": 3.5713491439819336, "learning_rate": 9.108761329305136e-07, "logits/chosen": -2.6931750774383545, "logits/rejected": -2.676894426345825, "logps/chosen": -33.02001953125, "logps/rejected": -117.12796783447266, "loss": 0.4137, "rewards/accuracies": 1.0, "rewards/chosen": 1.8537616729736328, "rewards/margins": 5.650142192840576, "rewards/rejected": -3.796380043029785, "step": 296 }, { "epoch": 0.8957587181903864, "grad_norm": 2.553356409072876, "learning_rate": 9.105740181268882e-07, "logits/chosen": -2.8137106895446777, "logits/rejected": -2.7388205528259277, "logps/chosen": -40.31370544433594, "logps/rejected": -142.2401885986328, "loss": 0.2908, "rewards/accuracies": 0.9375, "rewards/chosen": 1.798686146736145, "rewards/margins": 7.203954696655273, "rewards/rejected": -5.405268669128418, "step": 297 }, { "epoch": 0.8987747408105561, "grad_norm": 3.1174962520599365, "learning_rate": 9.102719033232627e-07, "logits/chosen": -2.60695743560791, "logits/rejected": -2.6069719791412354, "logps/chosen": -36.80424118041992, "logps/rejected": -121.92937469482422, "loss": 0.3755, "rewards/accuracies": 0.96875, "rewards/chosen": 1.9996424913406372, "rewards/margins": 5.9136199951171875, "rewards/rejected": -3.913977861404419, "step": 298 }, { "epoch": 0.9017907634307257, "grad_norm": 9.671049118041992, "learning_rate": 9.099697885196374e-07, "logits/chosen": -2.658315658569336, "logits/rejected": -2.686953067779541, "logps/chosen": -28.661142349243164, "logps/rejected": -125.70897674560547, "loss": 0.2796, "rewards/accuracies": 1.0, "rewards/chosen": 2.548455238342285, "rewards/margins": 6.839571952819824, "rewards/rejected": -4.291116237640381, "step": 299 }, { "epoch": 0.9048067860508954, "grad_norm": 5.230979919433594, "learning_rate": 9.096676737160121e-07, "logits/chosen": -2.7078845500946045, "logits/rejected": -2.6604464054107666, "logps/chosen": -44.336605072021484, "logps/rejected": -120.98612213134766, "loss": 0.4587, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3887957334518433, "rewards/margins": 5.319972991943359, "rewards/rejected": -3.9311771392822266, "step": 300 }, { "epoch": 0.907822808671065, "grad_norm": 4.397043704986572, "learning_rate": 9.093655589123867e-07, "logits/chosen": -2.6408474445343018, "logits/rejected": -2.6652121543884277, "logps/chosen": -46.520729064941406, "logps/rejected": -121.71378326416016, "loss": 0.4417, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2753403186798096, "rewards/margins": 5.454016208648682, "rewards/rejected": -4.178676128387451, "step": 301 }, { "epoch": 0.9108388312912347, "grad_norm": 4.038957595825195, "learning_rate": 9.090634441087612e-07, "logits/chosen": -2.675447940826416, "logits/rejected": -2.6210055351257324, "logps/chosen": -36.51831817626953, "logps/rejected": -116.62033081054688, "loss": 0.4271, "rewards/accuracies": 1.0, "rewards/chosen": 1.806365728378296, "rewards/margins": 5.455887794494629, "rewards/rejected": -3.649522542953491, "step": 302 }, { "epoch": 0.9138548539114043, "grad_norm": 3.3095099925994873, "learning_rate": 9.087613293051359e-07, "logits/chosen": -2.661809206008911, "logits/rejected": -2.708728313446045, "logps/chosen": -39.8987922668457, "logps/rejected": -123.28917694091797, "loss": 0.4033, "rewards/accuracies": 0.96875, "rewards/chosen": 1.9363411664962769, "rewards/margins": 5.999453544616699, "rewards/rejected": -4.063113212585449, "step": 303 }, { "epoch": 0.916870876531574, "grad_norm": 4.12385892868042, "learning_rate": 9.084592145015106e-07, "logits/chosen": -2.671792984008789, "logits/rejected": -2.6402041912078857, "logps/chosen": -30.645267486572266, "logps/rejected": -118.51405334472656, "loss": 0.3637, "rewards/accuracies": 0.96875, "rewards/chosen": 2.7086408138275146, "rewards/margins": 6.20920991897583, "rewards/rejected": -3.5005688667297363, "step": 304 }, { "epoch": 0.9198868991517436, "grad_norm": 4.634004592895508, "learning_rate": 9.081570996978851e-07, "logits/chosen": -2.779454469680786, "logits/rejected": -2.704418420791626, "logps/chosen": -33.159332275390625, "logps/rejected": -128.8733367919922, "loss": 0.3776, "rewards/accuracies": 0.96875, "rewards/chosen": 2.237736940383911, "rewards/margins": 6.473038196563721, "rewards/rejected": -4.235301494598389, "step": 305 }, { "epoch": 0.9229029217719132, "grad_norm": 3.2028167247772217, "learning_rate": 9.078549848942598e-07, "logits/chosen": -2.6509203910827637, "logits/rejected": -2.5884995460510254, "logps/chosen": -27.40159034729004, "logps/rejected": -121.86763000488281, "loss": 0.289, "rewards/accuracies": 0.96875, "rewards/chosen": 2.6718337535858154, "rewards/margins": 6.661600112915039, "rewards/rejected": -3.9897658824920654, "step": 306 }, { "epoch": 0.9259189443920829, "grad_norm": 2.733133554458618, "learning_rate": 9.075528700906345e-07, "logits/chosen": -2.737301826477051, "logits/rejected": -2.699414014816284, "logps/chosen": -31.81925392150879, "logps/rejected": -120.07268524169922, "loss": 0.3928, "rewards/accuracies": 0.96875, "rewards/chosen": 2.5204763412475586, "rewards/margins": 6.130350589752197, "rewards/rejected": -3.6098742485046387, "step": 307 }, { "epoch": 0.9289349670122526, "grad_norm": 2.8270092010498047, "learning_rate": 9.07250755287009e-07, "logits/chosen": -2.6344149112701416, "logits/rejected": -2.58858323097229, "logps/chosen": -29.19489288330078, "logps/rejected": -139.9636688232422, "loss": 0.2156, "rewards/accuracies": 1.0, "rewards/chosen": 2.7022814750671387, "rewards/margins": 7.872775554656982, "rewards/rejected": -5.1704936027526855, "step": 308 }, { "epoch": 0.9319509896324223, "grad_norm": 2.2249886989593506, "learning_rate": 9.069486404833836e-07, "logits/chosen": -2.6533455848693848, "logits/rejected": -2.6067159175872803, "logps/chosen": -38.42180633544922, "logps/rejected": -134.7628936767578, "loss": 0.3522, "rewards/accuracies": 0.9375, "rewards/chosen": 1.9751896858215332, "rewards/margins": 6.407021522521973, "rewards/rejected": -4.4318318367004395, "step": 309 }, { "epoch": 0.9349670122525919, "grad_norm": 4.025765895843506, "learning_rate": 9.066465256797583e-07, "logits/chosen": -2.6848814487457275, "logits/rejected": -2.6137213706970215, "logps/chosen": -37.50755310058594, "logps/rejected": -128.39468383789062, "loss": 0.3778, "rewards/accuracies": 0.9375, "rewards/chosen": 1.7307801246643066, "rewards/margins": 6.420088291168213, "rewards/rejected": -4.689308166503906, "step": 310 }, { "epoch": 0.9379830348727616, "grad_norm": 4.090373516082764, "learning_rate": 9.063444108761329e-07, "logits/chosen": -2.654553174972534, "logits/rejected": -2.656381607055664, "logps/chosen": -39.98219680786133, "logps/rejected": -126.56806945800781, "loss": 0.3174, "rewards/accuracies": 0.96875, "rewards/chosen": 1.8572678565979004, "rewards/margins": 6.198173522949219, "rewards/rejected": -4.34090518951416, "step": 311 }, { "epoch": 0.9409990574929312, "grad_norm": 4.182084083557129, "learning_rate": 9.060422960725075e-07, "logits/chosen": -2.711094617843628, "logits/rejected": -2.6001195907592773, "logps/chosen": -39.10904312133789, "logps/rejected": -129.7421417236328, "loss": 0.37, "rewards/accuracies": 0.96875, "rewards/chosen": 1.9466900825500488, "rewards/margins": 6.329608917236328, "rewards/rejected": -4.382918357849121, "step": 312 }, { "epoch": 0.9440150801131009, "grad_norm": 2.9724931716918945, "learning_rate": 9.057401812688822e-07, "logits/chosen": -2.6617767810821533, "logits/rejected": -2.641684055328369, "logps/chosen": -37.1741943359375, "logps/rejected": -134.0995635986328, "loss": 0.3957, "rewards/accuracies": 1.0, "rewards/chosen": 2.2515671253204346, "rewards/margins": 6.644708156585693, "rewards/rejected": -4.39314079284668, "step": 313 }, { "epoch": 0.9470311027332705, "grad_norm": 3.9354665279388428, "learning_rate": 9.054380664652567e-07, "logits/chosen": -2.7781693935394287, "logits/rejected": -2.700404644012451, "logps/chosen": -33.95586013793945, "logps/rejected": -129.00582885742188, "loss": 0.2634, "rewards/accuracies": 1.0, "rewards/chosen": 2.485180377960205, "rewards/margins": 7.314218997955322, "rewards/rejected": -4.829038619995117, "step": 314 }, { "epoch": 0.9500471253534402, "grad_norm": 3.414243698120117, "learning_rate": 9.051359516616314e-07, "logits/chosen": -2.6472055912017822, "logits/rejected": -2.6298372745513916, "logps/chosen": -36.609474182128906, "logps/rejected": -109.3704833984375, "loss": 0.4554, "rewards/accuracies": 0.96875, "rewards/chosen": 1.9141181707382202, "rewards/margins": 5.2020263671875, "rewards/rejected": -3.287907838821411, "step": 315 }, { "epoch": 0.9530631479736098, "grad_norm": 4.946686744689941, "learning_rate": 9.04833836858006e-07, "logits/chosen": -2.55047345161438, "logits/rejected": -2.60373592376709, "logps/chosen": -32.97657012939453, "logps/rejected": -116.87493896484375, "loss": 0.4024, "rewards/accuracies": 0.9375, "rewards/chosen": 2.2694520950317383, "rewards/margins": 6.2031145095825195, "rewards/rejected": -3.9336628913879395, "step": 316 }, { "epoch": 0.9560791705937794, "grad_norm": 2.858616828918457, "learning_rate": 9.045317220543806e-07, "logits/chosen": -2.5781755447387695, "logits/rejected": -2.6503095626831055, "logps/chosen": -30.43094253540039, "logps/rejected": -126.1664047241211, "loss": 0.2349, "rewards/accuracies": 0.9375, "rewards/chosen": 2.5669000148773193, "rewards/margins": 7.151845932006836, "rewards/rejected": -4.5849456787109375, "step": 317 }, { "epoch": 0.9590951932139491, "grad_norm": 3.1453702449798584, "learning_rate": 9.042296072507552e-07, "logits/chosen": -2.694441080093384, "logits/rejected": -2.6578526496887207, "logps/chosen": -37.31842803955078, "logps/rejected": -127.68551635742188, "loss": 0.3981, "rewards/accuracies": 1.0, "rewards/chosen": 2.149909257888794, "rewards/margins": 6.46392297744751, "rewards/rejected": -4.314013481140137, "step": 318 }, { "epoch": 0.9621112158341187, "grad_norm": 3.3961434364318848, "learning_rate": 9.039274924471299e-07, "logits/chosen": -2.6228086948394775, "logits/rejected": -2.6740310192108154, "logps/chosen": -36.203529357910156, "logps/rejected": -117.03117370605469, "loss": 0.3567, "rewards/accuracies": 0.90625, "rewards/chosen": 1.978323221206665, "rewards/margins": 5.730858325958252, "rewards/rejected": -3.752535104751587, "step": 319 }, { "epoch": 0.9651272384542884, "grad_norm": 2.6297667026519775, "learning_rate": 9.036253776435044e-07, "logits/chosen": -2.7088398933410645, "logits/rejected": -2.6574442386627197, "logps/chosen": -31.576515197753906, "logps/rejected": -132.89549255371094, "loss": 0.2593, "rewards/accuracies": 1.0, "rewards/chosen": 2.4314112663269043, "rewards/margins": 7.507594108581543, "rewards/rejected": -5.0761823654174805, "step": 320 }, { "epoch": 0.968143261074458, "grad_norm": 3.7235071659088135, "learning_rate": 9.033232628398791e-07, "logits/chosen": -2.7523961067199707, "logits/rejected": -2.6193761825561523, "logps/chosen": -34.509456634521484, "logps/rejected": -115.32862091064453, "loss": 0.3794, "rewards/accuracies": 0.96875, "rewards/chosen": 2.3555667400360107, "rewards/margins": 5.958643436431885, "rewards/rejected": -3.603076457977295, "step": 321 }, { "epoch": 0.9711592836946277, "grad_norm": 3.7077252864837646, "learning_rate": 9.030211480362538e-07, "logits/chosen": -2.612607955932617, "logits/rejected": -2.611891031265259, "logps/chosen": -35.60697555541992, "logps/rejected": -113.8883285522461, "loss": 0.463, "rewards/accuracies": 0.90625, "rewards/chosen": 2.0633604526519775, "rewards/margins": 5.402459144592285, "rewards/rejected": -3.3390984535217285, "step": 322 }, { "epoch": 0.9741753063147973, "grad_norm": 3.608281135559082, "learning_rate": 9.027190332326283e-07, "logits/chosen": -2.6573758125305176, "logits/rejected": -2.641587257385254, "logps/chosen": -47.24151611328125, "logps/rejected": -140.1054229736328, "loss": 0.3598, "rewards/accuracies": 0.90625, "rewards/chosen": 1.3771491050720215, "rewards/margins": 6.424884796142578, "rewards/rejected": -5.047735691070557, "step": 323 }, { "epoch": 0.977191328934967, "grad_norm": 3.9445085525512695, "learning_rate": 9.02416918429003e-07, "logits/chosen": -2.5967116355895996, "logits/rejected": -2.675239324569702, "logps/chosen": -40.005584716796875, "logps/rejected": -136.5059356689453, "loss": 0.3862, "rewards/accuracies": 0.96875, "rewards/chosen": 1.6877098083496094, "rewards/margins": 6.753598690032959, "rewards/rejected": -5.06588888168335, "step": 324 }, { "epoch": 0.9802073515551367, "grad_norm": 3.685222864151001, "learning_rate": 9.021148036253776e-07, "logits/chosen": -2.681568145751953, "logits/rejected": -2.6110987663269043, "logps/chosen": -35.59101867675781, "logps/rejected": -146.5959014892578, "loss": 0.3308, "rewards/accuracies": 1.0, "rewards/chosen": 1.8269375562667847, "rewards/margins": 7.501883506774902, "rewards/rejected": -5.674945831298828, "step": 325 }, { "epoch": 0.9832233741753064, "grad_norm": 2.6954877376556396, "learning_rate": 9.018126888217523e-07, "logits/chosen": -2.6622443199157715, "logits/rejected": -2.7015185356140137, "logps/chosen": -29.419069290161133, "logps/rejected": -109.21180725097656, "loss": 0.3797, "rewards/accuracies": 0.96875, "rewards/chosen": 2.5111613273620605, "rewards/margins": 5.833054542541504, "rewards/rejected": -3.3218936920166016, "step": 326 }, { "epoch": 0.986239396795476, "grad_norm": 2.9147591590881348, "learning_rate": 9.015105740181268e-07, "logits/chosen": -2.6618845462799072, "logits/rejected": -2.6378591060638428, "logps/chosen": -30.837343215942383, "logps/rejected": -130.02780151367188, "loss": 0.3177, "rewards/accuracies": 1.0, "rewards/chosen": 2.1650617122650146, "rewards/margins": 6.838648319244385, "rewards/rejected": -4.673586845397949, "step": 327 }, { "epoch": 0.9892554194156457, "grad_norm": 3.46626615524292, "learning_rate": 9.012084592145015e-07, "logits/chosen": -2.639181613922119, "logits/rejected": -2.551830291748047, "logps/chosen": -33.391761779785156, "logps/rejected": -122.5763168334961, "loss": 0.3176, "rewards/accuracies": 1.0, "rewards/chosen": 2.1783547401428223, "rewards/margins": 6.110096454620361, "rewards/rejected": -3.9317421913146973, "step": 328 }, { "epoch": 0.9922714420358153, "grad_norm": 5.43665885925293, "learning_rate": 9.009063444108761e-07, "logits/chosen": -2.71295428276062, "logits/rejected": -2.6884877681732178, "logps/chosen": -50.05661392211914, "logps/rejected": -116.74736022949219, "loss": 0.496, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2056304216384888, "rewards/margins": 5.240615367889404, "rewards/rejected": -4.034985065460205, "step": 329 }, { "epoch": 0.9952874646559849, "grad_norm": 3.940122127532959, "learning_rate": 9.006042296072507e-07, "logits/chosen": -2.669586658477783, "logits/rejected": -2.686917543411255, "logps/chosen": -33.69731140136719, "logps/rejected": -114.8826904296875, "loss": 0.3711, "rewards/accuracies": 1.0, "rewards/chosen": 2.069420337677002, "rewards/margins": 5.898730278015137, "rewards/rejected": -3.829310178756714, "step": 330 }, { "epoch": 0.9983034872761546, "grad_norm": 3.930995464324951, "learning_rate": 9.003021148036254e-07, "logits/chosen": -2.6393558979034424, "logits/rejected": -2.6545491218566895, "logps/chosen": -47.183349609375, "logps/rejected": -131.93997192382812, "loss": 0.4503, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2302427291870117, "rewards/margins": 6.03405237197876, "rewards/rejected": -4.803810119628906, "step": 331 }, { "epoch": 1.0030160226201696, "grad_norm": 4.6676459312438965, "learning_rate": 9e-07, "logits/chosen": -2.6078975200653076, "logits/rejected": -2.5991365909576416, "logps/chosen": -41.71644592285156, "logps/rejected": -124.72638702392578, "loss": 0.4559, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": 1.3980447053909302, "rewards/margins": 5.895102024078369, "rewards/rejected": -4.49705696105957, "step": 332 }, { "epoch": 1.0060320452403393, "grad_norm": 4.162962436676025, "learning_rate": 8.996978851963745e-07, "logits/chosen": -2.7681121826171875, "logits/rejected": -2.710023880004883, "logps/chosen": -39.01509475708008, "logps/rejected": -129.75816345214844, "loss": 0.4325, "rewards/accuracies": 0.90625, "rewards/chosen": 1.7510559558868408, "rewards/margins": 6.262985706329346, "rewards/rejected": -4.511929035186768, "step": 333 }, { "epoch": 1.009048067860509, "grad_norm": 3.6636569499969482, "learning_rate": 8.993957703927492e-07, "logits/chosen": -2.607451915740967, "logits/rejected": -2.6621828079223633, "logps/chosen": -44.946475982666016, "logps/rejected": -129.98760986328125, "loss": 0.4264, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2466105222702026, "rewards/margins": 5.73425817489624, "rewards/rejected": -4.48764705657959, "step": 334 }, { "epoch": 1.0120640904806786, "grad_norm": 4.36713981628418, "learning_rate": 8.990936555891239e-07, "logits/chosen": -2.6556460857391357, "logits/rejected": -2.7028956413269043, "logps/chosen": -35.501747131347656, "logps/rejected": -135.37289428710938, "loss": 0.3377, "rewards/accuracies": 1.0, "rewards/chosen": 2.002695083618164, "rewards/margins": 6.968148708343506, "rewards/rejected": -4.965453147888184, "step": 335 }, { "epoch": 1.0120640904806786, "eval_logits/chosen": -2.5852880477905273, "eval_logits/rejected": -2.576824903488159, "eval_logps/chosen": -38.19837188720703, "eval_logps/rejected": -130.6110382080078, "eval_loss": 0.3266986310482025, "eval_rewards/accuracies": 0.9716494679450989, "eval_rewards/chosen": 2.120631217956543, "eval_rewards/margins": 6.554572582244873, "eval_rewards/rejected": -4.4339399337768555, "eval_runtime": 698.061, "eval_samples_per_second": 0.554, "eval_steps_per_second": 0.278, "step": 335 }, { "epoch": 1.0150801131008482, "grad_norm": 3.658106565475464, "learning_rate": 8.987915407854984e-07, "logits/chosen": -2.561880588531494, "logits/rejected": -2.582427978515625, "logps/chosen": -39.28596115112305, "logps/rejected": -132.12257385253906, "loss": 0.3624, "rewards/accuracies": 0.9375, "rewards/chosen": 1.800997018814087, "rewards/margins": 6.300075531005859, "rewards/rejected": -4.499078750610352, "step": 336 }, { "epoch": 1.0180961357210179, "grad_norm": 4.752818584442139, "learning_rate": 8.984894259818731e-07, "logits/chosen": -2.639437437057495, "logits/rejected": -2.5687363147735596, "logps/chosen": -29.728607177734375, "logps/rejected": -115.54638671875, "loss": 0.363, "rewards/accuracies": 1.0, "rewards/chosen": 2.6206555366516113, "rewards/margins": 6.02673864364624, "rewards/rejected": -3.406083345413208, "step": 337 }, { "epoch": 1.0211121583411875, "grad_norm": 3.873147964477539, "learning_rate": 8.981873111782477e-07, "logits/chosen": -2.7018280029296875, "logits/rejected": -2.6223933696746826, "logps/chosen": -44.460723876953125, "logps/rejected": -133.4439697265625, "loss": 0.4336, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5951755046844482, "rewards/margins": 6.197387218475342, "rewards/rejected": -4.6022114753723145, "step": 338 }, { "epoch": 1.0241281809613572, "grad_norm": 3.8269548416137695, "learning_rate": 8.978851963746223e-07, "logits/chosen": -2.716438055038452, "logits/rejected": -2.639073371887207, "logps/chosen": -34.72791290283203, "logps/rejected": -124.68952941894531, "loss": 0.3587, "rewards/accuracies": 1.0, "rewards/chosen": 1.8828498125076294, "rewards/margins": 6.314336776733398, "rewards/rejected": -4.431487083435059, "step": 339 }, { "epoch": 1.0271442035815268, "grad_norm": 3.36647629737854, "learning_rate": 8.975830815709969e-07, "logits/chosen": -2.649397611618042, "logits/rejected": -2.669314384460449, "logps/chosen": -31.955198287963867, "logps/rejected": -133.98941040039062, "loss": 0.2418, "rewards/accuracies": 1.0, "rewards/chosen": 2.4217822551727295, "rewards/margins": 7.40693473815918, "rewards/rejected": -4.985152721405029, "step": 340 }, { "epoch": 1.0301602262016964, "grad_norm": 4.599888324737549, "learning_rate": 8.972809667673716e-07, "logits/chosen": -2.6645689010620117, "logits/rejected": -2.6323342323303223, "logps/chosen": -34.88303756713867, "logps/rejected": -120.78034210205078, "loss": 0.4298, "rewards/accuracies": 0.96875, "rewards/chosen": 2.3679826259613037, "rewards/margins": 6.060487270355225, "rewards/rejected": -3.692505121231079, "step": 341 }, { "epoch": 1.033176248821866, "grad_norm": 3.5391390323638916, "learning_rate": 8.969788519637462e-07, "logits/chosen": -2.739889144897461, "logits/rejected": -2.7083072662353516, "logps/chosen": -30.789888381958008, "logps/rejected": -130.27821350097656, "loss": 0.3335, "rewards/accuracies": 1.0, "rewards/chosen": 2.3230228424072266, "rewards/margins": 6.945010662078857, "rewards/rejected": -4.621987819671631, "step": 342 }, { "epoch": 1.0361922714420357, "grad_norm": 3.123992681503296, "learning_rate": 8.966767371601208e-07, "logits/chosen": -2.6930789947509766, "logits/rejected": -2.655794620513916, "logps/chosen": -39.828941345214844, "logps/rejected": -140.41580200195312, "loss": 0.3269, "rewards/accuracies": 0.9375, "rewards/chosen": 2.1118130683898926, "rewards/margins": 7.283477783203125, "rewards/rejected": -5.171665191650391, "step": 343 }, { "epoch": 1.0392082940622054, "grad_norm": 5.350164413452148, "learning_rate": 8.963746223564954e-07, "logits/chosen": -2.689378261566162, "logits/rejected": -2.6074092388153076, "logps/chosen": -39.390533447265625, "logps/rejected": -140.0839080810547, "loss": 0.3861, "rewards/accuracies": 1.0, "rewards/chosen": 1.781160593032837, "rewards/margins": 6.901391983032227, "rewards/rejected": -5.120232105255127, "step": 344 }, { "epoch": 1.042224316682375, "grad_norm": 3.901968240737915, "learning_rate": 8.9607250755287e-07, "logits/chosen": -2.5455172061920166, "logits/rejected": -2.5379178524017334, "logps/chosen": -31.678329467773438, "logps/rejected": -115.00810241699219, "loss": 0.3682, "rewards/accuracies": 1.0, "rewards/chosen": 2.5189497470855713, "rewards/margins": 6.213257789611816, "rewards/rejected": -3.694308280944824, "step": 345 }, { "epoch": 1.0452403393025447, "grad_norm": 3.1411855220794678, "learning_rate": 8.957703927492447e-07, "logits/chosen": -2.6069016456604004, "logits/rejected": -2.6271281242370605, "logps/chosen": -34.34900665283203, "logps/rejected": -134.09681701660156, "loss": 0.288, "rewards/accuracies": 1.0, "rewards/chosen": 2.1353697776794434, "rewards/margins": 6.973865032196045, "rewards/rejected": -4.83849573135376, "step": 346 }, { "epoch": 1.0482563619227143, "grad_norm": 4.462538719177246, "learning_rate": 8.954682779456193e-07, "logits/chosen": -2.6778037548065186, "logits/rejected": -2.6951372623443604, "logps/chosen": -32.154502868652344, "logps/rejected": -105.26219177246094, "loss": 0.4354, "rewards/accuracies": 1.0, "rewards/chosen": 2.3423285484313965, "rewards/margins": 5.024504661560059, "rewards/rejected": -2.682175636291504, "step": 347 }, { "epoch": 1.051272384542884, "grad_norm": 3.146876335144043, "learning_rate": 8.951661631419938e-07, "logits/chosen": -2.6585702896118164, "logits/rejected": -2.6162021160125732, "logps/chosen": -27.13525390625, "logps/rejected": -112.40546417236328, "loss": 0.3452, "rewards/accuracies": 0.96875, "rewards/chosen": 2.747126579284668, "rewards/margins": 6.15369176864624, "rewards/rejected": -3.406564712524414, "step": 348 }, { "epoch": 1.0542884071630536, "grad_norm": 4.363774299621582, "learning_rate": 8.948640483383686e-07, "logits/chosen": -2.6543266773223877, "logits/rejected": -2.6788370609283447, "logps/chosen": -21.04546546936035, "logps/rejected": -123.24430847167969, "loss": 0.2682, "rewards/accuracies": 1.0, "rewards/chosen": 3.1104469299316406, "rewards/margins": 7.3157148361206055, "rewards/rejected": -4.205267429351807, "step": 349 }, { "epoch": 1.0573044297832235, "grad_norm": 3.111248731613159, "learning_rate": 8.945619335347432e-07, "logits/chosen": -2.5920231342315674, "logits/rejected": -2.5557734966278076, "logps/chosen": -33.111873626708984, "logps/rejected": -131.44873046875, "loss": 0.3491, "rewards/accuracies": 0.96875, "rewards/chosen": 2.083944320678711, "rewards/margins": 6.352249622344971, "rewards/rejected": -4.268305778503418, "step": 350 }, { "epoch": 1.0603204524033931, "grad_norm": 3.2302279472351074, "learning_rate": 8.942598187311177e-07, "logits/chosen": -2.687819480895996, "logits/rejected": -2.581559658050537, "logps/chosen": -30.96585464477539, "logps/rejected": -125.91869354248047, "loss": 0.3646, "rewards/accuracies": 1.0, "rewards/chosen": 2.1631898880004883, "rewards/margins": 6.643764972686768, "rewards/rejected": -4.4805755615234375, "step": 351 }, { "epoch": 1.0633364750235628, "grad_norm": 4.309492588043213, "learning_rate": 8.939577039274924e-07, "logits/chosen": -2.779158592224121, "logits/rejected": -2.7228641510009766, "logps/chosen": -36.77633285522461, "logps/rejected": -120.86244201660156, "loss": 0.3958, "rewards/accuracies": 1.0, "rewards/chosen": 1.8021392822265625, "rewards/margins": 5.705634117126465, "rewards/rejected": -3.9034950733184814, "step": 352 }, { "epoch": 1.0663524976437324, "grad_norm": 5.097663402557373, "learning_rate": 8.936555891238671e-07, "logits/chosen": -2.6130876541137695, "logits/rejected": -2.6115260124206543, "logps/chosen": -41.70187759399414, "logps/rejected": -125.19181823730469, "loss": 0.4178, "rewards/accuracies": 1.0, "rewards/chosen": 1.5314619541168213, "rewards/margins": 5.8846116065979, "rewards/rejected": -4.353149890899658, "step": 353 }, { "epoch": 1.069368520263902, "grad_norm": 3.1641483306884766, "learning_rate": 8.933534743202417e-07, "logits/chosen": -2.71781063079834, "logits/rejected": -2.63875412940979, "logps/chosen": -33.333885192871094, "logps/rejected": -136.1980743408203, "loss": 0.2991, "rewards/accuracies": 0.9375, "rewards/chosen": 2.0311694145202637, "rewards/margins": 6.9780097007751465, "rewards/rejected": -4.946840286254883, "step": 354 }, { "epoch": 1.0723845428840717, "grad_norm": 4.52489709854126, "learning_rate": 8.930513595166162e-07, "logits/chosen": -2.6501433849334717, "logits/rejected": -2.627817153930664, "logps/chosen": -41.488712310791016, "logps/rejected": -128.68980407714844, "loss": 0.3699, "rewards/accuracies": 0.96875, "rewards/chosen": 1.6007280349731445, "rewards/margins": 6.21989631652832, "rewards/rejected": -4.619168758392334, "step": 355 }, { "epoch": 1.0754005655042413, "grad_norm": 3.3184924125671387, "learning_rate": 8.927492447129909e-07, "logits/chosen": -2.709867477416992, "logits/rejected": -2.672165870666504, "logps/chosen": -35.67942810058594, "logps/rejected": -131.5745849609375, "loss": 0.3027, "rewards/accuracies": 0.9375, "rewards/chosen": 2.240568161010742, "rewards/margins": 6.862308979034424, "rewards/rejected": -4.62174129486084, "step": 356 }, { "epoch": 1.078416588124411, "grad_norm": 5.548001766204834, "learning_rate": 8.924471299093656e-07, "logits/chosen": -2.6415672302246094, "logits/rejected": -2.6234066486358643, "logps/chosen": -48.04874801635742, "logps/rejected": -131.65264892578125, "loss": 0.4652, "rewards/accuracies": 1.0, "rewards/chosen": 0.9716227054595947, "rewards/margins": 5.67972469329834, "rewards/rejected": -4.708103179931641, "step": 357 }, { "epoch": 1.0814326107445806, "grad_norm": 3.300654649734497, "learning_rate": 8.921450151057401e-07, "logits/chosen": -2.6766610145568848, "logits/rejected": -2.657275676727295, "logps/chosen": -35.0860481262207, "logps/rejected": -129.0859832763672, "loss": 0.3544, "rewards/accuracies": 0.9375, "rewards/chosen": 2.262845277786255, "rewards/margins": 6.632558345794678, "rewards/rejected": -4.369713306427002, "step": 358 }, { "epoch": 1.0844486333647503, "grad_norm": 3.861959218978882, "learning_rate": 8.918429003021147e-07, "logits/chosen": -2.615912914276123, "logits/rejected": -2.6507644653320312, "logps/chosen": -40.33019256591797, "logps/rejected": -127.79317474365234, "loss": 0.4364, "rewards/accuracies": 1.0, "rewards/chosen": 1.5692936182022095, "rewards/margins": 6.088464736938477, "rewards/rejected": -4.519171237945557, "step": 359 }, { "epoch": 1.08746465598492, "grad_norm": 4.362010478973389, "learning_rate": 8.915407854984895e-07, "logits/chosen": -2.6139235496520996, "logits/rejected": -2.640681505203247, "logps/chosen": -29.509244918823242, "logps/rejected": -117.6003189086914, "loss": 0.372, "rewards/accuracies": 0.9375, "rewards/chosen": 2.5226590633392334, "rewards/margins": 6.249569892883301, "rewards/rejected": -3.72691011428833, "step": 360 }, { "epoch": 1.0904806786050896, "grad_norm": 3.1087186336517334, "learning_rate": 8.91238670694864e-07, "logits/chosen": -2.745413064956665, "logits/rejected": -2.679997682571411, "logps/chosen": -34.76270294189453, "logps/rejected": -122.10379028320312, "loss": 0.4018, "rewards/accuracies": 0.9375, "rewards/chosen": 2.07231068611145, "rewards/margins": 6.213494300842285, "rewards/rejected": -4.141183376312256, "step": 361 }, { "epoch": 1.0934967012252592, "grad_norm": 2.3715031147003174, "learning_rate": 8.909365558912386e-07, "logits/chosen": -2.633187770843506, "logits/rejected": -2.598400115966797, "logps/chosen": -29.4879093170166, "logps/rejected": -125.24742889404297, "loss": 0.2892, "rewards/accuracies": 0.9375, "rewards/chosen": 2.5249319076538086, "rewards/margins": 6.898254871368408, "rewards/rejected": -4.3733229637146, "step": 362 }, { "epoch": 1.0965127238454289, "grad_norm": 3.285435914993286, "learning_rate": 8.906344410876132e-07, "logits/chosen": -2.5341429710388184, "logits/rejected": -2.613515853881836, "logps/chosen": -35.02667999267578, "logps/rejected": -119.26377868652344, "loss": 0.3602, "rewards/accuracies": 0.9375, "rewards/chosen": 1.848467469215393, "rewards/margins": 5.8711323738098145, "rewards/rejected": -4.022665500640869, "step": 363 }, { "epoch": 1.0995287464655985, "grad_norm": 4.674692153930664, "learning_rate": 8.903323262839879e-07, "logits/chosen": -2.539485216140747, "logits/rejected": -2.600135564804077, "logps/chosen": -40.30905532836914, "logps/rejected": -122.57794189453125, "loss": 0.4074, "rewards/accuracies": 0.90625, "rewards/chosen": 1.988041877746582, "rewards/margins": 6.199455738067627, "rewards/rejected": -4.211414337158203, "step": 364 }, { "epoch": 1.1025447690857682, "grad_norm": 3.365130662918091, "learning_rate": 8.900302114803625e-07, "logits/chosen": -2.5941519737243652, "logits/rejected": -2.5616648197174072, "logps/chosen": -37.676979064941406, "logps/rejected": -122.13693237304688, "loss": 0.3825, "rewards/accuracies": 1.0, "rewards/chosen": 1.915662407875061, "rewards/margins": 5.598390102386475, "rewards/rejected": -3.682727336883545, "step": 365 }, { "epoch": 1.1055607917059378, "grad_norm": 13.501348495483398, "learning_rate": 8.897280966767371e-07, "logits/chosen": -2.5771071910858154, "logits/rejected": -2.5835232734680176, "logps/chosen": -26.68564224243164, "logps/rejected": -137.95042419433594, "loss": 0.2031, "rewards/accuracies": 1.0, "rewards/chosen": 3.010491132736206, "rewards/margins": 8.211703300476074, "rewards/rejected": -5.201211929321289, "step": 366 }, { "epoch": 1.1085768143261074, "grad_norm": 3.5153636932373047, "learning_rate": 8.894259818731118e-07, "logits/chosen": -2.639951705932617, "logits/rejected": -2.5795040130615234, "logps/chosen": -32.42876052856445, "logps/rejected": -107.5257797241211, "loss": 0.4147, "rewards/accuracies": 0.9375, "rewards/chosen": 2.330228328704834, "rewards/margins": 5.462320327758789, "rewards/rejected": -3.132091999053955, "step": 367 }, { "epoch": 1.111592836946277, "grad_norm": 2.917733907699585, "learning_rate": 8.891238670694864e-07, "logits/chosen": -2.651254892349243, "logits/rejected": -2.637146234512329, "logps/chosen": -34.98809814453125, "logps/rejected": -117.09476470947266, "loss": 0.3723, "rewards/accuracies": 0.96875, "rewards/chosen": 2.010667562484741, "rewards/margins": 5.960440635681152, "rewards/rejected": -3.949772834777832, "step": 368 }, { "epoch": 1.1146088595664467, "grad_norm": 3.3716111183166504, "learning_rate": 8.88821752265861e-07, "logits/chosen": -2.722200870513916, "logits/rejected": -2.7017908096313477, "logps/chosen": -31.805641174316406, "logps/rejected": -133.77796936035156, "loss": 0.3322, "rewards/accuracies": 0.96875, "rewards/chosen": 2.7451624870300293, "rewards/margins": 7.253153324127197, "rewards/rejected": -4.507991313934326, "step": 369 }, { "epoch": 1.1176248821866164, "grad_norm": 4.482001781463623, "learning_rate": 8.885196374622355e-07, "logits/chosen": -2.737663507461548, "logits/rejected": -2.614938497543335, "logps/chosen": -25.736801147460938, "logps/rejected": -121.76387786865234, "loss": 0.2981, "rewards/accuracies": 0.96875, "rewards/chosen": 2.974294662475586, "rewards/margins": 6.853912353515625, "rewards/rejected": -3.879617691040039, "step": 370 }, { "epoch": 1.120640904806786, "grad_norm": 4.220541477203369, "learning_rate": 8.882175226586103e-07, "logits/chosen": -2.675368070602417, "logits/rejected": -2.59633207321167, "logps/chosen": -39.487892150878906, "logps/rejected": -143.22930908203125, "loss": 0.3096, "rewards/accuracies": 1.0, "rewards/chosen": 1.9329073429107666, "rewards/margins": 7.282592296600342, "rewards/rejected": -5.349684715270996, "step": 371 }, { "epoch": 1.1236569274269557, "grad_norm": 5.048531532287598, "learning_rate": 8.879154078549849e-07, "logits/chosen": -2.6240272521972656, "logits/rejected": -2.5953011512756348, "logps/chosen": -41.9521484375, "logps/rejected": -132.89450073242188, "loss": 0.3594, "rewards/accuracies": 1.0, "rewards/chosen": 1.9350029230117798, "rewards/margins": 6.516333103179932, "rewards/rejected": -4.581330299377441, "step": 372 }, { "epoch": 1.1266729500471253, "grad_norm": 2.1551239490509033, "learning_rate": 8.876132930513594e-07, "logits/chosen": -2.606280565261841, "logits/rejected": -2.5802319049835205, "logps/chosen": -28.611309051513672, "logps/rejected": -113.45401763916016, "loss": 0.3741, "rewards/accuracies": 0.9375, "rewards/chosen": 2.474576473236084, "rewards/margins": 6.158968448638916, "rewards/rejected": -3.684391975402832, "step": 373 }, { "epoch": 1.129688972667295, "grad_norm": 3.853666305541992, "learning_rate": 8.87311178247734e-07, "logits/chosen": -2.626345157623291, "logits/rejected": -2.6384716033935547, "logps/chosen": -36.264892578125, "logps/rejected": -123.7625961303711, "loss": 0.3659, "rewards/accuracies": 0.96875, "rewards/chosen": 2.2573320865631104, "rewards/margins": 6.09507417678833, "rewards/rejected": -3.8377418518066406, "step": 374 }, { "epoch": 1.1327049952874646, "grad_norm": 3.9629576206207275, "learning_rate": 8.870090634441088e-07, "logits/chosen": -2.6528942584991455, "logits/rejected": -2.5696961879730225, "logps/chosen": -25.481897354125977, "logps/rejected": -104.53897857666016, "loss": 0.3816, "rewards/accuracies": 1.0, "rewards/chosen": 2.7533016204833984, "rewards/margins": 5.583933353424072, "rewards/rejected": -2.8306310176849365, "step": 375 }, { "epoch": 1.1357210179076342, "grad_norm": 2.8891425132751465, "learning_rate": 8.867069486404834e-07, "logits/chosen": -2.6041159629821777, "logits/rejected": -2.6171343326568604, "logps/chosen": -32.881099700927734, "logps/rejected": -125.77433776855469, "loss": 0.3267, "rewards/accuracies": 1.0, "rewards/chosen": 2.361501455307007, "rewards/margins": 6.847794532775879, "rewards/rejected": -4.486292362213135, "step": 376 }, { "epoch": 1.138737040527804, "grad_norm": 3.573014736175537, "learning_rate": 8.864048338368579e-07, "logits/chosen": -2.647970199584961, "logits/rejected": -2.5565731525421143, "logps/chosen": -33.605735778808594, "logps/rejected": -121.07559204101562, "loss": 0.351, "rewards/accuracies": 0.96875, "rewards/chosen": 2.3671789169311523, "rewards/margins": 6.293142318725586, "rewards/rejected": -3.9259626865386963, "step": 377 }, { "epoch": 1.1417530631479735, "grad_norm": 6.157566547393799, "learning_rate": 8.861027190332326e-07, "logits/chosen": -2.5967774391174316, "logits/rejected": -2.586148977279663, "logps/chosen": -31.95098114013672, "logps/rejected": -116.12279510498047, "loss": 0.3927, "rewards/accuracies": 0.90625, "rewards/chosen": 2.3144407272338867, "rewards/margins": 5.80573844909668, "rewards/rejected": -3.491298198699951, "step": 378 }, { "epoch": 1.1447690857681432, "grad_norm": 4.332909107208252, "learning_rate": 8.858006042296073e-07, "logits/chosen": -2.7363295555114746, "logits/rejected": -2.6665849685668945, "logps/chosen": -37.10266876220703, "logps/rejected": -124.07866668701172, "loss": 0.3829, "rewards/accuracies": 1.0, "rewards/chosen": 2.175607204437256, "rewards/margins": 6.472297668457031, "rewards/rejected": -4.296689987182617, "step": 379 }, { "epoch": 1.1477851083883128, "grad_norm": 3.9091451168060303, "learning_rate": 8.854984894259818e-07, "logits/chosen": -2.6940743923187256, "logits/rejected": -2.6541998386383057, "logps/chosen": -30.541980743408203, "logps/rejected": -120.70744323730469, "loss": 0.3308, "rewards/accuracies": 0.96875, "rewards/chosen": 2.5346360206604004, "rewards/margins": 6.617867469787598, "rewards/rejected": -4.0832319259643555, "step": 380 }, { "epoch": 1.1508011310084825, "grad_norm": 3.212101697921753, "learning_rate": 8.851963746223564e-07, "logits/chosen": -2.6816883087158203, "logits/rejected": -2.6313610076904297, "logps/chosen": -40.43682861328125, "logps/rejected": -126.89346313476562, "loss": 0.5096, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6531165838241577, "rewards/margins": 5.818180561065674, "rewards/rejected": -4.165064334869385, "step": 381 }, { "epoch": 1.1538171536286521, "grad_norm": 2.8550074100494385, "learning_rate": 8.848942598187312e-07, "logits/chosen": -2.640343189239502, "logits/rejected": -2.5991177558898926, "logps/chosen": -35.68329620361328, "logps/rejected": -132.8628387451172, "loss": 0.3561, "rewards/accuracies": 1.0, "rewards/chosen": 2.080545425415039, "rewards/margins": 6.820539474487305, "rewards/rejected": -4.739994049072266, "step": 382 }, { "epoch": 1.156833176248822, "grad_norm": 3.715437173843384, "learning_rate": 8.845921450151057e-07, "logits/chosen": -2.6613247394561768, "logits/rejected": -2.6210992336273193, "logps/chosen": -36.08149719238281, "logps/rejected": -124.29576110839844, "loss": 0.2926, "rewards/accuracies": 1.0, "rewards/chosen": 2.4281418323516846, "rewards/margins": 6.547021865844727, "rewards/rejected": -4.118879795074463, "step": 383 }, { "epoch": 1.1598491988689914, "grad_norm": 4.147194862365723, "learning_rate": 8.842900302114803e-07, "logits/chosen": -2.6782522201538086, "logits/rejected": -2.6073920726776123, "logps/chosen": -34.79983139038086, "logps/rejected": -121.28984832763672, "loss": 0.3725, "rewards/accuracies": 1.0, "rewards/chosen": 2.1100997924804688, "rewards/margins": 6.306461334228516, "rewards/rejected": -4.196361064910889, "step": 384 }, { "epoch": 1.1628652214891613, "grad_norm": 3.744464635848999, "learning_rate": 8.839879154078549e-07, "logits/chosen": -2.510359764099121, "logits/rejected": -2.530203104019165, "logps/chosen": -28.883705139160156, "logps/rejected": -111.38734436035156, "loss": 0.3761, "rewards/accuracies": 0.96875, "rewards/chosen": 2.6678361892700195, "rewards/margins": 6.032463073730469, "rewards/rejected": -3.36462664604187, "step": 385 }, { "epoch": 1.165881244109331, "grad_norm": 3.4364590644836426, "learning_rate": 8.836858006042296e-07, "logits/chosen": -2.6500768661499023, "logits/rejected": -2.582326889038086, "logps/chosen": -46.903472900390625, "logps/rejected": -138.54547119140625, "loss": 0.4153, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3955121040344238, "rewards/margins": 6.3324995040893555, "rewards/rejected": -4.936986923217773, "step": 386 }, { "epoch": 1.1688972667295006, "grad_norm": 2.760063409805298, "learning_rate": 8.833836858006042e-07, "logits/chosen": -2.5794472694396973, "logits/rejected": -2.558413505554199, "logps/chosen": -30.174182891845703, "logps/rejected": -129.99395751953125, "loss": 0.3108, "rewards/accuracies": 1.0, "rewards/chosen": 2.4133243560791016, "rewards/margins": 7.035831451416016, "rewards/rejected": -4.622507095336914, "step": 387 }, { "epoch": 1.1719132893496702, "grad_norm": 4.610401153564453, "learning_rate": 8.830815709969788e-07, "logits/chosen": -2.648977041244507, "logits/rejected": -2.6439523696899414, "logps/chosen": -27.9853515625, "logps/rejected": -129.1112060546875, "loss": 0.278, "rewards/accuracies": 1.0, "rewards/chosen": 2.759519577026367, "rewards/margins": 7.329242706298828, "rewards/rejected": -4.569723129272461, "step": 388 }, { "epoch": 1.1749293119698399, "grad_norm": 3.0968523025512695, "learning_rate": 8.827794561933534e-07, "logits/chosen": -2.6173176765441895, "logits/rejected": -2.5931921005249023, "logps/chosen": -25.76148223876953, "logps/rejected": -113.54474639892578, "loss": 0.4162, "rewards/accuracies": 0.875, "rewards/chosen": 2.4657137393951416, "rewards/margins": 6.013082504272461, "rewards/rejected": -3.5473690032958984, "step": 389 }, { "epoch": 1.1779453345900095, "grad_norm": 3.520857810974121, "learning_rate": 8.824773413897281e-07, "logits/chosen": -2.6932263374328613, "logits/rejected": -2.636134624481201, "logps/chosen": -40.99352264404297, "logps/rejected": -127.9123764038086, "loss": 0.4979, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4769361019134521, "rewards/margins": 5.900261878967285, "rewards/rejected": -4.423325538635254, "step": 390 }, { "epoch": 1.1809613572101791, "grad_norm": 3.433048725128174, "learning_rate": 8.821752265861027e-07, "logits/chosen": -2.652545690536499, "logits/rejected": -2.617588758468628, "logps/chosen": -42.00980758666992, "logps/rejected": -125.04736328125, "loss": 0.4382, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4961085319519043, "rewards/margins": 5.788578033447266, "rewards/rejected": -4.292469501495361, "step": 391 }, { "epoch": 1.1839773798303488, "grad_norm": 3.8033971786499023, "learning_rate": 8.818731117824772e-07, "logits/chosen": -2.5923399925231934, "logits/rejected": -2.5324153900146484, "logps/chosen": -30.36977195739746, "logps/rejected": -125.96150970458984, "loss": 0.2981, "rewards/accuracies": 1.0, "rewards/chosen": 2.5206663608551025, "rewards/margins": 6.9863080978393555, "rewards/rejected": -4.465641498565674, "step": 392 }, { "epoch": 1.1869934024505184, "grad_norm": 3.6326656341552734, "learning_rate": 8.815709969788519e-07, "logits/chosen": -2.572417736053467, "logits/rejected": -2.5828428268432617, "logps/chosen": -35.131141662597656, "logps/rejected": -119.7784652709961, "loss": 0.4162, "rewards/accuracies": 1.0, "rewards/chosen": 1.957563877105713, "rewards/margins": 5.731949806213379, "rewards/rejected": -3.774386405944824, "step": 393 }, { "epoch": 1.190009425070688, "grad_norm": 2.9293341636657715, "learning_rate": 8.812688821752266e-07, "logits/chosen": -2.669222354888916, "logits/rejected": -2.6172595024108887, "logps/chosen": -19.910266876220703, "logps/rejected": -124.52550506591797, "loss": 0.2705, "rewards/accuracies": 1.0, "rewards/chosen": 3.184619188308716, "rewards/margins": 7.323365211486816, "rewards/rejected": -4.1387457847595215, "step": 394 }, { "epoch": 1.1930254476908577, "grad_norm": 3.679981231689453, "learning_rate": 8.809667673716011e-07, "logits/chosen": -2.631603479385376, "logits/rejected": -2.602553129196167, "logps/chosen": -26.821683883666992, "logps/rejected": -129.9911346435547, "loss": 0.2582, "rewards/accuracies": 1.0, "rewards/chosen": 2.9484503269195557, "rewards/margins": 7.592673301696777, "rewards/rejected": -4.644222259521484, "step": 395 }, { "epoch": 1.1960414703110274, "grad_norm": 4.4811601638793945, "learning_rate": 8.806646525679758e-07, "logits/chosen": -2.6908724308013916, "logits/rejected": -2.588803291320801, "logps/chosen": -33.005645751953125, "logps/rejected": -135.1189727783203, "loss": 0.3515, "rewards/accuracies": 1.0, "rewards/chosen": 2.308114528656006, "rewards/margins": 7.244058609008789, "rewards/rejected": -4.935944080352783, "step": 396 }, { "epoch": 1.199057492931197, "grad_norm": 4.649314880371094, "learning_rate": 8.803625377643505e-07, "logits/chosen": -2.6562435626983643, "logits/rejected": -2.629227638244629, "logps/chosen": -29.984960556030273, "logps/rejected": -124.1937255859375, "loss": 0.3681, "rewards/accuracies": 1.0, "rewards/chosen": 2.682079553604126, "rewards/margins": 6.853893756866455, "rewards/rejected": -4.17181396484375, "step": 397 }, { "epoch": 1.2020735155513667, "grad_norm": 4.690675735473633, "learning_rate": 8.80060422960725e-07, "logits/chosen": -2.6866722106933594, "logits/rejected": -2.6034085750579834, "logps/chosen": -38.20228576660156, "logps/rejected": -124.33232879638672, "loss": 0.3858, "rewards/accuracies": 1.0, "rewards/chosen": 2.126711130142212, "rewards/margins": 6.315862655639648, "rewards/rejected": -4.189151287078857, "step": 398 }, { "epoch": 1.2050895381715363, "grad_norm": 3.814340591430664, "learning_rate": 8.797583081570996e-07, "logits/chosen": -2.6425907611846924, "logits/rejected": -2.6310837268829346, "logps/chosen": -27.77117156982422, "logps/rejected": -142.99606323242188, "loss": 0.2367, "rewards/accuracies": 0.9375, "rewards/chosen": 2.7502405643463135, "rewards/margins": 8.08284854888916, "rewards/rejected": -5.332607746124268, "step": 399 }, { "epoch": 1.208105560791706, "grad_norm": 4.336704254150391, "learning_rate": 8.794561933534743e-07, "logits/chosen": -2.6091439723968506, "logits/rejected": -2.5566728115081787, "logps/chosen": -33.35424041748047, "logps/rejected": -127.52921295166016, "loss": 0.37, "rewards/accuracies": 1.0, "rewards/chosen": 2.0855932235717773, "rewards/margins": 6.662317276000977, "rewards/rejected": -4.576724052429199, "step": 400 }, { "epoch": 1.2111215834118756, "grad_norm": 5.2793402671813965, "learning_rate": 8.79154078549849e-07, "logits/chosen": -2.672152042388916, "logits/rejected": -2.595928430557251, "logps/chosen": -43.8314094543457, "logps/rejected": -146.0111083984375, "loss": 0.4686, "rewards/accuracies": 0.875, "rewards/chosen": 1.4175505638122559, "rewards/margins": 6.830992221832275, "rewards/rejected": -5.413442134857178, "step": 401 }, { "epoch": 1.2141376060320452, "grad_norm": 3.352627992630005, "learning_rate": 8.788519637462235e-07, "logits/chosen": -2.6696279048919678, "logits/rejected": -2.624316453933716, "logps/chosen": -26.13186264038086, "logps/rejected": -115.17029571533203, "loss": 0.3712, "rewards/accuracies": 0.96875, "rewards/chosen": 2.6524548530578613, "rewards/margins": 6.4986371994018555, "rewards/rejected": -3.8461825847625732, "step": 402 }, { "epoch": 1.2141376060320452, "eval_logits/chosen": -2.5709149837493896, "eval_logits/rejected": -2.5528578758239746, "eval_logps/chosen": -37.48468780517578, "eval_logps/rejected": -133.1304931640625, "eval_loss": 0.318447470664978, "eval_rewards/accuracies": 0.9716494679450989, "eval_rewards/chosen": 2.191999912261963, "eval_rewards/margins": 6.877885818481445, "eval_rewards/rejected": -4.685885906219482, "eval_runtime": 696.3062, "eval_samples_per_second": 0.556, "eval_steps_per_second": 0.279, "step": 402 }, { "epoch": 1.2171536286522149, "grad_norm": 3.5946104526519775, "learning_rate": 8.785498489425981e-07, "logits/chosen": -2.651158094406128, "logits/rejected": -2.6594855785369873, "logps/chosen": -36.817955017089844, "logps/rejected": -126.29791259765625, "loss": 0.4016, "rewards/accuracies": 0.9375, "rewards/chosen": 2.136624574661255, "rewards/margins": 6.436273574829102, "rewards/rejected": -4.299648761749268, "step": 403 }, { "epoch": 1.2201696512723845, "grad_norm": 2.8605072498321533, "learning_rate": 8.782477341389728e-07, "logits/chosen": -2.6487696170806885, "logits/rejected": -2.663496732711792, "logps/chosen": -31.291601181030273, "logps/rejected": -136.46707153320312, "loss": 0.2945, "rewards/accuracies": 0.9375, "rewards/chosen": 2.283026695251465, "rewards/margins": 7.265111446380615, "rewards/rejected": -4.98208475112915, "step": 404 }, { "epoch": 1.2231856738925542, "grad_norm": 3.7316434383392334, "learning_rate": 8.779456193353474e-07, "logits/chosen": -2.687020778656006, "logits/rejected": -2.5991485118865967, "logps/chosen": -41.47602081298828, "logps/rejected": -143.32275390625, "loss": 0.3745, "rewards/accuracies": 1.0, "rewards/chosen": 1.837193489074707, "rewards/margins": 7.28604793548584, "rewards/rejected": -5.448855400085449, "step": 405 }, { "epoch": 1.2262016965127238, "grad_norm": 5.143167495727539, "learning_rate": 8.77643504531722e-07, "logits/chosen": -2.5894174575805664, "logits/rejected": -2.611586093902588, "logps/chosen": -30.199447631835938, "logps/rejected": -128.52920532226562, "loss": 0.2903, "rewards/accuracies": 0.96875, "rewards/chosen": 2.8545608520507812, "rewards/margins": 7.2606964111328125, "rewards/rejected": -4.4061360359191895, "step": 406 }, { "epoch": 1.2292177191328935, "grad_norm": 4.44464111328125, "learning_rate": 8.773413897280967e-07, "logits/chosen": -2.624497890472412, "logits/rejected": -2.6217334270477295, "logps/chosen": -29.098690032958984, "logps/rejected": -129.20895385742188, "loss": 0.3051, "rewards/accuracies": 0.96875, "rewards/chosen": 2.5127782821655273, "rewards/margins": 7.156905174255371, "rewards/rejected": -4.6441264152526855, "step": 407 }, { "epoch": 1.232233741753063, "grad_norm": 4.54266881942749, "learning_rate": 8.770392749244712e-07, "logits/chosen": -2.666365623474121, "logits/rejected": -2.615244150161743, "logps/chosen": -31.326251983642578, "logps/rejected": -130.0301055908203, "loss": 0.3287, "rewards/accuracies": 1.0, "rewards/chosen": 2.303025007247925, "rewards/margins": 6.930612564086914, "rewards/rejected": -4.627586841583252, "step": 408 }, { "epoch": 1.2352497643732328, "grad_norm": 4.512946605682373, "learning_rate": 8.767371601208459e-07, "logits/chosen": -2.6773552894592285, "logits/rejected": -2.6356678009033203, "logps/chosen": -34.59609603881836, "logps/rejected": -122.56100463867188, "loss": 0.4467, "rewards/accuracies": 0.9375, "rewards/chosen": 2.2711949348449707, "rewards/margins": 6.117222785949707, "rewards/rejected": -3.8460285663604736, "step": 409 }, { "epoch": 1.2382657869934024, "grad_norm": 3.3937318325042725, "learning_rate": 8.764350453172205e-07, "logits/chosen": -2.7034354209899902, "logits/rejected": -2.685305595397949, "logps/chosen": -39.89870071411133, "logps/rejected": -127.5386962890625, "loss": 0.4, "rewards/accuracies": 0.9375, "rewards/chosen": 1.7804090976715088, "rewards/margins": 6.285686492919922, "rewards/rejected": -4.50527811050415, "step": 410 }, { "epoch": 1.241281809613572, "grad_norm": 3.053697347640991, "learning_rate": 8.761329305135951e-07, "logits/chosen": -2.6221790313720703, "logits/rejected": -2.595590591430664, "logps/chosen": -34.11355209350586, "logps/rejected": -131.36343383789062, "loss": 0.3577, "rewards/accuracies": 1.0, "rewards/chosen": 2.002330780029297, "rewards/margins": 6.8397297859191895, "rewards/rejected": -4.837399005889893, "step": 411 }, { "epoch": 1.2442978322337417, "grad_norm": 3.332669973373413, "learning_rate": 8.758308157099698e-07, "logits/chosen": -2.6731531620025635, "logits/rejected": -2.635443687438965, "logps/chosen": -35.861427307128906, "logps/rejected": -119.41816711425781, "loss": 0.3909, "rewards/accuracies": 1.0, "rewards/chosen": 1.9489860534667969, "rewards/margins": 6.152676105499268, "rewards/rejected": -4.203690528869629, "step": 412 }, { "epoch": 1.2473138548539113, "grad_norm": 3.301947593688965, "learning_rate": 8.755287009063444e-07, "logits/chosen": -2.5486268997192383, "logits/rejected": -2.5656380653381348, "logps/chosen": -30.23056983947754, "logps/rejected": -123.27662658691406, "loss": 0.2715, "rewards/accuracies": 1.0, "rewards/chosen": 2.4584786891937256, "rewards/margins": 7.1008124351501465, "rewards/rejected": -4.642333984375, "step": 413 }, { "epoch": 1.250329877474081, "grad_norm": 6.738383769989014, "learning_rate": 8.75226586102719e-07, "logits/chosen": -2.666435718536377, "logits/rejected": -2.582324981689453, "logps/chosen": -41.32102966308594, "logps/rejected": -137.59506225585938, "loss": 0.3967, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4574270248413086, "rewards/margins": 6.865550994873047, "rewards/rejected": -5.408123970031738, "step": 414 }, { "epoch": 1.2533459000942506, "grad_norm": 4.090964317321777, "learning_rate": 8.749244712990936e-07, "logits/chosen": -2.6944360733032227, "logits/rejected": -2.684675693511963, "logps/chosen": -31.00346565246582, "logps/rejected": -106.96146392822266, "loss": 0.4139, "rewards/accuracies": 1.0, "rewards/chosen": 2.3958587646484375, "rewards/margins": 5.820653438568115, "rewards/rejected": -3.4247944355010986, "step": 415 }, { "epoch": 1.2563619227144205, "grad_norm": 7.094062328338623, "learning_rate": 8.746223564954683e-07, "logits/chosen": -2.677428722381592, "logits/rejected": -2.7173149585723877, "logps/chosen": -28.689510345458984, "logps/rejected": -99.30183410644531, "loss": 0.4703, "rewards/accuracies": 1.0, "rewards/chosen": 2.364509105682373, "rewards/margins": 5.080435752868652, "rewards/rejected": -2.7159268856048584, "step": 416 }, { "epoch": 1.25937794533459, "grad_norm": 6.395017147064209, "learning_rate": 8.743202416918428e-07, "logits/chosen": -2.663949966430664, "logits/rejected": -2.6521098613739014, "logps/chosen": -40.11581802368164, "logps/rejected": -138.5645751953125, "loss": 0.3197, "rewards/accuracies": 0.9375, "rewards/chosen": 1.8280348777770996, "rewards/margins": 7.234889984130859, "rewards/rejected": -5.406854629516602, "step": 417 }, { "epoch": 1.2623939679547598, "grad_norm": 5.840503692626953, "learning_rate": 8.740181268882175e-07, "logits/chosen": -2.725738525390625, "logits/rejected": -2.6508729457855225, "logps/chosen": -36.62067794799805, "logps/rejected": -127.6308364868164, "loss": 0.3651, "rewards/accuracies": 1.0, "rewards/chosen": 2.009166955947876, "rewards/margins": 6.501208305358887, "rewards/rejected": -4.49204158782959, "step": 418 }, { "epoch": 1.2654099905749292, "grad_norm": 4.895381927490234, "learning_rate": 8.737160120845921e-07, "logits/chosen": -2.7609715461730957, "logits/rejected": -2.6896426677703857, "logps/chosen": -42.49498748779297, "logps/rejected": -118.15975952148438, "loss": 0.4869, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7884998321533203, "rewards/margins": 5.270386695861816, "rewards/rejected": -3.481886863708496, "step": 419 }, { "epoch": 1.268426013195099, "grad_norm": 4.802533149719238, "learning_rate": 8.734138972809667e-07, "logits/chosen": -2.7379350662231445, "logits/rejected": -2.7010395526885986, "logps/chosen": -34.542808532714844, "logps/rejected": -134.08187866210938, "loss": 0.3875, "rewards/accuracies": 1.0, "rewards/chosen": 2.1056771278381348, "rewards/margins": 7.031365394592285, "rewards/rejected": -4.925688743591309, "step": 420 }, { "epoch": 1.2714420358152685, "grad_norm": 3.5438716411590576, "learning_rate": 8.731117824773413e-07, "logits/chosen": -2.659017562866211, "logits/rejected": -2.6784284114837646, "logps/chosen": -29.533952713012695, "logps/rejected": -141.922607421875, "loss": 0.2231, "rewards/accuracies": 1.0, "rewards/chosen": 2.8427352905273438, "rewards/margins": 8.384220123291016, "rewards/rejected": -5.541484832763672, "step": 421 }, { "epoch": 1.2744580584354384, "grad_norm": 3.2461063861846924, "learning_rate": 8.72809667673716e-07, "logits/chosen": -2.589169979095459, "logits/rejected": -2.6520581245422363, "logps/chosen": -32.463401794433594, "logps/rejected": -134.15345764160156, "loss": 0.2926, "rewards/accuracies": 1.0, "rewards/chosen": 2.5563366413116455, "rewards/margins": 7.639386177062988, "rewards/rejected": -5.083049774169922, "step": 422 }, { "epoch": 1.2774740810556078, "grad_norm": 3.6186447143554688, "learning_rate": 8.725075528700905e-07, "logits/chosen": -2.6334877014160156, "logits/rejected": -2.6382880210876465, "logps/chosen": -43.52091598510742, "logps/rejected": -114.36335754394531, "loss": 0.4526, "rewards/accuracies": 0.84375, "rewards/chosen": 1.6509673595428467, "rewards/margins": 5.323124885559082, "rewards/rejected": -3.6721572875976562, "step": 423 }, { "epoch": 1.2804901036757776, "grad_norm": 8.39200496673584, "learning_rate": 8.722054380664652e-07, "logits/chosen": -2.6072001457214355, "logits/rejected": -2.560779333114624, "logps/chosen": -21.59418487548828, "logps/rejected": -104.22137451171875, "loss": 0.3638, "rewards/accuracies": 1.0, "rewards/chosen": 3.2933874130249023, "rewards/margins": 6.148227691650391, "rewards/rejected": -2.8548405170440674, "step": 424 }, { "epoch": 1.2835061262959473, "grad_norm": 4.411486625671387, "learning_rate": 8.719033232628399e-07, "logits/chosen": -2.64202880859375, "logits/rejected": -2.6123604774475098, "logps/chosen": -31.458375930786133, "logps/rejected": -132.86480712890625, "loss": 0.2836, "rewards/accuracies": 1.0, "rewards/chosen": 2.2682859897613525, "rewards/margins": 7.291298866271973, "rewards/rejected": -5.023013114929199, "step": 425 }, { "epoch": 1.286522148916117, "grad_norm": 4.914824962615967, "learning_rate": 8.716012084592144e-07, "logits/chosen": -2.699495792388916, "logits/rejected": -2.622744560241699, "logps/chosen": -35.16413497924805, "logps/rejected": -145.1307830810547, "loss": 0.281, "rewards/accuracies": 0.96875, "rewards/chosen": 2.0781729221343994, "rewards/margins": 7.714895248413086, "rewards/rejected": -5.636722087860107, "step": 426 }, { "epoch": 1.2895381715362866, "grad_norm": 4.325778484344482, "learning_rate": 8.712990936555891e-07, "logits/chosen": -2.654114246368408, "logits/rejected": -2.6091134548187256, "logps/chosen": -41.88032150268555, "logps/rejected": -128.94497680664062, "loss": 0.4036, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7297769784927368, "rewards/margins": 6.27387809753418, "rewards/rejected": -4.544100761413574, "step": 427 }, { "epoch": 1.2925541941564562, "grad_norm": 4.019529819488525, "learning_rate": 8.709969788519637e-07, "logits/chosen": -2.7478652000427246, "logits/rejected": -2.6607964038848877, "logps/chosen": -35.120704650878906, "logps/rejected": -138.49203491210938, "loss": 0.2995, "rewards/accuracies": 1.0, "rewards/chosen": 2.363480567932129, "rewards/margins": 7.599124908447266, "rewards/rejected": -5.2356438636779785, "step": 428 }, { "epoch": 1.2955702167766259, "grad_norm": 4.797628879547119, "learning_rate": 8.706948640483384e-07, "logits/chosen": -2.6023125648498535, "logits/rejected": -2.5555953979492188, "logps/chosen": -31.773351669311523, "logps/rejected": -130.01678466796875, "loss": 0.2838, "rewards/accuracies": 1.0, "rewards/chosen": 2.4679160118103027, "rewards/margins": 7.445218086242676, "rewards/rejected": -4.977302551269531, "step": 429 }, { "epoch": 1.2985862393967955, "grad_norm": 4.490795135498047, "learning_rate": 8.703927492447129e-07, "logits/chosen": -2.6033594608306885, "logits/rejected": -2.6418826580047607, "logps/chosen": -28.714481353759766, "logps/rejected": -129.5844268798828, "loss": 0.2273, "rewards/accuracies": 1.0, "rewards/chosen": 2.6648707389831543, "rewards/margins": 7.367170810699463, "rewards/rejected": -4.702300071716309, "step": 430 }, { "epoch": 1.3016022620169652, "grad_norm": 3.773939609527588, "learning_rate": 8.700906344410876e-07, "logits/chosen": -2.6886165142059326, "logits/rejected": -2.632314920425415, "logps/chosen": -37.16127014160156, "logps/rejected": -136.47940063476562, "loss": 0.3174, "rewards/accuracies": 0.9375, "rewards/chosen": 2.101212739944458, "rewards/margins": 6.836058616638184, "rewards/rejected": -4.7348456382751465, "step": 431 }, { "epoch": 1.3046182846371348, "grad_norm": 3.8830366134643555, "learning_rate": 8.697885196374623e-07, "logits/chosen": -2.6565182209014893, "logits/rejected": -2.558316230773926, "logps/chosen": -37.177978515625, "logps/rejected": -135.97592163085938, "loss": 0.3225, "rewards/accuracies": 0.9375, "rewards/chosen": 2.1961305141448975, "rewards/margins": 7.10683012008667, "rewards/rejected": -4.910698890686035, "step": 432 }, { "epoch": 1.3076343072573045, "grad_norm": 3.65069842338562, "learning_rate": 8.694864048338368e-07, "logits/chosen": -2.7137649059295654, "logits/rejected": -2.676199436187744, "logps/chosen": -26.815013885498047, "logps/rejected": -111.25971984863281, "loss": 0.4122, "rewards/accuracies": 0.90625, "rewards/chosen": 2.2655794620513916, "rewards/margins": 6.005194187164307, "rewards/rejected": -3.739614725112915, "step": 433 }, { "epoch": 1.310650329877474, "grad_norm": 9.204977989196777, "learning_rate": 8.691842900302114e-07, "logits/chosen": -2.618049144744873, "logits/rejected": -2.582622766494751, "logps/chosen": -40.611202239990234, "logps/rejected": -150.483642578125, "loss": 0.3326, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7644100189208984, "rewards/margins": 7.744347095489502, "rewards/rejected": -5.9799370765686035, "step": 434 }, { "epoch": 1.3136663524976437, "grad_norm": 4.565004348754883, "learning_rate": 8.688821752265861e-07, "logits/chosen": -2.622467279434204, "logits/rejected": -2.5320816040039062, "logps/chosen": -34.32563018798828, "logps/rejected": -116.62334442138672, "loss": 0.3999, "rewards/accuracies": 0.9375, "rewards/chosen": 2.111335277557373, "rewards/margins": 6.066380500793457, "rewards/rejected": -3.955045461654663, "step": 435 }, { "epoch": 1.3166823751178134, "grad_norm": 4.295002460479736, "learning_rate": 8.685800604229607e-07, "logits/chosen": -2.6654536724090576, "logits/rejected": -2.5361862182617188, "logps/chosen": -43.13106155395508, "logps/rejected": -153.6588134765625, "loss": 0.3354, "rewards/accuracies": 1.0, "rewards/chosen": 1.479372262954712, "rewards/margins": 7.437130451202393, "rewards/rejected": -5.95775842666626, "step": 436 }, { "epoch": 1.319698397737983, "grad_norm": 3.0645411014556885, "learning_rate": 8.682779456193353e-07, "logits/chosen": -2.590231418609619, "logits/rejected": -2.5123543739318848, "logps/chosen": -31.090534210205078, "logps/rejected": -138.1846923828125, "loss": 0.2859, "rewards/accuracies": 1.0, "rewards/chosen": 2.600130796432495, "rewards/margins": 7.802825927734375, "rewards/rejected": -5.202694892883301, "step": 437 }, { "epoch": 1.3227144203581527, "grad_norm": 3.173476457595825, "learning_rate": 8.679758308157099e-07, "logits/chosen": -2.6747076511383057, "logits/rejected": -2.6828205585479736, "logps/chosen": -29.420665740966797, "logps/rejected": -124.65110778808594, "loss": 0.3047, "rewards/accuracies": 0.96875, "rewards/chosen": 2.6616127490997314, "rewards/margins": 7.0696234703063965, "rewards/rejected": -4.408011436462402, "step": 438 }, { "epoch": 1.3257304429783223, "grad_norm": 2.7070767879486084, "learning_rate": 8.676737160120845e-07, "logits/chosen": -2.6542177200317383, "logits/rejected": -2.539849281311035, "logps/chosen": -31.132553100585938, "logps/rejected": -129.4221649169922, "loss": 0.2549, "rewards/accuracies": 1.0, "rewards/chosen": 2.1400201320648193, "rewards/margins": 6.6458845138549805, "rewards/rejected": -4.505864143371582, "step": 439 }, { "epoch": 1.328746465598492, "grad_norm": 3.817420244216919, "learning_rate": 8.673716012084592e-07, "logits/chosen": -2.6296396255493164, "logits/rejected": -2.662964105606079, "logps/chosen": -29.86449432373047, "logps/rejected": -131.6903076171875, "loss": 0.3472, "rewards/accuracies": 1.0, "rewards/chosen": 2.4101524353027344, "rewards/margins": 7.3087286949157715, "rewards/rejected": -4.8985772132873535, "step": 440 }, { "epoch": 1.3317624882186616, "grad_norm": 3.8132166862487793, "learning_rate": 8.670694864048338e-07, "logits/chosen": -2.6695303916931152, "logits/rejected": -2.5932044982910156, "logps/chosen": -30.39085578918457, "logps/rejected": -140.88101196289062, "loss": 0.3806, "rewards/accuracies": 1.0, "rewards/chosen": 2.3240675926208496, "rewards/margins": 7.435549259185791, "rewards/rejected": -5.1114821434021, "step": 441 }, { "epoch": 1.3347785108388313, "grad_norm": 5.374870300292969, "learning_rate": 8.667673716012083e-07, "logits/chosen": -2.610045909881592, "logits/rejected": -2.692042589187622, "logps/chosen": -35.46611785888672, "logps/rejected": -135.86456298828125, "loss": 0.3583, "rewards/accuracies": 1.0, "rewards/chosen": 2.0147547721862793, "rewards/margins": 6.890933036804199, "rewards/rejected": -4.87617826461792, "step": 442 }, { "epoch": 1.337794533459001, "grad_norm": 4.125868797302246, "learning_rate": 8.664652567975831e-07, "logits/chosen": -2.6362929344177246, "logits/rejected": -2.601681709289551, "logps/chosen": -38.634063720703125, "logps/rejected": -122.50833129882812, "loss": 0.4115, "rewards/accuracies": 0.9375, "rewards/chosen": 1.8140877485275269, "rewards/margins": 5.901726245880127, "rewards/rejected": -4.087637901306152, "step": 443 }, { "epoch": 1.3408105560791705, "grad_norm": 4.995226860046387, "learning_rate": 8.661631419939577e-07, "logits/chosen": -2.617607593536377, "logits/rejected": -2.6858739852905273, "logps/chosen": -42.423912048339844, "logps/rejected": -124.53815460205078, "loss": 0.4266, "rewards/accuracies": 0.9375, "rewards/chosen": 1.7821537256240845, "rewards/margins": 6.332677364349365, "rewards/rejected": -4.55052375793457, "step": 444 }, { "epoch": 1.3438265786993402, "grad_norm": 3.1845128536224365, "learning_rate": 8.658610271903322e-07, "logits/chosen": -2.5964179039001465, "logits/rejected": -2.5737838745117188, "logps/chosen": -36.85116195678711, "logps/rejected": -144.99105834960938, "loss": 0.3213, "rewards/accuracies": 0.96875, "rewards/chosen": 1.8886585235595703, "rewards/margins": 7.418227195739746, "rewards/rejected": -5.529568672180176, "step": 445 }, { "epoch": 1.3468426013195098, "grad_norm": 4.065856456756592, "learning_rate": 8.655589123867069e-07, "logits/chosen": -2.6754703521728516, "logits/rejected": -2.6417527198791504, "logps/chosen": -37.86794662475586, "logps/rejected": -138.3861083984375, "loss": 0.3994, "rewards/accuracies": 0.90625, "rewards/chosen": 1.8392362594604492, "rewards/margins": 7.032003402709961, "rewards/rejected": -5.19276762008667, "step": 446 }, { "epoch": 1.3498586239396795, "grad_norm": 3.2530853748321533, "learning_rate": 8.652567975830816e-07, "logits/chosen": -2.643253803253174, "logits/rejected": -2.5754194259643555, "logps/chosen": -25.048635482788086, "logps/rejected": -134.44578552246094, "loss": 0.2145, "rewards/accuracies": 1.0, "rewards/chosen": 2.8187708854675293, "rewards/margins": 8.141624450683594, "rewards/rejected": -5.322853088378906, "step": 447 }, { "epoch": 1.3528746465598491, "grad_norm": 4.449993133544922, "learning_rate": 8.649546827794561e-07, "logits/chosen": -2.7014005184173584, "logits/rejected": -2.6599621772766113, "logps/chosen": -33.806922912597656, "logps/rejected": -144.5841064453125, "loss": 0.327, "rewards/accuracies": 0.96875, "rewards/chosen": 2.395293712615967, "rewards/margins": 8.06898021697998, "rewards/rejected": -5.673686981201172, "step": 448 }, { "epoch": 1.3558906691800188, "grad_norm": 4.740169525146484, "learning_rate": 8.646525679758307e-07, "logits/chosen": -2.6897835731506348, "logits/rejected": -2.575303077697754, "logps/chosen": -34.72779846191406, "logps/rejected": -122.57270050048828, "loss": 0.3486, "rewards/accuracies": 1.0, "rewards/chosen": 2.0646491050720215, "rewards/margins": 6.132500171661377, "rewards/rejected": -4.067850589752197, "step": 449 }, { "epoch": 1.3589066918001884, "grad_norm": 4.532487392425537, "learning_rate": 8.643504531722055e-07, "logits/chosen": -2.6721267700195312, "logits/rejected": -2.6545965671539307, "logps/chosen": -40.42285919189453, "logps/rejected": -126.94928741455078, "loss": 0.4312, "rewards/accuracies": 0.9375, "rewards/chosen": 1.8656980991363525, "rewards/margins": 6.233343601226807, "rewards/rejected": -4.367645263671875, "step": 450 }, { "epoch": 1.3619227144203583, "grad_norm": 3.420994758605957, "learning_rate": 8.6404833836858e-07, "logits/chosen": -2.679687976837158, "logits/rejected": -2.666733980178833, "logps/chosen": -36.589195251464844, "logps/rejected": -128.6448211669922, "loss": 0.3958, "rewards/accuracies": 0.96875, "rewards/chosen": 1.9974344968795776, "rewards/margins": 6.749525547027588, "rewards/rejected": -4.752091407775879, "step": 451 }, { "epoch": 1.3649387370405277, "grad_norm": 3.3164684772491455, "learning_rate": 8.637462235649546e-07, "logits/chosen": -2.7110681533813477, "logits/rejected": -2.6346588134765625, "logps/chosen": -37.74477005004883, "logps/rejected": -138.41796875, "loss": 0.3172, "rewards/accuracies": 1.0, "rewards/chosen": 1.9138875007629395, "rewards/margins": 7.345833778381348, "rewards/rejected": -5.431946277618408, "step": 452 }, { "epoch": 1.3679547596606976, "grad_norm": 3.942905902862549, "learning_rate": 8.634441087613292e-07, "logits/chosen": -2.5970821380615234, "logits/rejected": -2.573853015899658, "logps/chosen": -32.1584587097168, "logps/rejected": -118.62415313720703, "loss": 0.4251, "rewards/accuracies": 0.96875, "rewards/chosen": 2.4712114334106445, "rewards/margins": 6.490658760070801, "rewards/rejected": -4.0194478034973145, "step": 453 }, { "epoch": 1.370970782280867, "grad_norm": 4.833600044250488, "learning_rate": 8.63141993957704e-07, "logits/chosen": -2.6363720893859863, "logits/rejected": -2.639150381088257, "logps/chosen": -24.883621215820312, "logps/rejected": -125.1966552734375, "loss": 0.2339, "rewards/accuracies": 1.0, "rewards/chosen": 2.8675992488861084, "rewards/margins": 7.262654781341553, "rewards/rejected": -4.395055294036865, "step": 454 }, { "epoch": 1.3739868049010369, "grad_norm": 2.084524154663086, "learning_rate": 8.628398791540785e-07, "logits/chosen": -2.662155866622925, "logits/rejected": -2.650319814682007, "logps/chosen": -32.39781951904297, "logps/rejected": -122.53170013427734, "loss": 0.4268, "rewards/accuracies": 0.9375, "rewards/chosen": 1.9808425903320312, "rewards/margins": 6.274322509765625, "rewards/rejected": -4.293479919433594, "step": 455 }, { "epoch": 1.3770028275212063, "grad_norm": 3.2272825241088867, "learning_rate": 8.625377643504531e-07, "logits/chosen": -2.6405344009399414, "logits/rejected": -2.6702001094818115, "logps/chosen": -44.06291198730469, "logps/rejected": -120.54366302490234, "loss": 0.4473, "rewards/accuracies": 0.96875, "rewards/chosen": 1.718567967414856, "rewards/margins": 5.9401655197143555, "rewards/rejected": -4.221597671508789, "step": 456 }, { "epoch": 1.3800188501413762, "grad_norm": 2.7811591625213623, "learning_rate": 8.622356495468277e-07, "logits/chosen": -2.6087229251861572, "logits/rejected": -2.5973422527313232, "logps/chosen": -32.56956100463867, "logps/rejected": -140.24278259277344, "loss": 0.2491, "rewards/accuracies": 0.96875, "rewards/chosen": 2.0337326526641846, "rewards/margins": 7.733336925506592, "rewards/rejected": -5.699604511260986, "step": 457 }, { "epoch": 1.3830348727615456, "grad_norm": 3.176597833633423, "learning_rate": 8.619335347432024e-07, "logits/chosen": -2.653935432434082, "logits/rejected": -2.6715312004089355, "logps/chosen": -25.4039363861084, "logps/rejected": -143.6988983154297, "loss": 0.189, "rewards/accuracies": 1.0, "rewards/chosen": 2.893488645553589, "rewards/margins": 8.468833923339844, "rewards/rejected": -5.575345039367676, "step": 458 }, { "epoch": 1.3860508953817154, "grad_norm": 4.456879138946533, "learning_rate": 8.61631419939577e-07, "logits/chosen": -2.5959579944610596, "logits/rejected": -2.5654547214508057, "logps/chosen": -30.302583694458008, "logps/rejected": -114.6581802368164, "loss": 0.3432, "rewards/accuracies": 1.0, "rewards/chosen": 2.653959274291992, "rewards/margins": 6.5122880935668945, "rewards/rejected": -3.8583285808563232, "step": 459 }, { "epoch": 1.389066918001885, "grad_norm": 3.9996964931488037, "learning_rate": 8.613293051359516e-07, "logits/chosen": -2.7217421531677246, "logits/rejected": -2.644327402114868, "logps/chosen": -27.944547653198242, "logps/rejected": -125.9365234375, "loss": 0.3497, "rewards/accuracies": 0.96875, "rewards/chosen": 2.563626289367676, "rewards/margins": 6.969139575958252, "rewards/rejected": -4.405513286590576, "step": 460 }, { "epoch": 1.3920829406220547, "grad_norm": 3.616483449935913, "learning_rate": 8.610271903323263e-07, "logits/chosen": -2.6641688346862793, "logits/rejected": -2.563821315765381, "logps/chosen": -33.33052444458008, "logps/rejected": -138.61582946777344, "loss": 0.3678, "rewards/accuracies": 0.9375, "rewards/chosen": 2.3749759197235107, "rewards/margins": 7.300210952758789, "rewards/rejected": -4.925234317779541, "step": 461 }, { "epoch": 1.3950989632422244, "grad_norm": 4.534832954406738, "learning_rate": 8.607250755287009e-07, "logits/chosen": -2.726165533065796, "logits/rejected": -2.6573293209075928, "logps/chosen": -35.54429244995117, "logps/rejected": -132.8628387451172, "loss": 0.3172, "rewards/accuracies": 0.9375, "rewards/chosen": 2.411130905151367, "rewards/margins": 7.359520435333252, "rewards/rejected": -4.948390007019043, "step": 462 }, { "epoch": 1.398114985862394, "grad_norm": 4.693966388702393, "learning_rate": 8.604229607250755e-07, "logits/chosen": -2.741590976715088, "logits/rejected": -2.6861772537231445, "logps/chosen": -31.211631774902344, "logps/rejected": -131.43307495117188, "loss": 0.3223, "rewards/accuracies": 0.9375, "rewards/chosen": 2.5256645679473877, "rewards/margins": 7.432143688201904, "rewards/rejected": -4.9064788818359375, "step": 463 }, { "epoch": 1.4011310084825637, "grad_norm": 4.28096342086792, "learning_rate": 8.6012084592145e-07, "logits/chosen": -2.6433112621307373, "logits/rejected": -2.611628770828247, "logps/chosen": -32.75000762939453, "logps/rejected": -140.10391235351562, "loss": 0.3551, "rewards/accuracies": 1.0, "rewards/chosen": 1.9582346677780151, "rewards/margins": 7.599189758300781, "rewards/rejected": -5.640955448150635, "step": 464 }, { "epoch": 1.4041470311027333, "grad_norm": 2.8515517711639404, "learning_rate": 8.598187311178248e-07, "logits/chosen": -2.665653705596924, "logits/rejected": -2.5744190216064453, "logps/chosen": -38.38271713256836, "logps/rejected": -147.9900665283203, "loss": 0.3067, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6347956657409668, "rewards/margins": 7.839282035827637, "rewards/rejected": -6.20448637008667, "step": 465 }, { "epoch": 1.407163053722903, "grad_norm": 5.6484456062316895, "learning_rate": 8.595166163141994e-07, "logits/chosen": -2.6867709159851074, "logits/rejected": -2.667689085006714, "logps/chosen": -41.547386169433594, "logps/rejected": -145.30393981933594, "loss": 0.3099, "rewards/accuracies": 1.0, "rewards/chosen": 1.6442298889160156, "rewards/margins": 7.71774435043335, "rewards/rejected": -6.073513507843018, "step": 466 }, { "epoch": 1.4101790763430726, "grad_norm": 3.851868152618408, "learning_rate": 8.592145015105739e-07, "logits/chosen": -2.582399368286133, "logits/rejected": -2.539900541305542, "logps/chosen": -39.505741119384766, "logps/rejected": -133.43032836914062, "loss": 0.4382, "rewards/accuracies": 0.9375, "rewards/chosen": 1.68473219871521, "rewards/margins": 6.488312721252441, "rewards/rejected": -4.8035807609558105, "step": 467 }, { "epoch": 1.4131950989632422, "grad_norm": 3.2693886756896973, "learning_rate": 8.589123867069486e-07, "logits/chosen": -2.5813777446746826, "logits/rejected": -2.615713596343994, "logps/chosen": -26.771228790283203, "logps/rejected": -130.42637634277344, "loss": 0.3044, "rewards/accuracies": 0.96875, "rewards/chosen": 2.5573930740356445, "rewards/margins": 7.332037925720215, "rewards/rejected": -4.774644374847412, "step": 468 }, { "epoch": 1.416211121583412, "grad_norm": 3.9685239791870117, "learning_rate": 8.586102719033233e-07, "logits/chosen": -2.6160337924957275, "logits/rejected": -2.5747601985931396, "logps/chosen": -30.925527572631836, "logps/rejected": -126.29130554199219, "loss": 0.3218, "rewards/accuracies": 1.0, "rewards/chosen": 2.5677671432495117, "rewards/margins": 7.2226457595825195, "rewards/rejected": -4.654878616333008, "step": 469 }, { "epoch": 1.416211121583412, "eval_logits/chosen": -2.577974796295166, "eval_logits/rejected": -2.5534019470214844, "eval_logps/chosen": -37.7324104309082, "eval_logps/rejected": -136.11903381347656, "eval_loss": 0.3183530569076538, "eval_rewards/accuracies": 0.969072163105011, "eval_rewards/chosen": 2.167227268218994, "eval_rewards/margins": 7.151968002319336, "eval_rewards/rejected": -4.9847412109375, "eval_runtime": 699.2652, "eval_samples_per_second": 0.553, "eval_steps_per_second": 0.277, "step": 469 }, { "epoch": 1.4192271442035815, "grad_norm": 4.770530700683594, "learning_rate": 8.583081570996978e-07, "logits/chosen": -2.74465012550354, "logits/rejected": -2.6679508686065674, "logps/chosen": -41.84001541137695, "logps/rejected": -133.18417358398438, "loss": 0.375, "rewards/accuracies": 1.0, "rewards/chosen": 1.589478850364685, "rewards/margins": 6.650075912475586, "rewards/rejected": -5.060596942901611, "step": 470 }, { "epoch": 1.4222431668237512, "grad_norm": 4.020356178283691, "learning_rate": 8.580060422960724e-07, "logits/chosen": -2.6980600357055664, "logits/rejected": -2.6203887462615967, "logps/chosen": -33.697086334228516, "logps/rejected": -137.11557006835938, "loss": 0.3217, "rewards/accuracies": 0.96875, "rewards/chosen": 2.2608518600463867, "rewards/margins": 7.602620601654053, "rewards/rejected": -5.341768264770508, "step": 471 }, { "epoch": 1.4252591894439208, "grad_norm": 3.8187813758850098, "learning_rate": 8.577039274924472e-07, "logits/chosen": -2.6480376720428467, "logits/rejected": -2.5634255409240723, "logps/chosen": -32.00556945800781, "logps/rejected": -144.3088836669922, "loss": 0.257, "rewards/accuracies": 0.96875, "rewards/chosen": 2.481783151626587, "rewards/margins": 8.230159759521484, "rewards/rejected": -5.748376846313477, "step": 472 }, { "epoch": 1.4282752120640905, "grad_norm": 4.637365341186523, "learning_rate": 8.574018126888217e-07, "logits/chosen": -2.713634729385376, "logits/rejected": -2.668745517730713, "logps/chosen": -37.354896545410156, "logps/rejected": -126.10554504394531, "loss": 0.3335, "rewards/accuracies": 1.0, "rewards/chosen": 2.0403711795806885, "rewards/margins": 6.510885715484619, "rewards/rejected": -4.470515251159668, "step": 473 }, { "epoch": 1.4312912346842601, "grad_norm": 2.1987464427948, "learning_rate": 8.570996978851963e-07, "logits/chosen": -2.6295652389526367, "logits/rejected": -2.6013429164886475, "logps/chosen": -37.866939544677734, "logps/rejected": -128.18304443359375, "loss": 0.3875, "rewards/accuracies": 0.875, "rewards/chosen": 1.823856234550476, "rewards/margins": 6.7458109855651855, "rewards/rejected": -4.921955108642578, "step": 474 }, { "epoch": 1.4343072573044298, "grad_norm": 3.320286750793457, "learning_rate": 8.567975830815709e-07, "logits/chosen": -2.5992536544799805, "logits/rejected": -2.624826431274414, "logps/chosen": -30.301769256591797, "logps/rejected": -137.14076232910156, "loss": 0.3275, "rewards/accuracies": 1.0, "rewards/chosen": 2.370882034301758, "rewards/margins": 7.625644683837891, "rewards/rejected": -5.254762649536133, "step": 475 }, { "epoch": 1.4373232799245994, "grad_norm": 3.957502603530884, "learning_rate": 8.564954682779457e-07, "logits/chosen": -2.581636667251587, "logits/rejected": -2.4948761463165283, "logps/chosen": -37.63362503051758, "logps/rejected": -137.8570098876953, "loss": 0.3107, "rewards/accuracies": 0.9375, "rewards/chosen": 1.998907208442688, "rewards/margins": 6.912583827972412, "rewards/rejected": -4.913676738739014, "step": 476 }, { "epoch": 1.440339302544769, "grad_norm": 3.7829997539520264, "learning_rate": 8.561933534743202e-07, "logits/chosen": -2.6427085399627686, "logits/rejected": -2.6316540241241455, "logps/chosen": -35.03449630737305, "logps/rejected": -136.26568603515625, "loss": 0.3078, "rewards/accuracies": 0.9375, "rewards/chosen": 2.018630266189575, "rewards/margins": 7.497731685638428, "rewards/rejected": -5.479101657867432, "step": 477 }, { "epoch": 1.4433553251649387, "grad_norm": 3.667928695678711, "learning_rate": 8.558912386706948e-07, "logits/chosen": -2.72267484664917, "logits/rejected": -2.675199508666992, "logps/chosen": -41.5786247253418, "logps/rejected": -138.6229248046875, "loss": 0.3856, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5573773384094238, "rewards/margins": 6.803504943847656, "rewards/rejected": -5.246127128601074, "step": 478 }, { "epoch": 1.4463713477851083, "grad_norm": 4.555490016937256, "learning_rate": 8.555891238670695e-07, "logits/chosen": -2.6150858402252197, "logits/rejected": -2.632338047027588, "logps/chosen": -30.396360397338867, "logps/rejected": -132.4397735595703, "loss": 0.3216, "rewards/accuracies": 0.9375, "rewards/chosen": 2.5436723232269287, "rewards/margins": 7.560776233673096, "rewards/rejected": -5.017104148864746, "step": 479 }, { "epoch": 1.449387370405278, "grad_norm": 4.406988620758057, "learning_rate": 8.552870090634441e-07, "logits/chosen": -2.669908046722412, "logits/rejected": -2.6274466514587402, "logps/chosen": -41.857688903808594, "logps/rejected": -140.01441955566406, "loss": 0.3899, "rewards/accuracies": 0.9375, "rewards/chosen": 1.7214412689208984, "rewards/margins": 6.946969509124756, "rewards/rejected": -5.225528240203857, "step": 480 }, { "epoch": 1.4524033930254476, "grad_norm": 4.331000328063965, "learning_rate": 8.549848942598187e-07, "logits/chosen": -2.596278429031372, "logits/rejected": -2.6058356761932373, "logps/chosen": -40.95698928833008, "logps/rejected": -112.09190368652344, "loss": 0.4736, "rewards/accuracies": 0.90625, "rewards/chosen": 2.054473638534546, "rewards/margins": 5.490542888641357, "rewards/rejected": -3.4360687732696533, "step": 481 }, { "epoch": 1.4554194156456173, "grad_norm": 3.547244071960449, "learning_rate": 8.546827794561933e-07, "logits/chosen": -2.6437578201293945, "logits/rejected": -2.603335380554199, "logps/chosen": -32.023590087890625, "logps/rejected": -131.37799072265625, "loss": 0.3097, "rewards/accuracies": 1.0, "rewards/chosen": 2.4413461685180664, "rewards/margins": 7.2245049476623535, "rewards/rejected": -4.783158779144287, "step": 482 }, { "epoch": 1.458435438265787, "grad_norm": 4.4120306968688965, "learning_rate": 8.543806646525679e-07, "logits/chosen": -2.6817140579223633, "logits/rejected": -2.607923746109009, "logps/chosen": -28.9787540435791, "logps/rejected": -123.27876281738281, "loss": 0.3106, "rewards/accuracies": 0.96875, "rewards/chosen": 2.8639259338378906, "rewards/margins": 6.941637992858887, "rewards/rejected": -4.077711582183838, "step": 483 }, { "epoch": 1.4614514608859566, "grad_norm": 4.510419845581055, "learning_rate": 8.540785498489426e-07, "logits/chosen": -2.56282114982605, "logits/rejected": -2.6078381538391113, "logps/chosen": -30.253463745117188, "logps/rejected": -132.6449737548828, "loss": 0.3126, "rewards/accuracies": 0.96875, "rewards/chosen": 2.53442120552063, "rewards/margins": 7.464455604553223, "rewards/rejected": -4.9300336837768555, "step": 484 }, { "epoch": 1.4644674835061262, "grad_norm": 3.932114601135254, "learning_rate": 8.537764350453172e-07, "logits/chosen": -2.7105929851531982, "logits/rejected": -2.6512699127197266, "logps/chosen": -29.85540008544922, "logps/rejected": -138.0081787109375, "loss": 0.3352, "rewards/accuracies": 0.96875, "rewards/chosen": 2.4339373111724854, "rewards/margins": 7.725551128387451, "rewards/rejected": -5.291613578796387, "step": 485 }, { "epoch": 1.467483506126296, "grad_norm": 3.593729019165039, "learning_rate": 8.534743202416918e-07, "logits/chosen": -2.6984283924102783, "logits/rejected": -2.6071078777313232, "logps/chosen": -27.659984588623047, "logps/rejected": -138.14358520507812, "loss": 0.1572, "rewards/accuracies": 1.0, "rewards/chosen": 2.9090144634246826, "rewards/margins": 8.49332332611084, "rewards/rejected": -5.584308624267578, "step": 486 }, { "epoch": 1.4704995287464655, "grad_norm": 3.866530656814575, "learning_rate": 8.531722054380665e-07, "logits/chosen": -2.691314220428467, "logits/rejected": -2.579977512359619, "logps/chosen": -33.486961364746094, "logps/rejected": -138.3367919921875, "loss": 0.3517, "rewards/accuracies": 0.96875, "rewards/chosen": 2.0033857822418213, "rewards/margins": 7.1080217361450195, "rewards/rejected": -5.104635715484619, "step": 487 }, { "epoch": 1.4735155513666354, "grad_norm": 3.553825616836548, "learning_rate": 8.528700906344411e-07, "logits/chosen": -2.622788667678833, "logits/rejected": -2.513676404953003, "logps/chosen": -39.95635223388672, "logps/rejected": -123.64659881591797, "loss": 0.3665, "rewards/accuracies": 0.9375, "rewards/chosen": 1.7718931436538696, "rewards/margins": 6.3960700035095215, "rewards/rejected": -4.624176979064941, "step": 488 }, { "epoch": 1.4765315739868048, "grad_norm": 4.3883771896362305, "learning_rate": 8.525679758308156e-07, "logits/chosen": -2.6132376194000244, "logits/rejected": -2.578824520111084, "logps/chosen": -34.46259307861328, "logps/rejected": -125.62811279296875, "loss": 0.3918, "rewards/accuracies": 1.0, "rewards/chosen": 2.003316879272461, "rewards/margins": 6.474973678588867, "rewards/rejected": -4.471656322479248, "step": 489 }, { "epoch": 1.4795475966069747, "grad_norm": 4.140624046325684, "learning_rate": 8.522658610271903e-07, "logits/chosen": -2.587564706802368, "logits/rejected": -2.5313377380371094, "logps/chosen": -38.207969665527344, "logps/rejected": -128.95420837402344, "loss": 0.3998, "rewards/accuracies": 0.96875, "rewards/chosen": 2.164005756378174, "rewards/margins": 6.232521057128906, "rewards/rejected": -4.068515777587891, "step": 490 }, { "epoch": 1.482563619227144, "grad_norm": 3.879559278488159, "learning_rate": 8.51963746223565e-07, "logits/chosen": -2.6629397869110107, "logits/rejected": -2.7249252796173096, "logps/chosen": -25.908733367919922, "logps/rejected": -120.66592407226562, "loss": 0.3113, "rewards/accuracies": 0.96875, "rewards/chosen": 3.1170129776000977, "rewards/margins": 7.1572723388671875, "rewards/rejected": -4.040258884429932, "step": 491 }, { "epoch": 1.485579641847314, "grad_norm": 4.788487911224365, "learning_rate": 8.516616314199395e-07, "logits/chosen": -2.6663804054260254, "logits/rejected": -2.6294806003570557, "logps/chosen": -42.56226348876953, "logps/rejected": -148.27792358398438, "loss": 0.3763, "rewards/accuracies": 1.0, "rewards/chosen": 1.3071151971817017, "rewards/margins": 7.314943313598633, "rewards/rejected": -6.0078277587890625, "step": 492 }, { "epoch": 1.4885956644674834, "grad_norm": 4.777008533477783, "learning_rate": 8.513595166163141e-07, "logits/chosen": -2.6627449989318848, "logits/rejected": -2.5486674308776855, "logps/chosen": -27.738201141357422, "logps/rejected": -119.16476440429688, "loss": 0.3287, "rewards/accuracies": 0.96875, "rewards/chosen": 3.018897294998169, "rewards/margins": 6.93336296081543, "rewards/rejected": -3.91446590423584, "step": 493 }, { "epoch": 1.4916116870876532, "grad_norm": 4.479770183563232, "learning_rate": 8.510574018126888e-07, "logits/chosen": -2.6645350456237793, "logits/rejected": -2.5661685466766357, "logps/chosen": -29.6223201751709, "logps/rejected": -125.64826202392578, "loss": 0.2837, "rewards/accuracies": 1.0, "rewards/chosen": 2.712568521499634, "rewards/margins": 7.326128005981445, "rewards/rejected": -4.613560199737549, "step": 494 }, { "epoch": 1.4946277097078229, "grad_norm": 6.029559135437012, "learning_rate": 8.507552870090634e-07, "logits/chosen": -2.6753737926483154, "logits/rejected": -2.6212828159332275, "logps/chosen": -35.738189697265625, "logps/rejected": -143.18028259277344, "loss": 0.2641, "rewards/accuracies": 1.0, "rewards/chosen": 2.1205227375030518, "rewards/margins": 7.797273635864258, "rewards/rejected": -5.676750659942627, "step": 495 }, { "epoch": 1.4976437323279925, "grad_norm": 4.203415393829346, "learning_rate": 8.50453172205438e-07, "logits/chosen": -2.687910318374634, "logits/rejected": -2.6055288314819336, "logps/chosen": -28.352205276489258, "logps/rejected": -133.96786499023438, "loss": 0.2663, "rewards/accuracies": 0.96875, "rewards/chosen": 2.5925912857055664, "rewards/margins": 7.9063191413879395, "rewards/rejected": -5.313727855682373, "step": 496 }, { "epoch": 1.500659754948162, "grad_norm": 3.986396074295044, "learning_rate": 8.501510574018127e-07, "logits/chosen": -2.6728124618530273, "logits/rejected": -2.6326088905334473, "logps/chosen": -39.45402526855469, "logps/rejected": -140.0555419921875, "loss": 0.3594, "rewards/accuracies": 0.9375, "rewards/chosen": 1.8225077390670776, "rewards/margins": 7.300983428955078, "rewards/rejected": -5.478475570678711, "step": 497 }, { "epoch": 1.5036757775683318, "grad_norm": 5.004134654998779, "learning_rate": 8.498489425981872e-07, "logits/chosen": -2.6368064880371094, "logits/rejected": -2.659027099609375, "logps/chosen": -30.972856521606445, "logps/rejected": -128.8231658935547, "loss": 0.3718, "rewards/accuracies": 0.90625, "rewards/chosen": 2.4527413845062256, "rewards/margins": 7.261287689208984, "rewards/rejected": -4.808547019958496, "step": 498 }, { "epoch": 1.5066918001885015, "grad_norm": 3.8970131874084473, "learning_rate": 8.495468277945619e-07, "logits/chosen": -2.6257975101470947, "logits/rejected": -2.5392627716064453, "logps/chosen": -34.915225982666016, "logps/rejected": -137.7503204345703, "loss": 0.3075, "rewards/accuracies": 1.0, "rewards/chosen": 1.7984137535095215, "rewards/margins": 7.137725830078125, "rewards/rejected": -5.3393120765686035, "step": 499 }, { "epoch": 1.509707822808671, "grad_norm": 3.717424154281616, "learning_rate": 8.492447129909365e-07, "logits/chosen": -2.6235666275024414, "logits/rejected": -2.591362476348877, "logps/chosen": -30.99747657775879, "logps/rejected": -148.17376708984375, "loss": 0.2346, "rewards/accuracies": 1.0, "rewards/chosen": 2.6171631813049316, "rewards/margins": 8.480367660522461, "rewards/rejected": -5.863204002380371, "step": 500 }, { "epoch": 1.5127238454288408, "grad_norm": 3.2989351749420166, "learning_rate": 8.489425981873112e-07, "logits/chosen": -2.600269317626953, "logits/rejected": -2.6113946437835693, "logps/chosen": -37.4763069152832, "logps/rejected": -119.70750427246094, "loss": 0.3931, "rewards/accuracies": 0.875, "rewards/chosen": 2.1176178455352783, "rewards/margins": 6.216096878051758, "rewards/rejected": -4.0984787940979, "step": 501 }, { "epoch": 1.5157398680490104, "grad_norm": 4.4883246421813965, "learning_rate": 8.486404833836858e-07, "logits/chosen": -2.6440584659576416, "logits/rejected": -2.590463638305664, "logps/chosen": -34.086158752441406, "logps/rejected": -139.35626220703125, "loss": 0.3604, "rewards/accuracies": 1.0, "rewards/chosen": 2.0311543941497803, "rewards/margins": 7.082239151000977, "rewards/rejected": -5.051084995269775, "step": 502 }, { "epoch": 1.51875589066918, "grad_norm": 4.0811309814453125, "learning_rate": 8.483383685800604e-07, "logits/chosen": -2.7226593494415283, "logits/rejected": -2.632680892944336, "logps/chosen": -50.66078186035156, "logps/rejected": -135.59080505371094, "loss": 0.4256, "rewards/accuracies": 0.90625, "rewards/chosen": 1.2849476337432861, "rewards/margins": 6.270210266113281, "rewards/rejected": -4.985262870788574, "step": 503 }, { "epoch": 1.5217719132893497, "grad_norm": 4.977391719818115, "learning_rate": 8.480362537764351e-07, "logits/chosen": -2.564316987991333, "logits/rejected": -2.5277721881866455, "logps/chosen": -35.398250579833984, "logps/rejected": -122.46233367919922, "loss": 0.4308, "rewards/accuracies": 1.0, "rewards/chosen": 2.0686380863189697, "rewards/margins": 6.431429386138916, "rewards/rejected": -4.362791538238525, "step": 504 }, { "epoch": 1.5247879359095193, "grad_norm": 6.11861515045166, "learning_rate": 8.477341389728096e-07, "logits/chosen": -2.660884380340576, "logits/rejected": -2.6307742595672607, "logps/chosen": -22.88918113708496, "logps/rejected": -109.90579986572266, "loss": 0.3685, "rewards/accuracies": 1.0, "rewards/chosen": 3.213956832885742, "rewards/margins": 6.452282428741455, "rewards/rejected": -3.23832631111145, "step": 505 }, { "epoch": 1.527803958529689, "grad_norm": 5.466984748840332, "learning_rate": 8.474320241691843e-07, "logits/chosen": -2.6584362983703613, "logits/rejected": -2.5527963638305664, "logps/chosen": -33.821617126464844, "logps/rejected": -165.91439819335938, "loss": 0.2358, "rewards/accuracies": 1.0, "rewards/chosen": 2.371797561645508, "rewards/margins": 9.325069427490234, "rewards/rejected": -6.953271865844727, "step": 506 }, { "epoch": 1.5308199811498586, "grad_norm": 4.6193037033081055, "learning_rate": 8.471299093655589e-07, "logits/chosen": -2.6459875106811523, "logits/rejected": -2.656684637069702, "logps/chosen": -36.78864669799805, "logps/rejected": -125.12212371826172, "loss": 0.3784, "rewards/accuracies": 0.96875, "rewards/chosen": 2.114105224609375, "rewards/margins": 6.452789306640625, "rewards/rejected": -4.338684558868408, "step": 507 }, { "epoch": 1.5338360037700283, "grad_norm": 3.7281253337860107, "learning_rate": 8.468277945619335e-07, "logits/chosen": -2.559785842895508, "logits/rejected": -2.5896260738372803, "logps/chosen": -34.88210678100586, "logps/rejected": -119.80144500732422, "loss": 0.4033, "rewards/accuracies": 0.96875, "rewards/chosen": 2.0432639122009277, "rewards/margins": 5.950602054595947, "rewards/rejected": -3.9073386192321777, "step": 508 }, { "epoch": 1.536852026390198, "grad_norm": 5.236784934997559, "learning_rate": 8.465256797583081e-07, "logits/chosen": -2.6530258655548096, "logits/rejected": -2.5988657474517822, "logps/chosen": -34.59952926635742, "logps/rejected": -130.26156616210938, "loss": 0.2728, "rewards/accuracies": 1.0, "rewards/chosen": 2.5269031524658203, "rewards/margins": 7.254208087921143, "rewards/rejected": -4.7273054122924805, "step": 509 }, { "epoch": 1.5398680490103676, "grad_norm": 3.9886724948883057, "learning_rate": 8.462235649546828e-07, "logits/chosen": -2.557863235473633, "logits/rejected": -2.5277388095855713, "logps/chosen": -36.58681869506836, "logps/rejected": -136.57920837402344, "loss": 0.3433, "rewards/accuracies": 0.96875, "rewards/chosen": 2.3319714069366455, "rewards/margins": 7.064168930053711, "rewards/rejected": -4.7321977615356445, "step": 510 }, { "epoch": 1.5428840716305372, "grad_norm": 3.9133193492889404, "learning_rate": 8.459214501510573e-07, "logits/chosen": -2.589418888092041, "logits/rejected": -2.578507423400879, "logps/chosen": -34.48790740966797, "logps/rejected": -125.38518524169922, "loss": 0.315, "rewards/accuracies": 0.96875, "rewards/chosen": 2.354123115539551, "rewards/margins": 6.792098522186279, "rewards/rejected": -4.437975883483887, "step": 511 }, { "epoch": 1.5459000942507068, "grad_norm": 4.771402359008789, "learning_rate": 8.45619335347432e-07, "logits/chosen": -2.595494508743286, "logits/rejected": -2.5382237434387207, "logps/chosen": -36.089027404785156, "logps/rejected": -126.4628677368164, "loss": 0.3905, "rewards/accuracies": 1.0, "rewards/chosen": 1.8646658658981323, "rewards/margins": 6.367611408233643, "rewards/rejected": -4.502944469451904, "step": 512 }, { "epoch": 1.5489161168708765, "grad_norm": 3.4155404567718506, "learning_rate": 8.453172205438066e-07, "logits/chosen": -2.6147241592407227, "logits/rejected": -2.5978832244873047, "logps/chosen": -37.899436950683594, "logps/rejected": -148.1055145263672, "loss": 0.3261, "rewards/accuracies": 1.0, "rewards/chosen": 1.8855376243591309, "rewards/margins": 7.977244853973389, "rewards/rejected": -6.091707229614258, "step": 513 }, { "epoch": 1.5519321394910461, "grad_norm": 3.6587400436401367, "learning_rate": 8.450151057401812e-07, "logits/chosen": -2.6801109313964844, "logits/rejected": -2.618400812149048, "logps/chosen": -31.396217346191406, "logps/rejected": -125.21490478515625, "loss": 0.2903, "rewards/accuracies": 1.0, "rewards/chosen": 2.409646987915039, "rewards/margins": 6.881137847900391, "rewards/rejected": -4.471490383148193, "step": 514 }, { "epoch": 1.5549481621112158, "grad_norm": 6.81882381439209, "learning_rate": 8.447129909365559e-07, "logits/chosen": -2.621190309524536, "logits/rejected": -2.629643201828003, "logps/chosen": -29.669527053833008, "logps/rejected": -122.41162109375, "loss": 0.3401, "rewards/accuracies": 0.9375, "rewards/chosen": 2.6656908988952637, "rewards/margins": 6.700668811798096, "rewards/rejected": -4.034976959228516, "step": 515 }, { "epoch": 1.5579641847313854, "grad_norm": 3.7082297801971436, "learning_rate": 8.444108761329305e-07, "logits/chosen": -2.6369433403015137, "logits/rejected": -2.605591297149658, "logps/chosen": -25.877941131591797, "logps/rejected": -112.6895523071289, "loss": 0.4172, "rewards/accuracies": 1.0, "rewards/chosen": 2.7085728645324707, "rewards/margins": 6.448980331420898, "rewards/rejected": -3.740407943725586, "step": 516 }, { "epoch": 1.5609802073515553, "grad_norm": 3.333300828933716, "learning_rate": 8.44108761329305e-07, "logits/chosen": -2.700868606567383, "logits/rejected": -2.613032341003418, "logps/chosen": -21.334598541259766, "logps/rejected": -142.79429626464844, "loss": 0.205, "rewards/accuracies": 1.0, "rewards/chosen": 3.267824411392212, "rewards/margins": 8.58714771270752, "rewards/rejected": -5.319323539733887, "step": 517 }, { "epoch": 1.5639962299717247, "grad_norm": 4.523541450500488, "learning_rate": 8.438066465256797e-07, "logits/chosen": -2.643676996231079, "logits/rejected": -2.561884641647339, "logps/chosen": -32.91654968261719, "logps/rejected": -127.74563598632812, "loss": 0.3289, "rewards/accuracies": 1.0, "rewards/chosen": 2.5128889083862305, "rewards/margins": 6.968263626098633, "rewards/rejected": -4.455374240875244, "step": 518 }, { "epoch": 1.5670122525918946, "grad_norm": 5.169729232788086, "learning_rate": 8.435045317220544e-07, "logits/chosen": -2.612506866455078, "logits/rejected": -2.5608878135681152, "logps/chosen": -42.65461349487305, "logps/rejected": -138.72842407226562, "loss": 0.3331, "rewards/accuracies": 1.0, "rewards/chosen": 1.7502347230911255, "rewards/margins": 7.00106143951416, "rewards/rejected": -5.250826835632324, "step": 519 }, { "epoch": 1.570028275212064, "grad_norm": 3.569730758666992, "learning_rate": 8.432024169184289e-07, "logits/chosen": -2.61361026763916, "logits/rejected": -2.585939407348633, "logps/chosen": -39.466949462890625, "logps/rejected": -128.0105743408203, "loss": 0.4048, "rewards/accuracies": 1.0, "rewards/chosen": 1.7333101034164429, "rewards/margins": 6.470813751220703, "rewards/rejected": -4.737503528594971, "step": 520 }, { "epoch": 1.5730442978322339, "grad_norm": 4.418059825897217, "learning_rate": 8.429003021148036e-07, "logits/chosen": -2.6535418033599854, "logits/rejected": -2.6028785705566406, "logps/chosen": -41.51853942871094, "logps/rejected": -127.93994140625, "loss": 0.4213, "rewards/accuracies": 0.90625, "rewards/chosen": 1.6102535724639893, "rewards/margins": 6.116472244262695, "rewards/rejected": -4.506218433380127, "step": 521 }, { "epoch": 1.5760603204524033, "grad_norm": 5.740145683288574, "learning_rate": 8.425981873111782e-07, "logits/chosen": -2.6875882148742676, "logits/rejected": -2.610137462615967, "logps/chosen": -48.25819778442383, "logps/rejected": -143.77378845214844, "loss": 0.4479, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1221365928649902, "rewards/margins": 6.670126438140869, "rewards/rejected": -5.547990322113037, "step": 522 }, { "epoch": 1.5790763430725732, "grad_norm": 5.467655181884766, "learning_rate": 8.422960725075528e-07, "logits/chosen": -2.6214308738708496, "logits/rejected": -2.5779666900634766, "logps/chosen": -33.27457809448242, "logps/rejected": -139.5966339111328, "loss": 0.303, "rewards/accuracies": 0.96875, "rewards/chosen": 1.9296143054962158, "rewards/margins": 7.364806652069092, "rewards/rejected": -5.435192584991455, "step": 523 }, { "epoch": 1.5820923656927426, "grad_norm": 3.630598306655884, "learning_rate": 8.419939577039274e-07, "logits/chosen": -2.6019556522369385, "logits/rejected": -2.52701473236084, "logps/chosen": -29.285690307617188, "logps/rejected": -146.68890380859375, "loss": 0.2086, "rewards/accuracies": 1.0, "rewards/chosen": 2.728444814682007, "rewards/margins": 8.357881546020508, "rewards/rejected": -5.629437446594238, "step": 524 }, { "epoch": 1.5851083883129125, "grad_norm": 4.457280158996582, "learning_rate": 8.416918429003021e-07, "logits/chosen": -2.6324288845062256, "logits/rejected": -2.573692798614502, "logps/chosen": -42.734771728515625, "logps/rejected": -140.50184631347656, "loss": 0.4125, "rewards/accuracies": 1.0, "rewards/chosen": 1.536255121231079, "rewards/margins": 6.758134841918945, "rewards/rejected": -5.221879959106445, "step": 525 }, { "epoch": 1.5881244109330819, "grad_norm": 4.84096622467041, "learning_rate": 8.413897280966768e-07, "logits/chosen": -2.604504346847534, "logits/rejected": -2.569847822189331, "logps/chosen": -44.87253189086914, "logps/rejected": -142.3929901123047, "loss": 0.338, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4500600099563599, "rewards/margins": 6.870487213134766, "rewards/rejected": -5.420427322387695, "step": 526 }, { "epoch": 1.5911404335532517, "grad_norm": 4.859714508056641, "learning_rate": 8.410876132930513e-07, "logits/chosen": -2.5943384170532227, "logits/rejected": -2.591503620147705, "logps/chosen": -24.16752052307129, "logps/rejected": -133.19309997558594, "loss": 0.2793, "rewards/accuracies": 1.0, "rewards/chosen": 3.055499792098999, "rewards/margins": 8.07369327545166, "rewards/rejected": -5.018194198608398, "step": 527 }, { "epoch": 1.5941564561734212, "grad_norm": 6.050544261932373, "learning_rate": 8.407854984894259e-07, "logits/chosen": -2.665463447570801, "logits/rejected": -2.5639922618865967, "logps/chosen": -31.477975845336914, "logps/rejected": -117.98220825195312, "loss": 0.3872, "rewards/accuracies": 1.0, "rewards/chosen": 2.152714729309082, "rewards/margins": 5.902066707611084, "rewards/rejected": -3.749351739883423, "step": 528 }, { "epoch": 1.597172478793591, "grad_norm": 5.683608055114746, "learning_rate": 8.404833836858006e-07, "logits/chosen": -2.5968735218048096, "logits/rejected": -2.5742812156677246, "logps/chosen": -28.863239288330078, "logps/rejected": -121.50874328613281, "loss": 0.3845, "rewards/accuracies": 1.0, "rewards/chosen": 2.755127429962158, "rewards/margins": 6.61074686050415, "rewards/rejected": -3.855619192123413, "step": 529 }, { "epoch": 1.6001885014137605, "grad_norm": 4.423684597015381, "learning_rate": 8.401812688821752e-07, "logits/chosen": -2.6114883422851562, "logits/rejected": -2.534381628036499, "logps/chosen": -24.72628402709961, "logps/rejected": -135.81768798828125, "loss": 0.2368, "rewards/accuracies": 0.96875, "rewards/chosen": 2.9312820434570312, "rewards/margins": 7.851558685302734, "rewards/rejected": -4.920277118682861, "step": 530 }, { "epoch": 1.6032045240339303, "grad_norm": 4.440515041351318, "learning_rate": 8.398791540785498e-07, "logits/chosen": -2.578256130218506, "logits/rejected": -2.528148651123047, "logps/chosen": -42.68688201904297, "logps/rejected": -124.60569763183594, "loss": 0.4549, "rewards/accuracies": 0.96875, "rewards/chosen": 1.6088826656341553, "rewards/margins": 6.14517879486084, "rewards/rejected": -4.5362958908081055, "step": 531 }, { "epoch": 1.6062205466540997, "grad_norm": 4.4151153564453125, "learning_rate": 8.395770392749244e-07, "logits/chosen": -2.742849349975586, "logits/rejected": -2.6757922172546387, "logps/chosen": -28.00982093811035, "logps/rejected": -113.53783416748047, "loss": 0.3534, "rewards/accuracies": 1.0, "rewards/chosen": 3.00281023979187, "rewards/margins": 6.602553367614746, "rewards/rejected": -3.5997426509857178, "step": 532 }, { "epoch": 1.6092365692742696, "grad_norm": 3.511155128479004, "learning_rate": 8.392749244712991e-07, "logits/chosen": -2.591623306274414, "logits/rejected": -2.5776307582855225, "logps/chosen": -30.62994956970215, "logps/rejected": -133.9524688720703, "loss": 0.3078, "rewards/accuracies": 0.96875, "rewards/chosen": 2.5106022357940674, "rewards/margins": 7.299934387207031, "rewards/rejected": -4.789332389831543, "step": 533 }, { "epoch": 1.6122525918944393, "grad_norm": 4.4473114013671875, "learning_rate": 8.389728096676737e-07, "logits/chosen": -2.67671537399292, "logits/rejected": -2.6176187992095947, "logps/chosen": -36.95795822143555, "logps/rejected": -147.70196533203125, "loss": 0.2364, "rewards/accuracies": 0.96875, "rewards/chosen": 2.1337671279907227, "rewards/margins": 7.955036163330078, "rewards/rejected": -5.821268558502197, "step": 534 }, { "epoch": 1.615268614514609, "grad_norm": 20.541812896728516, "learning_rate": 8.386706948640483e-07, "logits/chosen": -2.656146287918091, "logits/rejected": -2.544205665588379, "logps/chosen": -33.24751281738281, "logps/rejected": -117.02616882324219, "loss": 0.44, "rewards/accuracies": 0.90625, "rewards/chosen": 2.3388330936431885, "rewards/margins": 6.397542953491211, "rewards/rejected": -4.058710098266602, "step": 535 }, { "epoch": 1.6182846371347785, "grad_norm": 3.6844642162323, "learning_rate": 8.383685800604229e-07, "logits/chosen": -2.711768627166748, "logits/rejected": -2.579827308654785, "logps/chosen": -41.91143035888672, "logps/rejected": -147.11923217773438, "loss": 0.3025, "rewards/accuracies": 0.90625, "rewards/chosen": 1.9753963947296143, "rewards/margins": 7.9125566482543945, "rewards/rejected": -5.937160015106201, "step": 536 }, { "epoch": 1.6182846371347785, "eval_logits/chosen": -2.5535528659820557, "eval_logits/rejected": -2.523106575012207, "eval_logps/chosen": -36.33160400390625, "eval_logps/rejected": -135.76556396484375, "eval_loss": 0.3097617030143738, "eval_rewards/accuracies": 0.969072163105011, "eval_rewards/chosen": 2.3073079586029053, "eval_rewards/margins": 7.256701469421387, "eval_rewards/rejected": -4.949393272399902, "eval_runtime": 699.8632, "eval_samples_per_second": 0.553, "eval_steps_per_second": 0.277, "step": 536 }, { "epoch": 1.6213006597549482, "grad_norm": 6.067670822143555, "learning_rate": 8.380664652567976e-07, "logits/chosen": -2.6677491664886475, "logits/rejected": -2.5903003215789795, "logps/chosen": -32.74269485473633, "logps/rejected": -132.67916870117188, "loss": 0.3443, "rewards/accuracies": 0.96875, "rewards/chosen": 2.171536922454834, "rewards/margins": 7.290382385253906, "rewards/rejected": -5.1188459396362305, "step": 537 }, { "epoch": 1.6243166823751178, "grad_norm": 3.7714343070983887, "learning_rate": 8.377643504531722e-07, "logits/chosen": -2.62612247467041, "logits/rejected": -2.57289457321167, "logps/chosen": -39.684288024902344, "logps/rejected": -120.28483581542969, "loss": 0.3925, "rewards/accuracies": 0.96875, "rewards/chosen": 1.82876455783844, "rewards/margins": 6.142579078674316, "rewards/rejected": -4.313814163208008, "step": 538 }, { "epoch": 1.6273327049952875, "grad_norm": 5.2334184646606445, "learning_rate": 8.374622356495467e-07, "logits/chosen": -2.609227180480957, "logits/rejected": -2.5424225330352783, "logps/chosen": -28.8324031829834, "logps/rejected": -143.57765197753906, "loss": 0.2322, "rewards/accuracies": 0.96875, "rewards/chosen": 2.656557559967041, "rewards/margins": 8.602021217346191, "rewards/rejected": -5.945463180541992, "step": 539 }, { "epoch": 1.6303487276154571, "grad_norm": 2.7517778873443604, "learning_rate": 8.371601208459214e-07, "logits/chosen": -2.696795701980591, "logits/rejected": -2.600306272506714, "logps/chosen": -21.968120574951172, "logps/rejected": -141.4007110595703, "loss": 0.2359, "rewards/accuracies": 1.0, "rewards/chosen": 2.962581157684326, "rewards/margins": 8.830026626586914, "rewards/rejected": -5.867445468902588, "step": 540 }, { "epoch": 1.6333647502356268, "grad_norm": 3.1116716861724854, "learning_rate": 8.368580060422961e-07, "logits/chosen": -2.6051955223083496, "logits/rejected": -2.645094871520996, "logps/chosen": -36.108734130859375, "logps/rejected": -140.6311798095703, "loss": 0.3481, "rewards/accuracies": 0.9375, "rewards/chosen": 2.1571311950683594, "rewards/margins": 7.819840908050537, "rewards/rejected": -5.662709712982178, "step": 541 }, { "epoch": 1.6363807728557964, "grad_norm": 3.0062220096588135, "learning_rate": 8.365558912386706e-07, "logits/chosen": -2.6381332874298096, "logits/rejected": -2.571871280670166, "logps/chosen": -32.0899543762207, "logps/rejected": -137.23655700683594, "loss": 0.2574, "rewards/accuracies": 0.96875, "rewards/chosen": 2.490370035171509, "rewards/margins": 7.809309005737305, "rewards/rejected": -5.318938255310059, "step": 542 }, { "epoch": 1.639396795475966, "grad_norm": 4.563173294067383, "learning_rate": 8.362537764350452e-07, "logits/chosen": -2.6206140518188477, "logits/rejected": -2.5547235012054443, "logps/chosen": -33.15302276611328, "logps/rejected": -123.74806213378906, "loss": 0.3648, "rewards/accuracies": 1.0, "rewards/chosen": 2.2850019931793213, "rewards/margins": 6.461381912231445, "rewards/rejected": -4.176380157470703, "step": 543 }, { "epoch": 1.6424128180961357, "grad_norm": 4.660052299499512, "learning_rate": 8.3595166163142e-07, "logits/chosen": -2.6052792072296143, "logits/rejected": -2.5427398681640625, "logps/chosen": -44.056304931640625, "logps/rejected": -146.9647216796875, "loss": 0.3529, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4187532663345337, "rewards/margins": 7.423699855804443, "rewards/rejected": -6.004946708679199, "step": 544 }, { "epoch": 1.6454288407163054, "grad_norm": 3.356750965118408, "learning_rate": 8.356495468277945e-07, "logits/chosen": -2.5900866985321045, "logits/rejected": -2.6173105239868164, "logps/chosen": -32.4943733215332, "logps/rejected": -142.39163208007812, "loss": 0.2943, "rewards/accuracies": 1.0, "rewards/chosen": 2.19169020652771, "rewards/margins": 7.789404392242432, "rewards/rejected": -5.597713470458984, "step": 545 }, { "epoch": 1.648444863336475, "grad_norm": 4.550638675689697, "learning_rate": 8.353474320241691e-07, "logits/chosen": -2.6656715869903564, "logits/rejected": -2.5656185150146484, "logps/chosen": -42.747901916503906, "logps/rejected": -133.47410583496094, "loss": 0.3808, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7349262237548828, "rewards/margins": 6.525852680206299, "rewards/rejected": -4.790926933288574, "step": 546 }, { "epoch": 1.6514608859566446, "grad_norm": 4.419494152069092, "learning_rate": 8.350453172205437e-07, "logits/chosen": -2.5301573276519775, "logits/rejected": -2.609675884246826, "logps/chosen": -50.89192581176758, "logps/rejected": -138.18821716308594, "loss": 0.4292, "rewards/accuracies": 0.9375, "rewards/chosen": 1.004582166671753, "rewards/margins": 6.519543170928955, "rewards/rejected": -5.514961242675781, "step": 547 }, { "epoch": 1.6544769085768143, "grad_norm": 3.6278393268585205, "learning_rate": 8.347432024169184e-07, "logits/chosen": -2.716196060180664, "logits/rejected": -2.6172680854797363, "logps/chosen": -31.61941146850586, "logps/rejected": -140.64389038085938, "loss": 0.3143, "rewards/accuracies": 0.96875, "rewards/chosen": 2.3811347484588623, "rewards/margins": 7.709987163543701, "rewards/rejected": -5.32885217666626, "step": 548 }, { "epoch": 1.657492931196984, "grad_norm": 6.1412858963012695, "learning_rate": 8.34441087613293e-07, "logits/chosen": -2.646629810333252, "logits/rejected": -2.585444450378418, "logps/chosen": -28.996753692626953, "logps/rejected": -122.74346923828125, "loss": 0.3295, "rewards/accuracies": 0.96875, "rewards/chosen": 2.773691177368164, "rewards/margins": 6.93098258972168, "rewards/rejected": -4.157290935516357, "step": 549 }, { "epoch": 1.6605089538171536, "grad_norm": 5.7903289794921875, "learning_rate": 8.341389728096676e-07, "logits/chosen": -2.5924274921417236, "logits/rejected": -2.559976100921631, "logps/chosen": -45.2492790222168, "logps/rejected": -146.69320678710938, "loss": 0.4156, "rewards/accuracies": 0.96875, "rewards/chosen": 1.290276050567627, "rewards/margins": 7.058662414550781, "rewards/rejected": -5.768385887145996, "step": 550 }, { "epoch": 1.6635249764373232, "grad_norm": 3.967573404312134, "learning_rate": 8.338368580060424e-07, "logits/chosen": -2.6376240253448486, "logits/rejected": -2.5946156978607178, "logps/chosen": -32.98979568481445, "logps/rejected": -123.74652862548828, "loss": 0.346, "rewards/accuracies": 0.96875, "rewards/chosen": 2.4781441688537598, "rewards/margins": 6.632494926452637, "rewards/rejected": -4.154351234436035, "step": 551 }, { "epoch": 1.666540999057493, "grad_norm": 4.621333122253418, "learning_rate": 8.335347432024169e-07, "logits/chosen": -2.70804500579834, "logits/rejected": -2.648344039916992, "logps/chosen": -32.427040100097656, "logps/rejected": -126.6888198852539, "loss": 0.3397, "rewards/accuracies": 0.96875, "rewards/chosen": 2.6408419609069824, "rewards/margins": 7.08052921295166, "rewards/rejected": -4.4396867752075195, "step": 552 }, { "epoch": 1.6695570216776625, "grad_norm": 6.096658229827881, "learning_rate": 8.332326283987915e-07, "logits/chosen": -2.6121060848236084, "logits/rejected": -2.6195011138916016, "logps/chosen": -49.445865631103516, "logps/rejected": -152.78494262695312, "loss": 0.367, "rewards/accuracies": 1.0, "rewards/chosen": 1.5588022470474243, "rewards/margins": 7.69796085357666, "rewards/rejected": -6.139159202575684, "step": 553 }, { "epoch": 1.6725730442978324, "grad_norm": 5.029810905456543, "learning_rate": 8.32930513595166e-07, "logits/chosen": -2.542064905166626, "logits/rejected": -2.5087082386016846, "logps/chosen": -38.58831024169922, "logps/rejected": -122.86125183105469, "loss": 0.422, "rewards/accuracies": 0.9375, "rewards/chosen": 2.077791213989258, "rewards/margins": 6.476146697998047, "rewards/rejected": -4.398355484008789, "step": 554 }, { "epoch": 1.6755890669180018, "grad_norm": 4.38785982131958, "learning_rate": 8.326283987915408e-07, "logits/chosen": -2.608524799346924, "logits/rejected": -2.5693869590759277, "logps/chosen": -32.34734344482422, "logps/rejected": -135.53866577148438, "loss": 0.3723, "rewards/accuracies": 1.0, "rewards/chosen": 2.4561328887939453, "rewards/margins": 7.641439437866211, "rewards/rejected": -5.185305595397949, "step": 555 }, { "epoch": 1.6786050895381717, "grad_norm": 4.694748401641846, "learning_rate": 8.323262839879154e-07, "logits/chosen": -2.686728000640869, "logits/rejected": -2.6368844509124756, "logps/chosen": -28.565006256103516, "logps/rejected": -127.03153991699219, "loss": 0.3363, "rewards/accuracies": 1.0, "rewards/chosen": 2.572374105453491, "rewards/margins": 7.054994583129883, "rewards/rejected": -4.4826202392578125, "step": 556 }, { "epoch": 1.681621112158341, "grad_norm": 4.136456489562988, "learning_rate": 8.3202416918429e-07, "logits/chosen": -2.5906503200531006, "logits/rejected": -2.5289626121520996, "logps/chosen": -37.400779724121094, "logps/rejected": -130.22901916503906, "loss": 0.3899, "rewards/accuracies": 1.0, "rewards/chosen": 2.0296502113342285, "rewards/margins": 6.528014183044434, "rewards/rejected": -4.498363494873047, "step": 557 }, { "epoch": 1.684637134778511, "grad_norm": 5.001535415649414, "learning_rate": 8.317220543806645e-07, "logits/chosen": -2.571179151535034, "logits/rejected": -2.519683361053467, "logps/chosen": -25.791824340820312, "logps/rejected": -123.82610321044922, "loss": 0.2229, "rewards/accuracies": 1.0, "rewards/chosen": 2.8551509380340576, "rewards/margins": 7.124576091766357, "rewards/rejected": -4.269425868988037, "step": 558 }, { "epoch": 1.6876531573986804, "grad_norm": 4.179035186767578, "learning_rate": 8.314199395770393e-07, "logits/chosen": -2.628474235534668, "logits/rejected": -2.6141767501831055, "logps/chosen": -39.35884094238281, "logps/rejected": -125.76901245117188, "loss": 0.4328, "rewards/accuracies": 0.9375, "rewards/chosen": 1.9679548740386963, "rewards/margins": 6.135637283325195, "rewards/rejected": -4.167682647705078, "step": 559 }, { "epoch": 1.6906691800188502, "grad_norm": 5.19273042678833, "learning_rate": 8.311178247734139e-07, "logits/chosen": -2.663661479949951, "logits/rejected": -2.5600204467773438, "logps/chosen": -22.82951545715332, "logps/rejected": -123.14030456542969, "loss": 0.2899, "rewards/accuracies": 1.0, "rewards/chosen": 3.043506622314453, "rewards/margins": 7.154801845550537, "rewards/rejected": -4.111295223236084, "step": 560 }, { "epoch": 1.6936852026390197, "grad_norm": 6.492900848388672, "learning_rate": 8.308157099697884e-07, "logits/chosen": -2.555906057357788, "logits/rejected": -2.5119171142578125, "logps/chosen": -23.228853225708008, "logps/rejected": -102.17993927001953, "loss": 0.4073, "rewards/accuracies": 0.9375, "rewards/chosen": 3.029310941696167, "rewards/margins": 5.844906806945801, "rewards/rejected": -2.815596580505371, "step": 561 }, { "epoch": 1.6967012252591895, "grad_norm": 3.5976083278656006, "learning_rate": 8.305135951661632e-07, "logits/chosen": -2.7204062938690186, "logits/rejected": -2.649658679962158, "logps/chosen": -40.300559997558594, "logps/rejected": -127.04175567626953, "loss": 0.4239, "rewards/accuracies": 0.9375, "rewards/chosen": 1.9073147773742676, "rewards/margins": 6.3888654708862305, "rewards/rejected": -4.481551170349121, "step": 562 }, { "epoch": 1.699717247879359, "grad_norm": 5.025044918060303, "learning_rate": 8.302114803625378e-07, "logits/chosen": -2.5714995861053467, "logits/rejected": -2.526120185852051, "logps/chosen": -30.87407684326172, "logps/rejected": -126.99202728271484, "loss": 0.3324, "rewards/accuracies": 1.0, "rewards/chosen": 2.690915584564209, "rewards/margins": 7.243697643280029, "rewards/rejected": -4.5527825355529785, "step": 563 }, { "epoch": 1.7027332704995288, "grad_norm": 4.903371334075928, "learning_rate": 8.299093655589123e-07, "logits/chosen": -2.6132993698120117, "logits/rejected": -2.547804117202759, "logps/chosen": -32.27640914916992, "logps/rejected": -144.92750549316406, "loss": 0.3259, "rewards/accuracies": 1.0, "rewards/chosen": 2.2599127292633057, "rewards/margins": 7.773232936859131, "rewards/rejected": -5.513319492340088, "step": 564 }, { "epoch": 1.7057492931196983, "grad_norm": 5.140822887420654, "learning_rate": 8.296072507552869e-07, "logits/chosen": -2.622640609741211, "logits/rejected": -2.6019489765167236, "logps/chosen": -34.33444595336914, "logps/rejected": -127.70674133300781, "loss": 0.296, "rewards/accuracies": 1.0, "rewards/chosen": 2.3898849487304688, "rewards/margins": 7.0008544921875, "rewards/rejected": -4.6109700202941895, "step": 565 }, { "epoch": 1.7087653157398681, "grad_norm": 2.350898027420044, "learning_rate": 8.293051359516617e-07, "logits/chosen": -2.599153518676758, "logits/rejected": -2.590355634689331, "logps/chosen": -28.298093795776367, "logps/rejected": -140.05715942382812, "loss": 0.2564, "rewards/accuracies": 1.0, "rewards/chosen": 2.6639626026153564, "rewards/margins": 8.361699104309082, "rewards/rejected": -5.697735786437988, "step": 566 }, { "epoch": 1.7117813383600375, "grad_norm": 4.663538455963135, "learning_rate": 8.290030211480362e-07, "logits/chosen": -2.6107797622680664, "logits/rejected": -2.5148532390594482, "logps/chosen": -38.112300872802734, "logps/rejected": -125.84368896484375, "loss": 0.4545, "rewards/accuracies": 1.0, "rewards/chosen": 1.875011920928955, "rewards/margins": 6.328166961669922, "rewards/rejected": -4.453155040740967, "step": 567 }, { "epoch": 1.7147973609802074, "grad_norm": 3.5457332134246826, "learning_rate": 8.287009063444108e-07, "logits/chosen": -2.610905170440674, "logits/rejected": -2.6127898693084717, "logps/chosen": -36.175575256347656, "logps/rejected": -130.34690856933594, "loss": 0.3714, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7270257472991943, "rewards/margins": 6.7602458000183105, "rewards/rejected": -5.033220291137695, "step": 568 }, { "epoch": 1.717813383600377, "grad_norm": 3.8563570976257324, "learning_rate": 8.283987915407855e-07, "logits/chosen": -2.6366093158721924, "logits/rejected": -2.6494359970092773, "logps/chosen": -33.03594207763672, "logps/rejected": -146.21087646484375, "loss": 0.3108, "rewards/accuracies": 1.0, "rewards/chosen": 2.5099520683288574, "rewards/margins": 8.293245315551758, "rewards/rejected": -5.783292770385742, "step": 569 }, { "epoch": 1.7208294062205467, "grad_norm": 4.564728260040283, "learning_rate": 8.280966767371601e-07, "logits/chosen": -2.6674323081970215, "logits/rejected": -2.5523667335510254, "logps/chosen": -28.358190536499023, "logps/rejected": -136.51026916503906, "loss": 0.2593, "rewards/accuracies": 1.0, "rewards/chosen": 2.8227550983428955, "rewards/margins": 8.027894973754883, "rewards/rejected": -5.205140113830566, "step": 570 }, { "epoch": 1.7238454288407163, "grad_norm": 7.7254533767700195, "learning_rate": 8.277945619335347e-07, "logits/chosen": -2.714214324951172, "logits/rejected": -2.6127357482910156, "logps/chosen": -32.92487335205078, "logps/rejected": -132.9020233154297, "loss": 0.3517, "rewards/accuracies": 1.0, "rewards/chosen": 2.552263021469116, "rewards/margins": 7.653058052062988, "rewards/rejected": -5.100794792175293, "step": 571 }, { "epoch": 1.726861451460886, "grad_norm": 7.26804256439209, "learning_rate": 8.274924471299093e-07, "logits/chosen": -2.647055149078369, "logits/rejected": -2.534034013748169, "logps/chosen": -34.336727142333984, "logps/rejected": -138.9445037841797, "loss": 0.3205, "rewards/accuracies": 1.0, "rewards/chosen": 2.156466007232666, "rewards/margins": 7.396108150482178, "rewards/rejected": -5.239643096923828, "step": 572 }, { "epoch": 1.7298774740810556, "grad_norm": 4.556045055389404, "learning_rate": 8.271903323262839e-07, "logits/chosen": -2.6506009101867676, "logits/rejected": -2.6542773246765137, "logps/chosen": -31.10702133178711, "logps/rejected": -129.91969299316406, "loss": 0.3438, "rewards/accuracies": 0.96875, "rewards/chosen": 2.5905191898345947, "rewards/margins": 7.292067527770996, "rewards/rejected": -4.7015485763549805, "step": 573 }, { "epoch": 1.7328934967012253, "grad_norm": 5.588433742523193, "learning_rate": 8.268882175226586e-07, "logits/chosen": -2.606783151626587, "logits/rejected": -2.5534839630126953, "logps/chosen": -53.23272705078125, "logps/rejected": -149.87796020507812, "loss": 0.4166, "rewards/accuracies": 1.0, "rewards/chosen": 0.6697826385498047, "rewards/margins": 6.808244705200195, "rewards/rejected": -6.138462066650391, "step": 574 }, { "epoch": 1.735909519321395, "grad_norm": 4.863414764404297, "learning_rate": 8.265861027190332e-07, "logits/chosen": -2.6446142196655273, "logits/rejected": -2.60370135307312, "logps/chosen": -33.19938659667969, "logps/rejected": -142.73716735839844, "loss": 0.3526, "rewards/accuracies": 1.0, "rewards/chosen": 2.070821523666382, "rewards/margins": 7.676774024963379, "rewards/rejected": -5.605952739715576, "step": 575 }, { "epoch": 1.7389255419415646, "grad_norm": 5.502556324005127, "learning_rate": 8.262839879154077e-07, "logits/chosen": -2.622616767883301, "logits/rejected": -2.4796652793884277, "logps/chosen": -36.076595306396484, "logps/rejected": -122.40946197509766, "loss": 0.4305, "rewards/accuracies": 0.90625, "rewards/chosen": 2.0748019218444824, "rewards/margins": 6.234867572784424, "rewards/rejected": -4.160065650939941, "step": 576 }, { "epoch": 1.7419415645617342, "grad_norm": 3.800964117050171, "learning_rate": 8.259818731117824e-07, "logits/chosen": -2.6155126094818115, "logits/rejected": -2.5424680709838867, "logps/chosen": -34.71942138671875, "logps/rejected": -140.08966064453125, "loss": 0.3186, "rewards/accuracies": 1.0, "rewards/chosen": 2.1697373390197754, "rewards/margins": 7.4217529296875, "rewards/rejected": -5.252016067504883, "step": 577 }, { "epoch": 1.7449575871819039, "grad_norm": 3.212265729904175, "learning_rate": 8.256797583081571e-07, "logits/chosen": -2.67349910736084, "logits/rejected": -2.5429248809814453, "logps/chosen": -37.331363677978516, "logps/rejected": -152.93247985839844, "loss": 0.2758, "rewards/accuracies": 1.0, "rewards/chosen": 2.3199658393859863, "rewards/margins": 8.480852127075195, "rewards/rejected": -6.160885810852051, "step": 578 }, { "epoch": 1.7479736098020735, "grad_norm": 4.281749725341797, "learning_rate": 8.253776435045317e-07, "logits/chosen": -2.6180052757263184, "logits/rejected": -2.5331227779388428, "logps/chosen": -29.64714813232422, "logps/rejected": -143.41757202148438, "loss": 0.2358, "rewards/accuracies": 1.0, "rewards/chosen": 2.605649709701538, "rewards/margins": 8.551613807678223, "rewards/rejected": -5.9459638595581055, "step": 579 }, { "epoch": 1.7509896324222431, "grad_norm": 5.713842868804932, "learning_rate": 8.250755287009063e-07, "logits/chosen": -2.704608678817749, "logits/rejected": -2.599090099334717, "logps/chosen": -35.3472900390625, "logps/rejected": -146.58767700195312, "loss": 0.3281, "rewards/accuracies": 1.0, "rewards/chosen": 2.13649320602417, "rewards/margins": 8.144469261169434, "rewards/rejected": -6.0079755783081055, "step": 580 }, { "epoch": 1.7540056550424128, "grad_norm": 3.43765926361084, "learning_rate": 8.24773413897281e-07, "logits/chosen": -2.665863037109375, "logits/rejected": -2.6094512939453125, "logps/chosen": -40.87910842895508, "logps/rejected": -140.50523376464844, "loss": 0.3043, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6098607778549194, "rewards/margins": 7.417531967163086, "rewards/rejected": -5.807671070098877, "step": 581 }, { "epoch": 1.7570216776625824, "grad_norm": 5.164868354797363, "learning_rate": 8.244712990936556e-07, "logits/chosen": -2.664417266845703, "logits/rejected": -2.669048547744751, "logps/chosen": -31.773700714111328, "logps/rejected": -123.13845825195312, "loss": 0.3653, "rewards/accuracies": 1.0, "rewards/chosen": 2.1430435180664062, "rewards/margins": 6.755025386810303, "rewards/rejected": -4.611982345581055, "step": 582 }, { "epoch": 1.760037700282752, "grad_norm": 4.152623176574707, "learning_rate": 8.241691842900301e-07, "logits/chosen": -2.665778875350952, "logits/rejected": -2.616217613220215, "logps/chosen": -30.866527557373047, "logps/rejected": -137.5774688720703, "loss": 0.3626, "rewards/accuracies": 1.0, "rewards/chosen": 2.5618937015533447, "rewards/margins": 7.745116710662842, "rewards/rejected": -5.183222770690918, "step": 583 }, { "epoch": 1.7630537229029217, "grad_norm": 2.5468363761901855, "learning_rate": 8.238670694864048e-07, "logits/chosen": -2.5692696571350098, "logits/rejected": -2.5372190475463867, "logps/chosen": -34.67368698120117, "logps/rejected": -141.7457275390625, "loss": 0.2748, "rewards/accuracies": 1.0, "rewards/chosen": 2.1997082233428955, "rewards/margins": 7.830397605895996, "rewards/rejected": -5.63068962097168, "step": 584 }, { "epoch": 1.7660697455230914, "grad_norm": 9.218463897705078, "learning_rate": 8.235649546827795e-07, "logits/chosen": -2.5558035373687744, "logits/rejected": -2.5394320487976074, "logps/chosen": -37.583961486816406, "logps/rejected": -142.37066650390625, "loss": 0.3708, "rewards/accuracies": 0.96875, "rewards/chosen": 2.012362241744995, "rewards/margins": 7.456643581390381, "rewards/rejected": -5.444281578063965, "step": 585 }, { "epoch": 1.769085768143261, "grad_norm": 4.400758266448975, "learning_rate": 8.23262839879154e-07, "logits/chosen": -2.59794282913208, "logits/rejected": -2.5016531944274902, "logps/chosen": -33.20765686035156, "logps/rejected": -121.92716217041016, "loss": 0.4144, "rewards/accuracies": 0.96875, "rewards/chosen": 2.189411163330078, "rewards/margins": 6.369546890258789, "rewards/rejected": -4.180135250091553, "step": 586 }, { "epoch": 1.7721017907634309, "grad_norm": 5.482017993927002, "learning_rate": 8.229607250755287e-07, "logits/chosen": -2.65327787399292, "logits/rejected": -2.647676706314087, "logps/chosen": -32.50635528564453, "logps/rejected": -133.36660766601562, "loss": 0.2893, "rewards/accuracies": 1.0, "rewards/chosen": 2.1860415935516357, "rewards/margins": 7.380302906036377, "rewards/rejected": -5.194261074066162, "step": 587 }, { "epoch": 1.7751178133836003, "grad_norm": 5.513513565063477, "learning_rate": 8.226586102719033e-07, "logits/chosen": -2.6289327144622803, "logits/rejected": -2.606117010116577, "logps/chosen": -44.151859283447266, "logps/rejected": -131.6847686767578, "loss": 0.4254, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5085227489471436, "rewards/margins": 6.364648342132568, "rewards/rejected": -4.856125354766846, "step": 588 }, { "epoch": 1.7781338360037702, "grad_norm": 3.8462588787078857, "learning_rate": 8.223564954682779e-07, "logits/chosen": -2.671060800552368, "logits/rejected": -2.628692388534546, "logps/chosen": -41.44866943359375, "logps/rejected": -123.79486083984375, "loss": 0.4232, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7845059633255005, "rewards/margins": 6.257498264312744, "rewards/rejected": -4.472992420196533, "step": 589 }, { "epoch": 1.7811498586239396, "grad_norm": 4.618725299835205, "learning_rate": 8.220543806646525e-07, "logits/chosen": -2.6505887508392334, "logits/rejected": -2.5476086139678955, "logps/chosen": -26.286252975463867, "logps/rejected": -127.1490478515625, "loss": 0.3269, "rewards/accuracies": 0.96875, "rewards/chosen": 2.537065267562866, "rewards/margins": 7.1600189208984375, "rewards/rejected": -4.62295389175415, "step": 590 }, { "epoch": 1.7841658812441095, "grad_norm": 3.712348222732544, "learning_rate": 8.217522658610272e-07, "logits/chosen": -2.6714186668395996, "logits/rejected": -2.5965259075164795, "logps/chosen": -33.581748962402344, "logps/rejected": -147.94679260253906, "loss": 0.2946, "rewards/accuracies": 0.9375, "rewards/chosen": 2.1781506538391113, "rewards/margins": 8.237540245056152, "rewards/rejected": -6.059389114379883, "step": 591 }, { "epoch": 1.7871819038642789, "grad_norm": 3.438375234603882, "learning_rate": 8.214501510574017e-07, "logits/chosen": -2.7091901302337646, "logits/rejected": -2.6427860260009766, "logps/chosen": -36.9462776184082, "logps/rejected": -131.08303833007812, "loss": 0.3492, "rewards/accuracies": 0.90625, "rewards/chosen": 1.852560043334961, "rewards/margins": 6.826747894287109, "rewards/rejected": -4.97418737411499, "step": 592 }, { "epoch": 1.7901979264844488, "grad_norm": 4.854276180267334, "learning_rate": 8.211480362537764e-07, "logits/chosen": -2.605114698410034, "logits/rejected": -2.5183372497558594, "logps/chosen": -34.66660690307617, "logps/rejected": -135.13192749023438, "loss": 0.2971, "rewards/accuracies": 1.0, "rewards/chosen": 2.29522442817688, "rewards/margins": 7.515588760375977, "rewards/rejected": -5.220364570617676, "step": 593 }, { "epoch": 1.7932139491046182, "grad_norm": 7.7496771812438965, "learning_rate": 8.20845921450151e-07, "logits/chosen": -2.59916353225708, "logits/rejected": -2.55344820022583, "logps/chosen": -39.44164276123047, "logps/rejected": -150.39805603027344, "loss": 0.3171, "rewards/accuracies": 1.0, "rewards/chosen": 1.735034465789795, "rewards/margins": 7.9638800621032715, "rewards/rejected": -6.228845119476318, "step": 594 }, { "epoch": 1.796229971724788, "grad_norm": 3.754060983657837, "learning_rate": 8.205438066465256e-07, "logits/chosen": -2.611109495162964, "logits/rejected": -2.5618181228637695, "logps/chosen": -28.787120819091797, "logps/rejected": -122.764404296875, "loss": 0.2847, "rewards/accuracies": 0.96875, "rewards/chosen": 2.6189208030700684, "rewards/margins": 7.244719505310059, "rewards/rejected": -4.62579870223999, "step": 595 }, { "epoch": 1.7992459943449575, "grad_norm": 4.638908863067627, "learning_rate": 8.202416918429003e-07, "logits/chosen": -2.6364593505859375, "logits/rejected": -2.5829129219055176, "logps/chosen": -27.181644439697266, "logps/rejected": -138.98138427734375, "loss": 0.2637, "rewards/accuracies": 1.0, "rewards/chosen": 2.963451862335205, "rewards/margins": 8.012051582336426, "rewards/rejected": -5.048600196838379, "step": 596 }, { "epoch": 1.8022620169651273, "grad_norm": 4.2514543533325195, "learning_rate": 8.199395770392749e-07, "logits/chosen": -2.619518518447876, "logits/rejected": -2.5294992923736572, "logps/chosen": -27.81955337524414, "logps/rejected": -124.65446472167969, "loss": 0.3698, "rewards/accuracies": 0.90625, "rewards/chosen": 2.613269805908203, "rewards/margins": 6.9339823722839355, "rewards/rejected": -4.320712089538574, "step": 597 }, { "epoch": 1.8052780395852968, "grad_norm": 4.540503978729248, "learning_rate": 8.196374622356495e-07, "logits/chosen": -2.5947296619415283, "logits/rejected": -2.5964102745056152, "logps/chosen": -33.36017608642578, "logps/rejected": -134.21875, "loss": 0.3474, "rewards/accuracies": 1.0, "rewards/chosen": 2.179690361022949, "rewards/margins": 7.478890895843506, "rewards/rejected": -5.299200057983398, "step": 598 }, { "epoch": 1.8082940622054666, "grad_norm": 4.436826229095459, "learning_rate": 8.193353474320241e-07, "logits/chosen": -2.6798763275146484, "logits/rejected": -2.572556257247925, "logps/chosen": -30.124441146850586, "logps/rejected": -129.71717834472656, "loss": 0.2887, "rewards/accuracies": 1.0, "rewards/chosen": 2.6504173278808594, "rewards/margins": 7.457063674926758, "rewards/rejected": -4.806646347045898, "step": 599 }, { "epoch": 1.811310084825636, "grad_norm": 6.279206275939941, "learning_rate": 8.190332326283988e-07, "logits/chosen": -2.6419920921325684, "logits/rejected": -2.604128122329712, "logps/chosen": -21.65706443786621, "logps/rejected": -117.60472106933594, "loss": 0.3275, "rewards/accuracies": 0.9375, "rewards/chosen": 3.2454025745391846, "rewards/margins": 7.189120292663574, "rewards/rejected": -3.9437179565429688, "step": 600 }, { "epoch": 1.814326107445806, "grad_norm": 5.261145114898682, "learning_rate": 8.187311178247733e-07, "logits/chosen": -2.5690529346466064, "logits/rejected": -2.5276052951812744, "logps/chosen": -34.921443939208984, "logps/rejected": -146.3457794189453, "loss": 0.2655, "rewards/accuracies": 0.96875, "rewards/chosen": 1.9639041423797607, "rewards/margins": 8.066547393798828, "rewards/rejected": -6.1026434898376465, "step": 601 }, { "epoch": 1.8173421300659753, "grad_norm": 2.9947221279144287, "learning_rate": 8.18429003021148e-07, "logits/chosen": -2.6134538650512695, "logits/rejected": -2.5572006702423096, "logps/chosen": -26.468666076660156, "logps/rejected": -135.3523406982422, "loss": 0.2478, "rewards/accuracies": 1.0, "rewards/chosen": 2.955274820327759, "rewards/margins": 7.773889541625977, "rewards/rejected": -4.818614959716797, "step": 602 }, { "epoch": 1.8203581526861452, "grad_norm": 4.3129706382751465, "learning_rate": 8.181268882175226e-07, "logits/chosen": -2.56425142288208, "logits/rejected": -2.547140121459961, "logps/chosen": -27.508298873901367, "logps/rejected": -145.9301300048828, "loss": 0.2906, "rewards/accuracies": 0.96875, "rewards/chosen": 2.4720685482025146, "rewards/margins": 8.368606567382812, "rewards/rejected": -5.896538734436035, "step": 603 }, { "epoch": 1.8203581526861452, "eval_logits/chosen": -2.53389573097229, "eval_logits/rejected": -2.5001659393310547, "eval_logps/chosen": -35.36158752441406, "eval_logps/rejected": -135.50599670410156, "eval_loss": 0.30586037039756775, "eval_rewards/accuracies": 0.969072163105011, "eval_rewards/chosen": 2.4043099880218506, "eval_rewards/margins": 7.327746391296387, "eval_rewards/rejected": -4.923436641693115, "eval_runtime": 695.4845, "eval_samples_per_second": 0.556, "eval_steps_per_second": 0.279, "step": 603 }, { "epoch": 1.8233741753063148, "grad_norm": 4.601969242095947, "learning_rate": 8.178247734138973e-07, "logits/chosen": -2.6558384895324707, "logits/rejected": -2.5601489543914795, "logps/chosen": -29.327619552612305, "logps/rejected": -147.28396606445312, "loss": 0.2855, "rewards/accuracies": 0.96875, "rewards/chosen": 2.9568159580230713, "rewards/margins": 8.646224021911621, "rewards/rejected": -5.689407825469971, "step": 604 }, { "epoch": 1.8263901979264845, "grad_norm": 4.016393661499023, "learning_rate": 8.175226586102719e-07, "logits/chosen": -2.638788938522339, "logits/rejected": -2.5261635780334473, "logps/chosen": -29.84278678894043, "logps/rejected": -139.1228485107422, "loss": 0.2802, "rewards/accuracies": 1.0, "rewards/chosen": 2.4224820137023926, "rewards/margins": 7.618208885192871, "rewards/rejected": -5.1957268714904785, "step": 605 }, { "epoch": 1.8294062205466541, "grad_norm": 4.080695629119873, "learning_rate": 8.172205438066465e-07, "logits/chosen": -2.51251220703125, "logits/rejected": -2.532264471054077, "logps/chosen": -28.099044799804688, "logps/rejected": -113.19165802001953, "loss": 0.4028, "rewards/accuracies": 1.0, "rewards/chosen": 2.6325981616973877, "rewards/margins": 6.3972296714782715, "rewards/rejected": -3.764631748199463, "step": 606 }, { "epoch": 1.8324222431668238, "grad_norm": 4.875034332275391, "learning_rate": 8.169184290030211e-07, "logits/chosen": -2.5594077110290527, "logits/rejected": -2.504826784133911, "logps/chosen": -19.869884490966797, "logps/rejected": -126.23995971679688, "loss": 0.2835, "rewards/accuracies": 0.96875, "rewards/chosen": 3.2848918437957764, "rewards/margins": 8.080093383789062, "rewards/rejected": -4.795201301574707, "step": 607 }, { "epoch": 1.8354382657869934, "grad_norm": 4.469341278076172, "learning_rate": 8.166163141993957e-07, "logits/chosen": -2.521383285522461, "logits/rejected": -2.500223159790039, "logps/chosen": -33.44373321533203, "logps/rejected": -134.10659790039062, "loss": 0.3396, "rewards/accuracies": 0.96875, "rewards/chosen": 2.3754022121429443, "rewards/margins": 6.964468002319336, "rewards/rejected": -4.5890655517578125, "step": 608 }, { "epoch": 1.838454288407163, "grad_norm": 4.418963432312012, "learning_rate": 8.163141993957704e-07, "logits/chosen": -2.6435391902923584, "logits/rejected": -2.590526580810547, "logps/chosen": -42.26728057861328, "logps/rejected": -140.53622436523438, "loss": 0.4322, "rewards/accuracies": 0.90625, "rewards/chosen": 1.5911858081817627, "rewards/margins": 7.236649513244629, "rewards/rejected": -5.645463943481445, "step": 609 }, { "epoch": 1.8414703110273327, "grad_norm": 5.4969682693481445, "learning_rate": 8.16012084592145e-07, "logits/chosen": -2.660684823989868, "logits/rejected": -2.5931849479675293, "logps/chosen": -25.83730125427246, "logps/rejected": -111.57052612304688, "loss": 0.3339, "rewards/accuracies": 0.9375, "rewards/chosen": 3.273106575012207, "rewards/margins": 6.939478874206543, "rewards/rejected": -3.6663718223571777, "step": 610 }, { "epoch": 1.8444863336475024, "grad_norm": 4.527313232421875, "learning_rate": 8.157099697885196e-07, "logits/chosen": -2.6296398639678955, "logits/rejected": -2.5874156951904297, "logps/chosen": -37.88767623901367, "logps/rejected": -129.60226440429688, "loss": 0.4549, "rewards/accuracies": 1.0, "rewards/chosen": 1.7042579650878906, "rewards/margins": 6.608887672424316, "rewards/rejected": -4.904629707336426, "step": 611 }, { "epoch": 1.847502356267672, "grad_norm": 5.811726093292236, "learning_rate": 8.154078549848942e-07, "logits/chosen": -2.6821837425231934, "logits/rejected": -2.65083909034729, "logps/chosen": -24.22327995300293, "logps/rejected": -142.37167358398438, "loss": 0.2105, "rewards/accuracies": 1.0, "rewards/chosen": 2.842219114303589, "rewards/margins": 8.232510566711426, "rewards/rejected": -5.390291213989258, "step": 612 }, { "epoch": 1.8505183788878417, "grad_norm": 4.1901421546936035, "learning_rate": 8.151057401812689e-07, "logits/chosen": -2.5973639488220215, "logits/rejected": -2.4880433082580566, "logps/chosen": -37.71889877319336, "logps/rejected": -134.12530517578125, "loss": 0.4118, "rewards/accuracies": 0.96875, "rewards/chosen": 1.9920765161514282, "rewards/margins": 7.033961296081543, "rewards/rejected": -5.041884422302246, "step": 613 }, { "epoch": 1.8535344015080113, "grad_norm": 3.85427188873291, "learning_rate": 8.148036253776434e-07, "logits/chosen": -2.7057228088378906, "logits/rejected": -2.6337811946868896, "logps/chosen": -21.972286224365234, "logps/rejected": -130.9473876953125, "loss": 0.316, "rewards/accuracies": 1.0, "rewards/chosen": 3.156348705291748, "rewards/margins": 7.893153667449951, "rewards/rejected": -4.736804962158203, "step": 614 }, { "epoch": 1.856550424128181, "grad_norm": 7.070821285247803, "learning_rate": 8.145015105740181e-07, "logits/chosen": -2.6826508045196533, "logits/rejected": -2.5662012100219727, "logps/chosen": -26.61810874938965, "logps/rejected": -131.67047119140625, "loss": 0.2886, "rewards/accuracies": 0.96875, "rewards/chosen": 2.9931559562683105, "rewards/margins": 7.7204437255859375, "rewards/rejected": -4.727288246154785, "step": 615 }, { "epoch": 1.8595664467483506, "grad_norm": 5.7658586502075195, "learning_rate": 8.141993957703928e-07, "logits/chosen": -2.654582977294922, "logits/rejected": -2.5550382137298584, "logps/chosen": -37.69517517089844, "logps/rejected": -134.82766723632812, "loss": 0.3687, "rewards/accuracies": 1.0, "rewards/chosen": 2.233065605163574, "rewards/margins": 7.26954984664917, "rewards/rejected": -5.036484718322754, "step": 616 }, { "epoch": 1.8625824693685202, "grad_norm": 4.542367458343506, "learning_rate": 8.138972809667673e-07, "logits/chosen": -2.5874247550964355, "logits/rejected": -2.6005799770355225, "logps/chosen": -32.508975982666016, "logps/rejected": -128.65187072753906, "loss": 0.396, "rewards/accuracies": 0.9375, "rewards/chosen": 2.1761131286621094, "rewards/margins": 6.68520450592041, "rewards/rejected": -4.509091854095459, "step": 617 }, { "epoch": 1.8655984919886899, "grad_norm": 4.628416061401367, "learning_rate": 8.135951661631419e-07, "logits/chosen": -2.638803243637085, "logits/rejected": -2.53700590133667, "logps/chosen": -38.582149505615234, "logps/rejected": -132.08663940429688, "loss": 0.3944, "rewards/accuracies": 0.96875, "rewards/chosen": 1.81232750415802, "rewards/margins": 6.569287300109863, "rewards/rejected": -4.756959915161133, "step": 618 }, { "epoch": 1.8686145146088595, "grad_norm": 4.181163311004639, "learning_rate": 8.132930513595166e-07, "logits/chosen": -2.552821636199951, "logits/rejected": -2.537008285522461, "logps/chosen": -40.2784538269043, "logps/rejected": -145.46563720703125, "loss": 0.3333, "rewards/accuracies": 0.9375, "rewards/chosen": 1.9038232564926147, "rewards/margins": 7.459002494812012, "rewards/rejected": -5.555178642272949, "step": 619 }, { "epoch": 1.8716305372290292, "grad_norm": 3.9472029209136963, "learning_rate": 8.129909365558912e-07, "logits/chosen": -2.582109212875366, "logits/rejected": -2.5498156547546387, "logps/chosen": -30.82326889038086, "logps/rejected": -114.99478912353516, "loss": 0.4031, "rewards/accuracies": 0.875, "rewards/chosen": 2.3995962142944336, "rewards/margins": 6.020542621612549, "rewards/rejected": -3.620946168899536, "step": 620 }, { "epoch": 1.8746465598491988, "grad_norm": 4.412528038024902, "learning_rate": 8.126888217522658e-07, "logits/chosen": -2.6067562103271484, "logits/rejected": -2.5879833698272705, "logps/chosen": -39.44639205932617, "logps/rejected": -118.4447021484375, "loss": 0.4356, "rewards/accuracies": 0.90625, "rewards/chosen": 1.4851659536361694, "rewards/margins": 5.7389936447143555, "rewards/rejected": -4.253828048706055, "step": 621 }, { "epoch": 1.8776625824693687, "grad_norm": 6.0615129470825195, "learning_rate": 8.123867069486404e-07, "logits/chosen": -2.586939811706543, "logits/rejected": -2.519338846206665, "logps/chosen": -35.84947967529297, "logps/rejected": -129.6017608642578, "loss": 0.3636, "rewards/accuracies": 0.96875, "rewards/chosen": 2.07735538482666, "rewards/margins": 6.486835479736328, "rewards/rejected": -4.409480571746826, "step": 622 }, { "epoch": 1.880678605089538, "grad_norm": 3.7065248489379883, "learning_rate": 8.120845921450151e-07, "logits/chosen": -2.6750316619873047, "logits/rejected": -2.6428985595703125, "logps/chosen": -27.031288146972656, "logps/rejected": -136.67576599121094, "loss": 0.2633, "rewards/accuracies": 0.96875, "rewards/chosen": 3.1802475452423096, "rewards/margins": 8.145842552185059, "rewards/rejected": -4.96559476852417, "step": 623 }, { "epoch": 1.883694627709708, "grad_norm": 3.7724649906158447, "learning_rate": 8.117824773413897e-07, "logits/chosen": -2.6444334983825684, "logits/rejected": -2.6002345085144043, "logps/chosen": -30.844696044921875, "logps/rejected": -142.61419677734375, "loss": 0.2617, "rewards/accuracies": 1.0, "rewards/chosen": 2.4321341514587402, "rewards/margins": 8.041814804077148, "rewards/rejected": -5.609681606292725, "step": 624 }, { "epoch": 1.8867106503298774, "grad_norm": 5.755083084106445, "learning_rate": 8.114803625377643e-07, "logits/chosen": -2.6596438884735107, "logits/rejected": -2.600454568862915, "logps/chosen": -35.22003936767578, "logps/rejected": -128.67784118652344, "loss": 0.333, "rewards/accuracies": 0.96875, "rewards/chosen": 2.322047472000122, "rewards/margins": 6.959865093231201, "rewards/rejected": -4.6378173828125, "step": 625 }, { "epoch": 1.8897266729500473, "grad_norm": 3.125730037689209, "learning_rate": 8.11178247734139e-07, "logits/chosen": -2.6290321350097656, "logits/rejected": -2.515388250350952, "logps/chosen": -30.34801483154297, "logps/rejected": -141.35601806640625, "loss": 0.2334, "rewards/accuracies": 0.96875, "rewards/chosen": 2.587134599685669, "rewards/margins": 8.289716720581055, "rewards/rejected": -5.702581882476807, "step": 626 }, { "epoch": 1.8927426955702167, "grad_norm": 4.678775787353516, "learning_rate": 8.108761329305136e-07, "logits/chosen": -2.5641369819641113, "logits/rejected": -2.5426025390625, "logps/chosen": -38.13385772705078, "logps/rejected": -122.11064910888672, "loss": 0.3907, "rewards/accuracies": 0.9375, "rewards/chosen": 2.074162483215332, "rewards/margins": 6.435022830963135, "rewards/rejected": -4.360860824584961, "step": 627 }, { "epoch": 1.8957587181903865, "grad_norm": 6.166533470153809, "learning_rate": 8.105740181268882e-07, "logits/chosen": -2.654564380645752, "logits/rejected": -2.542783737182617, "logps/chosen": -48.24378204345703, "logps/rejected": -149.3909149169922, "loss": 0.3702, "rewards/accuracies": 0.90625, "rewards/chosen": 1.2617013454437256, "rewards/margins": 7.387240409851074, "rewards/rejected": -6.125539779663086, "step": 628 }, { "epoch": 1.898774740810556, "grad_norm": 4.268416881561279, "learning_rate": 8.102719033232627e-07, "logits/chosen": -2.515092611312866, "logits/rejected": -2.4592318534851074, "logps/chosen": -33.916500091552734, "logps/rejected": -132.12326049804688, "loss": 0.3384, "rewards/accuracies": 0.96875, "rewards/chosen": 2.283944845199585, "rewards/margins": 7.382139205932617, "rewards/rejected": -5.098194599151611, "step": 629 }, { "epoch": 1.9017907634307258, "grad_norm": 3.4491827487945557, "learning_rate": 8.099697885196374e-07, "logits/chosen": -2.597116470336914, "logits/rejected": -2.5413217544555664, "logps/chosen": -36.58485412597656, "logps/rejected": -151.4737548828125, "loss": 0.2688, "rewards/accuracies": 1.0, "rewards/chosen": 2.479546546936035, "rewards/margins": 8.769434928894043, "rewards/rejected": -6.28988790512085, "step": 630 }, { "epoch": 1.9048067860508953, "grad_norm": 5.3820037841796875, "learning_rate": 8.096676737160121e-07, "logits/chosen": -2.6374263763427734, "logits/rejected": -2.5580568313598633, "logps/chosen": -37.15319061279297, "logps/rejected": -144.4940948486328, "loss": 0.3441, "rewards/accuracies": 0.9375, "rewards/chosen": 1.7803126573562622, "rewards/margins": 7.620912551879883, "rewards/rejected": -5.840600490570068, "step": 631 }, { "epoch": 1.9078228086710651, "grad_norm": 5.143025875091553, "learning_rate": 8.093655589123867e-07, "logits/chosen": -2.6480555534362793, "logits/rejected": -2.532674551010132, "logps/chosen": -34.743804931640625, "logps/rejected": -141.03887939453125, "loss": 0.3825, "rewards/accuracies": 0.9375, "rewards/chosen": 2.1869378089904785, "rewards/margins": 7.832551956176758, "rewards/rejected": -5.645614147186279, "step": 632 }, { "epoch": 1.9108388312912346, "grad_norm": 3.56280517578125, "learning_rate": 8.090634441087612e-07, "logits/chosen": -2.639831066131592, "logits/rejected": -2.591881513595581, "logps/chosen": -35.8949089050293, "logps/rejected": -138.73907470703125, "loss": 0.292, "rewards/accuracies": 0.96875, "rewards/chosen": 2.001603841781616, "rewards/margins": 7.6454877853393555, "rewards/rejected": -5.64388370513916, "step": 633 }, { "epoch": 1.9138548539114044, "grad_norm": 5.071743965148926, "learning_rate": 8.08761329305136e-07, "logits/chosen": -2.667919158935547, "logits/rejected": -2.582090377807617, "logps/chosen": -42.15703582763672, "logps/rejected": -145.14727783203125, "loss": 0.3706, "rewards/accuracies": 1.0, "rewards/chosen": 2.1279311180114746, "rewards/margins": 7.703741550445557, "rewards/rejected": -5.57581090927124, "step": 634 }, { "epoch": 1.9168708765315738, "grad_norm": 5.201121807098389, "learning_rate": 8.084592145015106e-07, "logits/chosen": -2.5213100910186768, "logits/rejected": -2.468045949935913, "logps/chosen": -33.274925231933594, "logps/rejected": -134.68650817871094, "loss": 0.3332, "rewards/accuracies": 1.0, "rewards/chosen": 2.1178317070007324, "rewards/margins": 7.248419761657715, "rewards/rejected": -5.130587577819824, "step": 635 }, { "epoch": 1.9198868991517437, "grad_norm": 2.8298418521881104, "learning_rate": 8.081570996978851e-07, "logits/chosen": -2.614720344543457, "logits/rejected": -2.529386520385742, "logps/chosen": -31.217571258544922, "logps/rejected": -134.5762176513672, "loss": 0.3345, "rewards/accuracies": 0.9375, "rewards/chosen": 2.6177003383636475, "rewards/margins": 7.678170680999756, "rewards/rejected": -5.0604705810546875, "step": 636 }, { "epoch": 1.9229029217719131, "grad_norm": 4.666352272033691, "learning_rate": 8.078549848942597e-07, "logits/chosen": -2.602982759475708, "logits/rejected": -2.532914161682129, "logps/chosen": -29.392614364624023, "logps/rejected": -125.1301040649414, "loss": 0.2915, "rewards/accuracies": 1.0, "rewards/chosen": 2.8358869552612305, "rewards/margins": 7.499133110046387, "rewards/rejected": -4.663247108459473, "step": 637 }, { "epoch": 1.925918944392083, "grad_norm": 4.25257682800293, "learning_rate": 8.075528700906345e-07, "logits/chosen": -2.62113094329834, "logits/rejected": -2.553269624710083, "logps/chosen": -29.451290130615234, "logps/rejected": -127.69596862792969, "loss": 0.3247, "rewards/accuracies": 1.0, "rewards/chosen": 3.0354151725769043, "rewards/margins": 7.7324538230896, "rewards/rejected": -4.697038650512695, "step": 638 }, { "epoch": 1.9289349670122526, "grad_norm": 5.204339027404785, "learning_rate": 8.07250755287009e-07, "logits/chosen": -2.591057062149048, "logits/rejected": -2.530442714691162, "logps/chosen": -35.90325164794922, "logps/rejected": -138.71324157714844, "loss": 0.3632, "rewards/accuracies": 0.9375, "rewards/chosen": 2.191315174102783, "rewards/margins": 7.4493865966796875, "rewards/rejected": -5.258070945739746, "step": 639 }, { "epoch": 1.9319509896324223, "grad_norm": 3.7518808841705322, "learning_rate": 8.069486404833836e-07, "logits/chosen": -2.688203811645508, "logits/rejected": -2.5549230575561523, "logps/chosen": -30.905860900878906, "logps/rejected": -141.18841552734375, "loss": 0.2366, "rewards/accuracies": 1.0, "rewards/chosen": 2.8436174392700195, "rewards/margins": 8.463417053222656, "rewards/rejected": -5.619799613952637, "step": 640 }, { "epoch": 1.934967012252592, "grad_norm": 5.687093257904053, "learning_rate": 8.066465256797583e-07, "logits/chosen": -2.5071911811828613, "logits/rejected": -2.52005934715271, "logps/chosen": -43.102394104003906, "logps/rejected": -141.85809326171875, "loss": 0.3829, "rewards/accuracies": 0.96875, "rewards/chosen": 1.6165684461593628, "rewards/margins": 7.165586948394775, "rewards/rejected": -5.549018383026123, "step": 641 }, { "epoch": 1.9379830348727616, "grad_norm": 4.477337837219238, "learning_rate": 8.063444108761329e-07, "logits/chosen": -2.6116719245910645, "logits/rejected": -2.54403018951416, "logps/chosen": -29.049020767211914, "logps/rejected": -135.65682983398438, "loss": 0.334, "rewards/accuracies": 0.96875, "rewards/chosen": 2.686342477798462, "rewards/margins": 7.827935218811035, "rewards/rejected": -5.141592979431152, "step": 642 }, { "epoch": 1.9409990574929312, "grad_norm": 4.533672332763672, "learning_rate": 8.060422960725075e-07, "logits/chosen": -2.666107654571533, "logits/rejected": -2.605492115020752, "logps/chosen": -27.7521915435791, "logps/rejected": -112.30829620361328, "loss": 0.3931, "rewards/accuracies": 0.96875, "rewards/chosen": 2.575455665588379, "rewards/margins": 6.140267372131348, "rewards/rejected": -3.564812183380127, "step": 643 }, { "epoch": 1.9440150801131009, "grad_norm": 4.529966831207275, "learning_rate": 8.057401812688821e-07, "logits/chosen": -2.5659289360046387, "logits/rejected": -2.5023348331451416, "logps/chosen": -34.118385314941406, "logps/rejected": -136.15963745117188, "loss": 0.3017, "rewards/accuracies": 1.0, "rewards/chosen": 2.1770737171173096, "rewards/margins": 7.744212627410889, "rewards/rejected": -5.567138195037842, "step": 644 }, { "epoch": 1.9470311027332705, "grad_norm": 4.530795097351074, "learning_rate": 8.054380664652568e-07, "logits/chosen": -2.559563636779785, "logits/rejected": -2.467134952545166, "logps/chosen": -32.089324951171875, "logps/rejected": -135.76296997070312, "loss": 0.3282, "rewards/accuracies": 0.9375, "rewards/chosen": 2.5019617080688477, "rewards/margins": 7.653574466705322, "rewards/rejected": -5.151612758636475, "step": 645 }, { "epoch": 1.9500471253534402, "grad_norm": 6.219491481781006, "learning_rate": 8.051359516616314e-07, "logits/chosen": -2.633960247039795, "logits/rejected": -2.6035397052764893, "logps/chosen": -27.170166015625, "logps/rejected": -109.47976684570312, "loss": 0.3606, "rewards/accuracies": 1.0, "rewards/chosen": 3.007132053375244, "rewards/margins": 6.581594944000244, "rewards/rejected": -3.574463129043579, "step": 646 }, { "epoch": 1.9530631479736098, "grad_norm": 5.005630016326904, "learning_rate": 8.04833836858006e-07, "logits/chosen": -2.58880615234375, "logits/rejected": -2.540952682495117, "logps/chosen": -36.47801208496094, "logps/rejected": -141.13491821289062, "loss": 0.3369, "rewards/accuracies": 0.96875, "rewards/chosen": 1.857330322265625, "rewards/margins": 7.558420181274414, "rewards/rejected": -5.701090335845947, "step": 647 }, { "epoch": 1.9560791705937794, "grad_norm": 7.566233158111572, "learning_rate": 8.045317220543805e-07, "logits/chosen": -2.5718870162963867, "logits/rejected": -2.5368809700012207, "logps/chosen": -30.309301376342773, "logps/rejected": -135.01548767089844, "loss": 0.3039, "rewards/accuracies": 1.0, "rewards/chosen": 2.6134798526763916, "rewards/margins": 7.939877986907959, "rewards/rejected": -5.326397895812988, "step": 648 }, { "epoch": 1.959095193213949, "grad_norm": 6.624753475189209, "learning_rate": 8.042296072507553e-07, "logits/chosen": -2.592090606689453, "logits/rejected": -2.511251449584961, "logps/chosen": -33.38520812988281, "logps/rejected": -120.79405975341797, "loss": 0.414, "rewards/accuracies": 1.0, "rewards/chosen": 2.052351713180542, "rewards/margins": 6.212430000305176, "rewards/rejected": -4.160078525543213, "step": 649 }, { "epoch": 1.9621112158341187, "grad_norm": 6.169089317321777, "learning_rate": 8.039274924471299e-07, "logits/chosen": -2.671034812927246, "logits/rejected": -2.565246343612671, "logps/chosen": -26.957366943359375, "logps/rejected": -130.53358459472656, "loss": 0.2931, "rewards/accuracies": 0.96875, "rewards/chosen": 2.6354217529296875, "rewards/margins": 7.61710262298584, "rewards/rejected": -4.981680393218994, "step": 650 }, { "epoch": 1.9651272384542884, "grad_norm": 4.589166641235352, "learning_rate": 8.036253776435044e-07, "logits/chosen": -2.573855400085449, "logits/rejected": -2.555612564086914, "logps/chosen": -38.53700256347656, "logps/rejected": -124.1783676147461, "loss": 0.4492, "rewards/accuracies": 0.90625, "rewards/chosen": 1.7216639518737793, "rewards/margins": 6.458789348602295, "rewards/rejected": -4.737125873565674, "step": 651 }, { "epoch": 1.968143261074458, "grad_norm": 6.933979511260986, "learning_rate": 8.033232628398791e-07, "logits/chosen": -2.5912363529205322, "logits/rejected": -2.5049123764038086, "logps/chosen": -41.13655090332031, "logps/rejected": -123.64097595214844, "loss": 0.4313, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5722627639770508, "rewards/margins": 6.064598083496094, "rewards/rejected": -4.492335319519043, "step": 652 }, { "epoch": 1.9711592836946277, "grad_norm": 4.201910495758057, "learning_rate": 8.030211480362538e-07, "logits/chosen": -2.5402510166168213, "logits/rejected": -2.4612722396850586, "logps/chosen": -40.462860107421875, "logps/rejected": -140.89456176757812, "loss": 0.3089, "rewards/accuracies": 1.0, "rewards/chosen": 1.9044950008392334, "rewards/margins": 7.516531467437744, "rewards/rejected": -5.61203670501709, "step": 653 }, { "epoch": 1.9741753063147973, "grad_norm": 3.9542763233184814, "learning_rate": 8.027190332326284e-07, "logits/chosen": -2.6164793968200684, "logits/rejected": -2.5692293643951416, "logps/chosen": -28.460487365722656, "logps/rejected": -135.0167236328125, "loss": 0.2635, "rewards/accuracies": 0.96875, "rewards/chosen": 2.5215585231781006, "rewards/margins": 7.7877020835876465, "rewards/rejected": -5.266143321990967, "step": 654 }, { "epoch": 1.977191328934967, "grad_norm": 6.295993804931641, "learning_rate": 8.024169184290029e-07, "logits/chosen": -2.692164182662964, "logits/rejected": -2.6073265075683594, "logps/chosen": -32.98140335083008, "logps/rejected": -121.47051239013672, "loss": 0.3781, "rewards/accuracies": 0.9375, "rewards/chosen": 2.1565628051757812, "rewards/margins": 6.534505367279053, "rewards/rejected": -4.37794303894043, "step": 655 }, { "epoch": 1.9802073515551366, "grad_norm": 4.77182674407959, "learning_rate": 8.021148036253777e-07, "logits/chosen": -2.6715478897094727, "logits/rejected": -2.5297553539276123, "logps/chosen": -45.14195251464844, "logps/rejected": -123.46617126464844, "loss": 0.4656, "rewards/accuracies": 1.0, "rewards/chosen": 1.5659868717193604, "rewards/margins": 5.70081901550293, "rewards/rejected": -4.134832382202148, "step": 656 }, { "epoch": 1.9832233741753065, "grad_norm": 4.773250102996826, "learning_rate": 8.018126888217523e-07, "logits/chosen": -2.6512253284454346, "logits/rejected": -2.5220046043395996, "logps/chosen": -30.166837692260742, "logps/rejected": -125.92536926269531, "loss": 0.3882, "rewards/accuracies": 1.0, "rewards/chosen": 2.393599510192871, "rewards/margins": 6.853672981262207, "rewards/rejected": -4.460073947906494, "step": 657 }, { "epoch": 1.986239396795476, "grad_norm": 6.489116191864014, "learning_rate": 8.015105740181268e-07, "logits/chosen": -2.5619592666625977, "logits/rejected": -2.578106164932251, "logps/chosen": -24.57448959350586, "logps/rejected": -131.63502502441406, "loss": 0.2489, "rewards/accuracies": 0.96875, "rewards/chosen": 3.1414871215820312, "rewards/margins": 8.215377807617188, "rewards/rejected": -5.073890686035156, "step": 658 }, { "epoch": 1.9892554194156458, "grad_norm": 4.6214070320129395, "learning_rate": 8.012084592145014e-07, "logits/chosen": -2.6089837551116943, "logits/rejected": -2.5328540802001953, "logps/chosen": -32.21451187133789, "logps/rejected": -133.2964630126953, "loss": 0.3091, "rewards/accuracies": 0.96875, "rewards/chosen": 2.333951950073242, "rewards/margins": 7.414024353027344, "rewards/rejected": -5.080072402954102, "step": 659 }, { "epoch": 1.9922714420358152, "grad_norm": 6.71843957901001, "learning_rate": 8.009063444108762e-07, "logits/chosen": -2.5947463512420654, "logits/rejected": -2.5766406059265137, "logps/chosen": -30.313858032226562, "logps/rejected": -124.83808898925781, "loss": 0.3717, "rewards/accuracies": 0.96875, "rewards/chosen": 2.5574839115142822, "rewards/margins": 6.837207794189453, "rewards/rejected": -4.279723644256592, "step": 660 }, { "epoch": 1.995287464655985, "grad_norm": 5.539814472198486, "learning_rate": 8.006042296072507e-07, "logits/chosen": -2.558635950088501, "logits/rejected": -2.5710504055023193, "logps/chosen": -32.97739791870117, "logps/rejected": -130.0188446044922, "loss": 0.2673, "rewards/accuracies": 0.96875, "rewards/chosen": 2.245152711868286, "rewards/margins": 7.213390827178955, "rewards/rejected": -4.968238353729248, "step": 661 }, { "epoch": 1.9983034872761545, "grad_norm": 2.8360416889190674, "learning_rate": 8.003021148036253e-07, "logits/chosen": -2.589587450027466, "logits/rejected": -2.5493617057800293, "logps/chosen": -34.77857208251953, "logps/rejected": -126.39409637451172, "loss": 0.3483, "rewards/accuracies": 0.9375, "rewards/chosen": 2.072476625442505, "rewards/margins": 6.629177093505859, "rewards/rejected": -4.556700229644775, "step": 662 }, { "epoch": 2.00301602262017, "grad_norm": 5.369045257568359, "learning_rate": 8e-07, "logits/chosen": -2.6161246299743652, "logits/rejected": -2.577303171157837, "logps/chosen": -36.94430923461914, "logps/rejected": -137.82525634765625, "loss": 0.3396, "rewards/accuracies": 0.9722222089767456, "rewards/chosen": 2.43228816986084, "rewards/margins": 7.812192440032959, "rewards/rejected": -5.379903793334961, "step": 663 }, { "epoch": 2.0060320452403393, "grad_norm": 4.985427379608154, "learning_rate": 7.996978851963746e-07, "logits/chosen": -2.65378475189209, "logits/rejected": -2.5805623531341553, "logps/chosen": -38.19184875488281, "logps/rejected": -131.6090850830078, "loss": 0.378, "rewards/accuracies": 0.9375, "rewards/chosen": 1.7245471477508545, "rewards/margins": 6.944536209106445, "rewards/rejected": -5.219989776611328, "step": 664 }, { "epoch": 2.009048067860509, "grad_norm": 4.27701473236084, "learning_rate": 7.993957703927492e-07, "logits/chosen": -2.542177200317383, "logits/rejected": -2.553853988647461, "logps/chosen": -38.321685791015625, "logps/rejected": -155.3750762939453, "loss": 0.304, "rewards/accuracies": 1.0, "rewards/chosen": 1.9336915016174316, "rewards/margins": 8.683610916137695, "rewards/rejected": -6.749919891357422, "step": 665 }, { "epoch": 2.0120640904806786, "grad_norm": 4.8042097091674805, "learning_rate": 7.990936555891238e-07, "logits/chosen": -2.6654205322265625, "logits/rejected": -2.6044859886169434, "logps/chosen": -26.297584533691406, "logps/rejected": -123.96743774414062, "loss": 0.3757, "rewards/accuracies": 0.96875, "rewards/chosen": 2.8181567192077637, "rewards/margins": 7.31463098526001, "rewards/rejected": -4.496474266052246, "step": 666 }, { "epoch": 2.0150801131008484, "grad_norm": 5.281298637390137, "learning_rate": 7.987915407854984e-07, "logits/chosen": -2.5530874729156494, "logits/rejected": -2.5084056854248047, "logps/chosen": -31.040607452392578, "logps/rejected": -130.0478515625, "loss": 0.3369, "rewards/accuracies": 0.96875, "rewards/chosen": 2.426583766937256, "rewards/margins": 7.376640319824219, "rewards/rejected": -4.950056552886963, "step": 667 }, { "epoch": 2.018096135721018, "grad_norm": 5.234290599822998, "learning_rate": 7.984894259818731e-07, "logits/chosen": -2.5366063117980957, "logits/rejected": -2.5047998428344727, "logps/chosen": -45.424827575683594, "logps/rejected": -147.46044921875, "loss": 0.3471, "rewards/accuracies": 0.90625, "rewards/chosen": 1.4027591943740845, "rewards/margins": 7.788963317871094, "rewards/rejected": -6.386204242706299, "step": 668 }, { "epoch": 2.0211121583411877, "grad_norm": 4.658989429473877, "learning_rate": 7.981873111782477e-07, "logits/chosen": -2.5485987663269043, "logits/rejected": -2.5814430713653564, "logps/chosen": -36.455322265625, "logps/rejected": -141.4229736328125, "loss": 0.3449, "rewards/accuracies": 1.0, "rewards/chosen": 1.9773554801940918, "rewards/margins": 7.6906232833862305, "rewards/rejected": -5.713267803192139, "step": 669 }, { "epoch": 2.024128180961357, "grad_norm": 3.4848599433898926, "learning_rate": 7.978851963746223e-07, "logits/chosen": -2.5753872394561768, "logits/rejected": -2.562534809112549, "logps/chosen": -33.789955139160156, "logps/rejected": -145.01388549804688, "loss": 0.3006, "rewards/accuracies": 0.90625, "rewards/chosen": 2.1071369647979736, "rewards/margins": 8.072259902954102, "rewards/rejected": -5.965122699737549, "step": 670 }, { "epoch": 2.024128180961357, "eval_logits/chosen": -2.5407490730285645, "eval_logits/rejected": -2.4907314777374268, "eval_logps/chosen": -37.612361907958984, "eval_logps/rejected": -142.67782592773438, "eval_loss": 0.3060208559036255, "eval_rewards/accuracies": 0.9716494679450989, "eval_rewards/chosen": 2.1792328357696533, "eval_rewards/margins": 7.819851875305176, "eval_rewards/rejected": -5.640619277954102, "eval_runtime": 701.6271, "eval_samples_per_second": 0.552, "eval_steps_per_second": 0.277, "step": 670 }, { "epoch": 2.027144203581527, "grad_norm": 5.81572961807251, "learning_rate": 7.97583081570997e-07, "logits/chosen": -2.5035791397094727, "logits/rejected": -2.4679267406463623, "logps/chosen": -26.21890640258789, "logps/rejected": -129.11746215820312, "loss": 0.2548, "rewards/accuracies": 1.0, "rewards/chosen": 2.8596839904785156, "rewards/margins": 7.7753777503967285, "rewards/rejected": -4.915694236755371, "step": 671 }, { "epoch": 2.0301602262016964, "grad_norm": 4.9879045486450195, "learning_rate": 7.972809667673716e-07, "logits/chosen": -2.5768373012542725, "logits/rejected": -2.477611541748047, "logps/chosen": -31.684911727905273, "logps/rejected": -155.65098571777344, "loss": 0.2393, "rewards/accuracies": 0.96875, "rewards/chosen": 2.5091896057128906, "rewards/margins": 9.119954109191895, "rewards/rejected": -6.610764980316162, "step": 672 }, { "epoch": 2.0331762488218663, "grad_norm": 8.30786418914795, "learning_rate": 7.969788519637461e-07, "logits/chosen": -2.599597930908203, "logits/rejected": -2.5234532356262207, "logps/chosen": -29.399824142456055, "logps/rejected": -159.2489776611328, "loss": 0.1911, "rewards/accuracies": 1.0, "rewards/chosen": 2.496750831604004, "rewards/margins": 9.28547477722168, "rewards/rejected": -6.788723945617676, "step": 673 }, { "epoch": 2.0361922714420357, "grad_norm": 5.512387752532959, "learning_rate": 7.966767371601208e-07, "logits/chosen": -2.5789523124694824, "logits/rejected": -2.487643003463745, "logps/chosen": -34.9536018371582, "logps/rejected": -129.67837524414062, "loss": 0.3753, "rewards/accuracies": 1.0, "rewards/chosen": 1.9897890090942383, "rewards/margins": 7.047908782958984, "rewards/rejected": -5.058119773864746, "step": 674 }, { "epoch": 2.0392082940622056, "grad_norm": 4.672743797302246, "learning_rate": 7.963746223564955e-07, "logits/chosen": -2.596385955810547, "logits/rejected": -2.503612995147705, "logps/chosen": -23.12519073486328, "logps/rejected": -121.13328552246094, "loss": 0.2939, "rewards/accuracies": 0.96875, "rewards/chosen": 3.0141654014587402, "rewards/margins": 7.490973949432373, "rewards/rejected": -4.476808547973633, "step": 675 }, { "epoch": 2.042224316682375, "grad_norm": 4.753532886505127, "learning_rate": 7.9607250755287e-07, "logits/chosen": -2.6193723678588867, "logits/rejected": -2.534262180328369, "logps/chosen": -30.363590240478516, "logps/rejected": -143.31155395507812, "loss": 0.3172, "rewards/accuracies": 0.96875, "rewards/chosen": 2.390141010284424, "rewards/margins": 8.290345191955566, "rewards/rejected": -5.900203704833984, "step": 676 }, { "epoch": 2.045240339302545, "grad_norm": 4.7799506187438965, "learning_rate": 7.957703927492446e-07, "logits/chosen": -2.7069857120513916, "logits/rejected": -2.6799445152282715, "logps/chosen": -26.575525283813477, "logps/rejected": -139.44918823242188, "loss": 0.2784, "rewards/accuracies": 1.0, "rewards/chosen": 2.7708847522735596, "rewards/margins": 8.393278121948242, "rewards/rejected": -5.622393608093262, "step": 677 }, { "epoch": 2.0482563619227143, "grad_norm": 5.424606800079346, "learning_rate": 7.954682779456193e-07, "logits/chosen": -2.6218557357788086, "logits/rejected": -2.4759254455566406, "logps/chosen": -43.21501159667969, "logps/rejected": -148.32371520996094, "loss": 0.3103, "rewards/accuracies": 0.96875, "rewards/chosen": 1.6606569290161133, "rewards/margins": 8.121703147888184, "rewards/rejected": -6.4610466957092285, "step": 678 }, { "epoch": 2.051272384542884, "grad_norm": 5.458117485046387, "learning_rate": 7.95166163141994e-07, "logits/chosen": -2.5924153327941895, "logits/rejected": -2.5285534858703613, "logps/chosen": -38.45952606201172, "logps/rejected": -127.86441040039062, "loss": 0.4472, "rewards/accuracies": 1.0, "rewards/chosen": 1.5668017864227295, "rewards/margins": 6.669527530670166, "rewards/rejected": -5.102726459503174, "step": 679 }, { "epoch": 2.0542884071630536, "grad_norm": 3.2662851810455322, "learning_rate": 7.948640483383685e-07, "logits/chosen": -2.5930089950561523, "logits/rejected": -2.5736160278320312, "logps/chosen": -31.404510498046875, "logps/rejected": -132.09715270996094, "loss": 0.2811, "rewards/accuracies": 0.96875, "rewards/chosen": 2.402240514755249, "rewards/margins": 7.615140914916992, "rewards/rejected": -5.212900638580322, "step": 680 }, { "epoch": 2.0573044297832235, "grad_norm": 4.902634620666504, "learning_rate": 7.945619335347432e-07, "logits/chosen": -2.679875373840332, "logits/rejected": -2.624401807785034, "logps/chosen": -31.573808670043945, "logps/rejected": -118.36007690429688, "loss": 0.3819, "rewards/accuracies": 0.9375, "rewards/chosen": 2.294935941696167, "rewards/margins": 6.298496246337891, "rewards/rejected": -4.0035600662231445, "step": 681 }, { "epoch": 2.060320452403393, "grad_norm": 6.443541526794434, "learning_rate": 7.942598187311178e-07, "logits/chosen": -2.5454843044281006, "logits/rejected": -2.4952447414398193, "logps/chosen": -45.04632568359375, "logps/rejected": -145.23574829101562, "loss": 0.3993, "rewards/accuracies": 1.0, "rewards/chosen": 1.5328664779663086, "rewards/margins": 7.383419990539551, "rewards/rejected": -5.850553512573242, "step": 682 }, { "epoch": 2.0633364750235628, "grad_norm": 5.956549644470215, "learning_rate": 7.939577039274924e-07, "logits/chosen": -2.654797315597534, "logits/rejected": -2.54665470123291, "logps/chosen": -35.04124069213867, "logps/rejected": -134.46563720703125, "loss": 0.3301, "rewards/accuracies": 1.0, "rewards/chosen": 2.199040412902832, "rewards/margins": 7.300161838531494, "rewards/rejected": -5.101121425628662, "step": 683 }, { "epoch": 2.066352497643732, "grad_norm": 4.354245662689209, "learning_rate": 7.93655589123867e-07, "logits/chosen": -2.625621795654297, "logits/rejected": -2.5235681533813477, "logps/chosen": -36.084625244140625, "logps/rejected": -135.1458282470703, "loss": 0.3967, "rewards/accuracies": 1.0, "rewards/chosen": 1.8788774013519287, "rewards/margins": 6.735532760620117, "rewards/rejected": -4.856655597686768, "step": 684 }, { "epoch": 2.069368520263902, "grad_norm": 3.897725820541382, "learning_rate": 7.933534743202417e-07, "logits/chosen": -2.5703494548797607, "logits/rejected": -2.5124683380126953, "logps/chosen": -34.936988830566406, "logps/rejected": -147.24685668945312, "loss": 0.2532, "rewards/accuracies": 0.96875, "rewards/chosen": 2.3678832054138184, "rewards/margins": 8.559107780456543, "rewards/rejected": -6.191225051879883, "step": 685 }, { "epoch": 2.0723845428840715, "grad_norm": 4.208485126495361, "learning_rate": 7.930513595166163e-07, "logits/chosen": -2.5798680782318115, "logits/rejected": -2.4993932247161865, "logps/chosen": -32.590518951416016, "logps/rejected": -128.789306640625, "loss": 0.2737, "rewards/accuracies": 1.0, "rewards/chosen": 2.638434648513794, "rewards/margins": 7.487374305725098, "rewards/rejected": -4.848940849304199, "step": 686 }, { "epoch": 2.0754005655042413, "grad_norm": 8.216657638549805, "learning_rate": 7.927492447129909e-07, "logits/chosen": -2.5323901176452637, "logits/rejected": -2.4927256107330322, "logps/chosen": -35.52788162231445, "logps/rejected": -142.7589569091797, "loss": 0.3149, "rewards/accuracies": 0.96875, "rewards/chosen": 2.076787233352661, "rewards/margins": 8.071463584899902, "rewards/rejected": -5.994676113128662, "step": 687 }, { "epoch": 2.0784165881244108, "grad_norm": 5.828702449798584, "learning_rate": 7.924471299093656e-07, "logits/chosen": -2.595963716506958, "logits/rejected": -2.5515921115875244, "logps/chosen": -36.19797134399414, "logps/rejected": -139.81982421875, "loss": 0.274, "rewards/accuracies": 1.0, "rewards/chosen": 2.297197103500366, "rewards/margins": 7.841348171234131, "rewards/rejected": -5.544151306152344, "step": 688 }, { "epoch": 2.0814326107445806, "grad_norm": 5.931180953979492, "learning_rate": 7.921450151057401e-07, "logits/chosen": -2.609804391860962, "logits/rejected": -2.4691147804260254, "logps/chosen": -29.466310501098633, "logps/rejected": -140.49342346191406, "loss": 0.2653, "rewards/accuracies": 0.96875, "rewards/chosen": 2.4000473022460938, "rewards/margins": 7.954187870025635, "rewards/rejected": -5.554141044616699, "step": 689 }, { "epoch": 2.08444863336475, "grad_norm": 4.587561130523682, "learning_rate": 7.918429003021148e-07, "logits/chosen": -2.5718324184417725, "logits/rejected": -2.507877826690674, "logps/chosen": -30.870769500732422, "logps/rejected": -149.50599670410156, "loss": 0.2461, "rewards/accuracies": 0.9375, "rewards/chosen": 2.477968454360962, "rewards/margins": 8.62702465057373, "rewards/rejected": -6.149056434631348, "step": 690 }, { "epoch": 2.08746465598492, "grad_norm": 5.4503679275512695, "learning_rate": 7.915407854984894e-07, "logits/chosen": -2.680358648300171, "logits/rejected": -2.6032004356384277, "logps/chosen": -21.82211685180664, "logps/rejected": -131.80816650390625, "loss": 0.2622, "rewards/accuracies": 1.0, "rewards/chosen": 3.387860059738159, "rewards/margins": 8.505661964416504, "rewards/rejected": -5.117802143096924, "step": 691 }, { "epoch": 2.0904806786050893, "grad_norm": 4.823843002319336, "learning_rate": 7.91238670694864e-07, "logits/chosen": -2.5544567108154297, "logits/rejected": -2.5422401428222656, "logps/chosen": -26.082660675048828, "logps/rejected": -132.09796142578125, "loss": 0.2513, "rewards/accuracies": 1.0, "rewards/chosen": 2.9466400146484375, "rewards/margins": 8.089935302734375, "rewards/rejected": -5.143294334411621, "step": 692 }, { "epoch": 2.093496701225259, "grad_norm": 4.834394931793213, "learning_rate": 7.909365558912386e-07, "logits/chosen": -2.6219451427459717, "logits/rejected": -2.5011684894561768, "logps/chosen": -31.99724769592285, "logps/rejected": -125.62249755859375, "loss": 0.3388, "rewards/accuracies": 0.96875, "rewards/chosen": 2.566006660461426, "rewards/margins": 7.427153587341309, "rewards/rejected": -4.861146926879883, "step": 693 }, { "epoch": 2.0965127238454286, "grad_norm": 4.341009140014648, "learning_rate": 7.906344410876133e-07, "logits/chosen": -2.6671202182769775, "logits/rejected": -2.5647101402282715, "logps/chosen": -32.6768684387207, "logps/rejected": -139.608642578125, "loss": 0.2533, "rewards/accuracies": 1.0, "rewards/chosen": 2.2635345458984375, "rewards/margins": 7.682836055755615, "rewards/rejected": -5.419302463531494, "step": 694 }, { "epoch": 2.0995287464655985, "grad_norm": 5.1156721115112305, "learning_rate": 7.903323262839878e-07, "logits/chosen": -2.575071334838867, "logits/rejected": -2.478318452835083, "logps/chosen": -30.900949478149414, "logps/rejected": -141.44508361816406, "loss": 0.2715, "rewards/accuracies": 0.96875, "rewards/chosen": 2.695077657699585, "rewards/margins": 8.272974967956543, "rewards/rejected": -5.577897071838379, "step": 695 }, { "epoch": 2.102544769085768, "grad_norm": 4.544837951660156, "learning_rate": 7.900302114803625e-07, "logits/chosen": -2.5759408473968506, "logits/rejected": -2.505685329437256, "logps/chosen": -26.82032585144043, "logps/rejected": -128.47735595703125, "loss": 0.2988, "rewards/accuracies": 1.0, "rewards/chosen": 2.643441915512085, "rewards/margins": 7.254195213317871, "rewards/rejected": -4.610753059387207, "step": 696 }, { "epoch": 2.105560791705938, "grad_norm": 8.269925117492676, "learning_rate": 7.897280966767371e-07, "logits/chosen": -2.675978183746338, "logits/rejected": -2.573794364929199, "logps/chosen": -40.572879791259766, "logps/rejected": -118.01476287841797, "loss": 0.5001, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7170411348342896, "rewards/margins": 5.315271854400635, "rewards/rejected": -3.5982303619384766, "step": 697 }, { "epoch": 2.108576814326107, "grad_norm": 4.017389297485352, "learning_rate": 7.894259818731117e-07, "logits/chosen": -2.603283643722534, "logits/rejected": -2.59233021736145, "logps/chosen": -27.444114685058594, "logps/rejected": -141.2212371826172, "loss": 0.1939, "rewards/accuracies": 0.96875, "rewards/chosen": 2.8114845752716064, "rewards/margins": 8.684345245361328, "rewards/rejected": -5.872860908508301, "step": 698 }, { "epoch": 2.111592836946277, "grad_norm": 3.4700570106506348, "learning_rate": 7.891238670694864e-07, "logits/chosen": -2.627171516418457, "logits/rejected": -2.5366415977478027, "logps/chosen": -27.1461181640625, "logps/rejected": -129.55999755859375, "loss": 0.2301, "rewards/accuracies": 0.96875, "rewards/chosen": 2.740997552871704, "rewards/margins": 7.925649642944336, "rewards/rejected": -5.184651851654053, "step": 699 }, { "epoch": 2.114608859566447, "grad_norm": 5.477234363555908, "learning_rate": 7.88821752265861e-07, "logits/chosen": -2.57466721534729, "logits/rejected": -2.4900827407836914, "logps/chosen": -22.148174285888672, "logps/rejected": -135.19796752929688, "loss": 0.2739, "rewards/accuracies": 1.0, "rewards/chosen": 3.3973262310028076, "rewards/margins": 8.235843658447266, "rewards/rejected": -4.838517189025879, "step": 700 }, { "epoch": 2.1176248821866164, "grad_norm": 4.195428371429443, "learning_rate": 7.885196374622356e-07, "logits/chosen": -2.540585517883301, "logits/rejected": -2.450662851333618, "logps/chosen": -40.02479553222656, "logps/rejected": -137.95098876953125, "loss": 0.3458, "rewards/accuracies": 0.90625, "rewards/chosen": 1.8635375499725342, "rewards/margins": 7.151567459106445, "rewards/rejected": -5.28803014755249, "step": 701 }, { "epoch": 2.1206409048067862, "grad_norm": 4.943327903747559, "learning_rate": 7.882175226586102e-07, "logits/chosen": -2.5651071071624756, "logits/rejected": -2.500202178955078, "logps/chosen": -31.028480529785156, "logps/rejected": -147.18199157714844, "loss": 0.2962, "rewards/accuracies": 1.0, "rewards/chosen": 2.670618772506714, "rewards/margins": 8.246448516845703, "rewards/rejected": -5.575829982757568, "step": 702 }, { "epoch": 2.1236569274269557, "grad_norm": 4.9720611572265625, "learning_rate": 7.879154078549849e-07, "logits/chosen": -2.5912885665893555, "logits/rejected": -2.580808162689209, "logps/chosen": -39.30220031738281, "logps/rejected": -121.71363830566406, "loss": 0.4442, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6484832763671875, "rewards/margins": 6.187416076660156, "rewards/rejected": -4.538932800292969, "step": 703 }, { "epoch": 2.1266729500471255, "grad_norm": 5.722783088684082, "learning_rate": 7.876132930513594e-07, "logits/chosen": -2.531393051147461, "logits/rejected": -2.47128963470459, "logps/chosen": -42.65108871459961, "logps/rejected": -139.83213806152344, "loss": 0.3635, "rewards/accuracies": 0.90625, "rewards/chosen": 1.6199971437454224, "rewards/margins": 7.0866289138793945, "rewards/rejected": -5.466631889343262, "step": 704 }, { "epoch": 2.129688972667295, "grad_norm": 4.041297435760498, "learning_rate": 7.873111782477341e-07, "logits/chosen": -2.5407156944274902, "logits/rejected": -2.517408609390259, "logps/chosen": -34.08020782470703, "logps/rejected": -130.99530029296875, "loss": 0.3081, "rewards/accuracies": 1.0, "rewards/chosen": 2.359210729598999, "rewards/margins": 7.387751579284668, "rewards/rejected": -5.028541088104248, "step": 705 }, { "epoch": 2.132704995287465, "grad_norm": 4.067626953125, "learning_rate": 7.870090634441088e-07, "logits/chosen": -2.638247013092041, "logits/rejected": -2.5304884910583496, "logps/chosen": -31.883045196533203, "logps/rejected": -139.71212768554688, "loss": 0.2436, "rewards/accuracies": 0.96875, "rewards/chosen": 2.3397960662841797, "rewards/margins": 8.081787109375, "rewards/rejected": -5.741991996765137, "step": 706 }, { "epoch": 2.1357210179076342, "grad_norm": 3.2741098403930664, "learning_rate": 7.867069486404834e-07, "logits/chosen": -2.541262149810791, "logits/rejected": -2.552002191543579, "logps/chosen": -35.167930603027344, "logps/rejected": -129.70794677734375, "loss": 0.3701, "rewards/accuracies": 1.0, "rewards/chosen": 2.233910322189331, "rewards/margins": 7.219415664672852, "rewards/rejected": -4.985505104064941, "step": 707 }, { "epoch": 2.138737040527804, "grad_norm": 6.7039666175842285, "learning_rate": 7.864048338368579e-07, "logits/chosen": -2.5603675842285156, "logits/rejected": -2.5283243656158447, "logps/chosen": -41.5500602722168, "logps/rejected": -147.24755859375, "loss": 0.325, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4485605955123901, "rewards/margins": 7.515326499938965, "rewards/rejected": -6.066764831542969, "step": 708 }, { "epoch": 2.1417530631479735, "grad_norm": 4.917281150817871, "learning_rate": 7.861027190332326e-07, "logits/chosen": -2.616973876953125, "logits/rejected": -2.5795834064483643, "logps/chosen": -36.80385971069336, "logps/rejected": -138.8020782470703, "loss": 0.2921, "rewards/accuracies": 0.96875, "rewards/chosen": 2.178736686706543, "rewards/margins": 7.977259159088135, "rewards/rejected": -5.798521995544434, "step": 709 }, { "epoch": 2.1447690857681434, "grad_norm": 4.695956707000732, "learning_rate": 7.858006042296073e-07, "logits/chosen": -2.542243719100952, "logits/rejected": -2.4855728149414062, "logps/chosen": -27.016681671142578, "logps/rejected": -120.74553680419922, "loss": 0.2947, "rewards/accuracies": 1.0, "rewards/chosen": 2.856307029724121, "rewards/margins": 7.2723188400268555, "rewards/rejected": -4.416012287139893, "step": 710 }, { "epoch": 2.147785108388313, "grad_norm": 4.813088417053223, "learning_rate": 7.854984894259818e-07, "logits/chosen": -2.503382444381714, "logits/rejected": -2.541269063949585, "logps/chosen": -31.14584732055664, "logps/rejected": -139.57168579101562, "loss": 0.2833, "rewards/accuracies": 0.96875, "rewards/chosen": 2.059804677963257, "rewards/margins": 7.894047260284424, "rewards/rejected": -5.834242820739746, "step": 711 }, { "epoch": 2.1508011310084827, "grad_norm": 4.467756748199463, "learning_rate": 7.851963746223564e-07, "logits/chosen": -2.6077284812927246, "logits/rejected": -2.5347282886505127, "logps/chosen": -24.610633850097656, "logps/rejected": -147.9739227294922, "loss": 0.2496, "rewards/accuracies": 0.96875, "rewards/chosen": 3.0635247230529785, "rewards/margins": 9.296182632446289, "rewards/rejected": -6.232656955718994, "step": 712 }, { "epoch": 2.153817153628652, "grad_norm": 8.537026405334473, "learning_rate": 7.848942598187311e-07, "logits/chosen": -2.604848623275757, "logits/rejected": -2.4875309467315674, "logps/chosen": -39.19342803955078, "logps/rejected": -130.45687866210938, "loss": 0.3323, "rewards/accuracies": 0.96875, "rewards/chosen": 2.142069101333618, "rewards/margins": 6.864507675170898, "rewards/rejected": -4.722438812255859, "step": 713 }, { "epoch": 2.156833176248822, "grad_norm": 3.2182390689849854, "learning_rate": 7.845921450151057e-07, "logits/chosen": -2.570648193359375, "logits/rejected": -2.520806312561035, "logps/chosen": -25.6314640045166, "logps/rejected": -144.30453491210938, "loss": 0.2874, "rewards/accuracies": 1.0, "rewards/chosen": 2.716468334197998, "rewards/margins": 8.431318283081055, "rewards/rejected": -5.714849948883057, "step": 714 }, { "epoch": 2.1598491988689914, "grad_norm": 4.123035430908203, "learning_rate": 7.842900302114803e-07, "logits/chosen": -2.5752193927764893, "logits/rejected": -2.588292121887207, "logps/chosen": -25.87221908569336, "logps/rejected": -138.36318969726562, "loss": 0.2619, "rewards/accuracies": 0.96875, "rewards/chosen": 2.6301350593566895, "rewards/margins": 8.01308536529541, "rewards/rejected": -5.382950782775879, "step": 715 }, { "epoch": 2.1628652214891613, "grad_norm": 4.447164535522461, "learning_rate": 7.83987915407855e-07, "logits/chosen": -2.525400161743164, "logits/rejected": -2.5580546855926514, "logps/chosen": -31.012889862060547, "logps/rejected": -120.8952407836914, "loss": 0.3399, "rewards/accuracies": 0.96875, "rewards/chosen": 2.42549729347229, "rewards/margins": 6.582612991333008, "rewards/rejected": -4.157115459442139, "step": 716 }, { "epoch": 2.1658812441093307, "grad_norm": 8.957637786865234, "learning_rate": 7.836858006042296e-07, "logits/chosen": -2.61203932762146, "logits/rejected": -2.555516004562378, "logps/chosen": -36.458866119384766, "logps/rejected": -140.94601440429688, "loss": 0.3974, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6636452674865723, "rewards/margins": 7.494684219360352, "rewards/rejected": -5.831038475036621, "step": 717 }, { "epoch": 2.1688972667295006, "grad_norm": 4.323126792907715, "learning_rate": 7.833836858006042e-07, "logits/chosen": -2.5364720821380615, "logits/rejected": -2.53324818611145, "logps/chosen": -34.09047317504883, "logps/rejected": -136.21771240234375, "loss": 0.3172, "rewards/accuracies": 1.0, "rewards/chosen": 2.5626649856567383, "rewards/margins": 7.690735816955566, "rewards/rejected": -5.128071308135986, "step": 718 }, { "epoch": 2.17191328934967, "grad_norm": 5.505923271179199, "learning_rate": 7.830815709969788e-07, "logits/chosen": -2.6648669242858887, "logits/rejected": -2.5221564769744873, "logps/chosen": -37.119598388671875, "logps/rejected": -144.5633544921875, "loss": 0.3403, "rewards/accuracies": 1.0, "rewards/chosen": 1.8996591567993164, "rewards/margins": 7.693971157073975, "rewards/rejected": -5.794313430786133, "step": 719 }, { "epoch": 2.17492931196984, "grad_norm": 4.230947494506836, "learning_rate": 7.827794561933534e-07, "logits/chosen": -2.5977838039398193, "logits/rejected": -2.470259666442871, "logps/chosen": -29.0611572265625, "logps/rejected": -135.64027404785156, "loss": 0.3001, "rewards/accuracies": 1.0, "rewards/chosen": 2.826516628265381, "rewards/margins": 8.023014068603516, "rewards/rejected": -5.196497440338135, "step": 720 }, { "epoch": 2.1779453345900093, "grad_norm": 6.7707109451293945, "learning_rate": 7.824773413897281e-07, "logits/chosen": -2.589113712310791, "logits/rejected": -2.4817957878112793, "logps/chosen": -27.040042877197266, "logps/rejected": -126.8250732421875, "loss": 0.2738, "rewards/accuracies": 1.0, "rewards/chosen": 2.6810193061828613, "rewards/margins": 7.672147274017334, "rewards/rejected": -4.991128444671631, "step": 721 }, { "epoch": 2.180961357210179, "grad_norm": 4.719963073730469, "learning_rate": 7.821752265861027e-07, "logits/chosen": -2.655212163925171, "logits/rejected": -2.538633346557617, "logps/chosen": -31.322742462158203, "logps/rejected": -130.96449279785156, "loss": 0.2964, "rewards/accuracies": 1.0, "rewards/chosen": 2.313162088394165, "rewards/margins": 7.131417274475098, "rewards/rejected": -4.818255424499512, "step": 722 }, { "epoch": 2.1839773798303486, "grad_norm": 8.01333999633789, "learning_rate": 7.818731117824772e-07, "logits/chosen": -2.647284984588623, "logits/rejected": -2.569782257080078, "logps/chosen": -27.650890350341797, "logps/rejected": -117.11687469482422, "loss": 0.3784, "rewards/accuracies": 0.96875, "rewards/chosen": 2.776674270629883, "rewards/margins": 6.688091278076172, "rewards/rejected": -3.9114174842834473, "step": 723 }, { "epoch": 2.1869934024505184, "grad_norm": 3.9335319995880127, "learning_rate": 7.81570996978852e-07, "logits/chosen": -2.6574573516845703, "logits/rejected": -2.5450429916381836, "logps/chosen": -35.080684661865234, "logps/rejected": -137.64801025390625, "loss": 0.322, "rewards/accuracies": 0.96875, "rewards/chosen": 2.2230188846588135, "rewards/margins": 7.898687362670898, "rewards/rejected": -5.675668239593506, "step": 724 }, { "epoch": 2.190009425070688, "grad_norm": 5.408119201660156, "learning_rate": 7.812688821752266e-07, "logits/chosen": -2.6481878757476807, "logits/rejected": -2.519139051437378, "logps/chosen": -45.60894775390625, "logps/rejected": -148.83949279785156, "loss": 0.3657, "rewards/accuracies": 1.0, "rewards/chosen": 1.3497437238693237, "rewards/margins": 7.031662940979004, "rewards/rejected": -5.681918621063232, "step": 725 }, { "epoch": 2.1930254476908577, "grad_norm": 3.981158971786499, "learning_rate": 7.809667673716011e-07, "logits/chosen": -2.5309698581695557, "logits/rejected": -2.4524989128112793, "logps/chosen": -31.77411460876465, "logps/rejected": -141.91856384277344, "loss": 0.2823, "rewards/accuracies": 0.96875, "rewards/chosen": 2.3191635608673096, "rewards/margins": 7.9138407707214355, "rewards/rejected": -5.594677448272705, "step": 726 }, { "epoch": 2.196041470311027, "grad_norm": 4.616644382476807, "learning_rate": 7.806646525679757e-07, "logits/chosen": -2.583256721496582, "logits/rejected": -2.508214235305786, "logps/chosen": -37.052833557128906, "logps/rejected": -135.653564453125, "loss": 0.3364, "rewards/accuracies": 0.96875, "rewards/chosen": 2.2571613788604736, "rewards/margins": 7.179500102996826, "rewards/rejected": -4.922338962554932, "step": 727 }, { "epoch": 2.199057492931197, "grad_norm": 7.332799434661865, "learning_rate": 7.803625377643505e-07, "logits/chosen": -2.503890037536621, "logits/rejected": -2.454500675201416, "logps/chosen": -24.20162010192871, "logps/rejected": -120.21561431884766, "loss": 0.3517, "rewards/accuracies": 1.0, "rewards/chosen": 2.970181941986084, "rewards/margins": 7.031965255737305, "rewards/rejected": -4.0617828369140625, "step": 728 }, { "epoch": 2.2020735155513664, "grad_norm": 4.673975467681885, "learning_rate": 7.80060422960725e-07, "logits/chosen": -2.506953716278076, "logits/rejected": -2.4679059982299805, "logps/chosen": -25.85749626159668, "logps/rejected": -119.0997085571289, "loss": 0.3999, "rewards/accuracies": 0.96875, "rewards/chosen": 3.0036659240722656, "rewards/margins": 7.248205184936523, "rewards/rejected": -4.244539260864258, "step": 729 }, { "epoch": 2.2050895381715363, "grad_norm": 5.891208171844482, "learning_rate": 7.797583081570996e-07, "logits/chosen": -2.6435353755950928, "logits/rejected": -2.5474343299865723, "logps/chosen": -27.356142044067383, "logps/rejected": -130.7992706298828, "loss": 0.3274, "rewards/accuracies": 1.0, "rewards/chosen": 2.8302178382873535, "rewards/margins": 7.7438836097717285, "rewards/rejected": -4.913665771484375, "step": 730 }, { "epoch": 2.208105560791706, "grad_norm": 3.281843900680542, "learning_rate": 7.794561933534742e-07, "logits/chosen": -2.5922257900238037, "logits/rejected": -2.5148794651031494, "logps/chosen": -32.80610275268555, "logps/rejected": -154.64801025390625, "loss": 0.2316, "rewards/accuracies": 0.96875, "rewards/chosen": 2.54756236076355, "rewards/margins": 8.776795387268066, "rewards/rejected": -6.229231834411621, "step": 731 }, { "epoch": 2.2111215834118756, "grad_norm": 4.631168365478516, "learning_rate": 7.79154078549849e-07, "logits/chosen": -2.667518377304077, "logits/rejected": -2.552748680114746, "logps/chosen": -30.500795364379883, "logps/rejected": -151.64654541015625, "loss": 0.2898, "rewards/accuracies": 1.0, "rewards/chosen": 2.1575140953063965, "rewards/margins": 8.184266090393066, "rewards/rejected": -6.02675199508667, "step": 732 }, { "epoch": 2.214137606032045, "grad_norm": 5.455662727355957, "learning_rate": 7.788519637462235e-07, "logits/chosen": -2.5338621139526367, "logits/rejected": -2.433610200881958, "logps/chosen": -25.03306007385254, "logps/rejected": -111.54065704345703, "loss": 0.3143, "rewards/accuracies": 1.0, "rewards/chosen": 3.0753347873687744, "rewards/margins": 6.841955184936523, "rewards/rejected": -3.766619920730591, "step": 733 }, { "epoch": 2.217153628652215, "grad_norm": 4.015118598937988, "learning_rate": 7.785498489425981e-07, "logits/chosen": -2.575103759765625, "logits/rejected": -2.541630744934082, "logps/chosen": -24.87664222717285, "logps/rejected": -136.60191345214844, "loss": 0.1698, "rewards/accuracies": 1.0, "rewards/chosen": 3.075737953186035, "rewards/margins": 8.509889602661133, "rewards/rejected": -5.434150695800781, "step": 734 }, { "epoch": 2.2201696512723847, "grad_norm": 3.5320518016815186, "learning_rate": 7.782477341389729e-07, "logits/chosen": -2.5340259075164795, "logits/rejected": -2.483365058898926, "logps/chosen": -32.31951904296875, "logps/rejected": -132.88967895507812, "loss": 0.3644, "rewards/accuracies": 0.96875, "rewards/chosen": 2.3806538581848145, "rewards/margins": 7.3762078285217285, "rewards/rejected": -4.995553493499756, "step": 735 }, { "epoch": 2.223185673892554, "grad_norm": 4.154507637023926, "learning_rate": 7.779456193353474e-07, "logits/chosen": -2.554654836654663, "logits/rejected": -2.463589906692505, "logps/chosen": -24.83932113647461, "logps/rejected": -137.49244689941406, "loss": 0.2159, "rewards/accuracies": 1.0, "rewards/chosen": 2.838921546936035, "rewards/margins": 8.345649719238281, "rewards/rejected": -5.506728649139404, "step": 736 }, { "epoch": 2.226201696512724, "grad_norm": 4.188664436340332, "learning_rate": 7.77643504531722e-07, "logits/chosen": -2.618382453918457, "logits/rejected": -2.466057538986206, "logps/chosen": -24.50214958190918, "logps/rejected": -150.55755615234375, "loss": 0.2212, "rewards/accuracies": 1.0, "rewards/chosen": 3.1397829055786133, "rewards/margins": 9.10837459564209, "rewards/rejected": -5.968591213226318, "step": 737 }, { "epoch": 2.226201696512724, "eval_logits/chosen": -2.510319709777832, "eval_logits/rejected": -2.4634387493133545, "eval_logps/chosen": -35.681758880615234, "eval_logps/rejected": -140.94961547851562, "eval_loss": 0.2977093756198883, "eval_rewards/accuracies": 0.9664948582649231, "eval_rewards/chosen": 2.3722927570343018, "eval_rewards/margins": 7.840090274810791, "eval_rewards/rejected": -5.467798709869385, "eval_runtime": 699.8076, "eval_samples_per_second": 0.553, "eval_steps_per_second": 0.277, "step": 737 }, { "epoch": 2.2292177191328935, "grad_norm": 6.116767406463623, "learning_rate": 7.773413897280966e-07, "logits/chosen": -2.5256757736206055, "logits/rejected": -2.448972225189209, "logps/chosen": -33.99687194824219, "logps/rejected": -149.57073974609375, "loss": 0.2793, "rewards/accuracies": 1.0, "rewards/chosen": 2.0669233798980713, "rewards/margins": 8.34630012512207, "rewards/rejected": -6.279376029968262, "step": 738 }, { "epoch": 2.2322337417530633, "grad_norm": 4.603915691375732, "learning_rate": 7.770392749244713e-07, "logits/chosen": -2.6567466259002686, "logits/rejected": -2.550412654876709, "logps/chosen": -26.234806060791016, "logps/rejected": -132.92030334472656, "loss": 0.32, "rewards/accuracies": 0.96875, "rewards/chosen": 2.8778371810913086, "rewards/margins": 7.91096305847168, "rewards/rejected": -5.033125400543213, "step": 739 }, { "epoch": 2.2352497643732328, "grad_norm": 4.468626499176025, "learning_rate": 7.767371601208459e-07, "logits/chosen": -2.561432123184204, "logits/rejected": -2.5101075172424316, "logps/chosen": -27.36064910888672, "logps/rejected": -130.09524536132812, "loss": 0.2865, "rewards/accuracies": 1.0, "rewards/chosen": 2.7951579093933105, "rewards/margins": 7.962505340576172, "rewards/rejected": -5.167347431182861, "step": 740 }, { "epoch": 2.2382657869934026, "grad_norm": 5.469913959503174, "learning_rate": 7.764350453172205e-07, "logits/chosen": -2.5308797359466553, "logits/rejected": -2.4525532722473145, "logps/chosen": -39.289466857910156, "logps/rejected": -138.35263061523438, "loss": 0.4374, "rewards/accuracies": 1.0, "rewards/chosen": 1.957666039466858, "rewards/margins": 7.156403541564941, "rewards/rejected": -5.198737144470215, "step": 741 }, { "epoch": 2.241281809613572, "grad_norm": 6.370260238647461, "learning_rate": 7.761329305135951e-07, "logits/chosen": -2.569800615310669, "logits/rejected": -2.472764730453491, "logps/chosen": -30.517070770263672, "logps/rejected": -121.07527160644531, "loss": 0.3726, "rewards/accuracies": 0.9375, "rewards/chosen": 2.6465137004852295, "rewards/margins": 6.881099224090576, "rewards/rejected": -4.234585285186768, "step": 742 }, { "epoch": 2.244297832233742, "grad_norm": 3.197244644165039, "learning_rate": 7.758308157099698e-07, "logits/chosen": -2.566879987716675, "logits/rejected": -2.4769816398620605, "logps/chosen": -35.089210510253906, "logps/rejected": -138.43399047851562, "loss": 0.3309, "rewards/accuracies": 1.0, "rewards/chosen": 2.216611623764038, "rewards/margins": 7.726300239562988, "rewards/rejected": -5.509688377380371, "step": 743 }, { "epoch": 2.2473138548539113, "grad_norm": 4.413608551025391, "learning_rate": 7.755287009063444e-07, "logits/chosen": -2.5677711963653564, "logits/rejected": -2.4967732429504395, "logps/chosen": -35.57360076904297, "logps/rejected": -137.25831604003906, "loss": 0.3532, "rewards/accuracies": 0.90625, "rewards/chosen": 1.9908114671707153, "rewards/margins": 7.17632532119751, "rewards/rejected": -5.185514450073242, "step": 744 }, { "epoch": 2.250329877474081, "grad_norm": 5.48082160949707, "learning_rate": 7.752265861027189e-07, "logits/chosen": -2.4882633686065674, "logits/rejected": -2.4387807846069336, "logps/chosen": -25.485549926757812, "logps/rejected": -118.41412353515625, "loss": 0.2862, "rewards/accuracies": 1.0, "rewards/chosen": 2.8542771339416504, "rewards/margins": 7.169381141662598, "rewards/rejected": -4.315104007720947, "step": 745 }, { "epoch": 2.2533459000942506, "grad_norm": 4.063022136688232, "learning_rate": 7.749244712990937e-07, "logits/chosen": -2.6131348609924316, "logits/rejected": -2.5308690071105957, "logps/chosen": -44.82879638671875, "logps/rejected": -154.15548706054688, "loss": 0.3369, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4135740995407104, "rewards/margins": 8.000736236572266, "rewards/rejected": -6.587162494659424, "step": 746 }, { "epoch": 2.2563619227144205, "grad_norm": 4.692275524139404, "learning_rate": 7.746223564954683e-07, "logits/chosen": -2.611088991165161, "logits/rejected": -2.507845640182495, "logps/chosen": -31.589628219604492, "logps/rejected": -147.53273010253906, "loss": 0.2125, "rewards/accuracies": 0.9375, "rewards/chosen": 2.6785295009613037, "rewards/margins": 8.703120231628418, "rewards/rejected": -6.024591445922852, "step": 747 }, { "epoch": 2.25937794533459, "grad_norm": 5.498006820678711, "learning_rate": 7.743202416918428e-07, "logits/chosen": -2.572216033935547, "logits/rejected": -2.4563510417938232, "logps/chosen": -48.364688873291016, "logps/rejected": -145.14320373535156, "loss": 0.4405, "rewards/accuracies": 0.90625, "rewards/chosen": 0.9191293716430664, "rewards/margins": 7.093910217285156, "rewards/rejected": -6.174781322479248, "step": 748 }, { "epoch": 2.2623939679547598, "grad_norm": 5.7051262855529785, "learning_rate": 7.740181268882174e-07, "logits/chosen": -2.4318110942840576, "logits/rejected": -2.4150917530059814, "logps/chosen": -32.310760498046875, "logps/rejected": -142.35133361816406, "loss": 0.2797, "rewards/accuracies": 1.0, "rewards/chosen": 2.2488901615142822, "rewards/margins": 8.18148136138916, "rewards/rejected": -5.932591915130615, "step": 749 }, { "epoch": 2.265409990574929, "grad_norm": 7.37419319152832, "learning_rate": 7.737160120845922e-07, "logits/chosen": -2.5572123527526855, "logits/rejected": -2.568345546722412, "logps/chosen": -32.793434143066406, "logps/rejected": -126.65853881835938, "loss": 0.3212, "rewards/accuracies": 1.0, "rewards/chosen": 2.452143430709839, "rewards/margins": 7.324245452880859, "rewards/rejected": -4.872101783752441, "step": 750 }, { "epoch": 2.268426013195099, "grad_norm": 5.853213787078857, "learning_rate": 7.734138972809667e-07, "logits/chosen": -2.5976920127868652, "logits/rejected": -2.5171055793762207, "logps/chosen": -38.715415954589844, "logps/rejected": -154.0245819091797, "loss": 0.3013, "rewards/accuracies": 1.0, "rewards/chosen": 2.124885320663452, "rewards/margins": 8.64108657836914, "rewards/rejected": -6.516201019287109, "step": 751 }, { "epoch": 2.2714420358152685, "grad_norm": 4.684994220733643, "learning_rate": 7.731117824773413e-07, "logits/chosen": -2.592972755432129, "logits/rejected": -2.416616916656494, "logps/chosen": -38.87663650512695, "logps/rejected": -142.52362060546875, "loss": 0.3544, "rewards/accuracies": 0.9375, "rewards/chosen": 1.9962743520736694, "rewards/margins": 7.461996078491211, "rewards/rejected": -5.465721130371094, "step": 752 }, { "epoch": 2.2744580584354384, "grad_norm": 5.459298133850098, "learning_rate": 7.72809667673716e-07, "logits/chosen": -2.50494122505188, "logits/rejected": -2.4564125537872314, "logps/chosen": -42.8108024597168, "logps/rejected": -147.10903930664062, "loss": 0.3766, "rewards/accuracies": 1.0, "rewards/chosen": 1.5974472761154175, "rewards/margins": 7.774496078491211, "rewards/rejected": -6.177048683166504, "step": 753 }, { "epoch": 2.277474081055608, "grad_norm": 4.74909782409668, "learning_rate": 7.725075528700907e-07, "logits/chosen": -2.4958176612854004, "logits/rejected": -2.4329631328582764, "logps/chosen": -25.45639991760254, "logps/rejected": -131.81448364257812, "loss": 0.2731, "rewards/accuracies": 1.0, "rewards/chosen": 3.0998809337615967, "rewards/margins": 7.881911277770996, "rewards/rejected": -4.7820305824279785, "step": 754 }, { "epoch": 2.2804901036757776, "grad_norm": 5.534731388092041, "learning_rate": 7.722054380664652e-07, "logits/chosen": -2.584073543548584, "logits/rejected": -2.53184175491333, "logps/chosen": -32.32643127441406, "logps/rejected": -141.575927734375, "loss": 0.2976, "rewards/accuracies": 0.96875, "rewards/chosen": 2.5003256797790527, "rewards/margins": 8.422224044799805, "rewards/rejected": -5.921897888183594, "step": 755 }, { "epoch": 2.283506126295947, "grad_norm": 4.441895008087158, "learning_rate": 7.719033232628398e-07, "logits/chosen": -2.5669538974761963, "logits/rejected": -2.5511186122894287, "logps/chosen": -25.679012298583984, "logps/rejected": -131.01358032226562, "loss": 0.2851, "rewards/accuracies": 0.96875, "rewards/chosen": 2.9378204345703125, "rewards/margins": 8.116053581237793, "rewards/rejected": -5.1782331466674805, "step": 756 }, { "epoch": 2.286522148916117, "grad_norm": 4.295564651489258, "learning_rate": 7.716012084592145e-07, "logits/chosen": -2.4917874336242676, "logits/rejected": -2.436164617538452, "logps/chosen": -33.25555419921875, "logps/rejected": -128.94309997558594, "loss": 0.3905, "rewards/accuracies": 0.96875, "rewards/chosen": 2.3735504150390625, "rewards/margins": 7.1734724044799805, "rewards/rejected": -4.799922466278076, "step": 757 }, { "epoch": 2.2895381715362864, "grad_norm": 4.019288063049316, "learning_rate": 7.712990936555891e-07, "logits/chosen": -2.6009159088134766, "logits/rejected": -2.497025489807129, "logps/chosen": -20.717443466186523, "logps/rejected": -147.1604461669922, "loss": 0.204, "rewards/accuracies": 0.96875, "rewards/chosen": 3.1866962909698486, "rewards/margins": 9.17694091796875, "rewards/rejected": -5.9902448654174805, "step": 758 }, { "epoch": 2.2925541941564562, "grad_norm": 3.085994243621826, "learning_rate": 7.709969788519637e-07, "logits/chosen": -2.5063021183013916, "logits/rejected": -2.495537281036377, "logps/chosen": -35.18728256225586, "logps/rejected": -149.30072021484375, "loss": 0.2486, "rewards/accuracies": 0.96875, "rewards/chosen": 2.0909669399261475, "rewards/margins": 8.553239822387695, "rewards/rejected": -6.462272644042969, "step": 759 }, { "epoch": 2.2955702167766256, "grad_norm": 5.151237487792969, "learning_rate": 7.706948640483384e-07, "logits/chosen": -2.563656806945801, "logits/rejected": -2.5344278812408447, "logps/chosen": -31.54369354248047, "logps/rejected": -138.9499969482422, "loss": 0.2214, "rewards/accuracies": 0.96875, "rewards/chosen": 2.432377815246582, "rewards/margins": 8.191303253173828, "rewards/rejected": -5.758924961090088, "step": 760 }, { "epoch": 2.2985862393967955, "grad_norm": 3.7236924171447754, "learning_rate": 7.70392749244713e-07, "logits/chosen": -2.535107135772705, "logits/rejected": -2.489755630493164, "logps/chosen": -26.465072631835938, "logps/rejected": -132.68414306640625, "loss": 0.2622, "rewards/accuracies": 1.0, "rewards/chosen": 2.8929738998413086, "rewards/margins": 8.046891212463379, "rewards/rejected": -5.15391731262207, "step": 761 }, { "epoch": 2.301602262016965, "grad_norm": 2.418396234512329, "learning_rate": 7.700906344410876e-07, "logits/chosen": -2.6036534309387207, "logits/rejected": -2.5088438987731934, "logps/chosen": -36.15151596069336, "logps/rejected": -145.50035095214844, "loss": 0.3117, "rewards/accuracies": 1.0, "rewards/chosen": 2.1318771839141846, "rewards/margins": 8.138557434082031, "rewards/rejected": -6.006680011749268, "step": 762 }, { "epoch": 2.304618284637135, "grad_norm": 5.492551326751709, "learning_rate": 7.697885196374622e-07, "logits/chosen": -2.5917510986328125, "logits/rejected": -2.54917049407959, "logps/chosen": -32.0612678527832, "logps/rejected": -125.93133544921875, "loss": 0.3187, "rewards/accuracies": 0.96875, "rewards/chosen": 2.5611021518707275, "rewards/margins": 7.166807174682617, "rewards/rejected": -4.605705261230469, "step": 763 }, { "epoch": 2.3076343072573042, "grad_norm": 6.077974796295166, "learning_rate": 7.694864048338368e-07, "logits/chosen": -2.514061212539673, "logits/rejected": -2.5046513080596924, "logps/chosen": -41.76094055175781, "logps/rejected": -151.15419006347656, "loss": 0.4136, "rewards/accuracies": 1.0, "rewards/chosen": 1.6531052589416504, "rewards/margins": 7.791615962982178, "rewards/rejected": -6.138510704040527, "step": 764 }, { "epoch": 2.310650329877474, "grad_norm": 6.469464302062988, "learning_rate": 7.691842900302115e-07, "logits/chosen": -2.5936710834503174, "logits/rejected": -2.511016845703125, "logps/chosen": -30.555089950561523, "logps/rejected": -131.954833984375, "loss": 0.3033, "rewards/accuracies": 1.0, "rewards/chosen": 2.575287342071533, "rewards/margins": 7.776882648468018, "rewards/rejected": -5.201595306396484, "step": 765 }, { "epoch": 2.313666352497644, "grad_norm": 6.909411907196045, "learning_rate": 7.688821752265861e-07, "logits/chosen": -2.5281240940093994, "logits/rejected": -2.4501171112060547, "logps/chosen": -31.92859649658203, "logps/rejected": -151.29837036132812, "loss": 0.3301, "rewards/accuracies": 0.96875, "rewards/chosen": 2.2174971103668213, "rewards/margins": 8.347847938537598, "rewards/rejected": -6.1303510665893555, "step": 766 }, { "epoch": 2.3166823751178134, "grad_norm": 3.4648005962371826, "learning_rate": 7.685800604229606e-07, "logits/chosen": -2.6228766441345215, "logits/rejected": -2.501138210296631, "logps/chosen": -48.61634063720703, "logps/rejected": -138.3655548095703, "loss": 0.4578, "rewards/accuracies": 0.90625, "rewards/chosen": 1.233239769935608, "rewards/margins": 6.372829437255859, "rewards/rejected": -5.139590263366699, "step": 767 }, { "epoch": 2.319698397737983, "grad_norm": 5.099306583404541, "learning_rate": 7.682779456193353e-07, "logits/chosen": -2.583498001098633, "logits/rejected": -2.4961025714874268, "logps/chosen": -32.38690948486328, "logps/rejected": -133.95791625976562, "loss": 0.2752, "rewards/accuracies": 1.0, "rewards/chosen": 2.6952719688415527, "rewards/margins": 7.806670188903809, "rewards/rejected": -5.111398220062256, "step": 768 }, { "epoch": 2.3227144203581527, "grad_norm": 6.464587688446045, "learning_rate": 7.6797583081571e-07, "logits/chosen": -2.635291576385498, "logits/rejected": -2.5720794200897217, "logps/chosen": -27.328472137451172, "logps/rejected": -127.81161499023438, "loss": 0.2975, "rewards/accuracies": 0.96875, "rewards/chosen": 2.940856695175171, "rewards/margins": 7.442885398864746, "rewards/rejected": -4.50202751159668, "step": 769 }, { "epoch": 2.3257304429783225, "grad_norm": 4.447290420532227, "learning_rate": 7.676737160120845e-07, "logits/chosen": -2.5715644359588623, "logits/rejected": -2.5467748641967773, "logps/chosen": -39.90910720825195, "logps/rejected": -156.99986267089844, "loss": 0.2369, "rewards/accuracies": 0.96875, "rewards/chosen": 2.081202983856201, "rewards/margins": 8.596946716308594, "rewards/rejected": -6.515744209289551, "step": 770 }, { "epoch": 2.328746465598492, "grad_norm": 7.96176815032959, "learning_rate": 7.673716012084592e-07, "logits/chosen": -2.5608389377593994, "logits/rejected": -2.4996178150177, "logps/chosen": -32.937843322753906, "logps/rejected": -137.9285430908203, "loss": 0.3325, "rewards/accuracies": 1.0, "rewards/chosen": 2.5046403408050537, "rewards/margins": 7.718601703643799, "rewards/rejected": -5.213961601257324, "step": 771 }, { "epoch": 2.331762488218662, "grad_norm": 3.4897005558013916, "learning_rate": 7.670694864048338e-07, "logits/chosen": -2.626985549926758, "logits/rejected": -2.552399158477783, "logps/chosen": -30.94525146484375, "logps/rejected": -131.22174072265625, "loss": 0.4003, "rewards/accuracies": 0.9375, "rewards/chosen": 2.2330543994903564, "rewards/margins": 7.235734939575195, "rewards/rejected": -5.002681255340576, "step": 772 }, { "epoch": 2.3347785108388313, "grad_norm": 6.511521816253662, "learning_rate": 7.667673716012084e-07, "logits/chosen": -2.5206823348999023, "logits/rejected": -2.5122666358947754, "logps/chosen": -25.30220603942871, "logps/rejected": -135.39466857910156, "loss": 0.2367, "rewards/accuracies": 0.96875, "rewards/chosen": 2.8469481468200684, "rewards/margins": 8.403213500976562, "rewards/rejected": -5.556265354156494, "step": 773 }, { "epoch": 2.337794533459001, "grad_norm": 5.024219036102295, "learning_rate": 7.66465256797583e-07, "logits/chosen": -2.518627166748047, "logits/rejected": -2.463085651397705, "logps/chosen": -33.13779067993164, "logps/rejected": -124.7509765625, "loss": 0.3542, "rewards/accuracies": 0.96875, "rewards/chosen": 2.4177772998809814, "rewards/margins": 6.863794803619385, "rewards/rejected": -4.446017265319824, "step": 774 }, { "epoch": 2.3408105560791705, "grad_norm": 4.9951581954956055, "learning_rate": 7.661631419939577e-07, "logits/chosen": -2.412461280822754, "logits/rejected": -2.449836254119873, "logps/chosen": -31.54458236694336, "logps/rejected": -136.64340209960938, "loss": 0.2806, "rewards/accuracies": 0.96875, "rewards/chosen": 2.3581931591033936, "rewards/margins": 7.6610260009765625, "rewards/rejected": -5.30283260345459, "step": 775 }, { "epoch": 2.3438265786993404, "grad_norm": 5.875415325164795, "learning_rate": 7.658610271903323e-07, "logits/chosen": -2.588286876678467, "logits/rejected": -2.521735191345215, "logps/chosen": -30.53354835510254, "logps/rejected": -136.11720275878906, "loss": 0.2958, "rewards/accuracies": 1.0, "rewards/chosen": 2.8208515644073486, "rewards/margins": 8.312536239624023, "rewards/rejected": -5.491684913635254, "step": 776 }, { "epoch": 2.34684260131951, "grad_norm": 4.636803150177002, "learning_rate": 7.655589123867069e-07, "logits/chosen": -2.6437692642211914, "logits/rejected": -2.5929605960845947, "logps/chosen": -33.31659698486328, "logps/rejected": -133.69839477539062, "loss": 0.3598, "rewards/accuracies": 0.96875, "rewards/chosen": 2.3340506553649902, "rewards/margins": 7.5439324378967285, "rewards/rejected": -5.2098822593688965, "step": 777 }, { "epoch": 2.3498586239396797, "grad_norm": 8.843103408813477, "learning_rate": 7.652567975830815e-07, "logits/chosen": -2.423161506652832, "logits/rejected": -2.43424654006958, "logps/chosen": -33.69511032104492, "logps/rejected": -134.23464965820312, "loss": 0.3658, "rewards/accuracies": 0.96875, "rewards/chosen": 2.359110116958618, "rewards/margins": 7.539552211761475, "rewards/rejected": -5.180441856384277, "step": 778 }, { "epoch": 2.352874646559849, "grad_norm": 5.832704067230225, "learning_rate": 7.649546827794562e-07, "logits/chosen": -2.6190714836120605, "logits/rejected": -2.526205539703369, "logps/chosen": -31.881933212280273, "logps/rejected": -155.72068786621094, "loss": 0.2694, "rewards/accuracies": 1.0, "rewards/chosen": 2.3556325435638428, "rewards/margins": 9.123089790344238, "rewards/rejected": -6.767457485198975, "step": 779 }, { "epoch": 2.355890669180019, "grad_norm": 4.011137008666992, "learning_rate": 7.646525679758308e-07, "logits/chosen": -2.5777652263641357, "logits/rejected": -2.512984275817871, "logps/chosen": -37.23835372924805, "logps/rejected": -134.64413452148438, "loss": 0.3723, "rewards/accuracies": 0.9375, "rewards/chosen": 1.9431743621826172, "rewards/margins": 7.3621320724487305, "rewards/rejected": -5.418957233428955, "step": 780 }, { "epoch": 2.3589066918001884, "grad_norm": 6.727613925933838, "learning_rate": 7.643504531722054e-07, "logits/chosen": -2.5573737621307373, "logits/rejected": -2.4020273685455322, "logps/chosen": -34.72820281982422, "logps/rejected": -163.0209503173828, "loss": 0.3349, "rewards/accuracies": 1.0, "rewards/chosen": 1.8008733987808228, "rewards/margins": 9.028814315795898, "rewards/rejected": -7.227941036224365, "step": 781 }, { "epoch": 2.3619227144203583, "grad_norm": 6.731988430023193, "learning_rate": 7.640483383685801e-07, "logits/chosen": -2.5292556285858154, "logits/rejected": -2.5170958042144775, "logps/chosen": -26.497438430786133, "logps/rejected": -130.93067932128906, "loss": 0.2706, "rewards/accuracies": 1.0, "rewards/chosen": 2.7481913566589355, "rewards/margins": 7.678438663482666, "rewards/rejected": -4.930246829986572, "step": 782 }, { "epoch": 2.3649387370405277, "grad_norm": 6.738046646118164, "learning_rate": 7.637462235649546e-07, "logits/chosen": -2.4773566722869873, "logits/rejected": -2.442448854446411, "logps/chosen": -38.454593658447266, "logps/rejected": -140.57847595214844, "loss": 0.3667, "rewards/accuracies": 1.0, "rewards/chosen": 1.8071329593658447, "rewards/margins": 7.505188465118408, "rewards/rejected": -5.698056221008301, "step": 783 }, { "epoch": 2.3679547596606976, "grad_norm": 6.1454386711120605, "learning_rate": 7.634441087613293e-07, "logits/chosen": -2.532637357711792, "logits/rejected": -2.4634077548980713, "logps/chosen": -28.089340209960938, "logps/rejected": -141.2545623779297, "loss": 0.2599, "rewards/accuracies": 1.0, "rewards/chosen": 2.7714457511901855, "rewards/margins": 8.469285011291504, "rewards/rejected": -5.697839260101318, "step": 784 }, { "epoch": 2.370970782280867, "grad_norm": 3.7959163188934326, "learning_rate": 7.631419939577039e-07, "logits/chosen": -2.5202579498291016, "logits/rejected": -2.439754009246826, "logps/chosen": -39.45886993408203, "logps/rejected": -148.44937133789062, "loss": 0.3337, "rewards/accuracies": 1.0, "rewards/chosen": 1.6833637952804565, "rewards/margins": 8.046822547912598, "rewards/rejected": -6.36345911026001, "step": 785 }, { "epoch": 2.373986804901037, "grad_norm": 7.9997663497924805, "learning_rate": 7.628398791540785e-07, "logits/chosen": -2.5772757530212402, "logits/rejected": -2.5549325942993164, "logps/chosen": -28.58677101135254, "logps/rejected": -122.17756652832031, "loss": 0.3131, "rewards/accuracies": 0.96875, "rewards/chosen": 2.601351499557495, "rewards/margins": 7.319739818572998, "rewards/rejected": -4.718389511108398, "step": 786 }, { "epoch": 2.3770028275212063, "grad_norm": 4.471433639526367, "learning_rate": 7.625377643504531e-07, "logits/chosen": -2.5535449981689453, "logits/rejected": -2.503805637359619, "logps/chosen": -36.214080810546875, "logps/rejected": -124.62782287597656, "loss": 0.3755, "rewards/accuracies": 0.96875, "rewards/chosen": 1.9940986633300781, "rewards/margins": 7.0862321853637695, "rewards/rejected": -5.092133522033691, "step": 787 }, { "epoch": 2.380018850141376, "grad_norm": 6.221755027770996, "learning_rate": 7.622356495468278e-07, "logits/chosen": -2.528292179107666, "logits/rejected": -2.447948694229126, "logps/chosen": -29.246355056762695, "logps/rejected": -136.59634399414062, "loss": 0.2919, "rewards/accuracies": 0.96875, "rewards/chosen": 2.272970676422119, "rewards/margins": 7.96284294128418, "rewards/rejected": -5.689871788024902, "step": 788 }, { "epoch": 2.3830348727615456, "grad_norm": 4.565267562866211, "learning_rate": 7.619335347432024e-07, "logits/chosen": -2.494962215423584, "logits/rejected": -2.5176093578338623, "logps/chosen": -41.067832946777344, "logps/rejected": -143.57843017578125, "loss": 0.323, "rewards/accuracies": 0.90625, "rewards/chosen": 1.9134732484817505, "rewards/margins": 7.217465400695801, "rewards/rejected": -5.303992748260498, "step": 789 }, { "epoch": 2.3860508953817154, "grad_norm": 5.188318252563477, "learning_rate": 7.61631419939577e-07, "logits/chosen": -2.4860353469848633, "logits/rejected": -2.402667999267578, "logps/chosen": -27.27822494506836, "logps/rejected": -133.4794158935547, "loss": 0.3491, "rewards/accuracies": 1.0, "rewards/chosen": 2.6857550144195557, "rewards/margins": 7.592535972595215, "rewards/rejected": -4.906781196594238, "step": 790 }, { "epoch": 2.389066918001885, "grad_norm": 4.240169048309326, "learning_rate": 7.613293051359517e-07, "logits/chosen": -2.537315607070923, "logits/rejected": -2.487727403640747, "logps/chosen": -34.6807861328125, "logps/rejected": -145.5005645751953, "loss": 0.2921, "rewards/accuracies": 0.9375, "rewards/chosen": 2.494469165802002, "rewards/margins": 8.19005298614502, "rewards/rejected": -5.695584297180176, "step": 791 }, { "epoch": 2.3920829406220547, "grad_norm": 4.882316589355469, "learning_rate": 7.610271903323262e-07, "logits/chosen": -2.559800386428833, "logits/rejected": -2.438450813293457, "logps/chosen": -23.943294525146484, "logps/rejected": -137.6796875, "loss": 0.2499, "rewards/accuracies": 0.96875, "rewards/chosen": 3.099241018295288, "rewards/margins": 8.800681114196777, "rewards/rejected": -5.701439380645752, "step": 792 }, { "epoch": 2.395098963242224, "grad_norm": 5.476734161376953, "learning_rate": 7.607250755287009e-07, "logits/chosen": -2.511197566986084, "logits/rejected": -2.45196533203125, "logps/chosen": -39.04254913330078, "logps/rejected": -142.54168701171875, "loss": 0.3685, "rewards/accuracies": 0.9375, "rewards/chosen": 1.9087367057800293, "rewards/margins": 7.5605058670043945, "rewards/rejected": -5.651769161224365, "step": 793 }, { "epoch": 2.398114985862394, "grad_norm": 3.322291135787964, "learning_rate": 7.604229607250755e-07, "logits/chosen": -2.5500943660736084, "logits/rejected": -2.51304292678833, "logps/chosen": -27.567752838134766, "logps/rejected": -148.54385375976562, "loss": 0.2279, "rewards/accuracies": 1.0, "rewards/chosen": 2.8987932205200195, "rewards/margins": 8.857714653015137, "rewards/rejected": -5.958921432495117, "step": 794 }, { "epoch": 2.4011310084825634, "grad_norm": 4.347036838531494, "learning_rate": 7.601208459214501e-07, "logits/chosen": -2.5298800468444824, "logits/rejected": -2.4825305938720703, "logps/chosen": -28.19463539123535, "logps/rejected": -148.685302734375, "loss": 0.2719, "rewards/accuracies": 1.0, "rewards/chosen": 2.4640073776245117, "rewards/margins": 8.746015548706055, "rewards/rejected": -6.282007694244385, "step": 795 }, { "epoch": 2.4041470311027333, "grad_norm": 6.596168041229248, "learning_rate": 7.598187311178247e-07, "logits/chosen": -2.5730161666870117, "logits/rejected": -2.5088136196136475, "logps/chosen": -39.1646842956543, "logps/rejected": -139.54989624023438, "loss": 0.3593, "rewards/accuracies": 1.0, "rewards/chosen": 1.984837293624878, "rewards/margins": 7.791705131530762, "rewards/rejected": -5.806867599487305, "step": 796 }, { "epoch": 2.4071630537229027, "grad_norm": 5.914654731750488, "learning_rate": 7.595166163141994e-07, "logits/chosen": -2.6168839931488037, "logits/rejected": -2.564103364944458, "logps/chosen": -30.762798309326172, "logps/rejected": -112.25588989257812, "loss": 0.3508, "rewards/accuracies": 0.96875, "rewards/chosen": 2.7725830078125, "rewards/margins": 6.343110084533691, "rewards/rejected": -3.5705273151397705, "step": 797 }, { "epoch": 2.4101790763430726, "grad_norm": 6.317051410675049, "learning_rate": 7.592145015105739e-07, "logits/chosen": -2.6341452598571777, "logits/rejected": -2.481501817703247, "logps/chosen": -38.08799743652344, "logps/rejected": -132.64854431152344, "loss": 0.3154, "rewards/accuracies": 0.9375, "rewards/chosen": 2.097627639770508, "rewards/margins": 6.9302825927734375, "rewards/rejected": -4.83265495300293, "step": 798 }, { "epoch": 2.413195098963242, "grad_norm": 4.792420864105225, "learning_rate": 7.589123867069486e-07, "logits/chosen": -2.512363910675049, "logits/rejected": -2.4814207553863525, "logps/chosen": -37.090118408203125, "logps/rejected": -126.18080139160156, "loss": 0.4757, "rewards/accuracies": 0.9375, "rewards/chosen": 1.785768747329712, "rewards/margins": 6.410417556762695, "rewards/rejected": -4.6246490478515625, "step": 799 }, { "epoch": 2.416211121583412, "grad_norm": 4.334286212921143, "learning_rate": 7.586102719033233e-07, "logits/chosen": -2.471717596054077, "logits/rejected": -2.46914005279541, "logps/chosen": -38.63822937011719, "logps/rejected": -143.32687377929688, "loss": 0.3788, "rewards/accuracies": 0.96875, "rewards/chosen": 1.9299731254577637, "rewards/margins": 7.6420745849609375, "rewards/rejected": -5.712100982666016, "step": 800 }, { "epoch": 2.4192271442035818, "grad_norm": 5.516719341278076, "learning_rate": 7.583081570996978e-07, "logits/chosen": -2.544977903366089, "logits/rejected": -2.501516580581665, "logps/chosen": -26.905811309814453, "logps/rejected": -113.09710693359375, "loss": 0.3945, "rewards/accuracies": 0.96875, "rewards/chosen": 2.9128007888793945, "rewards/margins": 6.725961208343506, "rewards/rejected": -3.813159465789795, "step": 801 }, { "epoch": 2.422243166823751, "grad_norm": 4.502244472503662, "learning_rate": 7.580060422960724e-07, "logits/chosen": -2.6236493587493896, "logits/rejected": -2.5214054584503174, "logps/chosen": -26.737964630126953, "logps/rejected": -125.43275451660156, "loss": 0.3324, "rewards/accuracies": 1.0, "rewards/chosen": 2.824956178665161, "rewards/margins": 7.454288959503174, "rewards/rejected": -4.629332542419434, "step": 802 }, { "epoch": 2.4252591894439206, "grad_norm": 4.170332431793213, "learning_rate": 7.577039274924471e-07, "logits/chosen": -2.591916084289551, "logits/rejected": -2.5383644104003906, "logps/chosen": -35.16984939575195, "logps/rejected": -113.64840698242188, "loss": 0.435, "rewards/accuracies": 0.90625, "rewards/chosen": 1.999222993850708, "rewards/margins": 6.109658241271973, "rewards/rejected": -4.110435485839844, "step": 803 }, { "epoch": 2.4282752120640905, "grad_norm": 4.3592681884765625, "learning_rate": 7.574018126888218e-07, "logits/chosen": -2.638465404510498, "logits/rejected": -2.557246685028076, "logps/chosen": -37.56809616088867, "logps/rejected": -138.2155303955078, "loss": 0.358, "rewards/accuracies": 0.90625, "rewards/chosen": 2.012599229812622, "rewards/margins": 7.328717231750488, "rewards/rejected": -5.316118240356445, "step": 804 }, { "epoch": 2.4282752120640905, "eval_logits/chosen": -2.4831721782684326, "eval_logits/rejected": -2.439906120300293, "eval_logps/chosen": -35.3286018371582, "eval_logps/rejected": -140.76856994628906, "eval_loss": 0.29873764514923096, "eval_rewards/accuracies": 0.9664948582649231, "eval_rewards/chosen": 2.4076082706451416, "eval_rewards/margins": 7.857303619384766, "eval_rewards/rejected": -5.449695110321045, "eval_runtime": 697.6422, "eval_samples_per_second": 0.555, "eval_steps_per_second": 0.278, "step": 804 } ], "logging_steps": 1.0, "max_steps": 3310, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 67, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }