{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.8085106382978724, "eval_steps": 500, "global_step": 33, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0851063829787234, "grad_norm": 0.0, "learning_rate": 0.0, "logits/chosen": -1.8645445108413696, "logits/rejected": 14.429821968078613, "logps/chosen": -346.043701171875, "logps/rejected": -212.6157684326172, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.1702127659574468, "grad_norm": 0.0, "learning_rate": 0.0, "logits/chosen": 0.5981572866439819, "logits/rejected": 11.581156730651855, "logps/chosen": -301.7901306152344, "logps/rejected": -188.81680297851562, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.2553191489361702, "grad_norm": 77.4799575805664, "learning_rate": 1e-07, "logits/chosen": -0.6666683554649353, "logits/rejected": 13.3030424118042, "logps/chosen": -244.64401245117188, "logps/rejected": -135.6005096435547, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3 }, { "epoch": 0.3404255319148936, "grad_norm": 75.18358612060547, "learning_rate": 2e-07, "logits/chosen": -0.6529165506362915, "logits/rejected": 12.279073715209961, "logps/chosen": -260.230224609375, "logps/rejected": -151.9573974609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4 }, { "epoch": 0.425531914893617, "grad_norm": 80.90792846679688, "learning_rate": 3e-07, "logits/chosen": 0.008319228887557983, "logits/rejected": 17.30872917175293, "logps/chosen": -284.0721435546875, "logps/rejected": -131.707275390625, "loss": 0.6899, "rewards/accuracies": 0.90625, "rewards/chosen": 0.008991742506623268, "rewards/margins": 0.007281172554939985, "rewards/rejected": 0.001710569835267961, "step": 5 }, { "epoch": 0.5106382978723404, "grad_norm": 70.98322296142578, "learning_rate": 4e-07, "logits/chosen": 1.6513853073120117, "logits/rejected": 10.607856750488281, "logps/chosen": -241.02496337890625, "logps/rejected": -187.40670776367188, "loss": 0.6637, "rewards/accuracies": 0.75, "rewards/chosen": 0.08284933865070343, "rewards/margins": 0.04663487523794174, "rewards/rejected": 0.03621446341276169, "step": 6 }, { "epoch": 0.5957446808510638, "grad_norm": 64.64167022705078, "learning_rate": 5e-07, "logits/chosen": -0.655532717704773, "logits/rejected": 13.186487197875977, "logps/chosen": -312.20770263671875, "logps/rejected": -185.97059631347656, "loss": 0.5792, "rewards/accuracies": 0.90625, "rewards/chosen": 0.3591811954975128, "rewards/margins": 0.2757205367088318, "rewards/rejected": 0.08346068859100342, "step": 7 }, { "epoch": 0.6808510638297872, "grad_norm": 50.39228820800781, "learning_rate": 4.821428571428571e-07, "logits/chosen": -0.6116840243339539, "logits/rejected": 14.426715850830078, "logps/chosen": -278.035888671875, "logps/rejected": -141.1471710205078, "loss": 0.4993, "rewards/accuracies": 0.84375, "rewards/chosen": 0.6650260090827942, "rewards/margins": 0.4626970589160919, "rewards/rejected": 0.20232899487018585, "step": 8 }, { "epoch": 0.7659574468085106, "grad_norm": 36.09687042236328, "learning_rate": 4.6428571428571427e-07, "logits/chosen": 1.258104681968689, "logits/rejected": 13.34419059753418, "logps/chosen": -207.1935577392578, "logps/rejected": -125.97917938232422, "loss": 0.3753, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0702375173568726, "rewards/margins": 0.9072185158729553, "rewards/rejected": 0.1630191206932068, "step": 9 }, { "epoch": 0.851063829787234, "grad_norm": 31.1262264251709, "learning_rate": 4.464285714285714e-07, "logits/chosen": 1.3978009223937988, "logits/rejected": 14.55895709991455, "logps/chosen": -271.84869384765625, "logps/rejected": -154.90167236328125, "loss": 0.3268, "rewards/accuracies": 0.8125, "rewards/chosen": 1.6433621644973755, "rewards/margins": 1.044425368309021, "rewards/rejected": 0.5989368557929993, "step": 10 }, { "epoch": 0.9361702127659575, "grad_norm": 36.019447326660156, "learning_rate": 4.285714285714285e-07, "logits/chosen": 2.2715587615966797, "logits/rejected": 11.973756790161133, "logps/chosen": -254.08721923828125, "logps/rejected": -192.8157501220703, "loss": 0.3231, "rewards/accuracies": 0.75, "rewards/chosen": 2.2909791469573975, "rewards/margins": 1.358994483947754, "rewards/rejected": 0.931984543800354, "step": 11 }, { "epoch": 1.0212765957446808, "grad_norm": 34.85441207885742, "learning_rate": 4.1071428571428566e-07, "logits/chosen": 2.2988595962524414, "logits/rejected": 11.726805686950684, "logps/chosen": -290.65106201171875, "logps/rejected": -231.5535125732422, "loss": 0.3005, "rewards/accuracies": 0.84375, "rewards/chosen": 2.6145758628845215, "rewards/margins": 2.101471424102783, "rewards/rejected": 0.5131043195724487, "step": 12 }, { "epoch": 1.1063829787234043, "grad_norm": 15.43303394317627, "learning_rate": 3.928571428571428e-07, "logits/chosen": 2.161435127258301, "logits/rejected": 15.173026084899902, "logps/chosen": -285.2391662597656, "logps/rejected": -175.35610961914062, "loss": 0.1156, "rewards/accuracies": 0.96875, "rewards/chosen": 3.8329055309295654, "rewards/margins": 3.6420764923095703, "rewards/rejected": 0.19082875549793243, "step": 13 }, { "epoch": 1.1914893617021276, "grad_norm": 22.640640258789062, "learning_rate": 3.75e-07, "logits/chosen": 0.34790876507759094, "logits/rejected": 16.288211822509766, "logps/chosen": -258.007080078125, "logps/rejected": -125.82489013671875, "loss": 0.1923, "rewards/accuracies": 0.90625, "rewards/chosen": 3.476907968521118, "rewards/margins": 3.2935471534729004, "rewards/rejected": 0.18336114287376404, "step": 14 }, { "epoch": 1.2765957446808511, "grad_norm": 17.350461959838867, "learning_rate": 3.5714285714285716e-07, "logits/chosen": 0.2184600532054901, "logits/rejected": 12.73766040802002, "logps/chosen": -201.4739532470703, "logps/rejected": -120.93484497070312, "loss": 0.1553, "rewards/accuracies": 0.9375, "rewards/chosen": 3.4015889167785645, "rewards/margins": 3.4224205017089844, "rewards/rejected": -0.020831629633903503, "step": 15 }, { "epoch": 1.3617021276595744, "grad_norm": 10.290063858032227, "learning_rate": 3.392857142857143e-07, "logits/chosen": 1.29380464553833, "logits/rejected": 17.044889450073242, "logps/chosen": -255.9779052734375, "logps/rejected": -129.52340698242188, "loss": 0.0961, "rewards/accuracies": 0.9375, "rewards/chosen": 4.283539295196533, "rewards/margins": 4.322422504425049, "rewards/rejected": -0.03888271749019623, "step": 16 }, { "epoch": 1.4468085106382977, "grad_norm": 12.951630592346191, "learning_rate": 3.2142857142857145e-07, "logits/chosen": 0.8126751780509949, "logits/rejected": 14.440966606140137, "logps/chosen": -242.29527282714844, "logps/rejected": -145.05775451660156, "loss": 0.0923, "rewards/accuracies": 0.96875, "rewards/chosen": 3.8756561279296875, "rewards/margins": 4.231058597564697, "rewards/rejected": -0.35540255904197693, "step": 17 }, { "epoch": 1.5319148936170213, "grad_norm": 19.92337417602539, "learning_rate": 3.0357142857142855e-07, "logits/chosen": 1.3031485080718994, "logits/rejected": 13.797933578491211, "logps/chosen": -248.54811096191406, "logps/rejected": -165.89808654785156, "loss": 0.0929, "rewards/accuracies": 1.0, "rewards/chosen": 4.139030456542969, "rewards/margins": 4.645418167114258, "rewards/rejected": -0.5063877701759338, "step": 18 }, { "epoch": 1.6170212765957448, "grad_norm": 9.488024711608887, "learning_rate": 2.857142857142857e-07, "logits/chosen": 2.991806983947754, "logits/rejected": 14.36280632019043, "logps/chosen": -278.69769287109375, "logps/rejected": -145.9281463623047, "loss": 0.0783, "rewards/accuracies": 0.9375, "rewards/chosen": 4.406310558319092, "rewards/margins": 5.150607109069824, "rewards/rejected": -0.7442967295646667, "step": 19 }, { "epoch": 1.702127659574468, "grad_norm": 12.83969497680664, "learning_rate": 2.6785714285714284e-07, "logits/chosen": 3.6287827491760254, "logits/rejected": 12.436114311218262, "logps/chosen": -210.47592163085938, "logps/rejected": -189.6756134033203, "loss": 0.083, "rewards/accuracies": 1.0, "rewards/chosen": 3.7564351558685303, "rewards/margins": 4.538413047790527, "rewards/rejected": -0.7819780707359314, "step": 20 }, { "epoch": 1.7872340425531914, "grad_norm": 17.395404815673828, "learning_rate": 2.5e-07, "logits/chosen": 4.2808837890625, "logits/rejected": 13.777887344360352, "logps/chosen": -271.89227294921875, "logps/rejected": -207.44837951660156, "loss": 0.1051, "rewards/accuracies": 0.96875, "rewards/chosen": 4.251194477081299, "rewards/margins": 5.237154006958008, "rewards/rejected": -0.9859597682952881, "step": 21 }, { "epoch": 1.872340425531915, "grad_norm": 23.122020721435547, "learning_rate": 2.3214285714285714e-07, "logits/chosen": 1.1441445350646973, "logits/rejected": 14.379843711853027, "logps/chosen": -235.24493408203125, "logps/rejected": -194.19224548339844, "loss": 0.1211, "rewards/accuracies": 0.9375, "rewards/chosen": 4.572199821472168, "rewards/margins": 5.295849800109863, "rewards/rejected": -0.7236496806144714, "step": 22 }, { "epoch": 1.9574468085106385, "grad_norm": 13.432173728942871, "learning_rate": 2.1428571428571426e-07, "logits/chosen": -0.5294728875160217, "logits/rejected": 16.71356201171875, "logps/chosen": -219.78480529785156, "logps/rejected": -111.86544036865234, "loss": 0.1006, "rewards/accuracies": 0.9375, "rewards/chosen": 3.635490894317627, "rewards/margins": 4.1884026527404785, "rewards/rejected": -0.5529115200042725, "step": 23 }, { "epoch": 2.0425531914893615, "grad_norm": 8.677014350891113, "learning_rate": 1.964285714285714e-07, "logits/chosen": 0.39843907952308655, "logits/rejected": 13.533981323242188, "logps/chosen": -216.69137573242188, "logps/rejected": -143.2519989013672, "loss": 0.056, "rewards/accuracies": 1.0, "rewards/chosen": 4.103562355041504, "rewards/margins": 4.519499778747559, "rewards/rejected": -0.4159368574619293, "step": 24 }, { "epoch": 2.127659574468085, "grad_norm": 3.03104305267334, "learning_rate": 1.7857142857142858e-07, "logits/chosen": 1.5668433904647827, "logits/rejected": 14.729446411132812, "logps/chosen": -191.61404418945312, "logps/rejected": -183.37631225585938, "loss": 0.03, "rewards/accuracies": 1.0, "rewards/chosen": 4.106316566467285, "rewards/margins": 5.793395042419434, "rewards/rejected": -1.6870781183242798, "step": 25 }, { "epoch": 2.2127659574468086, "grad_norm": 3.136233329772949, "learning_rate": 1.6071428571428573e-07, "logits/chosen": 0.05481068789958954, "logits/rejected": 14.857564926147461, "logps/chosen": -266.8812561035156, "logps/rejected": -150.13543701171875, "loss": 0.0235, "rewards/accuracies": 1.0, "rewards/chosen": 4.614926338195801, "rewards/margins": 5.615174293518066, "rewards/rejected": -1.0002480745315552, "step": 26 }, { "epoch": 2.297872340425532, "grad_norm": 2.425645112991333, "learning_rate": 1.4285714285714285e-07, "logits/chosen": 1.4185882806777954, "logits/rejected": 12.52522087097168, "logps/chosen": -257.33148193359375, "logps/rejected": -225.2210235595703, "loss": 0.0226, "rewards/accuracies": 1.0, "rewards/chosen": 4.434384822845459, "rewards/margins": 5.778439044952393, "rewards/rejected": -1.3440542221069336, "step": 27 }, { "epoch": 2.382978723404255, "grad_norm": 5.070924282073975, "learning_rate": 1.25e-07, "logits/chosen": 1.7797931432724, "logits/rejected": 12.562594413757324, "logps/chosen": -289.022216796875, "logps/rejected": -247.23553466796875, "loss": 0.0306, "rewards/accuracies": 1.0, "rewards/chosen": 4.724697113037109, "rewards/margins": 6.377914905548096, "rewards/rejected": -1.6532176733016968, "step": 28 }, { "epoch": 2.4680851063829787, "grad_norm": 2.1009552478790283, "learning_rate": 1.0714285714285713e-07, "logits/chosen": -0.9775732755661011, "logits/rejected": 14.321802139282227, "logps/chosen": -209.55625915527344, "logps/rejected": -115.63282775878906, "loss": 0.0191, "rewards/accuracies": 1.0, "rewards/chosen": 4.2973175048828125, "rewards/margins": 5.245972156524658, "rewards/rejected": -0.9486544132232666, "step": 29 }, { "epoch": 2.5531914893617023, "grad_norm": 2.554861545562744, "learning_rate": 8.928571428571429e-08, "logits/chosen": 2.66330623626709, "logits/rejected": 11.6483154296875, "logps/chosen": -243.45645141601562, "logps/rejected": -232.2058563232422, "loss": 0.0221, "rewards/accuracies": 1.0, "rewards/chosen": 4.154801368713379, "rewards/margins": 6.131631851196289, "rewards/rejected": -1.9768304824829102, "step": 30 }, { "epoch": 2.6382978723404253, "grad_norm": 2.7131459712982178, "learning_rate": 7.142857142857142e-08, "logits/chosen": -0.194177508354187, "logits/rejected": 13.815250396728516, "logps/chosen": -218.8939208984375, "logps/rejected": -164.52902221679688, "loss": 0.0205, "rewards/accuracies": 1.0, "rewards/chosen": 4.5105462074279785, "rewards/margins": 5.92357873916626, "rewards/rejected": -1.413031816482544, "step": 31 }, { "epoch": 2.723404255319149, "grad_norm": 3.911942481994629, "learning_rate": 5.3571428571428564e-08, "logits/chosen": -0.5970292091369629, "logits/rejected": 16.974449157714844, "logps/chosen": -247.49411010742188, "logps/rejected": -128.2796630859375, "loss": 0.0245, "rewards/accuracies": 1.0, "rewards/chosen": 4.004292011260986, "rewards/margins": 5.12349271774292, "rewards/rejected": -1.1192007064819336, "step": 32 }, { "epoch": 2.8085106382978724, "grad_norm": 3.6512134075164795, "learning_rate": 3.571428571428571e-08, "logits/chosen": -1.6241520643234253, "logits/rejected": 12.453986167907715, "logps/chosen": -295.5976867675781, "logps/rejected": -223.0675811767578, "loss": 0.0221, "rewards/accuracies": 1.0, "rewards/chosen": 5.045155048370361, "rewards/margins": 6.366570472717285, "rewards/rejected": -1.321415901184082, "step": 33 } ], "logging_steps": 1.0, "max_steps": 33, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9958223708160.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }