{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9646365422396856, "eval_steps": 50, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.09823182711198428, "grad_norm": 4.67578125, "learning_rate": 5e-07, "logits/chosen": -1.882656216621399, "logits/rejected": -2.0222558975219727, "logps/chosen": -629.1900024414062, "logps/rejected": -886.4169311523438, "loss": 0.6184, "rewards/accuracies": 0.39500001072883606, "rewards/chosen": 1.2086933851242065, "rewards/margins": 0.8173081278800964, "rewards/rejected": 0.39138519763946533, "step": 50 }, { "epoch": 0.09823182711198428, "eval_logits/chosen": -1.8351056575775146, "eval_logits/rejected": -1.898075819015503, "eval_logps/chosen": -827.69677734375, "eval_logps/rejected": -769.7338256835938, "eval_loss": 0.21162064373493195, "eval_rewards/accuracies": 0.7060185074806213, "eval_rewards/chosen": 4.598723888397217, "eval_rewards/margins": 3.7964937686920166, "eval_rewards/rejected": 0.802230179309845, "eval_runtime": 376.4446, "eval_samples_per_second": 1.145, "eval_steps_per_second": 0.574, "step": 50 }, { "epoch": 0.19646365422396855, "grad_norm": 1.43359375, "learning_rate": 1e-06, "logits/chosen": -1.9130176305770874, "logits/rejected": -2.028027296066284, "logps/chosen": -584.5574951171875, "logps/rejected": -919.9600219726562, "loss": 0.3184, "rewards/accuracies": 0.5550000071525574, "rewards/chosen": 4.297489166259766, "rewards/margins": 5.647652626037598, "rewards/rejected": -1.350163459777832, "step": 100 }, { "epoch": 0.19646365422396855, "eval_logits/chosen": -1.830168604850769, "eval_logits/rejected": -1.8947844505310059, "eval_logps/chosen": -808.8148193359375, "eval_logps/rejected": -775.2783813476562, "eval_loss": 0.20374441146850586, "eval_rewards/accuracies": 0.7083333134651184, "eval_rewards/chosen": 6.487063884735107, "eval_rewards/margins": 6.239492416381836, "eval_rewards/rejected": 0.24757163226604462, "eval_runtime": 376.9374, "eval_samples_per_second": 1.143, "eval_steps_per_second": 0.573, "step": 100 }, { "epoch": 0.29469548133595286, "grad_norm": 0.00019991397857666016, "learning_rate": 9.92403876506104e-07, "logits/chosen": -1.8983594179153442, "logits/rejected": -1.995449185371399, "logps/chosen": -539.44873046875, "logps/rejected": -860.8162231445312, "loss": 0.3339, "rewards/accuracies": 0.5274999737739563, "rewards/chosen": 4.254204273223877, "rewards/margins": 8.191503524780273, "rewards/rejected": -3.937298536300659, "step": 150 }, { "epoch": 0.29469548133595286, "eval_logits/chosen": -1.8110939264297485, "eval_logits/rejected": -1.8730604648590088, "eval_logps/chosen": -810.138916015625, "eval_logps/rejected": -787.7071533203125, "eval_loss": 0.20321600139141083, "eval_rewards/accuracies": 0.7083333134651184, "eval_rewards/chosen": 6.352499961853027, "eval_rewards/margins": 7.34998083114624, "eval_rewards/rejected": -0.9974803924560547, "eval_runtime": 381.3297, "eval_samples_per_second": 1.13, "eval_steps_per_second": 0.566, "step": 150 }, { "epoch": 0.3929273084479371, "grad_norm": 3.673828125, "learning_rate": 9.698463103929541e-07, "logits/chosen": -1.8827344179153442, "logits/rejected": NaN, "logps/chosen": -537.2374877929688, "logps/rejected": -964.1453857421875, "loss": 0.2992, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 4.266866683959961, "rewards/margins": 12.264365196228027, "rewards/rejected": -7.997498989105225, "step": 200 }, { "epoch": 0.3929273084479371, "eval_logits/chosen": -1.8244402408599854, "eval_logits/rejected": -1.88623046875, "eval_logps/chosen": -812.432861328125, "eval_logps/rejected": -809.0966186523438, "eval_loss": 0.20289792120456696, "eval_rewards/accuracies": 0.7083333134651184, "eval_rewards/chosen": 6.12615442276001, "eval_rewards/margins": 9.262307167053223, "eval_rewards/rejected": -3.1361522674560547, "eval_runtime": 375.9512, "eval_samples_per_second": 1.146, "eval_steps_per_second": 0.575, "step": 200 }, { "epoch": 0.4911591355599214, "grad_norm": 3.783203125, "learning_rate": 9.330127018922193e-07, "logits/chosen": -1.8917089700698853, "logits/rejected": -2.057480573654175, "logps/chosen": -548.9512329101562, "logps/rejected": -1068.0450439453125, "loss": 0.3062, "rewards/accuracies": 0.5600000023841858, "rewards/chosen": 4.461262226104736, "rewards/margins": 15.397392272949219, "rewards/rejected": -10.93613052368164, "step": 250 }, { "epoch": 0.4911591355599214, "eval_logits/chosen": -1.8248969316482544, "eval_logits/rejected": -1.8831199407577515, "eval_logps/chosen": -814.4699096679688, "eval_logps/rejected": -816.1018676757812, "eval_loss": 0.20285306870937347, "eval_rewards/accuracies": 0.7083333134651184, "eval_rewards/chosen": 5.92443323135376, "eval_rewards/margins": 9.760541915893555, "eval_rewards/rejected": -3.836108684539795, "eval_runtime": 382.044, "eval_samples_per_second": 1.128, "eval_steps_per_second": 0.565, "step": 250 }, { "epoch": 0.5893909626719057, "grad_norm": 0.0, "learning_rate": 8.83022221559489e-07, "logits/chosen": -1.8699413537979126, "logits/rejected": -2.034736394882202, "logps/chosen": -602.510009765625, "logps/rejected": -1128.844970703125, "loss": 0.2878, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 4.663626670837402, "rewards/margins": 18.277070999145508, "rewards/rejected": -13.613446235656738, "step": 300 }, { "epoch": 0.5893909626719057, "eval_logits/chosen": -1.8203803300857544, "eval_logits/rejected": -1.8756872415542603, "eval_logps/chosen": -814.0150756835938, "eval_logps/rejected": -814.901611328125, "eval_loss": 0.20328794419765472, "eval_rewards/accuracies": 0.7083333134651184, "eval_rewards/chosen": 5.968328475952148, "eval_rewards/margins": 9.685802459716797, "eval_rewards/rejected": -3.7174744606018066, "eval_runtime": 377.271, "eval_samples_per_second": 1.142, "eval_steps_per_second": 0.573, "step": 300 }, { "epoch": 0.68762278978389, "grad_norm": 0.0, "learning_rate": 8.213938048432696e-07, "logits/chosen": -1.8613183498382568, "logits/rejected": -2.010195255279541, "logps/chosen": -560.01123046875, "logps/rejected": -1054.34130859375, "loss": 0.2965, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 4.663458347320557, "rewards/margins": 17.340160369873047, "rewards/rejected": -12.676701545715332, "step": 350 }, { "epoch": 0.68762278978389, "eval_logits/chosen": -1.7953965663909912, "eval_logits/rejected": -1.8488408327102661, "eval_logps/chosen": -810.86572265625, "eval_logps/rejected": -815.11572265625, "eval_loss": 0.2027529776096344, "eval_rewards/accuracies": 0.7083333134651184, "eval_rewards/chosen": 6.281754016876221, "eval_rewards/margins": 10.01964282989502, "eval_rewards/rejected": -3.7378885746002197, "eval_runtime": 376.3789, "eval_samples_per_second": 1.145, "eval_steps_per_second": 0.574, "step": 350 }, { "epoch": 0.7858546168958742, "grad_norm": 0.0, "learning_rate": 7.5e-07, "logits/chosen": -1.8544628620147705, "logits/rejected": -1.9972070455551147, "logps/chosen": -550.8362426757812, "logps/rejected": -1053.2850341796875, "loss": 0.3138, "rewards/accuracies": 0.5475000143051147, "rewards/chosen": 4.41979455947876, "rewards/margins": 17.060997009277344, "rewards/rejected": -12.641203880310059, "step": 400 }, { "epoch": 0.7858546168958742, "eval_logits/chosen": -1.7978651523590088, "eval_logits/rejected": -1.85009765625, "eval_logps/chosen": -816.5775756835938, "eval_logps/rejected": -824.407958984375, "eval_loss": 0.20284250378608704, "eval_rewards/accuracies": 0.7083333134651184, "eval_rewards/chosen": 5.713149070739746, "eval_rewards/margins": 10.380072593688965, "eval_rewards/rejected": -4.666924953460693, "eval_runtime": 376.6783, "eval_samples_per_second": 1.144, "eval_steps_per_second": 0.573, "step": 400 }, { "epoch": 0.8840864440078585, "grad_norm": 0.0, "learning_rate": 6.710100716628344e-07, "logits/chosen": -1.8560644388198853, "logits/rejected": -1.9615429639816284, "logps/chosen": -552.2062377929688, "logps/rejected": -1005.7550048828125, "loss": 0.3279, "rewards/accuracies": 0.5350000262260437, "rewards/chosen": 4.51744270324707, "rewards/margins": 16.83458137512207, "rewards/rejected": -12.317138671875, "step": 450 }, { "epoch": 0.8840864440078585, "eval_logits/chosen": -1.7807526588439941, "eval_logits/rejected": -1.833708643913269, "eval_logps/chosen": -808.5011596679688, "eval_logps/rejected": -816.6145629882812, "eval_loss": 0.20271265506744385, "eval_rewards/accuracies": 0.7083333134651184, "eval_rewards/chosen": 6.52016544342041, "eval_rewards/margins": 10.406230926513672, "eval_rewards/rejected": -3.8860654830932617, "eval_runtime": 376.9509, "eval_samples_per_second": 1.143, "eval_steps_per_second": 0.573, "step": 450 }, { "epoch": 0.9823182711198428, "grad_norm": 0.2420654296875, "learning_rate": 5.868240888334652e-07, "logits/chosen": -1.8405078649520874, "logits/rejected": -1.9955663681030273, "logps/chosen": -563.3287353515625, "logps/rejected": -1027.811279296875, "loss": 0.293, "rewards/accuracies": 0.5774999856948853, "rewards/chosen": 4.939021587371826, "rewards/margins": 16.828310012817383, "rewards/rejected": -11.889289855957031, "step": 500 }, { "epoch": 0.9823182711198428, "eval_logits/chosen": -1.776493787765503, "eval_logits/rejected": -1.8292959928512573, "eval_logps/chosen": -803.6666870117188, "eval_logps/rejected": -814.2019653320312, "eval_loss": 0.20270462334156036, "eval_rewards/accuracies": 0.7083333134651184, "eval_rewards/chosen": 7.002051830291748, "eval_rewards/margins": 10.646075248718262, "eval_rewards/rejected": -3.6440227031707764, "eval_runtime": 375.946, "eval_samples_per_second": 1.146, "eval_steps_per_second": 0.575, "step": 500 }, { "epoch": 1.080550098231827, "grad_norm": 0.0, "learning_rate": 5e-07, "logits/chosen": -1.845117211341858, "logits/rejected": NaN, "logps/chosen": -590.0087280273438, "logps/rejected": -1017.6812744140625, "loss": 0.3064, "rewards/accuracies": 0.5583333373069763, "rewards/chosen": 5.3995184898376465, "rewards/margins": 17.224445343017578, "rewards/rejected": -11.82492446899414, "step": 550 }, { "epoch": 1.080550098231827, "eval_logits/chosen": -1.7842611074447632, "eval_logits/rejected": -1.837895154953003, "eval_logps/chosen": -803.8020629882812, "eval_logps/rejected": -816.7523193359375, "eval_loss": 0.20272213220596313, "eval_rewards/accuracies": 0.7083333134651184, "eval_rewards/chosen": 6.988828182220459, "eval_rewards/margins": 10.88884162902832, "eval_rewards/rejected": -3.900012254714966, "eval_runtime": 378.7944, "eval_samples_per_second": 1.138, "eval_steps_per_second": 0.57, "step": 550 }, { "epoch": 1.1787819253438114, "grad_norm": 0.0, "learning_rate": 4.131759111665348e-07, "logits/chosen": -1.8424999713897705, "logits/rejected": -1.9918944835662842, "logps/chosen": -574.5387573242188, "logps/rejected": -1103.74755859375, "loss": 0.2844, "rewards/accuracies": 0.5899999737739563, "rewards/chosen": 5.156400203704834, "rewards/margins": 17.812108993530273, "rewards/rejected": -12.655708312988281, "step": 600 }, { "epoch": 1.1787819253438114, "eval_logits/chosen": -1.7866662740707397, "eval_logits/rejected": -1.8396222591400146, "eval_logps/chosen": -805.9617919921875, "eval_logps/rejected": -820.7268676757812, "eval_loss": 0.202733114361763, "eval_rewards/accuracies": 0.7083333134651184, "eval_rewards/chosen": 6.773044586181641, "eval_rewards/margins": 11.070393562316895, "eval_rewards/rejected": -4.2973504066467285, "eval_runtime": 375.1497, "eval_samples_per_second": 1.149, "eval_steps_per_second": 0.576, "step": 600 }, { "epoch": 1.2770137524557956, "grad_norm": 0.0, "learning_rate": 3.2898992833716563e-07, "logits/chosen": -1.8421484231948853, "logits/rejected": -1.972021460533142, "logps/chosen": -555.9462280273438, "logps/rejected": -1028.8499755859375, "loss": 0.2972, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 5.088903903961182, "rewards/margins": 17.384765625, "rewards/rejected": -12.295862197875977, "step": 650 }, { "epoch": 1.2770137524557956, "eval_logits/chosen": -1.7830674648284912, "eval_logits/rejected": -1.8358244895935059, "eval_logps/chosen": -804.9791870117188, "eval_logps/rejected": -819.3784790039062, "eval_loss": 0.20274707674980164, "eval_rewards/accuracies": 0.7083333134651184, "eval_rewards/chosen": 6.87298583984375, "eval_rewards/margins": 11.037016868591309, "eval_rewards/rejected": -4.164031505584717, "eval_runtime": 375.3471, "eval_samples_per_second": 1.148, "eval_steps_per_second": 0.575, "step": 650 }, { "epoch": 1.37524557956778, "grad_norm": 0.0, "learning_rate": 2.500000000000001e-07, "logits/chosen": -1.8376269340515137, "logits/rejected": -1.9944921731948853, "logps/chosen": -595.8699951171875, "logps/rejected": -1099.8599853515625, "loss": 0.293, "rewards/accuracies": 0.5774999856948853, "rewards/chosen": 5.341933727264404, "rewards/margins": 18.64950180053711, "rewards/rejected": -13.307567596435547, "step": 700 }, { "epoch": 1.37524557956778, "eval_logits/chosen": -1.7823712825775146, "eval_logits/rejected": -1.8346353769302368, "eval_logps/chosen": -804.67822265625, "eval_logps/rejected": -819.5619506835938, "eval_loss": 0.20272904634475708, "eval_rewards/accuracies": 0.7083333134651184, "eval_rewards/chosen": 6.901559829711914, "eval_rewards/margins": 11.084138870239258, "eval_rewards/rejected": -4.1825785636901855, "eval_runtime": 377.6355, "eval_samples_per_second": 1.141, "eval_steps_per_second": 0.572, "step": 700 }, { "epoch": 1.4734774066797642, "grad_norm": 0.0, "learning_rate": 1.7860619515673032e-07, "logits/chosen": -1.8395702838897705, "logits/rejected": -1.969667911529541, "logps/chosen": -551.7537231445312, "logps/rejected": -1085.00537109375, "loss": 0.3, "rewards/accuracies": 0.5674999952316284, "rewards/chosen": 5.192094326019287, "rewards/margins": 18.057546615600586, "rewards/rejected": -12.86545181274414, "step": 750 }, { "epoch": 1.4734774066797642, "eval_logits/chosen": -1.7821271419525146, "eval_logits/rejected": -1.8342194557189941, "eval_logps/chosen": -804.69677734375, "eval_logps/rejected": -819.6371459960938, "eval_loss": 0.20272937417030334, "eval_rewards/accuracies": 0.7083333134651184, "eval_rewards/chosen": 6.901213645935059, "eval_rewards/margins": 11.090865135192871, "eval_rewards/rejected": -4.1896538734436035, "eval_runtime": 376.5424, "eval_samples_per_second": 1.145, "eval_steps_per_second": 0.574, "step": 750 }, { "epoch": 1.5717092337917484, "grad_norm": 0.180908203125, "learning_rate": 1.1697777844051104e-07, "logits/chosen": -1.8340917825698853, "logits/rejected": -1.9704101085662842, "logps/chosen": -571.333740234375, "logps/rejected": -1077.0687255859375, "loss": 0.2912, "rewards/accuracies": 0.5799999833106995, "rewards/chosen": 5.163873195648193, "rewards/margins": 17.815139770507812, "rewards/rejected": -12.651267051696777, "step": 800 }, { "epoch": 1.5717092337917484, "eval_logits/chosen": -1.7813811302185059, "eval_logits/rejected": -1.8334101438522339, "eval_logps/chosen": -804.3715209960938, "eval_logps/rejected": -819.3778686523438, "eval_loss": 0.202724888920784, "eval_rewards/accuracies": 0.7083333134651184, "eval_rewards/chosen": 6.932766437530518, "eval_rewards/margins": 11.097650527954102, "eval_rewards/rejected": -4.1648850440979, "eval_runtime": 375.1719, "eval_samples_per_second": 1.149, "eval_steps_per_second": 0.576, "step": 800 }, { "epoch": 1.6699410609037328, "grad_norm": 0.05718994140625, "learning_rate": 6.698729810778064e-08, "logits/chosen": -1.8348926305770874, "logits/rejected": -1.9697363376617432, "logps/chosen": -583.3875122070312, "logps/rejected": -1078.706298828125, "loss": 0.2827, "rewards/accuracies": 0.5924999713897705, "rewards/chosen": 5.512423515319824, "rewards/margins": 18.3432674407959, "rewards/rejected": -12.83084487915039, "step": 850 }, { "epoch": 1.6699410609037328, "eval_logits/chosen": -1.7811685800552368, "eval_logits/rejected": -1.8332293033599854, "eval_logps/chosen": -804.3367919921875, "eval_logps/rejected": -819.3900756835938, "eval_loss": 0.20272420346736908, "eval_rewards/accuracies": 0.7083333134651184, "eval_rewards/chosen": 6.935370445251465, "eval_rewards/margins": 11.101933479309082, "eval_rewards/rejected": -4.166563034057617, "eval_runtime": 376.8901, "eval_samples_per_second": 1.144, "eval_steps_per_second": 0.573, "step": 850 }, { "epoch": 1.768172888015717, "grad_norm": 0.0, "learning_rate": 3.015368960704584e-08, "logits/chosen": -1.837314486503601, "logits/rejected": -1.9836230278015137, "logps/chosen": -580.686279296875, "logps/rejected": -1060.800048828125, "loss": 0.2912, "rewards/accuracies": 0.5799999833106995, "rewards/chosen": 5.313009738922119, "rewards/margins": 17.764554977416992, "rewards/rejected": -12.451545715332031, "step": 900 }, { "epoch": 1.768172888015717, "eval_logits/chosen": -1.781037449836731, "eval_logits/rejected": -1.8330711126327515, "eval_logps/chosen": -804.3738403320312, "eval_logps/rejected": -819.3929443359375, "eval_loss": 0.20272405445575714, "eval_rewards/accuracies": 0.7083333134651184, "eval_rewards/chosen": 6.9330267906188965, "eval_rewards/margins": 11.098607063293457, "eval_rewards/rejected": -4.165579795837402, "eval_runtime": 376.8443, "eval_samples_per_second": 1.144, "eval_steps_per_second": 0.573, "step": 900 }, { "epoch": 1.8664047151277012, "grad_norm": 0.33740234375, "learning_rate": 7.59612349389599e-09, "logits/chosen": -1.8312207460403442, "logits/rejected": -1.980654239654541, "logps/chosen": -517.9837646484375, "logps/rejected": -973.3825073242188, "loss": 0.3137, "rewards/accuracies": 0.5475000143051147, "rewards/chosen": 5.046032905578613, "rewards/margins": 17.248321533203125, "rewards/rejected": -12.202287673950195, "step": 950 }, { "epoch": 1.8664047151277012, "eval_logits/chosen": -1.7810285091400146, "eval_logits/rejected": -1.833039402961731, "eval_logps/chosen": -804.3425903320312, "eval_logps/rejected": -819.4027709960938, "eval_loss": 0.20272395014762878, "eval_rewards/accuracies": 0.7083333134651184, "eval_rewards/chosen": 6.937338829040527, "eval_rewards/margins": 11.10338020324707, "eval_rewards/rejected": -4.166042327880859, "eval_runtime": 376.6269, "eval_samples_per_second": 1.144, "eval_steps_per_second": 0.574, "step": 950 }, { "epoch": 1.9646365422396856, "grad_norm": 0.0003845691680908203, "learning_rate": 0.0, "logits/chosen": -1.833642601966858, "logits/rejected": -1.9818944931030273, "logps/chosen": -566.6400146484375, "logps/rejected": -1054.8111572265625, "loss": 0.2922, "rewards/accuracies": 0.5799999833106995, "rewards/chosen": 4.9063005447387695, "rewards/margins": 17.938020706176758, "rewards/rejected": -13.031720161437988, "step": 1000 }, { "epoch": 1.9646365422396856, "eval_logits/chosen": -1.7809877395629883, "eval_logits/rejected": -1.8330711126327515, "eval_logps/chosen": -804.3726806640625, "eval_logps/rejected": -819.4230346679688, "eval_loss": 0.20272374153137207, "eval_rewards/accuracies": 0.7083333134651184, "eval_rewards/chosen": 6.933157444000244, "eval_rewards/margins": 11.101253509521484, "eval_rewards/rejected": -4.168097019195557, "eval_runtime": 376.4188, "eval_samples_per_second": 1.145, "eval_steps_per_second": 0.574, "step": 1000 }, { "epoch": 1.9646365422396856, "step": 1000, "total_flos": 0.0, "train_loss": 0.31735729217529296, "train_runtime": 210247.589, "train_samples_per_second": 0.038, "train_steps_per_second": 0.005 } ], "logging_steps": 50, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }