|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.11611030478955008, |
|
"eval_steps": 500, |
|
"global_step": 100, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 724.58984375, |
|
"epoch": 0.0011611030478955006, |
|
"grad_norm": 0.7112110585443041, |
|
"kl": 0.0009012222290039062, |
|
"learning_rate": 3.846153846153846e-08, |
|
"loss": 0.0, |
|
"reward": 0.046875, |
|
"reward_std": 0.06733439117670059, |
|
"rewards/correctness_reward_func": 0.0390625, |
|
"rewards/strict_format_reward_func": 0.0078125, |
|
"step": 1 |
|
}, |
|
{ |
|
"completion_length": 708.56640625, |
|
"epoch": 0.0023222060957910013, |
|
"grad_norm": 0.12068406687948798, |
|
"kl": 0.000453948974609375, |
|
"learning_rate": 7.692307692307692e-08, |
|
"loss": 0.0, |
|
"reward": 0.033203125, |
|
"reward_std": 0.06640625, |
|
"rewards/correctness_reward_func": 0.03125, |
|
"rewards/strict_format_reward_func": 0.001953125, |
|
"step": 2 |
|
}, |
|
{ |
|
"completion_length": 742.65625, |
|
"epoch": 0.0034833091436865023, |
|
"grad_norm": 0.7976659386050398, |
|
"kl": 0.0007343292236328125, |
|
"learning_rate": 1.1538461538461539e-07, |
|
"loss": 0.0, |
|
"reward": 0.0234375, |
|
"reward_std": 0.046875, |
|
"rewards/correctness_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.0078125, |
|
"step": 3 |
|
}, |
|
{ |
|
"completion_length": 808.21875, |
|
"epoch": 0.0046444121915820025, |
|
"grad_norm": 0.40112277888275455, |
|
"kl": 0.00063323974609375, |
|
"learning_rate": 1.5384615384615385e-07, |
|
"loss": 0.0, |
|
"reward": 0.060546875, |
|
"reward_std": 0.1078859455883503, |
|
"rewards/correctness_reward_func": 0.0546875, |
|
"rewards/strict_format_reward_func": 0.005859375, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 654.234375, |
|
"epoch": 0.005805515239477504, |
|
"grad_norm": 0.39725912723012635, |
|
"kl": 0.0004980564117431641, |
|
"learning_rate": 1.9230769230769231e-07, |
|
"loss": 0.0, |
|
"reward": 0.044921875, |
|
"reward_std": 0.08984375, |
|
"rewards/correctness_reward_func": 0.03125, |
|
"rewards/strict_format_reward_func": 0.013671875, |
|
"step": 5 |
|
}, |
|
{ |
|
"completion_length": 772.68359375, |
|
"epoch": 0.006966618287373005, |
|
"grad_norm": 0.14294736924830026, |
|
"kl": 0.0004153251647949219, |
|
"learning_rate": 2.3076923076923078e-07, |
|
"loss": 0.0, |
|
"reward": 0.046875, |
|
"reward_std": 0.0805421955883503, |
|
"rewards/correctness_reward_func": 0.0390625, |
|
"rewards/strict_format_reward_func": 0.0078125, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 745.203125, |
|
"epoch": 0.008127721335268505, |
|
"grad_norm": 0.16815060965909404, |
|
"kl": 0.0009188652038574219, |
|
"learning_rate": 2.692307692307692e-07, |
|
"loss": 0.0, |
|
"reward": 0.021484375, |
|
"reward_std": 0.029760945588350296, |
|
"rewards/correctness_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.005859375, |
|
"step": 7 |
|
}, |
|
{ |
|
"completion_length": 744.265625, |
|
"epoch": 0.009288824383164005, |
|
"grad_norm": 12.779442066715234, |
|
"kl": 0.00357818603515625, |
|
"learning_rate": 3.076923076923077e-07, |
|
"loss": 0.0001, |
|
"reward": 0.03125, |
|
"reward_std": 0.049292195588350296, |
|
"rewards/correctness_reward_func": 0.03125, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 704.51953125, |
|
"epoch": 0.010449927431059507, |
|
"grad_norm": 0.2475578183097461, |
|
"kl": 0.0035419464111328125, |
|
"learning_rate": 3.461538461538461e-07, |
|
"loss": 0.0001, |
|
"reward": 0.060546875, |
|
"reward_std": 0.1078859455883503, |
|
"rewards/correctness_reward_func": 0.0546875, |
|
"rewards/strict_format_reward_func": 0.005859375, |
|
"step": 9 |
|
}, |
|
{ |
|
"completion_length": 759.2890625, |
|
"epoch": 0.011611030478955007, |
|
"grad_norm": 0.3357083329970443, |
|
"kl": 0.00310516357421875, |
|
"learning_rate": 3.8461538461538463e-07, |
|
"loss": 0.0001, |
|
"reward": 0.00390625, |
|
"reward_std": 0.0078125, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.00390625, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 746.0703125, |
|
"epoch": 0.012772133526850507, |
|
"grad_norm": 0.2657165785401948, |
|
"kl": 0.00334930419921875, |
|
"learning_rate": 4.2307692307692304e-07, |
|
"loss": 0.0001, |
|
"reward": 0.017578125, |
|
"reward_std": 0.03515625, |
|
"rewards/correctness_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.001953125, |
|
"step": 11 |
|
}, |
|
{ |
|
"completion_length": 702.21875, |
|
"epoch": 0.01393323657474601, |
|
"grad_norm": 1.2656634353615759, |
|
"kl": 0.0019159317016601562, |
|
"learning_rate": 4.6153846153846156e-07, |
|
"loss": 0.0001, |
|
"reward": 0.033203125, |
|
"reward_std": 0.06640625, |
|
"rewards/correctness_reward_func": 0.03125, |
|
"rewards/strict_format_reward_func": 0.001953125, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 735.78515625, |
|
"epoch": 0.01509433962264151, |
|
"grad_norm": 0.5357650576255997, |
|
"kl": 0.005596160888671875, |
|
"learning_rate": 5e-07, |
|
"loss": 0.0002, |
|
"reward": 0.06640625, |
|
"reward_std": 0.1280700732022524, |
|
"rewards/correctness_reward_func": 0.0546875, |
|
"rewards/strict_format_reward_func": 0.01171875, |
|
"step": 13 |
|
}, |
|
{ |
|
"completion_length": 802.51953125, |
|
"epoch": 0.01625544267053701, |
|
"grad_norm": 1.9685839556774372, |
|
"kl": 0.00266265869140625, |
|
"learning_rate": 5.384615384615384e-07, |
|
"loss": 0.0001, |
|
"reward": 0.03125, |
|
"reward_std": 0.059198048897087574, |
|
"rewards/correctness_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"step": 14 |
|
}, |
|
{ |
|
"completion_length": 702.47265625, |
|
"epoch": 0.01741654571843251, |
|
"grad_norm": 0.3022926592468717, |
|
"kl": 0.00206756591796875, |
|
"learning_rate": 5.769230769230768e-07, |
|
"loss": 0.0001, |
|
"reward": 0.03125, |
|
"reward_std": 0.049292195588350296, |
|
"rewards/correctness_reward_func": 0.0234375, |
|
"rewards/strict_format_reward_func": 0.0078125, |
|
"step": 15 |
|
}, |
|
{ |
|
"completion_length": 714.55859375, |
|
"epoch": 0.01857764876632801, |
|
"grad_norm": 0.67295700611606, |
|
"kl": 0.006526947021484375, |
|
"learning_rate": 6.153846153846154e-07, |
|
"loss": 0.0003, |
|
"reward": 0.0546875, |
|
"reward_std": 0.04213257320225239, |
|
"rewards/correctness_reward_func": 0.046875, |
|
"rewards/strict_format_reward_func": 0.0078125, |
|
"step": 16 |
|
}, |
|
{ |
|
"completion_length": 728.94921875, |
|
"epoch": 0.019738751814223514, |
|
"grad_norm": 0.12683035812623522, |
|
"kl": 0.003253936767578125, |
|
"learning_rate": 6.538461538461538e-07, |
|
"loss": 0.0001, |
|
"reward": 0.041015625, |
|
"reward_std": 0.0688234455883503, |
|
"rewards/correctness_reward_func": 0.03125, |
|
"rewards/strict_format_reward_func": 0.009765625, |
|
"step": 17 |
|
}, |
|
{ |
|
"completion_length": 752.4609375, |
|
"epoch": 0.020899854862119014, |
|
"grad_norm": 0.7572918382078203, |
|
"kl": 0.002063751220703125, |
|
"learning_rate": 6.923076923076922e-07, |
|
"loss": 0.0001, |
|
"reward": 0.01953125, |
|
"reward_std": 0.0390625, |
|
"rewards/correctness_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.00390625, |
|
"step": 18 |
|
}, |
|
{ |
|
"completion_length": 686.39453125, |
|
"epoch": 0.022060957910014514, |
|
"grad_norm": 0.366311042586646, |
|
"kl": 0.00254058837890625, |
|
"learning_rate": 7.307692307692307e-07, |
|
"loss": 0.0001, |
|
"reward": 0.037109375, |
|
"reward_std": 0.07421875, |
|
"rewards/correctness_reward_func": 0.03125, |
|
"rewards/strict_format_reward_func": 0.005859375, |
|
"step": 19 |
|
}, |
|
{ |
|
"completion_length": 814.58203125, |
|
"epoch": 0.023222060957910014, |
|
"grad_norm": 0.39037265038987773, |
|
"kl": 0.0024509429931640625, |
|
"learning_rate": 7.692307692307693e-07, |
|
"loss": 0.0001, |
|
"reward": 0.056640625, |
|
"reward_std": 0.1000734455883503, |
|
"rewards/correctness_reward_func": 0.046875, |
|
"rewards/strict_format_reward_func": 0.009765625, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 790.58203125, |
|
"epoch": 0.024383164005805515, |
|
"grad_norm": 0.16449028843577287, |
|
"kl": 0.0017910003662109375, |
|
"learning_rate": 8.076923076923077e-07, |
|
"loss": 0.0001, |
|
"reward": 0.021484375, |
|
"reward_std": 0.04296875, |
|
"rewards/correctness_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.005859375, |
|
"step": 21 |
|
}, |
|
{ |
|
"completion_length": 720.109375, |
|
"epoch": 0.025544267053701015, |
|
"grad_norm": 0.16050190810020273, |
|
"kl": 0.000865936279296875, |
|
"learning_rate": 8.461538461538461e-07, |
|
"loss": 0.0, |
|
"reward": 0.041015625, |
|
"reward_std": 0.08203125, |
|
"rewards/correctness_reward_func": 0.03125, |
|
"rewards/strict_format_reward_func": 0.009765625, |
|
"step": 22 |
|
}, |
|
{ |
|
"completion_length": 772.921875, |
|
"epoch": 0.026705370101596515, |
|
"grad_norm": 1.022693227541973, |
|
"kl": 0.010603904724121094, |
|
"learning_rate": 8.846153846153846e-07, |
|
"loss": 0.0004, |
|
"reward": 0.0234375, |
|
"reward_std": 0.046875, |
|
"rewards/correctness_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.0078125, |
|
"step": 23 |
|
}, |
|
{ |
|
"completion_length": 785.9296875, |
|
"epoch": 0.02786647314949202, |
|
"grad_norm": 0.4975098781878154, |
|
"kl": 0.0006268024444580078, |
|
"learning_rate": 9.230769230769231e-07, |
|
"loss": 0.0, |
|
"reward": 0.0546875, |
|
"reward_std": 0.10463257133960724, |
|
"rewards/correctness_reward_func": 0.046875, |
|
"rewards/strict_format_reward_func": 0.0078125, |
|
"step": 24 |
|
}, |
|
{ |
|
"completion_length": 808.91796875, |
|
"epoch": 0.02902757619738752, |
|
"grad_norm": 0.07636536312105723, |
|
"kl": 0.0006952285766601562, |
|
"learning_rate": 9.615384615384615e-07, |
|
"loss": 0.0, |
|
"reward": 0.03515625, |
|
"reward_std": 0.055702777579426765, |
|
"rewards/correctness_reward_func": 0.0234375, |
|
"rewards/strict_format_reward_func": 0.01171875, |
|
"step": 25 |
|
}, |
|
{ |
|
"completion_length": 677.65234375, |
|
"epoch": 0.03018867924528302, |
|
"grad_norm": 105.92855708885655, |
|
"kl": 0.048847198486328125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.002, |
|
"reward": 0.033203125, |
|
"reward_std": 0.06640625, |
|
"rewards/correctness_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.017578125, |
|
"step": 26 |
|
}, |
|
{ |
|
"completion_length": 657.85546875, |
|
"epoch": 0.03134978229317852, |
|
"grad_norm": 0.0861458003290139, |
|
"kl": 0.004261970520019531, |
|
"learning_rate": 9.999964611162971e-07, |
|
"loss": 0.0002, |
|
"reward": 0.009765625, |
|
"reward_std": 0.01953125, |
|
"rewards/correctness_reward_func": 0.0078125, |
|
"rewards/strict_format_reward_func": 0.001953125, |
|
"step": 27 |
|
}, |
|
{ |
|
"completion_length": 726.2421875, |
|
"epoch": 0.03251088534107402, |
|
"grad_norm": 0.14527394620380304, |
|
"kl": 0.00072479248046875, |
|
"learning_rate": 9.999858445152839e-07, |
|
"loss": 0.0, |
|
"reward": 0.033203125, |
|
"reward_std": 0.053198445588350296, |
|
"rewards/correctness_reward_func": 0.03125, |
|
"rewards/strict_format_reward_func": 0.001953125, |
|
"step": 28 |
|
}, |
|
{ |
|
"completion_length": 646.53125, |
|
"epoch": 0.03367198838896952, |
|
"grad_norm": 0.4038815325919152, |
|
"kl": 0.008038520812988281, |
|
"learning_rate": 9.999681503472433e-07, |
|
"loss": 0.0003, |
|
"reward": 0.02734375, |
|
"reward_std": 0.043573048897087574, |
|
"rewards/correctness_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.01171875, |
|
"step": 29 |
|
}, |
|
{ |
|
"completion_length": 676.234375, |
|
"epoch": 0.03483309143686502, |
|
"grad_norm": 24.530460542475996, |
|
"kl": 0.08559513092041016, |
|
"learning_rate": 9.99943378862646e-07, |
|
"loss": 0.0034, |
|
"reward": 0.076171875, |
|
"reward_std": 0.1358339935541153, |
|
"rewards/correctness_reward_func": 0.0625, |
|
"rewards/strict_format_reward_func": 0.013671875, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 669.62890625, |
|
"epoch": 0.035994194484760524, |
|
"grad_norm": 12.948432597412872, |
|
"kl": 0.0016450881958007812, |
|
"learning_rate": 9.999115304121457e-07, |
|
"loss": 0.0001, |
|
"reward": 0.033203125, |
|
"reward_std": 0.06640625, |
|
"rewards/correctness_reward_func": 0.0234375, |
|
"rewards/strict_format_reward_func": 0.009765625, |
|
"step": 31 |
|
}, |
|
{ |
|
"completion_length": 632.80078125, |
|
"epoch": 0.03715529753265602, |
|
"grad_norm": 0.14738843577567481, |
|
"kl": 0.0011644363403320312, |
|
"learning_rate": 9.998726054465744e-07, |
|
"loss": 0.0, |
|
"reward": 0.0546875, |
|
"reward_std": 0.106073047965765, |
|
"rewards/correctness_reward_func": 0.0390625, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"step": 32 |
|
}, |
|
{ |
|
"completion_length": 686.90234375, |
|
"epoch": 0.038316400580551524, |
|
"grad_norm": 6.995419155886516, |
|
"kl": 0.056095123291015625, |
|
"learning_rate": 9.998266045169354e-07, |
|
"loss": 0.0022, |
|
"reward": 0.052734375, |
|
"reward_std": 0.0922609455883503, |
|
"rewards/correctness_reward_func": 0.0390625, |
|
"rewards/strict_format_reward_func": 0.013671875, |
|
"step": 33 |
|
}, |
|
{ |
|
"completion_length": 655.90234375, |
|
"epoch": 0.03947750362844703, |
|
"grad_norm": 3.5735767009613224, |
|
"kl": 0.013265609741210938, |
|
"learning_rate": 9.997735282743968e-07, |
|
"loss": 0.0005, |
|
"reward": 0.076171875, |
|
"reward_std": 0.13439351692795753, |
|
"rewards/correctness_reward_func": 0.0625, |
|
"rewards/strict_format_reward_func": 0.013671875, |
|
"step": 34 |
|
}, |
|
{ |
|
"completion_length": 639.82421875, |
|
"epoch": 0.040638606676342524, |
|
"grad_norm": 1.8475352137430245, |
|
"kl": 0.0025959014892578125, |
|
"learning_rate": 9.997133774702812e-07, |
|
"loss": 0.0001, |
|
"reward": 0.05078125, |
|
"reward_std": 0.09682007133960724, |
|
"rewards/correctness_reward_func": 0.03125, |
|
"rewards/strict_format_reward_func": 0.01953125, |
|
"step": 35 |
|
}, |
|
{ |
|
"completion_length": 715.3125, |
|
"epoch": 0.04179970972423803, |
|
"grad_norm": 13.632018237363075, |
|
"kl": 0.21777725219726562, |
|
"learning_rate": 9.996461529560552e-07, |
|
"loss": 0.0087, |
|
"reward": 0.0390625, |
|
"reward_std": 0.078125, |
|
"rewards/correctness_reward_func": 0.0234375, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"step": 36 |
|
}, |
|
{ |
|
"completion_length": 631.8203125, |
|
"epoch": 0.042960812772133525, |
|
"grad_norm": 10.015816840984483, |
|
"kl": 0.00942230224609375, |
|
"learning_rate": 9.995718556833178e-07, |
|
"loss": 0.0004, |
|
"reward": 0.072265625, |
|
"reward_std": 0.141229297965765, |
|
"rewards/correctness_reward_func": 0.046875, |
|
"rewards/strict_format_reward_func": 0.025390625, |
|
"step": 37 |
|
}, |
|
{ |
|
"completion_length": 636.9140625, |
|
"epoch": 0.04412191582002903, |
|
"grad_norm": 0.18443450708075712, |
|
"kl": 0.001720428466796875, |
|
"learning_rate": 9.994904867037865e-07, |
|
"loss": 0.0001, |
|
"reward": 0.033203125, |
|
"reward_std": 0.046594543382525444, |
|
"rewards/correctness_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.017578125, |
|
"step": 38 |
|
}, |
|
{ |
|
"completion_length": 667.6484375, |
|
"epoch": 0.045283018867924525, |
|
"grad_norm": 0.7286107675191874, |
|
"kl": 0.027004241943359375, |
|
"learning_rate": 9.994020471692832e-07, |
|
"loss": 0.0011, |
|
"reward": 0.0625, |
|
"reward_std": 0.10044586285948753, |
|
"rewards/correctness_reward_func": 0.0390625, |
|
"rewards/strict_format_reward_func": 0.0234375, |
|
"step": 39 |
|
}, |
|
{ |
|
"completion_length": 695.68359375, |
|
"epoch": 0.04644412191582003, |
|
"grad_norm": 0.7697487269128109, |
|
"kl": 0.002681732177734375, |
|
"learning_rate": 9.993065383317162e-07, |
|
"loss": 0.0001, |
|
"reward": 0.078125, |
|
"reward_std": 0.12407248839735985, |
|
"rewards/correctness_reward_func": 0.0546875, |
|
"rewards/strict_format_reward_func": 0.0234375, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 678.390625, |
|
"epoch": 0.04760522496371553, |
|
"grad_norm": 0.33385332935798095, |
|
"kl": 0.0028896331787109375, |
|
"learning_rate": 9.992039615430648e-07, |
|
"loss": 0.0001, |
|
"reward": 0.06640625, |
|
"reward_std": 0.1137621309608221, |
|
"rewards/correctness_reward_func": 0.046875, |
|
"rewards/strict_format_reward_func": 0.01953125, |
|
"step": 41 |
|
}, |
|
{ |
|
"completion_length": 660.2421875, |
|
"epoch": 0.04876632801161103, |
|
"grad_norm": 5.2234598338711615, |
|
"kl": 0.03473663330078125, |
|
"learning_rate": 9.990943182553578e-07, |
|
"loss": 0.0014, |
|
"reward": 0.05859375, |
|
"reward_std": 0.11058359593153, |
|
"rewards/correctness_reward_func": 0.0234375, |
|
"rewards/strict_format_reward_func": 0.03515625, |
|
"step": 42 |
|
}, |
|
{ |
|
"completion_length": 599.66015625, |
|
"epoch": 0.04992743105950653, |
|
"grad_norm": 0.3125631745813204, |
|
"kl": 0.0029144287109375, |
|
"learning_rate": 9.989776100206547e-07, |
|
"loss": 0.0001, |
|
"reward": 0.087890625, |
|
"reward_std": 0.1592714935541153, |
|
"rewards/correctness_reward_func": 0.0546875, |
|
"rewards/strict_format_reward_func": 0.033203125, |
|
"step": 43 |
|
}, |
|
{ |
|
"completion_length": 695.41015625, |
|
"epoch": 0.05108853410740203, |
|
"grad_norm": 0.31129440427202076, |
|
"kl": 0.002468109130859375, |
|
"learning_rate": 9.98853838491023e-07, |
|
"loss": 0.0001, |
|
"reward": 0.046875, |
|
"reward_std": 0.08240367192775011, |
|
"rewards/correctness_reward_func": 0.0234375, |
|
"rewards/strict_format_reward_func": 0.0234375, |
|
"step": 44 |
|
}, |
|
{ |
|
"completion_length": 594.67578125, |
|
"epoch": 0.05224963715529753, |
|
"grad_norm": 0.2962376329942141, |
|
"kl": 0.00382232666015625, |
|
"learning_rate": 9.98723005418515e-07, |
|
"loss": 0.0002, |
|
"reward": 0.091796875, |
|
"reward_std": 0.1253724191337824, |
|
"rewards/correctness_reward_func": 0.046875, |
|
"rewards/strict_format_reward_func": 0.044921875, |
|
"step": 45 |
|
}, |
|
{ |
|
"completion_length": 672.73046875, |
|
"epoch": 0.05341074020319303, |
|
"grad_norm": 3.778170631942383, |
|
"kl": 0.077880859375, |
|
"learning_rate": 9.985851126551428e-07, |
|
"loss": 0.0031, |
|
"reward": 0.103515625, |
|
"reward_std": 0.1781556848436594, |
|
"rewards/correctness_reward_func": 0.0625, |
|
"rewards/strict_format_reward_func": 0.041015625, |
|
"step": 46 |
|
}, |
|
{ |
|
"completion_length": 603.15625, |
|
"epoch": 0.054571843251088534, |
|
"grad_norm": 0.3069672598383264, |
|
"kl": 0.006290435791015625, |
|
"learning_rate": 9.98440162152852e-07, |
|
"loss": 0.0003, |
|
"reward": 0.08984375, |
|
"reward_std": 0.1551719233393669, |
|
"rewards/correctness_reward_func": 0.046875, |
|
"rewards/strict_format_reward_func": 0.04296875, |
|
"step": 47 |
|
}, |
|
{ |
|
"completion_length": 632.2890625, |
|
"epoch": 0.05573294629898404, |
|
"grad_norm": 1.2667105724676624, |
|
"kl": 0.029102325439453125, |
|
"learning_rate": 9.982881559634946e-07, |
|
"loss": 0.0012, |
|
"reward": 0.08203125, |
|
"reward_std": 0.14755274169147015, |
|
"rewards/correctness_reward_func": 0.03125, |
|
"rewards/strict_format_reward_func": 0.05078125, |
|
"step": 48 |
|
}, |
|
{ |
|
"completion_length": 571.69921875, |
|
"epoch": 0.056894049346879534, |
|
"grad_norm": 51.00112062221978, |
|
"kl": 0.9501419067382812, |
|
"learning_rate": 9.981290962387997e-07, |
|
"loss": 0.0379, |
|
"reward": 0.07421875, |
|
"reward_std": 0.12532384134829044, |
|
"rewards/correctness_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.05859375, |
|
"step": 49 |
|
}, |
|
{ |
|
"completion_length": 595.40234375, |
|
"epoch": 0.05805515239477504, |
|
"grad_norm": 1.04581231978298, |
|
"kl": 0.01190185546875, |
|
"learning_rate": 9.979629852303425e-07, |
|
"loss": 0.0005, |
|
"reward": 0.11328125, |
|
"reward_std": 0.18154333159327507, |
|
"rewards/correctness_reward_func": 0.03125, |
|
"rewards/strict_format_reward_func": 0.08203125, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 561.47265625, |
|
"epoch": 0.059216255442670535, |
|
"grad_norm": 0.7109293653864751, |
|
"kl": 0.0103302001953125, |
|
"learning_rate": 9.977898252895134e-07, |
|
"loss": 0.0004, |
|
"reward": 0.13671875, |
|
"reward_std": 0.2260015867650509, |
|
"rewards/correctness_reward_func": 0.0390625, |
|
"rewards/strict_format_reward_func": 0.09765625, |
|
"step": 51 |
|
}, |
|
{ |
|
"completion_length": 589.95703125, |
|
"epoch": 0.06037735849056604, |
|
"grad_norm": 2.9062465124154735, |
|
"kl": 0.00753021240234375, |
|
"learning_rate": 9.976096188674836e-07, |
|
"loss": 0.0003, |
|
"reward": 0.12890625, |
|
"reward_std": 0.19242635369300842, |
|
"rewards/correctness_reward_func": 0.0234375, |
|
"rewards/strict_format_reward_func": 0.10546875, |
|
"step": 52 |
|
}, |
|
{ |
|
"completion_length": 537.0234375, |
|
"epoch": 0.06153846153846154, |
|
"grad_norm": 22.409980149466673, |
|
"kl": 0.2718505859375, |
|
"learning_rate": 9.974223685151718e-07, |
|
"loss": 0.0109, |
|
"reward": 0.12890625, |
|
"reward_std": 0.1836371347308159, |
|
"rewards/correctness_reward_func": 0.0234375, |
|
"rewards/strict_format_reward_func": 0.10546875, |
|
"step": 53 |
|
}, |
|
{ |
|
"completion_length": 531.80859375, |
|
"epoch": 0.06269956458635705, |
|
"grad_norm": 2.8826743442454297, |
|
"kl": 0.122955322265625, |
|
"learning_rate": 9.972280768832067e-07, |
|
"loss": 0.0049, |
|
"reward": 0.220703125, |
|
"reward_std": 0.3008614853024483, |
|
"rewards/correctness_reward_func": 0.09375, |
|
"rewards/strict_format_reward_func": 0.126953125, |
|
"step": 54 |
|
}, |
|
{ |
|
"completion_length": 611.5, |
|
"epoch": 0.06386066763425254, |
|
"grad_norm": 2.352779269743852, |
|
"kl": 0.02130889892578125, |
|
"learning_rate": 9.970267467218902e-07, |
|
"loss": 0.0009, |
|
"reward": 0.232421875, |
|
"reward_std": 0.2863368093967438, |
|
"rewards/correctness_reward_func": 0.0546875, |
|
"rewards/strict_format_reward_func": 0.177734375, |
|
"step": 55 |
|
}, |
|
{ |
|
"completion_length": 555.91796875, |
|
"epoch": 0.06502177068214804, |
|
"grad_norm": 0.9680034082980018, |
|
"kl": 0.01513671875, |
|
"learning_rate": 9.968183808811585e-07, |
|
"loss": 0.0006, |
|
"reward": 0.24609375, |
|
"reward_std": 0.3278784677386284, |
|
"rewards/correctness_reward_func": 0.0703125, |
|
"rewards/strict_format_reward_func": 0.17578125, |
|
"step": 56 |
|
}, |
|
{ |
|
"completion_length": 561.3203125, |
|
"epoch": 0.06618287373004354, |
|
"grad_norm": 5.155662797346911, |
|
"kl": 0.1178741455078125, |
|
"learning_rate": 9.966029823105415e-07, |
|
"loss": 0.0047, |
|
"reward": 0.291015625, |
|
"reward_std": 0.3391275480389595, |
|
"rewards/correctness_reward_func": 0.09375, |
|
"rewards/strict_format_reward_func": 0.197265625, |
|
"step": 57 |
|
}, |
|
{ |
|
"completion_length": 574.9453125, |
|
"epoch": 0.06734397677793905, |
|
"grad_norm": 3.5240176122453306, |
|
"kl": 0.0246734619140625, |
|
"learning_rate": 9.96380554059121e-07, |
|
"loss": 0.001, |
|
"reward": 0.291015625, |
|
"reward_std": 0.31832827627658844, |
|
"rewards/correctness_reward_func": 0.078125, |
|
"rewards/strict_format_reward_func": 0.212890625, |
|
"step": 58 |
|
}, |
|
{ |
|
"completion_length": 583.93359375, |
|
"epoch": 0.06850507982583454, |
|
"grad_norm": 3.558262716521494, |
|
"kl": 0.0449981689453125, |
|
"learning_rate": 9.961510992754882e-07, |
|
"loss": 0.0018, |
|
"reward": 0.28515625, |
|
"reward_std": 0.26169369369745255, |
|
"rewards/correctness_reward_func": 0.0703125, |
|
"rewards/strict_format_reward_func": 0.21484375, |
|
"step": 59 |
|
}, |
|
{ |
|
"completion_length": 498.65625, |
|
"epoch": 0.06966618287373004, |
|
"grad_norm": 2.811935806633887, |
|
"kl": 0.0194091796875, |
|
"learning_rate": 9.959146212076978e-07, |
|
"loss": 0.0008, |
|
"reward": 0.33984375, |
|
"reward_std": 0.314444862306118, |
|
"rewards/correctness_reward_func": 0.0625, |
|
"rewards/strict_format_reward_func": 0.27734375, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 478.53125, |
|
"epoch": 0.07082728592162554, |
|
"grad_norm": 4.984861468141312, |
|
"kl": 0.107757568359375, |
|
"learning_rate": 9.95671123203224e-07, |
|
"loss": 0.0043, |
|
"reward": 0.361328125, |
|
"reward_std": 0.33764002099633217, |
|
"rewards/correctness_reward_func": 0.09375, |
|
"rewards/strict_format_reward_func": 0.267578125, |
|
"step": 61 |
|
}, |
|
{ |
|
"completion_length": 512.39453125, |
|
"epoch": 0.07198838896952105, |
|
"grad_norm": 1.007778929339388, |
|
"kl": 0.01971435546875, |
|
"learning_rate": 9.954206087089105e-07, |
|
"loss": 0.0008, |
|
"reward": 0.3203125, |
|
"reward_std": 0.2790137939155102, |
|
"rewards/correctness_reward_func": 0.046875, |
|
"rewards/strict_format_reward_func": 0.2734375, |
|
"step": 62 |
|
}, |
|
{ |
|
"completion_length": 516.03515625, |
|
"epoch": 0.07314949201741655, |
|
"grad_norm": 0.5960945531101304, |
|
"kl": 0.0254974365234375, |
|
"learning_rate": 9.951630812709244e-07, |
|
"loss": 0.001, |
|
"reward": 0.373046875, |
|
"reward_std": 0.3191210627555847, |
|
"rewards/correctness_reward_func": 0.09375, |
|
"rewards/strict_format_reward_func": 0.279296875, |
|
"step": 63 |
|
}, |
|
{ |
|
"completion_length": 478.65625, |
|
"epoch": 0.07431059506531204, |
|
"grad_norm": 3.3120058314917094, |
|
"kl": 0.027435302734375, |
|
"learning_rate": 9.948985445347044e-07, |
|
"loss": 0.0011, |
|
"reward": 0.396484375, |
|
"reward_std": 0.30096181109547615, |
|
"rewards/correctness_reward_func": 0.0703125, |
|
"rewards/strict_format_reward_func": 0.326171875, |
|
"step": 64 |
|
}, |
|
{ |
|
"completion_length": 453.5390625, |
|
"epoch": 0.07547169811320754, |
|
"grad_norm": 10.565800054920043, |
|
"kl": 0.06463623046875, |
|
"learning_rate": 9.946270022449093e-07, |
|
"loss": 0.0026, |
|
"reward": 0.40234375, |
|
"reward_std": 0.30288316309452057, |
|
"rewards/correctness_reward_func": 0.0703125, |
|
"rewards/strict_format_reward_func": 0.33203125, |
|
"step": 65 |
|
}, |
|
{ |
|
"completion_length": 467.4609375, |
|
"epoch": 0.07663280116110305, |
|
"grad_norm": 8.848702328863897, |
|
"kl": 0.026763916015625, |
|
"learning_rate": 9.94348458245365e-07, |
|
"loss": 0.0011, |
|
"reward": 0.484375, |
|
"reward_std": 0.3539407253265381, |
|
"rewards/correctness_reward_func": 0.1640625, |
|
"rewards/strict_format_reward_func": 0.3203125, |
|
"step": 66 |
|
}, |
|
{ |
|
"completion_length": 460.51171875, |
|
"epoch": 0.07779390420899855, |
|
"grad_norm": 0.8276997968349737, |
|
"kl": 0.028564453125, |
|
"learning_rate": 9.940629164790118e-07, |
|
"loss": 0.0011, |
|
"reward": 0.390625, |
|
"reward_std": 0.2685689851641655, |
|
"rewards/correctness_reward_func": 0.0546875, |
|
"rewards/strict_format_reward_func": 0.3359375, |
|
"step": 67 |
|
}, |
|
{ |
|
"completion_length": 432.30078125, |
|
"epoch": 0.07895500725689406, |
|
"grad_norm": 1.6501741381150759, |
|
"kl": 0.068115234375, |
|
"learning_rate": 9.937703809878454e-07, |
|
"loss": 0.0027, |
|
"reward": 0.447265625, |
|
"reward_std": 0.32472314685583115, |
|
"rewards/correctness_reward_func": 0.109375, |
|
"rewards/strict_format_reward_func": 0.337890625, |
|
"step": 68 |
|
}, |
|
{ |
|
"completion_length": 433.5625, |
|
"epoch": 0.08011611030478955, |
|
"grad_norm": 5.548040313902584, |
|
"kl": 0.0538330078125, |
|
"learning_rate": 9.934708559128622e-07, |
|
"loss": 0.0022, |
|
"reward": 0.48046875, |
|
"reward_std": 0.3404357209801674, |
|
"rewards/correctness_reward_func": 0.1171875, |
|
"rewards/strict_format_reward_func": 0.36328125, |
|
"step": 69 |
|
}, |
|
{ |
|
"completion_length": 411.9921875, |
|
"epoch": 0.08127721335268505, |
|
"grad_norm": 4.54760002369526, |
|
"kl": 0.0399169921875, |
|
"learning_rate": 9.931643454939998e-07, |
|
"loss": 0.0016, |
|
"reward": 0.505859375, |
|
"reward_std": 0.3411124534904957, |
|
"rewards/correctness_reward_func": 0.1484375, |
|
"rewards/strict_format_reward_func": 0.357421875, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 374.98828125, |
|
"epoch": 0.08243831640058055, |
|
"grad_norm": 0.7476462962662087, |
|
"kl": 0.06536865234375, |
|
"learning_rate": 9.928508540700772e-07, |
|
"loss": 0.0026, |
|
"reward": 0.458984375, |
|
"reward_std": 0.2434551902115345, |
|
"rewards/correctness_reward_func": 0.09375, |
|
"rewards/strict_format_reward_func": 0.365234375, |
|
"step": 71 |
|
}, |
|
{ |
|
"completion_length": 371.80859375, |
|
"epoch": 0.08359941944847606, |
|
"grad_norm": 0.31697483250235303, |
|
"kl": 0.04022216796875, |
|
"learning_rate": 9.925303860787333e-07, |
|
"loss": 0.0016, |
|
"reward": 0.427734375, |
|
"reward_std": 0.24097520112991333, |
|
"rewards/correctness_reward_func": 0.0546875, |
|
"rewards/strict_format_reward_func": 0.373046875, |
|
"step": 72 |
|
}, |
|
{ |
|
"completion_length": 408.953125, |
|
"epoch": 0.08476052249637156, |
|
"grad_norm": 2.342541131973969, |
|
"kl": 0.0394287109375, |
|
"learning_rate": 9.92202946056364e-07, |
|
"loss": 0.0016, |
|
"reward": 0.447265625, |
|
"reward_std": 0.2658774182200432, |
|
"rewards/correctness_reward_func": 0.0625, |
|
"rewards/strict_format_reward_func": 0.384765625, |
|
"step": 73 |
|
}, |
|
{ |
|
"completion_length": 367.03125, |
|
"epoch": 0.08592162554426705, |
|
"grad_norm": 0.457004333832563, |
|
"kl": 0.05194091796875, |
|
"learning_rate": 9.918685386380572e-07, |
|
"loss": 0.0021, |
|
"reward": 0.478515625, |
|
"reward_std": 0.22598736733198166, |
|
"rewards/correctness_reward_func": 0.078125, |
|
"rewards/strict_format_reward_func": 0.400390625, |
|
"step": 74 |
|
}, |
|
{ |
|
"completion_length": 423.35546875, |
|
"epoch": 0.08708272859216255, |
|
"grad_norm": 81.83417853597544, |
|
"kl": 0.247314453125, |
|
"learning_rate": 9.915271685575296e-07, |
|
"loss": 0.0099, |
|
"reward": 0.48828125, |
|
"reward_std": 0.3054451234638691, |
|
"rewards/correctness_reward_func": 0.1015625, |
|
"rewards/strict_format_reward_func": 0.38671875, |
|
"step": 75 |
|
}, |
|
{ |
|
"completion_length": 312.484375, |
|
"epoch": 0.08824383164005806, |
|
"grad_norm": 1.780578379981416, |
|
"kl": 0.0880126953125, |
|
"learning_rate": 9.911788406470568e-07, |
|
"loss": 0.0035, |
|
"reward": 0.548828125, |
|
"reward_std": 0.32934778556227684, |
|
"rewards/correctness_reward_func": 0.125, |
|
"rewards/strict_format_reward_func": 0.423828125, |
|
"step": 76 |
|
}, |
|
{ |
|
"completion_length": 338.2421875, |
|
"epoch": 0.08940493468795356, |
|
"grad_norm": 0.6544200585420813, |
|
"kl": 0.0731201171875, |
|
"learning_rate": 9.908235598374068e-07, |
|
"loss": 0.0029, |
|
"reward": 0.45703125, |
|
"reward_std": 0.1958453767001629, |
|
"rewards/correctness_reward_func": 0.0390625, |
|
"rewards/strict_format_reward_func": 0.41796875, |
|
"step": 77 |
|
}, |
|
{ |
|
"completion_length": 268.171875, |
|
"epoch": 0.09056603773584905, |
|
"grad_norm": 0.5879416696261314, |
|
"kl": 0.108642578125, |
|
"learning_rate": 9.904613311577695e-07, |
|
"loss": 0.0044, |
|
"reward": 0.556640625, |
|
"reward_std": 0.22046153992414474, |
|
"rewards/correctness_reward_func": 0.140625, |
|
"rewards/strict_format_reward_func": 0.416015625, |
|
"step": 78 |
|
}, |
|
{ |
|
"completion_length": 231.109375, |
|
"epoch": 0.09172714078374455, |
|
"grad_norm": 0.8976017470783989, |
|
"kl": 0.18408203125, |
|
"learning_rate": 9.900921597356855e-07, |
|
"loss": 0.0074, |
|
"reward": 0.515625, |
|
"reward_std": 0.20618988201022148, |
|
"rewards/correctness_reward_func": 0.0859375, |
|
"rewards/strict_format_reward_func": 0.4296875, |
|
"step": 79 |
|
}, |
|
{ |
|
"completion_length": 196.59765625, |
|
"epoch": 0.09288824383164006, |
|
"grad_norm": 17.77907159035059, |
|
"kl": 0.263916015625, |
|
"learning_rate": 9.897160507969735e-07, |
|
"loss": 0.0106, |
|
"reward": 0.47265625, |
|
"reward_std": 0.19209052622318268, |
|
"rewards/correctness_reward_func": 0.0546875, |
|
"rewards/strict_format_reward_func": 0.41796875, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 110.4609375, |
|
"epoch": 0.09404934687953556, |
|
"grad_norm": 1.3276411102732406, |
|
"kl": 0.3662109375, |
|
"learning_rate": 9.893330096656573e-07, |
|
"loss": 0.0147, |
|
"reward": 0.578125, |
|
"reward_std": 0.27833693847060204, |
|
"rewards/correctness_reward_func": 0.140625, |
|
"rewards/strict_format_reward_func": 0.4375, |
|
"step": 81 |
|
}, |
|
{ |
|
"completion_length": 77.88671875, |
|
"epoch": 0.09521044992743107, |
|
"grad_norm": 1.9379785150930775, |
|
"kl": 0.51953125, |
|
"learning_rate": 9.889430417638883e-07, |
|
"loss": 0.0208, |
|
"reward": 0.51171875, |
|
"reward_std": 0.1544804871082306, |
|
"rewards/correctness_reward_func": 0.0546875, |
|
"rewards/strict_format_reward_func": 0.45703125, |
|
"step": 82 |
|
}, |
|
{ |
|
"completion_length": 73.9375, |
|
"epoch": 0.09637155297532655, |
|
"grad_norm": 0.8473473359273695, |
|
"kl": 0.48486328125, |
|
"learning_rate": 9.885461526118713e-07, |
|
"loss": 0.0194, |
|
"reward": 0.50390625, |
|
"reward_std": 0.14182795584201813, |
|
"rewards/correctness_reward_func": 0.046875, |
|
"rewards/strict_format_reward_func": 0.45703125, |
|
"step": 83 |
|
}, |
|
{ |
|
"completion_length": 56.2109375, |
|
"epoch": 0.09753265602322206, |
|
"grad_norm": 2.281085454498393, |
|
"kl": 0.49072265625, |
|
"learning_rate": 9.88142347827784e-07, |
|
"loss": 0.0196, |
|
"reward": 0.4765625, |
|
"reward_std": 0.12851098738610744, |
|
"rewards/correctness_reward_func": 0.0234375, |
|
"rewards/strict_format_reward_func": 0.453125, |
|
"step": 84 |
|
}, |
|
{ |
|
"completion_length": 37.875, |
|
"epoch": 0.09869375907111756, |
|
"grad_norm": 0.5788019582854786, |
|
"kl": 0.5927734375, |
|
"learning_rate": 9.877316331276993e-07, |
|
"loss": 0.0237, |
|
"reward": 0.470703125, |
|
"reward_std": 0.055291797965765, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.470703125, |
|
"step": 85 |
|
}, |
|
{ |
|
"completion_length": 33.50390625, |
|
"epoch": 0.09985486211901307, |
|
"grad_norm": 6.104259815482374, |
|
"kl": 0.5439453125, |
|
"learning_rate": 9.873140143255034e-07, |
|
"loss": 0.0217, |
|
"reward": 0.517578125, |
|
"reward_std": 0.07212539482861757, |
|
"rewards/correctness_reward_func": 0.046875, |
|
"rewards/strict_format_reward_func": 0.470703125, |
|
"step": 86 |
|
}, |
|
{ |
|
"completion_length": 20.76171875, |
|
"epoch": 0.10101596516690857, |
|
"grad_norm": 0.507931311814518, |
|
"kl": 0.5947265625, |
|
"learning_rate": 9.86889497332814e-07, |
|
"loss": 0.0238, |
|
"reward": 0.486328125, |
|
"reward_std": 0.02734375, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.486328125, |
|
"step": 87 |
|
}, |
|
{ |
|
"completion_length": 16.58203125, |
|
"epoch": 0.10217706821480406, |
|
"grad_norm": 0.45325431756004353, |
|
"kl": 0.6630859375, |
|
"learning_rate": 9.864580881588958e-07, |
|
"loss": 0.0265, |
|
"reward": 0.501953125, |
|
"reward_std": 0.024041798897087574, |
|
"rewards/correctness_reward_func": 0.0078125, |
|
"rewards/strict_format_reward_func": 0.494140625, |
|
"step": 88 |
|
}, |
|
{ |
|
"completion_length": 16.12109375, |
|
"epoch": 0.10333817126269956, |
|
"grad_norm": 0.38014072113379394, |
|
"kl": 0.6513671875, |
|
"learning_rate": 9.860197929105767e-07, |
|
"loss": 0.0261, |
|
"reward": 0.49609375, |
|
"reward_std": 0.0078125, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.49609375, |
|
"step": 89 |
|
}, |
|
{ |
|
"completion_length": 19.7265625, |
|
"epoch": 0.10449927431059507, |
|
"grad_norm": 3.246980620297076, |
|
"kl": 0.6064453125, |
|
"learning_rate": 9.8557461779216e-07, |
|
"loss": 0.0243, |
|
"reward": 0.48828125, |
|
"reward_std": 0.0234375, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.48828125, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 15.8359375, |
|
"epoch": 0.10566037735849057, |
|
"grad_norm": 1.125767741710113, |
|
"kl": 0.6162109375, |
|
"learning_rate": 9.85122569105338e-07, |
|
"loss": 0.0247, |
|
"reward": 0.49609375, |
|
"reward_std": 0.0078125, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.49609375, |
|
"step": 91 |
|
}, |
|
{ |
|
"completion_length": 22.44921875, |
|
"epoch": 0.10682148040638606, |
|
"grad_norm": 0.44874987843781083, |
|
"kl": 0.6083984375, |
|
"learning_rate": 9.846636532491012e-07, |
|
"loss": 0.0243, |
|
"reward": 0.48828125, |
|
"reward_std": 0.0234375, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.48828125, |
|
"step": 92 |
|
}, |
|
{ |
|
"completion_length": 16.21484375, |
|
"epoch": 0.10798258345428156, |
|
"grad_norm": 2.7244778647827363, |
|
"kl": 0.796875, |
|
"learning_rate": 9.841978767196493e-07, |
|
"loss": 0.0319, |
|
"reward": 0.494140625, |
|
"reward_std": 0.01171875, |
|
"rewards/correctness_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.494140625, |
|
"step": 93 |
|
}, |
|
{ |
|
"completion_length": 18.08984375, |
|
"epoch": 0.10914368650217707, |
|
"grad_norm": 0.900717061581606, |
|
"kl": 0.5712890625, |
|
"learning_rate": 9.83725246110298e-07, |
|
"loss": 0.0229, |
|
"reward": 0.49609375, |
|
"reward_std": 0.0234375, |
|
"rewards/correctness_reward_func": 0.0078125, |
|
"rewards/strict_format_reward_func": 0.48828125, |
|
"step": 94 |
|
}, |
|
{ |
|
"completion_length": 19.921875, |
|
"epoch": 0.11030478955007257, |
|
"grad_norm": 0.2922094758318377, |
|
"kl": 0.62109375, |
|
"learning_rate": 9.832457681113865e-07, |
|
"loss": 0.0249, |
|
"reward": 0.51953125, |
|
"reward_std": 0.0546875, |
|
"rewards/correctness_reward_func": 0.03125, |
|
"rewards/strict_format_reward_func": 0.48828125, |
|
"step": 95 |
|
}, |
|
{ |
|
"completion_length": 22.48828125, |
|
"epoch": 0.11146589259796807, |
|
"grad_norm": 1.5719055280514225, |
|
"kl": 0.5966796875, |
|
"learning_rate": 9.827594495101822e-07, |
|
"loss": 0.0239, |
|
"reward": 0.53125, |
|
"reward_std": 0.049292195588350296, |
|
"rewards/correctness_reward_func": 0.0390625, |
|
"rewards/strict_format_reward_func": 0.4921875, |
|
"step": 96 |
|
}, |
|
{ |
|
"completion_length": 21.078125, |
|
"epoch": 0.11262699564586356, |
|
"grad_norm": 1.875138112811585, |
|
"kl": 0.62890625, |
|
"learning_rate": 9.822662971907852e-07, |
|
"loss": 0.0252, |
|
"reward": 0.505859375, |
|
"reward_std": 0.047479298897087574, |
|
"rewards/correctness_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.490234375, |
|
"step": 97 |
|
}, |
|
{ |
|
"completion_length": 20.328125, |
|
"epoch": 0.11378809869375907, |
|
"grad_norm": 0.46440948422543626, |
|
"kl": 0.5703125, |
|
"learning_rate": 9.8176631813403e-07, |
|
"loss": 0.0228, |
|
"reward": 0.5078125, |
|
"reward_std": 0.015625, |
|
"rewards/correctness_reward_func": 0.0078125, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"step": 98 |
|
}, |
|
{ |
|
"completion_length": 17.6953125, |
|
"epoch": 0.11494920174165457, |
|
"grad_norm": 2.350666772323524, |
|
"kl": 0.6298828125, |
|
"learning_rate": 9.812595194173874e-07, |
|
"loss": 0.0252, |
|
"reward": 0.50390625, |
|
"reward_std": 0.0234375, |
|
"rewards/correctness_reward_func": 0.0078125, |
|
"rewards/strict_format_reward_func": 0.49609375, |
|
"step": 99 |
|
}, |
|
{ |
|
"completion_length": 30.88671875, |
|
"epoch": 0.11611030478955008, |
|
"grad_norm": 0.6769899791454503, |
|
"kl": 0.537109375, |
|
"learning_rate": 9.807459082148648e-07, |
|
"loss": 0.0215, |
|
"reward": 0.56640625, |
|
"reward_std": 0.0804273895919323, |
|
"rewards/correctness_reward_func": 0.078125, |
|
"rewards/strict_format_reward_func": 0.48828125, |
|
"step": 100 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 861, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|