diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,3103 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.12002133712660028, + "eval_steps": 500, + "global_step": 450, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 8.246321295801964e-05, + "clip_ratio/high_mean": 8.246321295801964e-05, + "clip_ratio/low_mean": 0.0001607200845481, + "clip_ratio/low_min": 0.0001607200845481, + "clip_ratio/region_mean": 0.00024318329847624733, + "completions/clipped_ratio": 0.4322916666666667, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 406.7740936279297, + "completions/mean_terminated_length": 326.5364532470703, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 1.220555803510878, + "epoch": 0.0013335704125177809, + "frac_reward_zero_std": 0.1510416716337204, + "grad_norm": 0.2492305040359497, + "kl": 0.00039684898996104797, + "learning_rate": 2.857142857142857e-07, + "loss": 0.0468, + "num_tokens": 859381.0, + "reward": 0.2200520932674408, + "reward_std": 0.390093058347702, + "rewards/equation_reward_func/mean": 0.03450520895421505, + "rewards/equation_reward_func/std": 0.18242325633764267, + "rewards/format_reward_func/mean": 0.185546875, + "rewards/format_reward_func/std": 0.3889496922492981, + "sampling/importance_sampling_ratio/max": 1.8975598812103271, + "sampling/importance_sampling_ratio/mean": 1.0000792145729065, + "sampling/importance_sampling_ratio/min": 0.30868129432201385, + "sampling/sampling_logp_difference/max": 1.182242512702942, + "sampling/sampling_logp_difference/mean": 0.020579097792506218, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0001745158997234992, + "clip_ratio/high_mean": 0.0001745158997234992, + "clip_ratio/low_mean": 0.0003991564800445404, + "clip_ratio/low_min": 0.0003991564800445404, + "clip_ratio/region_mean": 0.0005736723749174012, + "completions/clipped_ratio": 0.42708333333333337, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 403.5670623779297, + "completions/mean_terminated_length": 322.7135009765625, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 1.2127860360675389, + "epoch": 0.0026671408250355617, + "frac_reward_zero_std": 0.1302083358168602, + "grad_norm": 0.21184705197811127, + "kl": 0.0008533940883353353, + "learning_rate": 6.428571428571429e-07, + "loss": 0.0457, + "num_tokens": 1713884.0, + "reward": 0.2552083358168602, + "reward_std": 0.4136742502450943, + "rewards/equation_reward_func/mean": 0.041015625, + "rewards/equation_reward_func/std": 0.19729218631982803, + "rewards/format_reward_func/mean": 0.2141927033662796, + "rewards/format_reward_func/std": 0.4104663133621216, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000714659690857, + "sampling/importance_sampling_ratio/min": 0.3451089859008789, + "sampling/sampling_logp_difference/max": 1.1921609044075012, + "sampling/sampling_logp_difference/mean": 0.02058501821011305, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0005565897760485919, + "clip_ratio/high_mean": 0.0005565897760485919, + "clip_ratio/low_mean": 0.0012271921059841084, + "clip_ratio/low_min": 0.0012271921059841084, + "clip_ratio/region_mean": 0.0017837818796073812, + "completions/clipped_ratio": 0.35416666666666663, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 390.6380310058594, + "completions/mean_terminated_length": 324.38136291503906, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 1.2096752087275187, + "epoch": 0.004000711237553343, + "frac_reward_zero_std": 0.015625, + "grad_norm": 0.2598620057106018, + "kl": 0.004041151764492194, + "learning_rate": 1e-06, + "loss": 0.0432, + "num_tokens": 2548784.0, + "reward": 0.4368489682674408, + "reward_std": 0.520995706319809, + "rewards/equation_reward_func/mean": 0.037109375, + "rewards/equation_reward_func/std": 0.18914515525102615, + "rewards/format_reward_func/mean": 0.3997395932674408, + "rewards/format_reward_func/std": 0.47988975048065186, + "sampling/importance_sampling_ratio/max": 1.832794189453125, + "sampling/importance_sampling_ratio/mean": 1.0000334680080414, + "sampling/importance_sampling_ratio/min": 0.3897475004196167, + "sampling/sampling_logp_difference/max": 0.9497977793216705, + "sampling/sampling_logp_difference/mean": 0.020455473102629185, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0007313882763911452, + "clip_ratio/high_mean": 0.0007313882763911452, + "clip_ratio/low_mean": 0.0007843806812565567, + "clip_ratio/low_min": 0.0007843806812565567, + "clip_ratio/region_mean": 0.001515768954737319, + "completions/clipped_ratio": 0.19921875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 344.3255310058594, + "completions/mean_terminated_length": 302.98789978027344, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 1.1884036090638903, + "epoch": 0.005334281650071123, + "frac_reward_zero_std": 0.031250000931322575, + "grad_norm": 0.2656697928905487, + "kl": 0.010231439417435064, + "learning_rate": 9.996755410126814e-07, + "loss": 0.068, + "num_tokens": 3311636.0, + "reward": 0.759765625, + "reward_std": 0.5190623104572296, + "rewards/equation_reward_func/mean": 0.0703125, + "rewards/equation_reward_func/std": 0.2539859116077423, + "rewards/format_reward_func/mean": 0.689453125, + "rewards/format_reward_func/std": 0.4570586085319519, + "sampling/importance_sampling_ratio/max": 1.9918479919433594, + "sampling/importance_sampling_ratio/mean": 0.9999858736991882, + "sampling/importance_sampling_ratio/min": 0.17882930487394333, + "sampling/sampling_logp_difference/max": 1.729017198085785, + "sampling/sampling_logp_difference/mean": 0.02019453700631857, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0008292854373899496, + "clip_ratio/high_mean": 0.0008292854373899496, + "clip_ratio/low_mean": 0.0005807053695510452, + "clip_ratio/low_min": 0.0005807053695510452, + "clip_ratio/region_mean": 0.0014099908009585407, + "completions/clipped_ratio": 0.095703125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.5, + "completions/mean_length": 294.95703125, + "completions/mean_terminated_length": 272.17376708984375, + "completions/min_length": 80.5, + "completions/min_terminated_length": 80.5, + "entropy": 1.1066096756193373, + "epoch": 0.006667852062588905, + "frac_reward_zero_std": 0.2135416716337204, + "grad_norm": 0.30067694187164307, + "kl": 0.017605406356354554, + "learning_rate": 9.987025851452636e-07, + "loss": 0.0595, + "num_tokens": 3999098.0, + "reward": 0.953125, + "reward_std": 0.35241255164146423, + "rewards/equation_reward_func/mean": 0.064453125, + "rewards/equation_reward_func/std": 0.24568618088960648, + "rewards/format_reward_func/mean": 0.888671875, + "rewards/format_reward_func/std": 0.3112259954214096, + "sampling/importance_sampling_ratio/max": 1.7943660616874695, + "sampling/importance_sampling_ratio/mean": 1.0000225603580475, + "sampling/importance_sampling_ratio/min": 0.376553475856781, + "sampling/sampling_logp_difference/max": 0.9779322147369385, + "sampling/sampling_logp_difference/mean": 0.01951777935028076, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0006231176602240237, + "clip_ratio/high_mean": 0.0006231176602240237, + "clip_ratio/low_mean": 0.0006414349638766403, + "clip_ratio/low_min": 0.0006414349638766403, + "clip_ratio/region_mean": 0.001264552614884451, + "completions/clipped_ratio": 0.02994791666666663, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.5, + "completions/mean_length": 246.296875, + "completions/mean_terminated_length": 238.16598510742188, + "completions/min_length": 82.5, + "completions/min_terminated_length": 82.5, + "entropy": 1.0370207269986471, + "epoch": 0.008001422475106686, + "frac_reward_zero_std": 0.4635416865348816, + "grad_norm": 0.22506149113178253, + "kl": 0.023923315438959335, + "learning_rate": 9.970823951348486e-07, + "loss": 0.0409, + "num_tokens": 4611506.0, + "reward": 1.0403646230697632, + "reward_std": 0.22080308943986893, + "rewards/equation_reward_func/mean": 0.07291666604578495, + "rewards/equation_reward_func/std": 0.2589513882994652, + "rewards/format_reward_func/mean": 0.9674479067325592, + "rewards/format_reward_func/std": 0.1769649013876915, + "sampling/importance_sampling_ratio/max": 1.9244984984397888, + "sampling/importance_sampling_ratio/mean": 0.9998869001865387, + "sampling/importance_sampling_ratio/min": 0.20804276317358017, + "sampling/sampling_logp_difference/max": 1.5718563795089722, + "sampling/sampling_logp_difference/mean": 0.01897315215319395, + "step": 30 + }, + { + "clip_ratio/high_max": 0.00027691970058690965, + "clip_ratio/high_mean": 0.00027691970058690965, + "clip_ratio/low_mean": 0.000560271245517975, + "clip_ratio/low_min": 0.000560271245517975, + "clip_ratio/region_mean": 0.0008371909491769556, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 201.0279998779297, + "completions/mean_terminated_length": 199.8256607055664, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.9397390564282735, + "epoch": 0.009334992887624467, + "frac_reward_zero_std": 0.5729166865348816, + "grad_norm": 0.22270287573337555, + "kl": 0.039403304706017175, + "learning_rate": 9.948170737222762e-07, + "loss": 0.0232, + "num_tokens": 5154565.0, + "reward": 1.0917969346046448, + "reward_std": 0.1866888925433159, + "rewards/equation_reward_func/mean": 0.1002604179084301, + "rewards/equation_reward_func/std": 0.30041730403900146, + "rewards/format_reward_func/mean": 0.9915364682674408, + "rewards/format_reward_func/std": 0.080092404037714, + "sampling/importance_sampling_ratio/max": 1.8699551224708557, + "sampling/importance_sampling_ratio/mean": 1.000175654888153, + "sampling/importance_sampling_ratio/min": 0.36644524335861206, + "sampling/sampling_logp_difference/max": 1.0192375183105469, + "sampling/sampling_logp_difference/mean": 0.018250481225550175, + "step": 35 + }, + { + "clip_ratio/high_max": 0.00014663741749245673, + "clip_ratio/high_mean": 0.00014663741749245673, + "clip_ratio/low_mean": 0.00064790764299687, + "clip_ratio/low_min": 0.00064790764299687, + "clip_ratio/region_mean": 0.0007945450572555678, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 178.76303100585938, + "completions/mean_terminated_length": 178.76303100585938, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.835021067990197, + "epoch": 0.010668563300142247, + "frac_reward_zero_std": 0.65625, + "grad_norm": 0.18460600078105927, + "kl": 0.05627463087439537, + "learning_rate": 9.919095609231123e-07, + "loss": 0.0099, + "num_tokens": 5408799.0, + "reward": 1.0911458730697632, + "reward_std": 0.15167692303657532, + "rewards/equation_reward_func/mean": 0.0911458358168602, + "rewards/equation_reward_func/std": 0.28800395131111145, + "rewards/format_reward_func/mean": 1.0, + "rewards/format_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999470710754395, + "sampling/importance_sampling_ratio/min": 0.36619439721107483, + "sampling/sampling_logp_difference/max": 1.0045909881591797, + "sampling/sampling_logp_difference/mean": 0.017718330025672913, + "step": 40 + }, + { + "clip_ratio/high_max": 5.151105805351916e-05, + "clip_ratio/high_mean": 5.151105805351916e-05, + "clip_ratio/low_mean": 0.00028730963757779034, + "clip_ratio/low_min": 0.00028730963757779034, + "clip_ratio/region_mean": 0.0003388206953079336, + "completions/clipped_ratio": 0.0006510416666666852, + "completions/max_length": 472.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 178.38282012939453, + "completions/mean_terminated_length": 178.16500854492188, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.7936246474583943, + "epoch": 0.012002133712660028, + "frac_reward_zero_std": 0.5677083432674408, + "grad_norm": 0.2048202008008957, + "kl": 0.06592979356646538, + "learning_rate": 9.88363630211991e-07, + "loss": 0.0081, + "num_tokens": 5917251.0, + "reward": 1.09765625, + "reward_std": 0.18212821334600449, + "rewards/equation_reward_func/mean": 0.099609375, + "rewards/equation_reward_func/std": 0.29943445324897766, + "rewards/format_reward_func/mean": 0.998046875, + "rewards/format_reward_func/std": 0.04354107566177845, + "sampling/importance_sampling_ratio/max": 1.9906055927276611, + "sampling/importance_sampling_ratio/mean": 0.9997865557670593, + "sampling/importance_sampling_ratio/min": 0.44908395409584045, + "sampling/sampling_logp_difference/max": 1.0048612654209137, + "sampling/sampling_logp_difference/mean": 0.017231782898306847, + "step": 45 + }, + { + "clip_ratio/high_max": 0.00013182188785221014, + "clip_ratio/high_mean": 0.00013182188785221014, + "clip_ratio/low_mean": 0.0004938909819530737, + "clip_ratio/low_min": 0.0004938909819530737, + "clip_ratio/region_mean": 0.0006257128717455392, + "completions/clipped_ratio": 0.0013020833333333148, + "completions/max_length": 488.5, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 174.65755462646484, + "completions/mean_terminated_length": 174.21988677978516, + "completions/min_length": 76.5, + "completions/min_terminated_length": 76.5, + "entropy": 0.7289129985703362, + "epoch": 0.01333570412517781, + "frac_reward_zero_std": 0.4843750149011612, + "grad_norm": 0.23449285328388214, + "kl": 0.07896106764674186, + "learning_rate": 9.841838836252625e-07, + "loss": 0.0195, + "num_tokens": 6419477.0, + "reward": 1.142578125, + "reward_std": 0.22810514271259308, + "rewards/equation_reward_func/mean": 0.1458333358168602, + "rewards/equation_reward_func/std": 0.35293246805667877, + "rewards/format_reward_func/mean": 0.9967447817325592, + "rewards/format_reward_func/std": 0.056708112359046936, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999631345272064, + "sampling/importance_sampling_ratio/min": 0.3919448107481003, + "sampling/sampling_logp_difference/max": 1.066166639328003, + "sampling/sampling_logp_difference/mean": 0.016717251390218735, + "step": 50 + }, + { + "clip_ratio/high_max": 0.00028201885159230897, + "clip_ratio/high_mean": 0.00028201885159230897, + "clip_ratio/low_mean": 0.00038137449979937325, + "clip_ratio/low_min": 0.00038137449979937325, + "clip_ratio/region_mean": 0.0006633933484812991, + "completions/clipped_ratio": 0.0, + "completions/max_length": 429.5, + "completions/max_terminated_length": 429.5, + "completions/mean_length": 166.056640625, + "completions/mean_terminated_length": 166.056640625, + "completions/min_length": 67.5, + "completions/min_terminated_length": 67.5, + "entropy": 0.6839104798105028, + "epoch": 0.01466927453769559, + "frac_reward_zero_std": 0.5833333730697632, + "grad_norm": 0.27615708112716675, + "kl": 0.08441497402058708, + "learning_rate": 9.793757457883061e-07, + "loss": 0.0074, + "num_tokens": 6908932.0, + "reward": 1.1412761211395264, + "reward_std": 0.18976972252130508, + "rewards/equation_reward_func/mean": 0.1419270858168602, + "rewards/equation_reward_func/std": 0.3488829731941223, + "rewards/format_reward_func/mean": 0.9993489682674408, + "rewards/format_reward_func/std": 0.018042195588350296, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000046193599701, + "sampling/importance_sampling_ratio/min": 0.3581351041793823, + "sampling/sampling_logp_difference/max": 1.33011794090271, + "sampling/sampling_logp_difference/mean": 0.015749589540064335, + "step": 55 + }, + { + "clip_ratio/high_max": 0.00020543187339272764, + "clip_ratio/high_mean": 0.00020543187339272764, + "clip_ratio/low_mean": 0.0004257633095322591, + "clip_ratio/low_min": 0.0004257633095322591, + "clip_ratio/region_mean": 0.0006311951806613554, + "completions/clipped_ratio": 0.0, + "completions/max_length": 422.5, + "completions/max_terminated_length": 422.5, + "completions/mean_length": 156.6217498779297, + "completions/mean_terminated_length": 156.6217498779297, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.6679691672325134, + "epoch": 0.016002844950213372, + "frac_reward_zero_std": 0.520833358168602, + "grad_norm": 0.22341680526733398, + "kl": 0.10024641735686196, + "learning_rate": 9.739454568752555e-07, + "loss": 0.0117, + "num_tokens": 7383887.0, + "reward": 1.1484375596046448, + "reward_std": 0.2132711559534073, + "rewards/equation_reward_func/mean": 0.1484375, + "rewards/equation_reward_func/std": 0.35439321398735046, + "rewards/format_reward_func/mean": 1.0, + "rewards/format_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001911520957947, + "sampling/importance_sampling_ratio/min": 0.4817018210887909, + "sampling/sampling_logp_difference/max": 0.846564769744873, + "sampling/sampling_logp_difference/mean": 0.015754975378513336, + "step": 60 + }, + { + "clip_ratio/high_max": 0.00014940144287215338, + "clip_ratio/high_mean": 0.00014940144287215338, + "clip_ratio/low_mean": 0.0004307091339594788, + "clip_ratio/low_min": 0.0004307091339594788, + "clip_ratio/region_mean": 0.0005801105719809938, + "completions/clipped_ratio": 0.0006510416666666852, + "completions/max_length": 453.0, + "completions/max_terminated_length": 426.5, + "completions/mean_length": 155.27734375, + "completions/mean_terminated_length": 155.04510498046875, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.6233775231573316, + "epoch": 0.01733641536273115, + "frac_reward_zero_std": 0.5104166865348816, + "grad_norm": 0.25647392868995667, + "kl": 0.10600344340006511, + "learning_rate": 9.67900064510277e-07, + "loss": 0.0084, + "num_tokens": 7856865.0, + "reward": 1.1516927480697632, + "reward_std": 0.22302059829235077, + "rewards/equation_reward_func/mean": 0.1529947891831398, + "rewards/equation_reward_func/std": 0.3602159321308136, + "rewards/format_reward_func/mean": 0.9986979365348816, + "rewards/format_reward_func/std": 0.03608439117670059, + "sampling/importance_sampling_ratio/max": 1.9309203624725342, + "sampling/importance_sampling_ratio/mean": 0.9999693632125854, + "sampling/importance_sampling_ratio/min": 0.4153195768594742, + "sampling/sampling_logp_difference/max": 0.8791846036911011, + "sampling/sampling_logp_difference/mean": 0.015379528980702162, + "step": 65 + }, + { + "clip_ratio/high_max": 0.00025970345886889846, + "clip_ratio/high_mean": 0.00025970345886889846, + "clip_ratio/low_mean": 0.0006601886904617358, + "clip_ratio/low_min": 0.0006601886904617358, + "clip_ratio/region_mean": 0.0009198921454501235, + "completions/clipped_ratio": 0.0006510416666666852, + "completions/max_length": 468.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 155.7721405029297, + "completions/mean_terminated_length": 155.54035186767578, + "completions/min_length": 73.5, + "completions/min_terminated_length": 73.5, + "entropy": 0.5942621601952447, + "epoch": 0.018669985775248935, + "frac_reward_zero_std": 0.5937500298023224, + "grad_norm": 0.2447100132703781, + "kl": 0.11355986578596963, + "learning_rate": 9.612474146209095e-07, + "loss": 0.0164, + "num_tokens": 8330091.0, + "reward": 1.1471354365348816, + "reward_std": 0.18823636323213577, + "rewards/equation_reward_func/mean": 0.1484375, + "rewards/equation_reward_func/std": 0.3548363447189331, + "rewards/format_reward_func/mean": 0.9986979365348816, + "rewards/format_reward_func/std": 0.03608439117670059, + "sampling/importance_sampling_ratio/max": 1.752297282218933, + "sampling/importance_sampling_ratio/mean": 0.999990701675415, + "sampling/importance_sampling_ratio/min": 0.4324537664651871, + "sampling/sampling_logp_difference/max": 0.8426389098167419, + "sampling/sampling_logp_difference/mean": 0.015041533857584, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0001407271652068529, + "clip_ratio/high_mean": 0.0001407271652068529, + "clip_ratio/low_mean": 0.0005166083179776453, + "clip_ratio/low_min": 0.0005166083179776453, + "clip_ratio/region_mean": 0.0006573354848013776, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 156.1412811279297, + "completions/mean_terminated_length": 156.1412811279297, + "completions/min_length": 75.5, + "completions/min_terminated_length": 75.5, + "entropy": 0.5519994563526578, + "epoch": 0.020003556187766714, + "frac_reward_zero_std": 0.5260416865348816, + "grad_norm": 0.27088993787765503, + "kl": 0.12343557874361674, + "learning_rate": 9.539961412553374e-07, + "loss": 0.0088, + "num_tokens": 8804340.0, + "reward": 1.1705729365348816, + "reward_std": 0.2155926674604416, + "rewards/equation_reward_func/mean": 0.1705729141831398, + "rewards/equation_reward_func/std": 0.3762807846069336, + "rewards/format_reward_func/mean": 1.0, + "rewards/format_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.941873550415039, + "sampling/importance_sampling_ratio/mean": 1.0001720190048218, + "sampling/importance_sampling_ratio/min": 0.4308120161294937, + "sampling/sampling_logp_difference/max": 0.8821389675140381, + "sampling/sampling_logp_difference/mean": 0.014702403452247381, + "step": 75 + }, + { + "clip_ratio/high_max": 0.00027237245586648996, + "clip_ratio/high_mean": 0.00027237245586648996, + "clip_ratio/low_mean": 0.0004707174133121346, + "clip_ratio/low_min": 0.0004707174133121346, + "clip_ratio/region_mean": 0.0007430898717656317, + "completions/clipped_ratio": 0.0026041666666666297, + "completions/max_length": 512.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 155.4244842529297, + "completions/mean_terminated_length": 154.49346923828125, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.5304910063743591, + "epoch": 0.021337126600284494, + "frac_reward_zero_std": 0.5729166865348816, + "grad_norm": 0.2550235986709595, + "kl": 0.12983355240689384, + "learning_rate": 9.461556553768123e-07, + "loss": 0.0107, + "num_tokens": 9040818.0, + "reward": 1.1692708730697632, + "reward_std": 0.1947672814130783, + "rewards/equation_reward_func/mean": 0.171875, + "rewards/equation_reward_func/std": 0.3775176405906677, + "rewards/format_reward_func/mean": 0.9973958134651184, + "rewards/format_reward_func/std": 0.05099776014685631, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001031160354614, + "sampling/importance_sampling_ratio/min": 0.49738985300064087, + "sampling/sampling_logp_difference/max": 0.8177399635314941, + "sampling/sampling_logp_difference/mean": 0.014317769557237625, + "step": 80 + }, + { + "clip_ratio/high_max": 0.00024086676744951142, + "clip_ratio/high_mean": 0.00024086676744951142, + "clip_ratio/low_mean": 0.00039953979139681903, + "clip_ratio/low_min": 0.00039953979139681903, + "clip_ratio/region_mean": 0.0006404065549658198, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.5, + "completions/max_terminated_length": 439.5, + "completions/mean_length": 153.61133575439453, + "completions/mean_terminated_length": 153.61133575439453, + "completions/min_length": 69.5, + "completions/min_terminated_length": 69.5, + "entropy": 0.5225983613067203, + "epoch": 0.022670697012802277, + "frac_reward_zero_std": 0.5468750149011612, + "grad_norm": 0.2504768967628479, + "kl": 0.12945938093794718, + "learning_rate": 9.377361326497673e-07, + "loss": 0.0096, + "num_tokens": 9511365.0, + "reward": 1.1777344346046448, + "reward_std": 0.21015208959579468, + "rewards/equation_reward_func/mean": 0.1783854216337204, + "rewards/equation_reward_func/std": 0.3818565607070923, + "rewards/format_reward_func/mean": 0.9993489682674408, + "rewards/format_reward_func/std": 0.018042195588350296, + "sampling/importance_sampling_ratio/max": 1.9827061891555786, + "sampling/importance_sampling_ratio/mean": 1.0000260472297668, + "sampling/importance_sampling_ratio/min": 0.2931542694568634, + "sampling/sampling_logp_difference/max": 1.2275865077972412, + "sampling/sampling_logp_difference/mean": 0.014332784339785576, + "step": 85 + }, + { + "clip_ratio/high_max": 0.00030845343135297296, + "clip_ratio/high_mean": 0.00030845343135297296, + "clip_ratio/low_mean": 0.0003940276859793812, + "clip_ratio/low_min": 0.0003940276859793812, + "clip_ratio/region_mean": 0.0007024811192726095, + "completions/clipped_ratio": 0.0006510416666666852, + "completions/max_length": 455.5, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 151.39258575439453, + "completions/mean_terminated_length": 151.15863800048828, + "completions/min_length": 79.5, + "completions/min_terminated_length": 79.5, + "entropy": 0.5152331637011633, + "epoch": 0.024004267425320056, + "frac_reward_zero_std": 0.453125, + "grad_norm": 0.24938349425792694, + "kl": 0.14415942314598296, + "learning_rate": 9.287485002334732e-07, + "loss": 0.0105, + "num_tokens": 9977992.0, + "reward": 1.2389323115348816, + "reward_std": 0.25067979097366333, + "rewards/equation_reward_func/mean": 0.2395833283662796, + "rewards/equation_reward_func/std": 0.4270091652870178, + "rewards/format_reward_func/mean": 0.9993489682674408, + "rewards/format_reward_func/std": 0.018042195588350296, + "sampling/importance_sampling_ratio/max": 1.9377199411392212, + "sampling/importance_sampling_ratio/mean": 1.0001115202903748, + "sampling/importance_sampling_ratio/min": 0.3048397898674011, + "sampling/sampling_logp_difference/max": 1.2129307985305786, + "sampling/sampling_logp_difference/mean": 0.014080267399549484, + "step": 90 + }, + { + "clip_ratio/high_max": 0.00033899308117623956, + "clip_ratio/high_mean": 0.00033899308117623956, + "clip_ratio/low_mean": 0.0004896878639960454, + "clip_ratio/low_min": 0.0004896878639960454, + "clip_ratio/region_mean": 0.000828680945495661, + "completions/clipped_ratio": 0.0013020833333333703, + "completions/max_length": 512.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 158.04622650146484, + "completions/mean_terminated_length": 157.58474731445312, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.5163041750590006, + "epoch": 0.02533783783783784, + "frac_reward_zero_std": 0.5468750149011612, + "grad_norm": 0.22510170936584473, + "kl": 0.1380500313308504, + "learning_rate": 9.192044226003788e-07, + "loss": 0.0117, + "num_tokens": 10455367.0, + "reward": 1.1953125596046448, + "reward_std": 0.21207982301712036, + "rewards/equation_reward_func/mean": 0.1966145858168602, + "rewards/equation_reward_func/std": 0.395397424697876, + "rewards/format_reward_func/mean": 0.9986979365348816, + "rewards/format_reward_func/std": 0.03608439117670059, + "sampling/importance_sampling_ratio/max": 1.8817728161811829, + "sampling/importance_sampling_ratio/mean": 0.9998800754547119, + "sampling/importance_sampling_ratio/min": 0.4095059782266617, + "sampling/sampling_logp_difference/max": 1.0675833225250244, + "sampling/sampling_logp_difference/mean": 0.013827220071107149, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0002426135088575797, + "clip_ratio/high_mean": 0.0002426135088575797, + "clip_ratio/low_mean": 0.0003667403675434697, + "clip_ratio/low_min": 0.0003667403675434697, + "clip_ratio/region_mean": 0.000609353871550411, + "completions/clipped_ratio": 0.0006510416666666852, + "completions/max_length": 453.0, + "completions/max_terminated_length": 446.5, + "completions/mean_length": 161.76953887939453, + "completions/mean_terminated_length": 161.54193878173828, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.5136602183183034, + "epoch": 0.02667140825035562, + "frac_reward_zero_std": 0.5416666865348816, + "grad_norm": 0.23747371137142181, + "kl": 0.14902293648984696, + "learning_rate": 9.091162863975388e-07, + "loss": 0.0058, + "num_tokens": 10938317.0, + "reward": 1.2154948115348816, + "reward_std": 0.21142307668924332, + "rewards/equation_reward_func/mean": 0.2161458358168602, + "rewards/equation_reward_func/std": 0.4118070900440216, + "rewards/format_reward_func/mean": 0.9993489682674408, + "rewards/format_reward_func/std": 0.018042195588350296, + "sampling/importance_sampling_ratio/max": 1.9759749174118042, + "sampling/importance_sampling_ratio/mean": 0.9999299943447113, + "sampling/importance_sampling_ratio/min": 0.39328936487436295, + "sampling/sampling_logp_difference/max": 1.0894970297813416, + "sampling/sampling_logp_difference/mean": 0.013999458402395248, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0003500140766846016, + "clip_ratio/high_mean": 0.0003500140766846016, + "clip_ratio/low_mean": 0.0005092643660544935, + "clip_ratio/low_min": 0.0005092643660544935, + "clip_ratio/region_mean": 0.0008592784450027264, + "completions/clipped_ratio": 0.0006510416666666852, + "completions/max_length": 484.0, + "completions/max_terminated_length": 441.5, + "completions/mean_length": 164.32227325439453, + "completions/mean_terminated_length": 164.0944061279297, + "completions/min_length": 77.5, + "completions/min_terminated_length": 77.5, + "entropy": 0.5040387398666806, + "epoch": 0.0280049786628734, + "frac_reward_zero_std": 0.4843750149011612, + "grad_norm": 0.2740892469882965, + "kl": 0.13950954443878597, + "learning_rate": 8.984971843707787e-07, + "loss": 0.0117, + "num_tokens": 11424860.0, + "reward": 1.2389323115348816, + "reward_std": 0.23391704261302948, + "rewards/equation_reward_func/mean": 0.240234375, + "rewards/equation_reward_func/std": 0.4267624020576477, + "rewards/format_reward_func/mean": 0.9986979067325592, + "rewards/format_reward_func/std": 0.025498881936073303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999798238277435, + "sampling/importance_sampling_ratio/min": 0.3396901339292526, + "sampling/sampling_logp_difference/max": 1.1812533736228943, + "sampling/sampling_logp_difference/mean": 0.01429820992052555, + "step": 105 + }, + { + "clip_ratio/high_max": 0.00027339755777373083, + "clip_ratio/high_mean": 0.00027339755777373083, + "clip_ratio/low_mean": 0.0005355036419738705, + "clip_ratio/low_min": 0.0005355036419738705, + "clip_ratio/region_mean": 0.0008089011958670906, + "completions/clipped_ratio": 0.002604166666666685, + "completions/max_length": 512.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 169.81446075439453, + "completions/mean_terminated_length": 168.92395782470703, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.48931105732917785, + "epoch": 0.02933854907539118, + "frac_reward_zero_std": 0.578125, + "grad_norm": 0.27575984597206116, + "kl": 0.1484737174378501, + "learning_rate": 8.873608983724579e-07, + "loss": 0.0173, + "num_tokens": 11919671.0, + "reward": 1.2278646230697632, + "reward_std": 0.18972523510456085, + "rewards/equation_reward_func/mean": 0.2304687574505806, + "rewards/equation_reward_func/std": 0.4191684126853943, + "rewards/format_reward_func/mean": 0.9973958432674408, + "rewards/format_reward_func/std": 0.04925142601132393, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999160468578339, + "sampling/importance_sampling_ratio/min": 0.2148948758840561, + "sampling/sampling_logp_difference/max": 1.5601493120193481, + "sampling/sampling_logp_difference/mean": 0.014369042590260506, + "step": 110 + }, + { + "clip_ratio/high_max": 0.00040300794805969215, + "clip_ratio/high_mean": 0.00040300794805969215, + "clip_ratio/low_mean": 0.0003567300795111805, + "clip_ratio/low_min": 0.0003567300795111805, + "clip_ratio/region_mean": 0.0007597380298345039, + "completions/clipped_ratio": 0.0013020833333333703, + "completions/max_length": 512.0, + "completions/max_terminated_length": 462.5, + "completions/mean_length": 169.3873748779297, + "completions/mean_terminated_length": 168.940673828125, + "completions/min_length": 80.5, + "completions/min_terminated_length": 80.5, + "entropy": 0.47813753684361776, + "epoch": 0.03067211948790896, + "frac_reward_zero_std": 0.5416666865348816, + "grad_norm": 0.2630886435508728, + "kl": 0.1524241354730394, + "learning_rate": 8.75721881474886e-07, + "loss": 0.0116, + "num_tokens": 12414266.0, + "reward": 1.2389323115348816, + "reward_std": 0.2096470445394516, + "rewards/equation_reward_func/mean": 0.2402343824505806, + "rewards/equation_reward_func/std": 0.4271441549062729, + "rewards/format_reward_func/mean": 0.9986979365348816, + "rewards/format_reward_func/std": 0.03608439117670059, + "sampling/importance_sampling_ratio/max": 1.891337275505066, + "sampling/importance_sampling_ratio/mean": 1.0000755190849304, + "sampling/importance_sampling_ratio/min": 0.42482201755046844, + "sampling/sampling_logp_difference/max": 0.8645262718200684, + "sampling/sampling_logp_difference/mean": 0.01372270192950964, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0004927822120306599, + "clip_ratio/high_mean": 0.0004927822120306599, + "clip_ratio/low_mean": 0.0007719357342769702, + "clip_ratio/low_min": 0.0007719357342769702, + "clip_ratio/region_mean": 0.0012647179375764811, + "completions/clipped_ratio": 0.0026041666666666297, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 174.0234375, + "completions/mean_terminated_length": 173.1409912109375, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.45458437999089557, + "epoch": 0.032005689900426744, + "frac_reward_zero_std": 0.4791666865348816, + "grad_norm": 0.26756706833839417, + "kl": 0.1737965441412396, + "learning_rate": 8.635952392126071e-07, + "loss": 0.0122, + "num_tokens": 12665156.0, + "reward": 1.234375, + "reward_std": 0.2345321625471115, + "rewards/equation_reward_func/mean": 0.2369791716337204, + "rewards/equation_reward_func/std": 0.42550650238990784, + "rewards/format_reward_func/mean": 0.9973958134651184, + "rewards/format_reward_func/std": 0.05099776014685631, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999394416809082, + "sampling/importance_sampling_ratio/min": 0.5686211585998535, + "sampling/sampling_logp_difference/max": 0.7365884780883789, + "sampling/sampling_logp_difference/mean": 0.013807976618409157, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0004774133257645493, + "clip_ratio/high_mean": 0.0004774133257645493, + "clip_ratio/low_mean": 0.0005259135214146227, + "clip_ratio/low_min": 0.0005259135214146227, + "clip_ratio/region_mean": 0.001003326855910321, + "completions/clipped_ratio": 0.002604166666666685, + "completions/max_length": 512.0, + "completions/max_terminated_length": 480.5, + "completions/mean_length": 171.62891387939453, + "completions/mean_terminated_length": 170.74213409423828, + "completions/min_length": 85.5, + "completions/min_terminated_length": 85.5, + "entropy": 0.4290965742535061, + "epoch": 0.03333926031294452, + "frac_reward_zero_std": 0.5104166865348816, + "grad_norm": 0.2608935534954071, + "kl": 0.20678435928291744, + "learning_rate": 8.509967099778933e-07, + "loss": 0.0128, + "num_tokens": 13163034.0, + "reward": 1.2486979365348816, + "reward_std": 0.2202325016260147, + "rewards/equation_reward_func/mean": 0.2513020858168602, + "rewards/equation_reward_func/std": 0.43289557099342346, + "rewards/format_reward_func/mean": 0.9973958432674408, + "rewards/format_reward_func/std": 0.04925142601132393, + "sampling/importance_sampling_ratio/max": 1.8249186277389526, + "sampling/importance_sampling_ratio/mean": 0.9998769462108612, + "sampling/importance_sampling_ratio/min": 0.47857430577278137, + "sampling/sampling_logp_difference/max": 0.980915755033493, + "sampling/sampling_logp_difference/mean": 0.01346610626205802, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0005842774365899257, + "clip_ratio/high_mean": 0.0005842774365899257, + "clip_ratio/low_mean": 0.00032540460718640435, + "clip_ratio/low_min": 0.00032540460718640435, + "clip_ratio/region_mean": 0.0009096820379555639, + "completions/clipped_ratio": 0.005208333333333315, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.5, + "completions/mean_length": 171.0826873779297, + "completions/mean_terminated_length": 169.3041000366211, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.4244159738222758, + "epoch": 0.0346728307254623, + "frac_reward_zero_std": 0.536458358168602, + "grad_norm": 0.29272767901420593, + "kl": 0.18204551736513774, + "learning_rate": 8.379426445948932e-07, + "loss": 0.0148, + "num_tokens": 13659921.0, + "reward": 1.3020833730697632, + "reward_std": 0.2015741690993309, + "rewards/equation_reward_func/mean": 0.3072916567325592, + "rewards/equation_reward_func/std": 0.46159426867961884, + "rewards/format_reward_func/mean": 0.9947916567325592, + "rewards/format_reward_func/std": 0.06954876892268658, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000101923942566, + "sampling/importance_sampling_ratio/min": 0.38368818163871765, + "sampling/sampling_logp_difference/max": 1.3043135404586792, + "sampling/sampling_logp_difference/mean": 0.013248240575194359, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0005383453202537365, + "clip_ratio/high_mean": 0.0005383453202537365, + "clip_ratio/low_mean": 0.00044601258996408434, + "clip_ratio/low_min": 0.00044601258996408434, + "clip_ratio/region_mean": 0.0009843579073074377, + "completions/clipped_ratio": 0.004557291666666685, + "completions/max_length": 512.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 173.892578125, + "completions/mean_terminated_length": 172.34494018554688, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.4247790104813046, + "epoch": 0.036006401137980086, + "frac_reward_zero_std": 0.5364583730697632, + "grad_norm": 0.22071315348148346, + "kl": 0.1905387987693151, + "learning_rate": 8.244499850989451e-07, + "loss": 0.0182, + "num_tokens": 14161420.0, + "reward": 1.2727864980697632, + "reward_std": 0.2035394236445427, + "rewards/equation_reward_func/mean": 0.27734375, + "rewards/equation_reward_func/std": 0.44794152677059174, + "rewards/format_reward_func/mean": 0.9954427182674408, + "rewards/format_reward_func/std": 0.06722298264503479, + "sampling/importance_sampling_ratio/max": 1.9829902052879333, + "sampling/importance_sampling_ratio/mean": 1.0000203251838684, + "sampling/importance_sampling_ratio/min": 0.35485656559467316, + "sampling/sampling_logp_difference/max": 1.0371501445770264, + "sampling/sampling_logp_difference/mean": 0.012964142020791769, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0006448455293947417, + "clip_ratio/high_mean": 0.0006448455293947417, + "clip_ratio/low_mean": 0.00024130035211176922, + "clip_ratio/low_min": 0.00024130035211176922, + "clip_ratio/region_mean": 0.0008861458705117305, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 174.1354217529297, + "completions/mean_terminated_length": 172.1516571044922, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.42853086325857376, + "epoch": 0.03733997155049787, + "frac_reward_zero_std": 0.489583358168602, + "grad_norm": 0.2709422707557678, + "kl": 0.17643056445651584, + "learning_rate": 8.105362427485942e-07, + "loss": 0.0195, + "num_tokens": 14663308.0, + "reward": 1.3404948115348816, + "reward_std": 0.22940540313720703, + "rewards/equation_reward_func/mean": 0.3463541716337204, + "rewards/equation_reward_func/std": 0.4759208410978317, + "rewards/format_reward_func/mean": 0.994140625, + "rewards/format_reward_func/std": 0.07525911927223206, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000093936920166, + "sampling/importance_sampling_ratio/min": 0.22675791382789612, + "sampling/sampling_logp_difference/max": 1.5817719101905823, + "sampling/sampling_logp_difference/mean": 0.012891901191323996, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0006030157308689215, + "clip_ratio/high_mean": 0.0006030157308689215, + "clip_ratio/low_mean": 0.0002967870997963473, + "clip_ratio/low_min": 0.0002967870997963473, + "clip_ratio/region_mean": 0.0008998028254912546, + "completions/clipped_ratio": 0.003255208333333315, + "completions/max_length": 512.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 174.322265625, + "completions/mean_terminated_length": 173.2196807861328, + "completions/min_length": 76.5, + "completions/min_terminated_length": 76.5, + "entropy": 0.43246553474002414, + "epoch": 0.038673541963015645, + "frac_reward_zero_std": 0.5989583432674408, + "grad_norm": 0.22108739614486694, + "kl": 0.18742341233624352, + "learning_rate": 7.962194752988518e-07, + "loss": 0.0135, + "num_tokens": 15165235.0, + "reward": 1.2766927480697632, + "reward_std": 0.17926111817359924, + "rewards/equation_reward_func/mean": 0.2805989533662796, + "rewards/equation_reward_func/std": 0.44955602288246155, + "rewards/format_reward_func/mean": 0.99609375, + "rewards/format_reward_func/std": 0.06151263229548931, + "sampling/importance_sampling_ratio/max": 1.8048723936080933, + "sampling/importance_sampling_ratio/mean": 0.9998882114887238, + "sampling/importance_sampling_ratio/min": 0.2515784278512001, + "sampling/sampling_logp_difference/max": 1.607212781906128, + "sampling/sampling_logp_difference/mean": 0.01319821598008275, + "step": 145 + }, + { + "clip_ratio/high_max": 0.000455771059043602, + "clip_ratio/high_mean": 0.000455771059043602, + "clip_ratio/low_mean": 0.0005551438691327349, + "clip_ratio/low_min": 0.0005551438691327349, + "clip_ratio/region_mean": 0.001010914930763344, + "completions/clipped_ratio": 0.00651041666666663, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.5, + "completions/mean_length": 172.05599212646484, + "completions/mean_terminated_length": 169.83926391601562, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.4205744167168935, + "epoch": 0.04000711237553343, + "frac_reward_zero_std": 0.5677083432674408, + "grad_norm": 0.19048988819122314, + "kl": 0.19434802863332962, + "learning_rate": 7.815182635651912e-07, + "loss": 0.0188, + "num_tokens": 15663849.0, + "reward": 1.2884114980697632, + "reward_std": 0.18946316838264465, + "rewards/equation_reward_func/mean": 0.2949218675494194, + "rewards/equation_reward_func/std": 0.45279279351234436, + "rewards/format_reward_func/mean": 0.9934895634651184, + "rewards/format_reward_func/std": 0.07629651390016079, + "sampling/importance_sampling_ratio/max": 1.929445505142212, + "sampling/importance_sampling_ratio/mean": 0.9999913275241852, + "sampling/importance_sampling_ratio/min": 0.5050479918718338, + "sampling/sampling_logp_difference/max": 0.7063337564468384, + "sampling/sampling_logp_difference/mean": 0.012342919129878283, + "step": 150 + }, + { + "clip_ratio/high_max": 0.00041446864488534627, + "clip_ratio/high_mean": 0.00041446864488534627, + "clip_ratio/low_mean": 0.0003240321924547768, + "clip_ratio/low_min": 0.0003240321924547768, + "clip_ratio/region_mean": 0.0007385008425141375, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 478.5, + "completions/mean_length": 169.75130462646484, + "completions/mean_terminated_length": 167.73681640625, + "completions/min_length": 73.5, + "completions/min_terminated_length": 73.5, + "entropy": 0.4019563555717468, + "epoch": 0.04134068278805121, + "frac_reward_zero_std": 0.5312500298023224, + "grad_norm": 0.2859667241573334, + "kl": 0.2029052002562417, + "learning_rate": 7.664516873086987e-07, + "loss": 0.0258, + "num_tokens": 16159091.0, + "reward": 1.2975261211395264, + "reward_std": 0.2108873352408409, + "rewards/equation_reward_func/mean": 0.3033854216337204, + "rewards/equation_reward_func/std": 0.45988011360168457, + "rewards/format_reward_func/mean": 0.994140625, + "rewards/format_reward_func/std": 0.07625199481844902, + "sampling/importance_sampling_ratio/max": 1.8173683285713196, + "sampling/importance_sampling_ratio/mean": 0.9999800026416779, + "sampling/importance_sampling_ratio/min": 0.2708848789334297, + "sampling/sampling_logp_difference/max": 1.4767508506774902, + "sampling/sampling_logp_difference/mean": 0.012260426301509142, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0008528313479877802, + "clip_ratio/high_mean": 0.0008528313479877802, + "clip_ratio/low_mean": 0.0006209706496317975, + "clip_ratio/low_min": 0.0006209706496317975, + "clip_ratio/region_mean": 0.001473802014435124, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 169.5572967529297, + "completions/mean_terminated_length": 166.8608856201172, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.3816792335775163, + "epoch": 0.04267425320056899, + "frac_reward_zero_std": 0.5729166865348816, + "grad_norm": 0.24159470200538635, + "kl": 0.2158263640271293, + "learning_rate": 7.510393004736722e-07, + "loss": 0.0157, + "num_tokens": 16406399.0, + "reward": 1.3463542461395264, + "reward_std": 0.1807473599910736, + "rewards/equation_reward_func/mean": 0.3541666567325592, + "rewards/equation_reward_func/std": 0.47857171297073364, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.08809977769851685, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001572370529175, + "sampling/importance_sampling_ratio/min": 0.44540029764175415, + "sampling/sampling_logp_difference/max": 0.8087818622589111, + "sampling/sampling_logp_difference/mean": 0.012025379575788975, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0005206197885046196, + "clip_ratio/high_mean": 0.0005206197885046196, + "clip_ratio/low_mean": 0.0002294917636188782, + "clip_ratio/low_min": 0.0002294917636188782, + "clip_ratio/region_mean": 0.0007501115579442638, + "completions/clipped_ratio": 0.007161458333333315, + "completions/max_length": 512.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 170.11263275146484, + "completions/mean_terminated_length": 167.67625427246094, + "completions/min_length": 65.5, + "completions/min_terminated_length": 65.5, + "entropy": 0.37027359273698596, + "epoch": 0.04400782361308677, + "frac_reward_zero_std": 0.6250000298023224, + "grad_norm": 0.22267894446849823, + "kl": 0.22396905488438076, + "learning_rate": 7.353011058098103e-07, + "loss": 0.0203, + "num_tokens": 16902348.0, + "reward": 1.2955729365348816, + "reward_std": 0.15850768983364105, + "rewards/equation_reward_func/mean": 0.3027343675494194, + "rewards/equation_reward_func/std": 0.455763041973114, + "rewards/format_reward_func/mean": 0.9928385317325592, + "rewards/format_reward_func/std": 0.0793424490839243, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000206232070923, + "sampling/importance_sampling_ratio/min": 0.24150198698043823, + "sampling/sampling_logp_difference/max": 1.4618057608604431, + "sampling/sampling_logp_difference/mean": 0.011629619169980288, + "step": 165 + }, + { + "clip_ratio/high_max": 0.00039232027742804753, + "clip_ratio/high_mean": 0.00039232027742804753, + "clip_ratio/low_mean": 0.00025200080967301293, + "clip_ratio/low_min": 0.00025200080967301293, + "clip_ratio/region_mean": 0.0006443210874244364, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 478.5, + "completions/mean_length": 158.5260467529297, + "completions/mean_terminated_length": 157.13428497314453, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.35925259788831077, + "epoch": 0.04534139402560455, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.2323993742465973, + "kl": 0.2927443004316754, + "learning_rate": 7.192575289119245e-07, + "loss": 0.0118, + "num_tokens": 17380188.0, + "reward": 1.33984375, + "reward_std": 0.1657690703868866, + "rewards/equation_reward_func/mean": 0.34375, + "rewards/equation_reward_func/std": 0.46854379773139954, + "rewards/format_reward_func/mean": 0.99609375, + "rewards/format_reward_func/std": 0.06151263229548931, + "sampling/importance_sampling_ratio/max": 1.8994778394699097, + "sampling/importance_sampling_ratio/mean": 1.0000271797180176, + "sampling/importance_sampling_ratio/min": 0.32314135134220123, + "sampling/sampling_logp_difference/max": 1.1470708847045898, + "sampling/sampling_logp_difference/mean": 0.011732298880815506, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0005873265297850593, + "clip_ratio/high_mean": 0.0005873265297850593, + "clip_ratio/low_mean": 0.0002976627059979364, + "clip_ratio/low_min": 0.0002976627059979364, + "clip_ratio/region_mean": 0.0008849892380466271, + "completions/clipped_ratio": 0.005208333333333315, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.5, + "completions/mean_length": 157.38607025146484, + "completions/mean_terminated_length": 155.54173278808594, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.35415328476164076, + "epoch": 0.04667496443812233, + "frac_reward_zero_std": 0.5625000298023224, + "grad_norm": 0.20426678657531738, + "kl": 0.25026152034600574, + "learning_rate": 7.029293917108677e-07, + "loss": 0.0174, + "num_tokens": 17856317.0, + "reward": 1.4042969346046448, + "reward_std": 0.1873713582754135, + "rewards/equation_reward_func/mean": 0.4095052033662796, + "rewards/equation_reward_func/std": 0.491452157497406, + "rewards/format_reward_func/mean": 0.9947916567325592, + "rewards/format_reward_func/std": 0.06954877078533173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999100863933563, + "sampling/importance_sampling_ratio/min": 0.2912691496312618, + "sampling/sampling_logp_difference/max": 1.7345809936523438, + "sampling/sampling_logp_difference/mean": 0.011669775005429983, + "step": 175 + }, + { + "clip_ratio/high_max": 0.000228812032017029, + "clip_ratio/high_mean": 0.000228812032017029, + "clip_ratio/low_mean": 0.0005502028474842922, + "clip_ratio/low_min": 0.0005502028474842922, + "clip_ratio/region_mean": 0.0007790148814415766, + "completions/clipped_ratio": 0.00520833333333337, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.5, + "completions/mean_length": 160.9303436279297, + "completions/mean_terminated_length": 159.07701110839844, + "completions/min_length": 64.5, + "completions/min_terminated_length": 64.5, + "entropy": 0.3341843148072561, + "epoch": 0.04800853485064011, + "frac_reward_zero_std": 0.5781250298023224, + "grad_norm": 0.2191099375486374, + "kl": 0.23970166212982602, + "learning_rate": 6.863378854500845e-07, + "loss": 0.0214, + "num_tokens": 18337762.0, + "reward": 1.388671875, + "reward_std": 0.17967208474874496, + "rewards/equation_reward_func/mean": 0.39453125, + "rewards/equation_reward_func/std": 0.4879314750432968, + "rewards/format_reward_func/mean": 0.994140625, + "rewards/format_reward_func/std": 0.07304696924984455, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000572502613068, + "sampling/importance_sampling_ratio/min": 0.27417469397187233, + "sampling/sampling_logp_difference/max": 1.580506145954132, + "sampling/sampling_logp_difference/mean": 0.01102939760312438, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0004596881364705041, + "clip_ratio/high_mean": 0.0004596881364705041, + "clip_ratio/low_mean": 0.0004985270132440039, + "clip_ratio/low_min": 0.0004985270132440039, + "clip_ratio/region_mean": 0.0009582151532716428, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.5, + "completions/mean_length": 152.93489837646484, + "completions/mean_terminated_length": 151.52809143066406, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.3119810879230499, + "epoch": 0.049342105263157895, + "frac_reward_zero_std": 0.6510416865348816, + "grad_norm": 0.18948058784008026, + "kl": 0.25336053901248506, + "learning_rate": 6.695045431828524e-07, + "loss": 0.0152, + "num_tokens": 18806414.0, + "reward": 1.4388021230697632, + "reward_std": 0.15766192972660065, + "rewards/equation_reward_func/mean": 0.4427083283662796, + "rewards/equation_reward_func/std": 0.4956718683242798, + "rewards/format_reward_func/mean": 0.99609375, + "rewards/format_reward_func/std": 0.06151263229548931, + "sampling/importance_sampling_ratio/max": 1.9822060465812683, + "sampling/importance_sampling_ratio/mean": 1.0000199973583221, + "sampling/importance_sampling_ratio/min": 0.41931191086769104, + "sampling/sampling_logp_difference/max": 0.9327619969844818, + "sampling/sampling_logp_difference/mean": 0.010495124384760857, + "step": 185 + }, + { + "clip_ratio/high_max": 0.00048528614052985276, + "clip_ratio/high_mean": 0.00048528614052985276, + "clip_ratio/low_mean": 0.0002456105589064666, + "clip_ratio/low_min": 0.0002456105589064666, + "clip_ratio/region_mean": 0.0007308966987895676, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 161.43489837646484, + "completions/mean_terminated_length": 160.05962371826172, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.30431248876783584, + "epoch": 0.05067567567567568, + "frac_reward_zero_std": 0.6510416865348816, + "grad_norm": 0.24482014775276184, + "kl": 0.23480852279398176, + "learning_rate": 6.524512118259121e-07, + "loss": 0.0146, + "num_tokens": 19288498.0, + "reward": 1.4355469346046448, + "reward_std": 0.15682842582464218, + "rewards/equation_reward_func/mean": 0.439453125, + "rewards/equation_reward_func/std": 0.49664008617401123, + "rewards/format_reward_func/mean": 0.99609375, + "rewards/format_reward_func/std": 0.06151263415813446, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999767541885376, + "sampling/importance_sampling_ratio/min": 0.30531009286642075, + "sampling/sampling_logp_difference/max": 1.3271417617797852, + "sampling/sampling_logp_difference/mean": 0.01045646658167243, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0005238070214141367, + "clip_ratio/high_mean": 0.0005238070214141367, + "clip_ratio/low_mean": 0.00021611079120905036, + "clip_ratio/low_min": 0.00021611079120905036, + "clip_ratio/region_mean": 0.0007399178187673291, + "completions/clipped_ratio": 0.00716145833333337, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 170.7213592529297, + "completions/mean_terminated_length": 168.25975799560547, + "completions/min_length": 65.5, + "completions/min_terminated_length": 65.5, + "entropy": 0.31717623472213746, + "epoch": 0.052009246088193455, + "frac_reward_zero_std": 0.6458333730697632, + "grad_norm": 0.2376313954591751, + "kl": 0.23356608119275835, + "learning_rate": 6.352000238057539e-07, + "loss": 0.0197, + "num_tokens": 19785062.0, + "reward": 1.3964844346046448, + "reward_std": 0.14822185412049294, + "rewards/equation_reward_func/mean": 0.4055989533662796, + "rewards/equation_reward_func/std": 0.4893123656511307, + "rewards/format_reward_func/mean": 0.9908854067325592, + "rewards/format_reward_func/std": 0.094081811606884, + "sampling/importance_sampling_ratio/max": 1.8793203234672546, + "sampling/importance_sampling_ratio/mean": 1.000065565109253, + "sampling/importance_sampling_ratio/min": 0.3261307030916214, + "sampling/sampling_logp_difference/max": 1.2673670053482056, + "sampling/sampling_logp_difference/mean": 0.010490935295820236, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0005316998308343399, + "clip_ratio/high_mean": 0.0005316998308343399, + "clip_ratio/low_mean": 0.00033068071514005874, + "clip_ratio/low_min": 0.00033068071514005874, + "clip_ratio/region_mean": 0.0008623805492081576, + "completions/clipped_ratio": 0.00651041666666663, + "completions/max_length": 512.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 171.1197967529297, + "completions/mean_terminated_length": 168.885986328125, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "entropy": 0.31826107104619344, + "epoch": 0.05334281650071124, + "frac_reward_zero_std": 0.65625, + "grad_norm": 0.22351564466953278, + "kl": 0.26972466740343304, + "learning_rate": 6.177733683343578e-07, + "loss": 0.0163, + "num_tokens": 20033266.0, + "reward": 1.38671875, + "reward_std": 0.1559552103281021, + "rewards/equation_reward_func/mean": 0.3958333432674408, + "rewards/equation_reward_func/std": 0.4893476068973541, + "rewards/format_reward_func/mean": 0.9908854365348816, + "rewards/format_reward_func/std": 0.0950961783528328, + "sampling/importance_sampling_ratio/max": 1.8682427406311035, + "sampling/importance_sampling_ratio/mean": 1.0000337362289429, + "sampling/importance_sampling_ratio/min": 0.3602951467037201, + "sampling/sampling_logp_difference/max": 1.020831823348999, + "sampling/sampling_logp_difference/mean": 0.010743354447185993, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0004498787479759711, + "clip_ratio/high_mean": 0.0004498787479759711, + "clip_ratio/low_mean": 0.00022729498288956367, + "clip_ratio/low_min": 0.00022729498288956367, + "clip_ratio/region_mean": 0.000677173730218783, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 176.720703125, + "completions/mean_terminated_length": 173.40760803222656, + "completions/min_length": 66.5, + "completions/min_terminated_length": 66.5, + "entropy": 0.3147707760334015, + "epoch": 0.05467638691322902, + "frac_reward_zero_std": 0.6354166865348816, + "grad_norm": 0.1942840814590454, + "kl": 0.23799771898322636, + "learning_rate": 6.001938623516705e-07, + "loss": 0.0191, + "num_tokens": 20539277.0, + "reward": 1.4042969346046448, + "reward_std": 0.15899360924959183, + "rewards/equation_reward_func/mean": 0.4153645783662796, + "rewards/equation_reward_func/std": 0.4924657344818115, + "rewards/format_reward_func/mean": 0.9889323115348816, + "rewards/format_reward_func/std": 0.1005549281835556, + "sampling/importance_sampling_ratio/max": 1.8476455211639404, + "sampling/importance_sampling_ratio/mean": 1.000089704990387, + "sampling/importance_sampling_ratio/min": 0.4166345149278641, + "sampling/sampling_logp_difference/max": 0.9109713137149811, + "sampling/sampling_logp_difference/mean": 0.01037593511864543, + "step": 205 + }, + { + "clip_ratio/high_max": 0.00039852156704809103, + "clip_ratio/high_mean": 0.00039852156704809103, + "clip_ratio/low_mean": 0.0004050786943278379, + "clip_ratio/low_min": 0.0004050786943278379, + "clip_ratio/region_mean": 0.0008036002646096879, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.5, + "completions/mean_length": 172.04492950439453, + "completions/mean_terminated_length": 170.7117691040039, + "completions/min_length": 65.5, + "completions/min_terminated_length": 65.5, + "entropy": 0.308811209599177, + "epoch": 0.0560099573257468, + "frac_reward_zero_std": 0.5520833432674408, + "grad_norm": 0.20238249003887177, + "kl": 0.26445436477661133, + "learning_rate": 5.824843211725264e-07, + "loss": 0.0305, + "num_tokens": 21038050.0, + "reward": 1.419921875, + "reward_std": 0.1984238103032112, + "rewards/equation_reward_func/mean": 0.4244791716337204, + "rewards/equation_reward_func/std": 0.49348677694797516, + "rewards/format_reward_func/mean": 0.9954427182674408, + "rewards/format_reward_func/std": 0.06722298264503479, + "sampling/importance_sampling_ratio/max": 1.9434946775436401, + "sampling/importance_sampling_ratio/mean": 0.9999637007713318, + "sampling/importance_sampling_ratio/min": 0.26076772809028625, + "sampling/sampling_logp_difference/max": 1.7781074047088623, + "sampling/sampling_logp_difference/mean": 0.01050703413784504, + "step": 210 + }, + { + "clip_ratio/high_max": 0.00040974616648681045, + "clip_ratio/high_mean": 0.00040974616648681045, + "clip_ratio/low_mean": 0.0003410278324736282, + "clip_ratio/low_min": 0.0003410278324736282, + "clip_ratio/region_mean": 0.000750774004134453, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.5, + "completions/mean_length": 171.0084686279297, + "completions/mean_terminated_length": 168.99774169921875, + "completions/min_length": 67.5, + "completions/min_terminated_length": 67.5, + "entropy": 0.3029042184352875, + "epoch": 0.05734352773826458, + "frac_reward_zero_std": 0.6718750298023224, + "grad_norm": 0.16279266774654388, + "kl": 0.2774416406949361, + "learning_rate": 5.646677288761132e-07, + "loss": 0.0141, + "num_tokens": 21535151.0, + "reward": 1.4563802480697632, + "reward_std": 0.1416066288948059, + "rewards/equation_reward_func/mean": 0.4609375, + "rewards/equation_reward_func/std": 0.4987538307905197, + "rewards/format_reward_func/mean": 0.9954427182674408, + "rewards/format_reward_func/std": 0.06722298264503479, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999966025352478, + "sampling/importance_sampling_ratio/min": 0.3917490094900131, + "sampling/sampling_logp_difference/max": 0.9386938810348511, + "sampling/sampling_logp_difference/mean": 0.010426444932818413, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0003467093590491762, + "clip_ratio/high_mean": 0.0003467093590491762, + "clip_ratio/low_mean": 0.00024782920599035506, + "clip_ratio/low_min": 0.00024782920599035506, + "clip_ratio/region_mean": 0.0005945385708602973, + "completions/clipped_ratio": 0.004557291666666685, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 177.0826873779297, + "completions/mean_terminated_length": 175.55155181884766, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.30562667846679686, + "epoch": 0.05867709815078236, + "frac_reward_zero_std": 0.6302083432674408, + "grad_norm": 0.23372629284858704, + "kl": 0.28943457702795666, + "learning_rate": 5.467672084764065e-07, + "loss": 0.0226, + "num_tokens": 22041174.0, + "reward": 1.4173177480697632, + "reward_std": 0.16590671986341476, + "rewards/equation_reward_func/mean": 0.4225260466337204, + "rewards/equation_reward_func/std": 0.494156077504158, + "rewards/format_reward_func/mean": 0.9947916865348816, + "rewards/format_reward_func/std": 0.07202750444412231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000020861625671, + "sampling/importance_sampling_ratio/min": 0.3635213226079941, + "sampling/sampling_logp_difference/max": 1.0139039754867554, + "sampling/sampling_logp_difference/mean": 0.01038840925320983, + "step": 220 + }, + { + "clip_ratio/high_max": 0.00038185345345280236, + "clip_ratio/high_mean": 0.00038185345345280236, + "clip_ratio/low_mean": 0.000233135441926101, + "clip_ratio/low_min": 0.000233135441926101, + "clip_ratio/region_mean": 0.0006149888976425346, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 495.5, + "completions/mean_length": 176.21419525146484, + "completions/mean_terminated_length": 174.90045166015625, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.2975032581223382, + "epoch": 0.06001066856330014, + "frac_reward_zero_std": 0.6458333432674408, + "grad_norm": 0.22727879881858826, + "kl": 0.3511068628893958, + "learning_rate": 5.288059919122921e-07, + "loss": 0.0181, + "num_tokens": 22545999.0, + "reward": 1.4192708730697632, + "reward_std": 0.15288615226745605, + "rewards/equation_reward_func/mean": 0.4231770783662796, + "rewards/equation_reward_func/std": 0.493749737739563, + "rewards/format_reward_func/mean": 0.99609375, + "rewards/format_reward_func/std": 0.06151263229548931, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999711811542511, + "sampling/importance_sampling_ratio/min": 0.3541726917028427, + "sampling/sampling_logp_difference/max": 1.4118239283561707, + "sampling/sampling_logp_difference/mean": 0.010275719687342644, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0004903958845210986, + "clip_ratio/high_mean": 0.0004903958845210986, + "clip_ratio/low_mean": 0.0001953022573919346, + "clip_ratio/low_min": 0.0001953022573919346, + "clip_ratio/region_mean": 0.0006856981409429056, + "completions/clipped_ratio": 0.0026041666666666297, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 181.92188262939453, + "completions/mean_terminated_length": 181.06005096435547, + "completions/min_length": 66.5, + "completions/min_terminated_length": 66.5, + "entropy": 0.31225186255243087, + "epoch": 0.06134423897581792, + "frac_reward_zero_std": 0.6197916865348816, + "grad_norm": 0.2173406183719635, + "kl": 0.24050753911336262, + "learning_rate": 5.108073898963193e-07, + "loss": 0.0105, + "num_tokens": 23059759.0, + "reward": 1.4322916865348816, + "reward_std": 0.16636358946561813, + "rewards/equation_reward_func/mean": 0.4348958283662796, + "rewards/equation_reward_func/std": 0.49522319436073303, + "rewards/format_reward_func/mean": 0.9973958134651184, + "rewards/format_reward_func/std": 0.05099776014685631, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000023365020752, + "sampling/importance_sampling_ratio/min": 0.1604004092514515, + "sampling/sampling_logp_difference/max": 1.9200037717819214, + "sampling/sampling_logp_difference/mean": 0.010590968187898397, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0004358871306370323, + "clip_ratio/high_mean": 0.0004358871306370323, + "clip_ratio/low_mean": 0.00021185120470666637, + "clip_ratio/low_min": 0.00021185120470666637, + "clip_ratio/region_mean": 0.0006477383418112165, + "completions/clipped_ratio": 0.0013020833333333148, + "completions/max_length": 502.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 180.7760467529297, + "completions/mean_terminated_length": 180.33740234375, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.32192651364538405, + "epoch": 0.0626778093883357, + "frac_reward_zero_std": 0.6093750298023224, + "grad_norm": 0.16173921525478363, + "kl": 0.22371604475710127, + "learning_rate": 4.927947616612215e-07, + "loss": 0.0162, + "num_tokens": 23571695.0, + "reward": 1.4466146230697632, + "reward_std": 0.17118638008832932, + "rewards/equation_reward_func/mean": 0.44921875, + "rewards/equation_reward_func/std": 0.49735087156295776, + "rewards/format_reward_func/mean": 0.9973958432674408, + "rewards/format_reward_func/std": 0.03601375222206116, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999264478683472, + "sampling/importance_sampling_ratio/min": 0.23550476133823395, + "sampling/sampling_logp_difference/max": 1.5325356721878052, + "sampling/sampling_logp_difference/mean": 0.010465701576322317, + "step": 235 + }, + { + "clip_ratio/high_max": 0.00034989016874331153, + "clip_ratio/high_mean": 0.00034989016874331153, + "clip_ratio/low_mean": 0.00038944559572781954, + "clip_ratio/low_min": 0.00038944559572781954, + "clip_ratio/region_mean": 0.000739335770615273, + "completions/clipped_ratio": 0.00651041666666663, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 188.44921875, + "completions/mean_terminated_length": 186.32896423339844, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.3212519837750329, + "epoch": 0.06401137980085349, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.2004379779100418, + "kl": 0.2435829328166114, + "learning_rate": 4.747914846434627e-07, + "loss": 0.022, + "num_tokens": 23833520.0, + "reward": 1.4205729961395264, + "reward_std": 0.17257216572761536, + "rewards/equation_reward_func/mean": 0.4296875, + "rewards/equation_reward_func/std": 0.4953540563583374, + "rewards/format_reward_func/mean": 0.9908854365348816, + "rewards/format_reward_func/std": 0.09509618580341339, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999675154685974, + "sampling/importance_sampling_ratio/min": 0.4614628553390503, + "sampling/sampling_logp_difference/max": 1.3090758323669434, + "sampling/sampling_logp_difference/mean": 0.010545244440436363, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0004778647549553878, + "clip_ratio/high_mean": 0.0004778647549553878, + "clip_ratio/low_mean": 0.00032209758517435854, + "clip_ratio/low_min": 0.00032209758517435854, + "clip_ratio/region_mean": 0.0007999623430401294, + "completions/clipped_ratio": 0.008463541666666685, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.5, + "completions/mean_length": 189.0377655029297, + "completions/mean_terminated_length": 186.2826156616211, + "completions/min_length": 68.5, + "completions/min_terminated_length": 68.5, + "entropy": 0.31984990239143374, + "epoch": 0.06534495021337126, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.2481832504272461, + "kl": 0.23841883838176728, + "learning_rate": 4.568209241431614e-07, + "loss": 0.0202, + "num_tokens": 24358202.0, + "reward": 1.4342448115348816, + "reward_std": 0.16434622555971146, + "rewards/equation_reward_func/mean": 0.4440104216337204, + "rewards/equation_reward_func/std": 0.49655450880527496, + "rewards/format_reward_func/mean": 0.990234375, + "rewards/format_reward_func/std": 0.09789345785975456, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999916851520538, + "sampling/importance_sampling_ratio/min": 0.43274274468421936, + "sampling/sampling_logp_difference/max": 1.0429438352584839, + "sampling/sampling_logp_difference/mean": 0.010600093752145767, + "step": 245 + }, + { + "clip_ratio/high_max": 0.00028033588993518305, + "clip_ratio/high_mean": 0.00028033588993518305, + "clip_ratio/low_mean": 0.0002272098383400589, + "clip_ratio/low_min": 0.0002272098383400589, + "clip_ratio/region_mean": 0.000507545731185625, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.5, + "completions/mean_length": 199.169921875, + "completions/mean_terminated_length": 197.94733428955078, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.3251332739988963, + "epoch": 0.06667852062588904, + "frac_reward_zero_std": 0.6354166865348816, + "grad_norm": 0.22253383696079254, + "kl": 0.2497336741950777, + "learning_rate": 4.389064029997634e-07, + "loss": 0.0233, + "num_tokens": 24898775.0, + "reward": 1.3880208730697632, + "reward_std": 0.16248539090156555, + "rewards/equation_reward_func/mean": 0.3919270783662796, + "rewards/equation_reward_func/std": 0.4877689331769943, + "rewards/format_reward_func/mean": 0.99609375, + "rewards/format_reward_func/std": 0.05828043818473816, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000008940696716, + "sampling/importance_sampling_ratio/min": 0.26857390999794006, + "sampling/sampling_logp_difference/max": 1.3815239071846008, + "sampling/sampling_logp_difference/mean": 0.01083392184227705, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0004061708960863244, + "clip_ratio/high_mean": 0.0004061708960863244, + "clip_ratio/low_mean": 0.0002619471727585834, + "clip_ratio/low_min": 0.0002619471727585834, + "clip_ratio/region_mean": 0.0006681180711085391, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 494.5, + "completions/mean_length": 202.4362030029297, + "completions/mean_terminated_length": 199.9986801147461, + "completions/min_length": 67.5, + "completions/min_terminated_length": 67.5, + "entropy": 0.32372507254282634, + "epoch": 0.06801209103840683, + "frac_reward_zero_std": 0.5937500298023224, + "grad_norm": 0.20556816458702087, + "kl": 0.24480950600571103, + "learning_rate": 4.21071171322823e-07, + "loss": 0.0225, + "num_tokens": 25444085.0, + "reward": 1.4010416865348816, + "reward_std": 0.17663030326366425, + "rewards/equation_reward_func/mean": 0.41015625, + "rewards/equation_reward_func/std": 0.4920380413532257, + "rewards/format_reward_func/mean": 0.9908854067325592, + "rewards/format_reward_func/std": 0.09484752267599106, + "sampling/importance_sampling_ratio/max": 1.8252553343772888, + "sampling/importance_sampling_ratio/mean": 0.9999038279056549, + "sampling/importance_sampling_ratio/min": 0.49288080632686615, + "sampling/sampling_logp_difference/max": 1.0243215560913086, + "sampling/sampling_logp_difference/mean": 0.010709302965551615, + "step": 255 + }, + { + "clip_ratio/high_max": 0.000378556484873924, + "clip_ratio/high_mean": 0.000378556484873924, + "clip_ratio/low_mean": 0.0002973765532563751, + "clip_ratio/low_min": 0.0002973765532563751, + "clip_ratio/region_mean": 0.0006759330378069232, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.5, + "completions/mean_length": 199.3697967529297, + "completions/mean_terminated_length": 196.93287658691406, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.31678102943632336, + "epoch": 0.0693456614509246, + "frac_reward_zero_std": 0.578125, + "grad_norm": 0.2536647617816925, + "kl": 0.25352691577540504, + "learning_rate": 4.0333837631717376e-07, + "loss": 0.024, + "num_tokens": 25984797.0, + "reward": 1.4283854365348816, + "reward_std": 0.1874813660979271, + "rewards/equation_reward_func/mean": 0.4361979216337204, + "rewards/equation_reward_func/std": 0.490349143743515, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.08778633177280426, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999091029167175, + "sampling/importance_sampling_ratio/min": 0.14155469788238406, + "sampling/sampling_logp_difference/max": 2.815498113632202, + "sampling/sampling_logp_difference/mean": 0.01065267063677311, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0005121126489636178, + "clip_ratio/high_mean": 0.0005121126489636178, + "clip_ratio/low_mean": 0.0003389536577742547, + "clip_ratio/low_min": 0.0003389536577742547, + "clip_ratio/region_mean": 0.0008510663102950073, + "completions/clipped_ratio": 0.003255208333333315, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.5, + "completions/mean_length": 180.0319061279297, + "completions/mean_terminated_length": 178.94473266601562, + "completions/min_length": 69.5, + "completions/min_terminated_length": 69.5, + "entropy": 0.29022469321886696, + "epoch": 0.0706792318634424, + "frac_reward_zero_std": 0.59375, + "grad_norm": 0.2507019639015198, + "kl": 0.36944893532329137, + "learning_rate": 3.8573103224165547e-07, + "loss": 0.0271, + "num_tokens": 26495342.0, + "reward": 1.5156250596046448, + "reward_std": 0.1780867874622345, + "rewards/equation_reward_func/mean": 0.5201822966337204, + "rewards/equation_reward_func/std": 0.49822771549224854, + "rewards/format_reward_func/mean": 0.9954427182674408, + "rewards/format_reward_func/std": 0.06722298264503479, + "sampling/importance_sampling_ratio/max": 1.8695436716079712, + "sampling/importance_sampling_ratio/mean": 1.0000428557395935, + "sampling/importance_sampling_ratio/min": 0.07395134610123932, + "sampling/sampling_logp_difference/max": 3.476148009300232, + "sampling/sampling_logp_difference/mean": 0.010271421167999506, + "step": 265 + }, + { + "clip_ratio/high_max": 0.00039898287860624904, + "clip_ratio/high_mean": 0.00039898287860624904, + "clip_ratio/low_mean": 0.00030372722928101816, + "clip_ratio/low_min": 0.00030372722928101816, + "clip_ratio/region_mean": 0.0007027101111210262, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 494.5, + "completions/mean_length": 194.29883575439453, + "completions/mean_terminated_length": 193.05294036865234, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.29568223887019685, + "epoch": 0.07201280227596017, + "frac_reward_zero_std": 0.6510416865348816, + "grad_norm": 0.1862553507089615, + "kl": 0.2528458439641529, + "learning_rate": 3.6827199054038036e-07, + "loss": 0.0174, + "num_tokens": 27028249.0, + "reward": 1.4173177480697632, + "reward_std": 0.15105794370174408, + "rewards/equation_reward_func/mean": 0.421875, + "rewards/equation_reward_func/std": 0.4940943270921707, + "rewards/format_reward_func/mean": 0.9954427182674408, + "rewards/format_reward_func/std": 0.06722298264503479, + "sampling/importance_sampling_ratio/max": 1.9082924723625183, + "sampling/importance_sampling_ratio/mean": 0.9999897181987762, + "sampling/importance_sampling_ratio/min": 0.2628078907728195, + "sampling/sampling_logp_difference/max": 1.3640477061271667, + "sampling/sampling_logp_difference/mean": 0.010448508895933628, + "step": 270 + }, + { + "clip_ratio/high_max": 0.00038676669226131503, + "clip_ratio/high_mean": 0.00038676669226131503, + "clip_ratio/low_mean": 0.00018815232971165744, + "clip_ratio/low_min": 0.00018815232971165744, + "clip_ratio/region_mean": 0.0005749190232664761, + "completions/clipped_ratio": 0.0013020833333333148, + "completions/max_length": 505.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 190.041015625, + "completions/mean_terminated_length": 189.63401794433594, + "completions/min_length": 70.5, + "completions/min_terminated_length": 70.5, + "entropy": 0.30600046979056467, + "epoch": 0.07334637268847795, + "frac_reward_zero_std": 0.6302083432674408, + "grad_norm": 0.18525779247283936, + "kl": 0.2684856136639913, + "learning_rate": 3.5098391018530813e-07, + "loss": 0.0169, + "num_tokens": 27554520.0, + "reward": 1.4667969346046448, + "reward_std": 0.16029469668865204, + "rewards/equation_reward_func/mean": 0.4680989682674408, + "rewards/equation_reward_func/std": 0.4967698007822037, + "rewards/format_reward_func/mean": 0.9986979067325592, + "rewards/format_reward_func/std": 0.025498880073428154, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998972415924072, + "sampling/importance_sampling_ratio/min": 0.31307777017354965, + "sampling/sampling_logp_difference/max": 1.2583473920822144, + "sampling/sampling_logp_difference/mean": 0.010472552385181189, + "step": 275 + }, + { + "clip_ratio/high_max": 0.00032300802866100435, + "clip_ratio/high_mean": 0.00032300802866100435, + "clip_ratio/low_mean": 0.0002205706091545936, + "clip_ratio/low_min": 0.0002205706091545936, + "clip_ratio/region_mean": 0.0005435786416961088, + "completions/clipped_ratio": 0.0013020833333333703, + "completions/max_length": 512.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 174.7877655029297, + "completions/mean_terminated_length": 174.3480987548828, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.2920326189862357, + "epoch": 0.07467994310099574, + "frac_reward_zero_std": 0.7916666865348816, + "grad_norm": 0.19834178686141968, + "kl": 0.26587820880942875, + "learning_rate": 3.3388922826861785e-07, + "loss": 0.0162, + "num_tokens": 27805821.0, + "reward": 1.5221354961395264, + "reward_std": 0.08995597064495087, + "rewards/equation_reward_func/mean": 0.5234375, + "rewards/equation_reward_func/std": 0.49977585673332214, + "rewards/format_reward_func/mean": 0.9986979365348816, + "rewards/format_reward_func/std": 0.03608439117670059, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000381469726562, + "sampling/importance_sampling_ratio/min": 0.24444493651390076, + "sampling/sampling_logp_difference/max": 1.408765196800232, + "sampling/sampling_logp_difference/mean": 0.009937557391822338, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0003733713551709014, + "clip_ratio/high_mean": 0.0003733713551709014, + "clip_ratio/low_mean": 0.00024178889418383025, + "clip_ratio/low_min": 0.00024178889418383025, + "clip_ratio/region_mean": 0.0006151602519417389, + "completions/clipped_ratio": 0.004557291666666685, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 194.90560150146484, + "completions/mean_terminated_length": 193.46009826660156, + "completions/min_length": 68.5, + "completions/min_terminated_length": 68.5, + "entropy": 0.30232903560002644, + "epoch": 0.07601351351351351, + "frac_reward_zero_std": 0.6302083432674408, + "grad_norm": 0.23240312933921814, + "kl": 0.264598982863956, + "learning_rate": 3.1701013088304206e-07, + "loss": 0.0185, + "num_tokens": 28339652.0, + "reward": 1.4537760615348816, + "reward_std": 0.1596420630812645, + "rewards/equation_reward_func/mean": 0.458984375, + "rewards/equation_reward_func/std": 0.4979875087738037, + "rewards/format_reward_func/mean": 0.9947916567325592, + "rewards/format_reward_func/std": 0.0714474730193615, + "sampling/importance_sampling_ratio/max": 1.934059500694275, + "sampling/importance_sampling_ratio/mean": 0.9999870955944061, + "sampling/importance_sampling_ratio/min": 0.18720961920917034, + "sampling/sampling_logp_difference/max": 2.4462757110595703, + "sampling/sampling_logp_difference/mean": 0.01020945655182004, + "step": 285 + }, + { + "clip_ratio/high_max": 0.00027480118248301245, + "clip_ratio/high_mean": 0.00027480118248301245, + "clip_ratio/low_mean": 0.00016693919961754646, + "clip_ratio/low_min": 0.00016693919961754646, + "clip_ratio/region_mean": 0.00044174038242393484, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.5, + "completions/mean_length": 188.88867950439453, + "completions/mean_terminated_length": 188.25939178466797, + "completions/min_length": 70.5, + "completions/min_terminated_length": 70.5, + "entropy": 0.2999881055619982, + "epoch": 0.07734708392603129, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.2126220464706421, + "kl": 0.24879435135258568, + "learning_rate": 3.003685243279592e-07, + "loss": 0.0216, + "num_tokens": 28864257.0, + "reward": 1.4648438096046448, + "reward_std": 0.1629902645945549, + "rewards/equation_reward_func/mean": 0.4674479216337204, + "rewards/equation_reward_func/std": 0.49728669226169586, + "rewards/format_reward_func/mean": 0.9973958432674408, + "rewards/format_reward_func/std": 0.04925142601132393, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000181794166565, + "sampling/importance_sampling_ratio/min": 0.350922629237175, + "sampling/sampling_logp_difference/max": 1.0541338324546814, + "sampling/sampling_logp_difference/mean": 0.010166732594370842, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0003630790345293159, + "clip_ratio/high_mean": 0.0003630790345293159, + "clip_ratio/low_mean": 0.00018830655284950303, + "clip_ratio/low_min": 0.00018830655284950303, + "clip_ratio/region_mean": 0.000551385587055443, + "completions/clipped_ratio": 0.004557291666666685, + "completions/max_length": 512.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 183.171875, + "completions/mean_terminated_length": 181.68539428710938, + "completions/min_length": 62.5, + "completions/min_terminated_length": 62.5, + "entropy": 0.28412733740276763, + "epoch": 0.07868065433854908, + "frac_reward_zero_std": 0.6822916865348816, + "grad_norm": 0.21540172398090363, + "kl": 0.24528321226437885, + "learning_rate": 2.839860066786103e-07, + "loss": 0.0148, + "num_tokens": 29379897.0, + "reward": 1.4720052480697632, + "reward_std": 0.13307339698076248, + "rewards/equation_reward_func/mean": 0.4765625, + "rewards/equation_reward_func/std": 0.49971458315849304, + "rewards/format_reward_func/mean": 0.9954427182674408, + "rewards/format_reward_func/std": 0.06209208443760872, + "sampling/importance_sampling_ratio/max": 1.8820261359214783, + "sampling/importance_sampling_ratio/mean": 0.9998998045921326, + "sampling/importance_sampling_ratio/min": 0.36002135276794434, + "sampling/sampling_logp_difference/max": 1.1157363057136536, + "sampling/sampling_logp_difference/mean": 0.009611997287720442, + "step": 295 + }, + { + "clip_ratio/high_max": 0.00036827284849197086, + "clip_ratio/high_mean": 0.00036827284849197086, + "clip_ratio/low_mean": 0.0002555595329290049, + "clip_ratio/low_min": 0.0002555595329290049, + "clip_ratio/region_mean": 0.0006238323781872168, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 491.5, + "completions/mean_length": 182.7154998779297, + "completions/mean_terminated_length": 181.42316436767578, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.26423416799969146, + "epoch": 0.08001422475106686, + "frac_reward_zero_std": 0.6302083432674408, + "grad_norm": 0.18316447734832764, + "kl": 0.2664960814846887, + "learning_rate": 2.6788383975533993e-07, + "loss": 0.0181, + "num_tokens": 29895396.0, + "reward": 1.4759114980697632, + "reward_std": 0.1575167253613472, + "rewards/equation_reward_func/mean": 0.4798177033662796, + "rewards/equation_reward_func/std": 0.4993356764316559, + "rewards/format_reward_func/mean": 0.99609375, + "rewards/format_reward_func/std": 0.06151263229548931, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000330805778503, + "sampling/importance_sampling_ratio/min": 0.34555216133594513, + "sampling/sampling_logp_difference/max": 1.087525725364685, + "sampling/sampling_logp_difference/mean": 0.009254928212612867, + "step": 300 + }, + { + "clip_ratio/high_max": 0.00029552211344707757, + "clip_ratio/high_mean": 0.00029552211344707757, + "clip_ratio/low_mean": 0.0001925515308458772, + "clip_ratio/low_min": 0.0001925515308458772, + "clip_ratio/region_mean": 0.0004880736417059476, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 179.11458587646484, + "completions/mean_terminated_length": 177.8103790283203, + "completions/min_length": 66.5, + "completions/min_terminated_length": 66.5, + "entropy": 0.2656091329124239, + "epoch": 0.08134779516358463, + "frac_reward_zero_std": 0.640625, + "grad_norm": 0.20923396944999695, + "kl": 0.26239980657895406, + "learning_rate": 2.520829215292426e-07, + "loss": 0.0146, + "num_tokens": 30404612.0, + "reward": 1.4960938096046448, + "reward_std": 0.15754638612270355, + "rewards/equation_reward_func/mean": 0.5006510466337204, + "rewards/equation_reward_func/std": 0.5000161230564117, + "rewards/format_reward_func/mean": 0.9954427182674408, + "rewards/format_reward_func/std": 0.06722298264503479, + "sampling/importance_sampling_ratio/max": 1.9263655543327332, + "sampling/importance_sampling_ratio/mean": 0.9999202489852905, + "sampling/importance_sampling_ratio/min": 0.2630079463124275, + "sampling/sampling_logp_difference/max": 1.4374548196792603, + "sampling/sampling_logp_difference/mean": 0.009449337143450975, + "step": 305 + }, + { + "clip_ratio/high_max": 0.00042231839421826104, + "clip_ratio/high_mean": 0.00042231839421826104, + "clip_ratio/low_mean": 0.00024206019330045414, + "clip_ratio/low_min": 0.00024206019330045414, + "clip_ratio/region_mean": 0.0006643785868719634, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.5, + "completions/max_terminated_length": 498.5, + "completions/mean_length": 178.81966400146484, + "completions/mean_terminated_length": 178.81966400146484, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.26296143333117167, + "epoch": 0.08268136557610242, + "frac_reward_zero_std": 0.5781250298023224, + "grad_norm": 0.22123926877975464, + "kl": 0.27026213506857555, + "learning_rate": 2.366037590000236e-07, + "loss": 0.0286, + "num_tokens": 30913383.0, + "reward": 1.5117188096046448, + "reward_std": 0.1835896149277687, + "rewards/equation_reward_func/mean": 0.5123698115348816, + "rewards/equation_reward_func/std": 0.5001688897609711, + "rewards/format_reward_func/mean": 0.9993489682674408, + "rewards/format_reward_func/std": 0.018042195588350296, + "sampling/importance_sampling_ratio/max": 1.910811960697174, + "sampling/importance_sampling_ratio/mean": 0.9999909698963165, + "sampling/importance_sampling_ratio/min": 0.2877357676625252, + "sampling/sampling_logp_difference/max": 1.254935622215271, + "sampling/sampling_logp_difference/mean": 0.009498789440840483, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0003104596676874078, + "clip_ratio/high_mean": 0.0003104596676874078, + "clip_ratio/low_mean": 0.00026212412986852644, + "clip_ratio/low_min": 0.00026212412986852644, + "clip_ratio/region_mean": 0.0005725837936754235, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 183.0631561279297, + "completions/mean_terminated_length": 181.7668685913086, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "entropy": 0.2513858218987783, + "epoch": 0.0840149359886202, + "frac_reward_zero_std": 0.6250000298023224, + "grad_norm": 0.2222646325826645, + "kl": 0.25177126692401036, + "learning_rate": 2.2146644158127826e-07, + "loss": 0.0181, + "num_tokens": 31428904.0, + "reward": 1.5110677480697632, + "reward_std": 0.1636933758854866, + "rewards/equation_reward_func/mean": 0.5156250149011612, + "rewards/equation_reward_func/std": 0.4989316165447235, + "rewards/format_reward_func/mean": 0.9954427182674408, + "rewards/format_reward_func/std": 0.06722298264503479, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000070631504059, + "sampling/importance_sampling_ratio/min": 0.1315157115459442, + "sampling/sampling_logp_difference/max": 2.5495256185531616, + "sampling/sampling_logp_difference/mean": 0.00900375097990036, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0003791713918973174, + "clip_ratio/high_mean": 0.0003791713918973174, + "clip_ratio/low_mean": 0.00039865842554718254, + "clip_ratio/low_min": 0.00039865842554718254, + "clip_ratio/region_mean": 0.0007778298158276205, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 178.9921875, + "completions/mean_terminated_length": 177.686279296875, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.2471254113647673, + "epoch": 0.08534850640113797, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.23829446732997894, + "kl": 0.2550564338763555, + "learning_rate": 2.0669061502772772e-07, + "loss": 0.0176, + "num_tokens": 31683530.0, + "reward": 1.52734375, + "reward_std": 0.19769152998924255, + "rewards/equation_reward_func/mean": 0.5325520634651184, + "rewards/equation_reward_func/std": 0.4992643892765045, + "rewards/format_reward_func/mean": 0.9947916865348816, + "rewards/format_reward_func/std": 0.07202750444412231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001081228256226, + "sampling/importance_sampling_ratio/min": 0.3776777684688568, + "sampling/sampling_logp_difference/max": 1.2291953563690186, + "sampling/sampling_logp_difference/mean": 0.009398526512086391, + "step": 320 + }, + { + "clip_ratio/high_max": 0.00024981050357584736, + "clip_ratio/high_mean": 0.00024981050357584736, + "clip_ratio/low_mean": 0.0002185462550389477, + "clip_ratio/low_min": 0.0002185462550389477, + "clip_ratio/region_mean": 0.0004683567586147951, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.5, + "completions/max_terminated_length": 496.5, + "completions/mean_length": 189.79492950439453, + "completions/mean_terminated_length": 189.79492950439453, + "completions/min_length": 68.5, + "completions/min_terminated_length": 68.5, + "entropy": 0.24910557634300656, + "epoch": 0.08668207681365576, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.18343009054660797, + "kl": 0.3331004543436898, + "learning_rate": 1.9229545593825363e-07, + "loss": 0.014, + "num_tokens": 32209647.0, + "reward": 1.4921875596046448, + "reward_std": 0.13511116057634354, + "rewards/equation_reward_func/mean": 0.4928385466337204, + "rewards/equation_reward_func/std": 0.4982505291700363, + "rewards/format_reward_func/mean": 0.9993489682674408, + "rewards/format_reward_func/std": 0.018042195588350296, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999713003635406, + "sampling/importance_sampling_ratio/min": 0.3226901516318321, + "sampling/sampling_logp_difference/max": 1.4107424020767212, + "sampling/sampling_logp_difference/mean": 0.009088593069463968, + "step": 325 + }, + { + "clip_ratio/high_max": 0.00025329808090140835, + "clip_ratio/high_mean": 0.00025329808090140835, + "clip_ratio/low_mean": 0.00014644789658228142, + "clip_ratio/low_min": 0.00014644789658228142, + "clip_ratio/region_mean": 0.0003997459778070657, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 487.5, + "completions/mean_length": 188.43555450439453, + "completions/mean_terminated_length": 187.79903411865234, + "completions/min_length": 70.5, + "completions/min_terminated_length": 70.5, + "entropy": 0.25031254357761806, + "epoch": 0.08801564722617354, + "frac_reward_zero_std": 0.6979166865348816, + "grad_norm": 0.18205001950263977, + "kl": 0.23675336175494724, + "learning_rate": 1.782996468678179e-07, + "loss": 0.0156, + "num_tokens": 32733476.0, + "reward": 1.5117188096046448, + "reward_std": 0.1285114586353302, + "rewards/equation_reward_func/mean": 0.5136718600988388, + "rewards/equation_reward_func/std": 0.499781534075737, + "rewards/format_reward_func/mean": 0.998046875, + "rewards/format_reward_func/std": 0.04354107566177845, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000253915786743, + "sampling/importance_sampling_ratio/min": 0.2096487432718277, + "sampling/sampling_logp_difference/max": 1.6379327774047852, + "sampling/sampling_logp_difference/mean": 0.008947435766458511, + "step": 330 + }, + { + "clip_ratio/high_max": 0.00036760022469227096, + "clip_ratio/high_mean": 0.00036760022469227096, + "clip_ratio/low_mean": 0.00019373813152520194, + "clip_ratio/low_min": 0.00019373813152520194, + "clip_ratio/region_mean": 0.0005613383539538417, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 192.64974212646484, + "completions/mean_terminated_length": 191.39713287353516, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.26695258451832665, + "epoch": 0.08934921763869132, + "frac_reward_zero_std": 0.7135416865348816, + "grad_norm": 0.1603485494852066, + "kl": 0.23771749205059475, + "learning_rate": 1.6472135208057125e-07, + "loss": 0.0127, + "num_tokens": 33263226.0, + "reward": 1.4791666865348816, + "reward_std": 0.1239507794380188, + "rewards/equation_reward_func/mean": 0.4830729067325592, + "rewards/equation_reward_func/std": 0.4993588328361511, + "rewards/format_reward_func/mean": 0.99609375, + "rewards/format_reward_func/std": 0.062418460845947266, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000041127204895, + "sampling/importance_sampling_ratio/min": 0.3652953952550888, + "sampling/sampling_logp_difference/max": 1.011906623840332, + "sampling/sampling_logp_difference/mean": 0.009391121566295624, + "step": 335 + }, + { + "clip_ratio/high_max": 0.00027188220345932576, + "clip_ratio/high_mean": 0.00027188220345932576, + "clip_ratio/low_mean": 0.00012609862444353187, + "clip_ratio/low_min": 0.00012609862444353187, + "clip_ratio/region_mean": 0.0003979808295197371, + "completions/clipped_ratio": 0.002604166666666685, + "completions/max_length": 504.5, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 182.22200775146484, + "completions/mean_terminated_length": 181.35569763183594, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.25910536878638796, + "epoch": 0.0906827880512091, + "frac_reward_zero_std": 0.6770833432674408, + "grad_norm": 0.23770348727703094, + "kl": 0.2541147106223636, + "learning_rate": 1.515781939756186e-07, + "loss": 0.0146, + "num_tokens": 33777583.0, + "reward": 1.5520833730697632, + "reward_std": 0.13701710104942322, + "rewards/equation_reward_func/mean": 0.5553385317325592, + "rewards/equation_reward_func/std": 0.496296688914299, + "rewards/format_reward_func/mean": 0.9967447817325592, + "rewards/format_reward_func/std": 0.04023824259638786, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999904036521912, + "sampling/importance_sampling_ratio/min": 0.2545197419822216, + "sampling/sampling_logp_difference/max": 1.6942999362945557, + "sampling/sampling_logp_difference/mean": 0.009073257446289062, + "step": 340 + }, + { + "clip_ratio/high_max": 0.00023049027594323787, + "clip_ratio/high_mean": 0.00023049027594323787, + "clip_ratio/low_mean": 0.00019442280172370373, + "clip_ratio/low_min": 0.00019442280172370373, + "clip_ratio/region_mean": 0.00042491307443318267, + "completions/clipped_ratio": 0.0026041666666666297, + "completions/max_length": 512.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 198.7076873779297, + "completions/mean_terminated_length": 197.88968658447266, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "entropy": 0.2685669667190976, + "epoch": 0.09201635846372688, + "frac_reward_zero_std": 0.6302083432674408, + "grad_norm": 0.22922219336032867, + "kl": 0.2642299314339956, + "learning_rate": 1.3888723021603526e-07, + "loss": 0.0214, + "num_tokens": 34317222.0, + "reward": 1.4889323115348816, + "reward_std": 0.15422768890857697, + "rewards/equation_reward_func/mean": 0.4928385317325592, + "rewards/equation_reward_func/std": 0.499693363904953, + "rewards/format_reward_func/mean": 0.99609375, + "rewards/format_reward_func/std": 0.062418460845947266, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999385178089142, + "sampling/importance_sampling_ratio/min": 0.2094741016626358, + "sampling/sampling_logp_difference/max": 1.6018545627593994, + "sampling/sampling_logp_difference/mean": 0.009481040760874748, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0005110090045491233, + "clip_ratio/high_mean": 0.0005110090045491233, + "clip_ratio/low_mean": 0.0002782518490372846, + "clip_ratio/low_min": 0.0002782518490372846, + "clip_ratio/region_mean": 0.0007892608542331598, + "completions/clipped_ratio": 0.007161458333333315, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.5, + "completions/mean_length": 200.57813262939453, + "completions/mean_terminated_length": 198.3343276977539, + "completions/min_length": 64.5, + "completions/min_terminated_length": 64.5, + "entropy": 0.2778268658452564, + "epoch": 0.09334992887624466, + "frac_reward_zero_std": 0.65625, + "grad_norm": 0.19151687622070312, + "kl": 0.2644747452603446, + "learning_rate": 1.2666493159081942e-07, + "loss": 0.0209, + "num_tokens": 34859326.0, + "reward": 1.4980469346046448, + "reward_std": 0.14211814105510712, + "rewards/equation_reward_func/mean": 0.5052083283662796, + "rewards/equation_reward_func/std": 0.49980807304382324, + "rewards/format_reward_func/mean": 0.9928385317325592, + "rewards/format_reward_func/std": 0.08428813144564629, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000106692314148, + "sampling/importance_sampling_ratio/min": 0.16273843869566917, + "sampling/sampling_logp_difference/max": 2.139746904373169, + "sampling/sampling_logp_difference/mean": 0.009429726749658585, + "step": 350 + }, + { + "clip_ratio/high_max": 0.00037550903733871464, + "clip_ratio/high_mean": 0.00037550903733871464, + "clip_ratio/low_mean": 0.00020875977093560828, + "clip_ratio/low_min": 0.00020875977093560828, + "clip_ratio/region_mean": 0.0005842688040704363, + "completions/clipped_ratio": 0.008463541666666685, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.5, + "completions/mean_length": 212.54036712646484, + "completions/mean_terminated_length": 210.00491333007812, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.2823557800716824, + "epoch": 0.09468349928876245, + "frac_reward_zero_std": 0.6510416865348816, + "grad_norm": 0.19097639620304108, + "kl": 0.2537163115210003, + "learning_rate": 1.1492716063850971e-07, + "loss": 0.0171, + "num_tokens": 35419812.0, + "reward": 1.4335938096046448, + "reward_std": 0.14987096190452576, + "rewards/equation_reward_func/mean": 0.44140625, + "rewards/equation_reward_func/std": 0.4967052489519119, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.08221758343279362, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999942183494568, + "sampling/importance_sampling_ratio/min": 0.22593369334936142, + "sampling/sampling_logp_difference/max": 1.5091421604156494, + "sampling/sampling_logp_difference/mean": 0.009899929631501436, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0004138908918119139, + "clip_ratio/high_mean": 0.0004138908918119139, + "clip_ratio/low_mean": 0.0002439305920334947, + "clip_ratio/low_min": 0.0002439305920334947, + "clip_ratio/region_mean": 0.0006578214861090398, + "completions/clipped_ratio": 0.00651041666666663, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 221.3619842529297, + "completions/mean_terminated_length": 219.45741271972656, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.2852799680497911, + "epoch": 0.09601706970128022, + "frac_reward_zero_std": 0.6458333730697632, + "grad_norm": 0.1906740814447403, + "kl": 0.25882800188329486, + "learning_rate": 1.0368915106021253e-07, + "loss": 0.0164, + "num_tokens": 35707154.0, + "reward": 1.4088542461395264, + "reward_std": 0.1536119282245636, + "rewards/equation_reward_func/mean": 0.4153645932674408, + "rewards/equation_reward_func/std": 0.4931059181690216, + "rewards/format_reward_func/mean": 0.9934895634651184, + "rewards/format_reward_func/std": 0.08047648519277573, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998905062675476, + "sampling/importance_sampling_ratio/min": 0.025155412033200264, + "sampling/sampling_logp_difference/max": 3.6826822757720947, + "sampling/sampling_logp_difference/mean": 0.010094322264194489, + "step": 360 + }, + { + "clip_ratio/high_max": 0.00036821790013669267, + "clip_ratio/high_mean": 0.00036821790013669267, + "clip_ratio/low_mean": 0.00020876210247580375, + "clip_ratio/low_min": 0.00020876210247580375, + "clip_ratio/region_mean": 0.0005769799971151062, + "completions/clipped_ratio": 0.005208333333333315, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.5, + "completions/mean_length": 192.52474975585938, + "completions/mean_terminated_length": 190.85196685791016, + "completions/min_length": 63.5, + "completions/min_terminated_length": 63.5, + "entropy": 0.27236475778950586, + "epoch": 0.09735064011379801, + "frac_reward_zero_std": 0.6979166865348816, + "grad_norm": 0.16606222093105316, + "kl": 0.2549555543396208, + "learning_rate": 9.296548794875658e-08, + "loss": 0.0205, + "num_tokens": 36236760.0, + "reward": 1.5397136211395264, + "reward_std": 0.1306811273097992, + "rewards/equation_reward_func/mean": 0.5462239682674408, + "rewards/equation_reward_func/std": 0.4980590343475342, + "rewards/format_reward_func/mean": 0.9934895932674408, + "rewards/format_reward_func/std": 0.08006364107131958, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000245869159698, + "sampling/importance_sampling_ratio/min": 0.32276323437690735, + "sampling/sampling_logp_difference/max": 1.1750080585479736, + "sampling/sampling_logp_difference/mean": 0.009734107181429863, + "step": 365 + }, + { + "clip_ratio/high_max": 0.00032508691203676993, + "clip_ratio/high_mean": 0.00032508691203676993, + "clip_ratio/low_mean": 0.0002096930091890196, + "clip_ratio/low_min": 0.0002096930091890196, + "clip_ratio/region_mean": 0.0005347799183154064, + "completions/clipped_ratio": 0.005208333333333315, + "completions/max_length": 512.0, + "completions/max_terminated_length": 493.5, + "completions/mean_length": 198.5768280029297, + "completions/mean_terminated_length": 196.9315643310547, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.269310344921218, + "epoch": 0.09868421052631579, + "frac_reward_zero_std": 0.6562500298023224, + "grad_norm": 0.186299130320549, + "kl": 0.2559140615993076, + "learning_rate": 8.277008885963593e-08, + "loss": 0.0203, + "num_tokens": 36776182.0, + "reward": 1.51171875, + "reward_std": 0.14977246522903442, + "rewards/equation_reward_func/mean": 0.5182291567325592, + "rewards/equation_reward_func/std": 0.49978746473789215, + "rewards/format_reward_func/mean": 0.9934895932674408, + "rewards/format_reward_func/std": 0.08006364107131958, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999904453754425, + "sampling/importance_sampling_ratio/min": 0.31694868206977844, + "sampling/sampling_logp_difference/max": 1.2207127809524536, + "sampling/sampling_logp_difference/mean": 0.009575647301971912, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0003482732174840445, + "clip_ratio/high_mean": 0.0003482732174840445, + "clip_ratio/low_mean": 0.00024789382182967127, + "clip_ratio/low_min": 0.00024789382182967127, + "clip_ratio/region_mean": 0.0005961670349481413, + "completions/clipped_ratio": 0.0013020833333333703, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 200.9010467529297, + "completions/mean_terminated_length": 200.4954376220703, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.2636648823817571, + "epoch": 0.10001778093883357, + "frac_reward_zero_std": 0.6510416865348816, + "grad_norm": 0.18824271857738495, + "kl": 0.256957537929217, + "learning_rate": 7.311618574830569e-08, + "loss": 0.0094, + "num_tokens": 37319254.0, + "reward": 1.5325521230697632, + "reward_std": 0.15159861743450165, + "rewards/equation_reward_func/mean": 0.5338541567325592, + "rewards/equation_reward_func/std": 0.49900682270526886, + "rewards/format_reward_func/mean": 0.9986979365348816, + "rewards/format_reward_func/std": 0.03608439117670059, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999472200870514, + "sampling/importance_sampling_ratio/min": 0.18117966502904892, + "sampling/sampling_logp_difference/max": 1.7293313145637512, + "sampling/sampling_logp_difference/mean": 0.00952440220862627, + "step": 375 + }, + { + "clip_ratio/high_max": 0.00036038666666071445, + "clip_ratio/high_mean": 0.00036038666666071445, + "clip_ratio/low_mean": 0.00022362041604032532, + "clip_ratio/low_min": 0.00022362041604032532, + "clip_ratio/region_mean": 0.0005840070839945434, + "completions/clipped_ratio": 0.005208333333333315, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 200.12239837646484, + "completions/mean_terminated_length": 198.49497985839844, + "completions/min_length": 70.5, + "completions/min_terminated_length": 70.5, + "entropy": 0.2741007298231125, + "epoch": 0.10135135135135136, + "frac_reward_zero_std": 0.6718750298023224, + "grad_norm": 0.22426468133926392, + "kl": 0.24386265277862548, + "learning_rate": 6.401630779727451e-08, + "loss": 0.0233, + "num_tokens": 37861010.0, + "reward": 1.5195313096046448, + "reward_std": 0.1441396027803421, + "rewards/equation_reward_func/mean": 0.5240885317325592, + "rewards/equation_reward_func/std": 0.499693363904953, + "rewards/format_reward_func/mean": 0.9954426884651184, + "rewards/format_reward_func/std": 0.06573712453246117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000188946723938, + "sampling/importance_sampling_ratio/min": 0.3154449015855789, + "sampling/sampling_logp_difference/max": 1.2225030064582825, + "sampling/sampling_logp_difference/mean": 0.00955261243507266, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0003885146663782911, + "clip_ratio/high_mean": 0.0003885146663782911, + "clip_ratio/low_mean": 0.00019978681618037323, + "clip_ratio/low_min": 0.00019978681618037323, + "clip_ratio/region_mean": 0.0005883014819119126, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 206.13411712646484, + "completions/mean_terminated_length": 203.11197662353516, + "completions/min_length": 69.5, + "completions/min_terminated_length": 69.5, + "entropy": 0.27878377073340943, + "epoch": 0.10268492176386913, + "frac_reward_zero_std": 0.6302083730697632, + "grad_norm": 0.25973591208457947, + "kl": 0.24229458239343432, + "learning_rate": 5.548226515528132e-08, + "loss": 0.0211, + "num_tokens": 38412264.0, + "reward": 1.4947916865348816, + "reward_std": 0.1535058617591858, + "rewards/equation_reward_func/mean": 0.5052083432674408, + "rewards/equation_reward_func/std": 0.500128984451294, + "rewards/format_reward_func/mean": 0.9895833432674408, + "rewards/format_reward_func/std": 0.10139166191220284, + "sampling/importance_sampling_ratio/max": 1.8118363618850708, + "sampling/importance_sampling_ratio/mean": 1.0000417232513428, + "sampling/importance_sampling_ratio/min": 0.1979382038116455, + "sampling/sampling_logp_difference/max": 1.7080631256103516, + "sampling/sampling_logp_difference/mean": 0.009640831500291824, + "step": 385 + }, + { + "clip_ratio/high_max": 0.00041090645568652286, + "clip_ratio/high_mean": 0.00041090645568652286, + "clip_ratio/low_mean": 0.00019087346606991358, + "clip_ratio/low_min": 0.00019087346606991358, + "clip_ratio/region_mean": 0.0006017799240200677, + "completions/clipped_ratio": 0.00716145833333337, + "completions/max_length": 512.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 201.58399200439453, + "completions/mean_terminated_length": 199.3389663696289, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.27267059452003906, + "epoch": 0.10401849217638691, + "frac_reward_zero_std": 0.6354166865348816, + "grad_norm": 0.20046281814575195, + "kl": 0.26228648126125337, + "learning_rate": 4.7525133609659484e-08, + "loss": 0.0256, + "num_tokens": 38956217.0, + "reward": 1.4928385615348816, + "reward_std": 0.16229917109012604, + "rewards/equation_reward_func/mean": 0.5013020783662796, + "rewards/equation_reward_func/std": 0.5003173649311066, + "rewards/format_reward_func/mean": 0.9915364384651184, + "rewards/format_reward_func/std": 0.0910358801484108, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999051094055176, + "sampling/importance_sampling_ratio/min": 0.2637256905436516, + "sampling/sampling_logp_difference/max": 1.6963087916374207, + "sampling/sampling_logp_difference/mean": 0.009614390321075916, + "step": 390 + }, + { + "clip_ratio/high_max": 0.00037535354333360574, + "clip_ratio/high_mean": 0.00037535354333360574, + "clip_ratio/low_mean": 0.00026431774798159796, + "clip_ratio/low_min": 0.00026431774798159796, + "clip_ratio/region_mean": 0.0006396712942255868, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 194.7447967529297, + "completions/mean_terminated_length": 192.86903381347656, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.2787840538554721, + "epoch": 0.1053520625889047, + "frac_reward_zero_std": 0.6458333730697632, + "grad_norm": 0.18311934173107147, + "kl": 0.3251284198628532, + "learning_rate": 4.015524021178196e-08, + "loss": 0.0225, + "num_tokens": 39489281.0, + "reward": 1.541015625, + "reward_std": 0.1530192792415619, + "rewards/equation_reward_func/mean": 0.5475260317325592, + "rewards/equation_reward_func/std": 0.4973372370004654, + "rewards/format_reward_func/mean": 0.9934895932674408, + "rewards/format_reward_func/std": 0.08006364107131958, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999671578407288, + "sampling/importance_sampling_ratio/min": 0.332446813583374, + "sampling/sampling_logp_difference/max": 1.13972207903862, + "sampling/sampling_logp_difference/mean": 0.009697155095636845, + "step": 395 + }, + { + "clip_ratio/high_max": 0.00041435547011335275, + "clip_ratio/high_mean": 0.00041435547011335275, + "clip_ratio/low_mean": 0.0002526627391085236, + "clip_ratio/low_min": 0.0002526627391085236, + "clip_ratio/region_mean": 0.0006670182037244861, + "completions/clipped_ratio": 0.0026041666666666297, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 210.89453125, + "completions/mean_terminated_length": 210.1083526611328, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "entropy": 0.2867226378785239, + "epoch": 0.10668563300142248, + "frac_reward_zero_std": 0.5833333730697632, + "grad_norm": 0.22371231019496918, + "kl": 0.2524174325995975, + "learning_rate": 3.3382149874242814e-08, + "loss": 0.02, + "num_tokens": 39768280.0, + "reward": 1.4453125, + "reward_std": 0.1829361617565155, + "rewards/equation_reward_func/mean": 0.44921875, + "rewards/equation_reward_func/std": 0.49773871898651123, + "rewards/format_reward_func/mean": 0.99609375, + "rewards/format_reward_func/std": 0.062418460845947266, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999879777431488, + "sampling/importance_sampling_ratio/min": 0.2876232862472534, + "sampling/sampling_logp_difference/max": 1.2461037635803223, + "sampling/sampling_logp_difference/mean": 0.010267886333167553, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0003471711728100975, + "clip_ratio/high_mean": 0.0003471711728100975, + "clip_ratio/low_mean": 0.0001415536454361346, + "clip_ratio/low_min": 0.0001415536454361346, + "clip_ratio/region_mean": 0.0004887248208332393, + "completions/clipped_ratio": 0.004557291666666685, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.5, + "completions/mean_length": 194.76107025146484, + "completions/mean_terminated_length": 193.36660766601562, + "completions/min_length": 64.5, + "completions/min_terminated_length": 64.5, + "entropy": 0.2709054324362013, + "epoch": 0.10801920341394025, + "frac_reward_zero_std": 0.6510416865348816, + "grad_norm": 0.20122139155864716, + "kl": 0.2460600412554211, + "learning_rate": 2.721465295716996e-08, + "loss": 0.0255, + "num_tokens": 40301697.0, + "reward": 1.5390625596046448, + "reward_std": 0.15056315809488297, + "rewards/equation_reward_func/mean": 0.5436197966337204, + "rewards/equation_reward_func/std": 0.492304265499115, + "rewards/format_reward_func/mean": 0.9954427182674408, + "rewards/format_reward_func/std": 0.06209208443760872, + "sampling/importance_sampling_ratio/max": 1.9082962274551392, + "sampling/importance_sampling_ratio/mean": 0.9999383985996246, + "sampling/importance_sampling_ratio/min": 0.164414182305336, + "sampling/sampling_logp_difference/max": 1.9235963821411133, + "sampling/sampling_logp_difference/mean": 0.00957494368776679, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0003728364956461721, + "clip_ratio/high_mean": 0.0003728364956461721, + "clip_ratio/low_mean": 0.00019873416749760509, + "clip_ratio/low_min": 0.00019873416749760509, + "clip_ratio/region_mean": 0.000571570666700912, + "completions/clipped_ratio": 0.008463541666666685, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.5, + "completions/mean_length": 205.55404663085938, + "completions/mean_terminated_length": 202.93463897705078, + "completions/min_length": 63.5, + "completions/min_terminated_length": 63.5, + "entropy": 0.2775467402405209, + "epoch": 0.10935277382645804, + "frac_reward_zero_std": 0.6302083730697632, + "grad_norm": 0.22816282510757446, + "kl": 0.23982396490044064, + "learning_rate": 2.1660753859779223e-08, + "loss": 0.0186, + "num_tokens": 40851468.0, + "reward": 1.4928385615348816, + "reward_std": 0.165056474506855, + "rewards/equation_reward_func/mean": 0.5026041716337204, + "rewards/equation_reward_func/std": 0.4994972199201584, + "rewards/format_reward_func/mean": 0.990234375, + "rewards/format_reward_func/std": 0.09834573045372963, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999938011169434, + "sampling/importance_sampling_ratio/min": 0.1391434147953987, + "sampling/sampling_logp_difference/max": 2.167668402194977, + "sampling/sampling_logp_difference/mean": 0.009840988088399172, + "step": 410 + }, + { + "clip_ratio/high_max": 0.00048231933016925016, + "clip_ratio/high_mean": 0.00048231933016925016, + "clip_ratio/low_mean": 0.0002598730784181195, + "clip_ratio/low_min": 0.0002598730784181195, + "clip_ratio/region_mean": 0.0007421924050302349, + "completions/clipped_ratio": 0.0032552083333333703, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.5, + "completions/mean_length": 207.0319061279297, + "completions/mean_terminated_length": 206.03699493408203, + "completions/min_length": 69.5, + "completions/min_terminated_length": 69.5, + "entropy": 0.28412299984031253, + "epoch": 0.11068634423897582, + "frac_reward_zero_std": 0.5833333432674408, + "grad_norm": 0.21226766705513, + "kl": 0.2683076818784078, + "learning_rate": 1.672766063197789e-08, + "loss": 0.0162, + "num_tokens": 41403669.0, + "reward": 1.4954427480697632, + "reward_std": 0.18370190262794495, + "rewards/equation_reward_func/mean": 0.4993489533662796, + "rewards/equation_reward_func/std": 0.49986331164836884, + "rewards/format_reward_func/mean": 0.99609375, + "rewards/format_reward_func/std": 0.06151263415813446, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000271797180176, + "sampling/importance_sampling_ratio/min": 0.2862170785665512, + "sampling/sampling_logp_difference/max": 1.4186863899230957, + "sampling/sampling_logp_difference/mean": 0.010202998295426369, + "step": 415 + }, + { + "clip_ratio/high_max": 0.00034577459122778643, + "clip_ratio/high_mean": 0.00034577459122778643, + "clip_ratio/low_mean": 0.00016661739921093816, + "clip_ratio/low_min": 0.00016661739921093816, + "clip_ratio/region_mean": 0.0005123919907621005, + "completions/clipped_ratio": 0.00716145833333337, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 198.767578125, + "completions/mean_terminated_length": 196.5046157836914, + "completions/min_length": 70.5, + "completions/min_terminated_length": 70.5, + "entropy": 0.2770514468352, + "epoch": 0.1120199146514936, + "frac_reward_zero_std": 0.6718750298023224, + "grad_norm": 0.22321927547454834, + "kl": 0.25876561204592385, + "learning_rate": 1.2421775619498199e-08, + "loss": 0.0233, + "num_tokens": 41943384.0, + "reward": 1.517578125, + "reward_std": 0.14652028679847717, + "rewards/equation_reward_func/mean": 0.5247395634651184, + "rewards/equation_reward_func/std": 0.4996977001428604, + "rewards/format_reward_func/mean": 0.9928385615348816, + "rewards/format_reward_func/std": 0.08356184139847755, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000275373458862, + "sampling/importance_sampling_ratio/min": 0.26883871108293533, + "sampling/sampling_logp_difference/max": 1.3706042766571045, + "sampling/sampling_logp_difference/mean": 0.009586183819919825, + "step": 420 + }, + { + "clip_ratio/high_max": 0.00036685404273965915, + "clip_ratio/high_mean": 0.00036685404273965915, + "clip_ratio/low_mean": 0.00020818886599348237, + "clip_ratio/low_min": 0.00020818886599348237, + "clip_ratio/region_mean": 0.0005750429059844464, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 200.52539825439453, + "completions/mean_terminated_length": 198.76984405517578, + "completions/min_length": 65.5, + "completions/min_terminated_length": 65.5, + "entropy": 0.27818418641885123, + "epoch": 0.11335348506401138, + "frac_reward_zero_std": 0.6666666865348816, + "grad_norm": 0.2058248221874237, + "kl": 0.22648685177167258, + "learning_rate": 8.748687154702672e-09, + "loss": 0.0295, + "num_tokens": 42485743.0, + "reward": 1.5045573115348816, + "reward_std": 0.14612428471446037, + "rewards/equation_reward_func/mean": 0.5110677033662796, + "rewards/equation_reward_func/std": 0.4901665598154068, + "rewards/format_reward_func/mean": 0.9934895932674408, + "rewards/format_reward_func/std": 0.07875731959939003, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000016450881958, + "sampling/importance_sampling_ratio/min": 0.40848278999328613, + "sampling/sampling_logp_difference/max": 0.9087653458118439, + "sampling/sampling_logp_difference/mean": 0.009386220946907997, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0004233562154695392, + "clip_ratio/high_mean": 0.0004233562154695392, + "clip_ratio/low_mean": 0.00017712132636612903, + "clip_ratio/low_min": 0.00017712132636612903, + "clip_ratio/region_mean": 0.0006004775415122923, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 490.5, + "completions/mean_length": 199.7018280029297, + "completions/mean_terminated_length": 197.82736206054688, + "completions/min_length": 69.5, + "completions/min_terminated_length": 69.5, + "entropy": 0.26555215650134617, + "epoch": 0.11468705547652916, + "frac_reward_zero_std": 0.6822916865348816, + "grad_norm": 0.2425134927034378, + "kl": 0.24950815704133775, + "learning_rate": 5.713162303845886e-09, + "loss": 0.0217, + "num_tokens": 43026789.0, + "reward": 1.5162761211395264, + "reward_std": 0.13688313961029053, + "rewards/equation_reward_func/mean": 0.5214843899011612, + "rewards/equation_reward_func/std": 0.4958457350730896, + "rewards/format_reward_func/mean": 0.9947916567325592, + "rewards/format_reward_func/std": 0.06954876892268658, + "sampling/importance_sampling_ratio/max": 1.9392542839050293, + "sampling/importance_sampling_ratio/mean": 0.9998978078365326, + "sampling/importance_sampling_ratio/min": 0.285983182489872, + "sampling/sampling_logp_difference/max": 1.2714332342147827, + "sampling/sampling_logp_difference/mean": 0.009993265382945538, + "step": 430 + }, + { + "clip_ratio/high_max": 0.00034185098969222356, + "clip_ratio/high_mean": 0.00034185098969222356, + "clip_ratio/low_mean": 0.00019279461218199383, + "clip_ratio/low_min": 0.00019279461218199383, + "clip_ratio/region_mean": 0.000534645603167721, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.5, + "completions/mean_length": 197.5006561279297, + "completions/mean_terminated_length": 195.65191650390625, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.2816057221757041, + "epoch": 0.11602062588904694, + "frac_reward_zero_std": 0.6822916865348816, + "grad_norm": 0.22995464503765106, + "kl": 0.26083281801806557, + "learning_rate": 3.3191406802041688e-09, + "loss": 0.0167, + "num_tokens": 43564542.0, + "reward": 1.5123698711395264, + "reward_std": 0.14350445568561554, + "rewards/equation_reward_func/mean": 0.5182291865348816, + "rewards/equation_reward_func/std": 0.4999932050704956, + "rewards/format_reward_func/mean": 0.994140625, + "rewards/format_reward_func/std": 0.07525911927223206, + "sampling/importance_sampling_ratio/max": 1.8215952515602112, + "sampling/importance_sampling_ratio/mean": 0.9999869465827942, + "sampling/importance_sampling_ratio/min": 0.1803101897239685, + "sampling/sampling_logp_difference/max": 2.213542938232422, + "sampling/sampling_logp_difference/mean": 0.009848698042333126, + "step": 435 + }, + { + "clip_ratio/high_max": 0.00033501201380406405, + "clip_ratio/high_mean": 0.00033501201380406405, + "clip_ratio/low_mean": 0.00020934161924136181, + "clip_ratio/low_min": 0.00020934161924136181, + "clip_ratio/region_mean": 0.0005443536314285464, + "completions/clipped_ratio": 0.0013020833333333703, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 204.40234375, + "completions/mean_terminated_length": 204.0012969970703, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.2727111210425695, + "epoch": 0.11735419630156473, + "frac_reward_zero_std": 0.6666666865348816, + "grad_norm": 0.17391209304332733, + "kl": 0.24492659701241387, + "learning_rate": 1.5697293311039973e-09, + "loss": 0.019, + "num_tokens": 43838819.0, + "reward": 1.515625, + "reward_std": 0.15364935994148254, + "rewards/equation_reward_func/mean": 0.5169270634651184, + "rewards/equation_reward_func/std": 0.5000390410423279, + "rewards/format_reward_func/mean": 0.9986979365348816, + "rewards/format_reward_func/std": 0.03608439117670059, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000205039978027, + "sampling/importance_sampling_ratio/min": 0.25808805227279663, + "sampling/sampling_logp_difference/max": 1.5242679119110107, + "sampling/sampling_logp_difference/mean": 0.0099252387881279, + "step": 440 + }, + { + "clip_ratio/high_max": 0.00030947119699299545, + "clip_ratio/high_mean": 0.00030947119699299545, + "clip_ratio/low_mean": 0.0001354989123582426, + "clip_ratio/low_min": 0.0001354989123582426, + "clip_ratio/region_mean": 0.00044497011015967776, + "completions/clipped_ratio": 0.00911458333333337, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 213.32421875, + "completions/mean_terminated_length": 210.57686614990234, + "completions/min_length": 68.5, + "completions/min_terminated_length": 68.5, + "entropy": 0.2844251851240794, + "epoch": 0.1186877667140825, + "frac_reward_zero_std": 0.7031250298023224, + "grad_norm": 0.21089453995227814, + "kl": 0.2540728171666463, + "learning_rate": 4.671987054842841e-10, + "loss": 0.0176, + "num_tokens": 44400757.0, + "reward": 1.4583333730697632, + "reward_std": 0.13112683594226837, + "rewards/equation_reward_func/mean": 0.4674479067325592, + "rewards/equation_reward_func/std": 0.49901847541332245, + "rewards/format_reward_func/mean": 0.9908854365348816, + "rewards/format_reward_func/std": 0.0950961783528328, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999507665634155, + "sampling/importance_sampling_ratio/min": 0.11136713065207005, + "sampling/sampling_logp_difference/max": 2.4416446685791016, + "sampling/sampling_logp_difference/mean": 0.009969823528081179, + "step": 445 + }, + { + "clip_ratio/high_max": 0.00027995540812197656, + "clip_ratio/high_mean": 0.00027995540812197656, + "clip_ratio/low_mean": 0.00016834852350358333, + "clip_ratio/low_min": 0.00016834852350358333, + "clip_ratio/region_mean": 0.0004483039332424394, + "completions/clipped_ratio": 0.0032552083333333703, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.5, + "completions/mean_length": 201.4264373779297, + "completions/mean_terminated_length": 200.39505004882812, + "completions/min_length": 65.5, + "completions/min_terminated_length": 65.5, + "entropy": 0.2818666017717785, + "epoch": 0.12002133712660028, + "frac_reward_zero_std": 0.6458333432674408, + "grad_norm": 0.246941938996315, + "kl": 0.2329849471648534, + "learning_rate": 1.2979707226135061e-11, + "loss": 0.0278, + "num_tokens": 44944436.0, + "reward": 1.5299479961395264, + "reward_std": 0.15937086939811707, + "rewards/equation_reward_func/mean": 0.5338541567325592, + "rewards/equation_reward_func/std": 0.49874016642570496, + "rewards/format_reward_func/mean": 0.99609375, + "rewards/format_reward_func/std": 0.06151263415813446, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999302923679352, + "sampling/importance_sampling_ratio/min": 0.2703121677041054, + "sampling/sampling_logp_difference/max": 1.323049008846283, + "sampling/sampling_logp_difference/mean": 0.009845161344856024, + "step": 450 + }, + { + "epoch": 0.12002133712660028, + "step": 450, + "total_flos": 0.0, + "train_loss": 0.019761702488693925, + "train_runtime": 13024.8044, + "train_samples_per_second": 9.95, + "train_steps_per_second": 0.035 + } + ], + "logging_steps": 5, + "max_steps": 450, + "num_input_tokens_seen": 44944436, + "num_train_epochs": 1, + "save_steps": 25, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}